From 709d000780b2e7072a6adcf81691e0402a3fdb22 Mon Sep 17 00:00:00 2001
From: G <bharath78910@gmail.com>
Date: Sat, 20 Jun 2026 23:13:17 +0530
Subject: [PATCH 1/2] core page index cache changes

Signed-off-by: G <bharath78910@gmail.com>
---
 .../rust/src/cache.rs                         |  183 --
 .../src/{ => cache}/custom_cache_manager.rs   |   65 +-
 .../rust/src/{ => cache}/eviction_policy.rs   |    0
 .../rust/src/cache/metadata_cache.rs          |  344 ++++
 .../rust/src/cache/mod.rs                     |   37 +
 .../rust/src/cache/page_index/cache_keys.rs   |   65 +
 .../rust/src/cache/page_index/cache_store.rs  |  195 ++
 .../page_index/column_schema_resolver.rs      |  116 ++
 .../rust/src/cache/page_index/mod.rs          |  188 ++
 .../src/cache/page_index/page_index_io.rs     | 1626 +++++++++++++++++
 .../rust/src/{ => cache}/statistics_cache.rs  |    0
 .../rust/src/indexed_executor.rs              |   75 +-
 .../rust/src/indexed_table/parquet_bridge.rs  |   58 +-
 .../rust/src/lib.rs                           |   11 +-
 .../rust/src/scoped_index_optimizer.rs        |  415 +++++
 .../rust/src/scoped_page_index_reader.rs      |  388 ++++
 .../rust/src/session_context.rs               |   29 +
 17 files changed, 3554 insertions(+), 241 deletions(-)
 delete mode 100644 sandbox/plugins/analytics-backend-datafusion/rust/src/cache.rs
 rename sandbox/plugins/analytics-backend-datafusion/rust/src/{ => cache}/custom_cache_manager.rs (88%)
 rename sandbox/plugins/analytics-backend-datafusion/rust/src/{ => cache}/eviction_policy.rs (100%)
 create mode 100644 sandbox/plugins/analytics-backend-datafusion/rust/src/cache/metadata_cache.rs
 create mode 100644 sandbox/plugins/analytics-backend-datafusion/rust/src/cache/mod.rs
 create mode 100644 sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/cache_keys.rs
 create mode 100644 sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/cache_store.rs
 create mode 100644 sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/column_schema_resolver.rs
 create mode 100644 sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/mod.rs
 create mode 100644 sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/page_index_io.rs
 rename sandbox/plugins/analytics-backend-datafusion/rust/src/{ => cache}/statistics_cache.rs (100%)
 create mode 100644 sandbox/plugins/analytics-backend-datafusion/rust/src/scoped_index_optimizer.rs
 create mode 100644 sandbox/plugins/analytics-backend-datafusion/rust/src/scoped_page_index_reader.rs

diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache.rs
deleted file mode 100644
index d5fb186acbc51..0000000000000
--- a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache.rs
+++ /dev/null
@@ -1,183 +0,0 @@
-/*
- * SPDX-License-Identifier: Apache-2.0
- *
- * The OpenSearch Contributors require contributions made to
- * this file be licensed under the Apache-2.0 license or a
- * compatible open source license.
- */
-
-use std::sync::atomic::{AtomicUsize, Ordering};
-use std::sync::{Arc, Mutex};
-
-use datafusion::execution::cache::cache_manager::{
-    CachedFileMetadataEntry, FileMetadataCache, FileMetadataCacheEntry,
-};
-use datafusion::execution::cache::DefaultFilesMetadataCache;
-use datafusion::execution::cache::CacheAccessor;
-use log::error;
-use object_store::path::Path;
-
-// Cache type constants
-pub const CACHE_TYPE_METADATA: &str = "METADATA";
-pub const CACHE_TYPE_STATS: &str = "STATISTICS";
-
-// Helper function to log cache operations
-fn log_cache_error(operation: &str, error: &str) {
-    error!("[CACHE ERROR] {} operation failed: {}", operation, error);
-}
-
-// Wrapper to make Mutex<DefaultFilesMetadataCache> implement FileMetadataCache
-pub struct MutexFileMetadataCache {
-    pub inner: Mutex<DefaultFilesMetadataCache>,
-    hit_count: AtomicUsize,
-    miss_count: AtomicUsize,
-}
-
-impl MutexFileMetadataCache {
-    pub fn new(cache: DefaultFilesMetadataCache) -> Self {
-        Self {
-            inner: Mutex::new(cache),
-            hit_count: AtomicUsize::new(0),
-            miss_count: AtomicUsize::new(0),
-        }
-    }
-
-    pub fn hit_count(&self) -> usize {
-        self.hit_count.load(Ordering::Relaxed)
-    }
-
-    pub fn miss_count(&self) -> usize {
-        self.miss_count.load(Ordering::Relaxed)
-    }
-
-    pub fn reset_stats(&self) {
-        self.hit_count.store(0, Ordering::Relaxed);
-        self.miss_count.store(0, Ordering::Relaxed);
-    }
-
-    pub fn clear_cache(&self) {
-        if let Ok(cache) = self.inner.lock() {
-            cache.clear();
-        }
-    }
-
-    pub fn update_cache_limit(&self, new_limit: usize) {
-        if let Ok(cache) = self.inner.lock() {
-            cache.update_cache_limit(new_limit);
-        }
-    }
-
-    pub fn get_cache_limit(&self) -> usize {
-        if let Ok(cache) = self.inner.lock() {
-            cache.cache_limit()
-        } else {
-            0
-        }
-    }
-}
-
-impl CacheAccessor<Path, CachedFileMetadataEntry> for MutexFileMetadataCache {
-    fn get(&self, k: &Path) -> Option<CachedFileMetadataEntry> {
-        match self.inner.lock() {
-            Ok(cache) => {
-                let result = cache.get(k);
-                if result.is_some() {
-                    self.hit_count.fetch_add(1, Ordering::Relaxed);
-                } else {
-                    self.miss_count.fetch_add(1, Ordering::Relaxed);
-                }
-                result
-            }
-            Err(e) => {
-                log_cache_error("get", &e.to_string());
-                None
-            }
-        }
-    }
-
-    fn put(&self, k: &Path, v: CachedFileMetadataEntry) -> Option<CachedFileMetadataEntry> {
-        match self.inner.lock() {
-            Ok(cache) => cache.put(k, v),
-            Err(e) => {
-                log_cache_error("put", &e.to_string());
-                None
-            }
-        }
-    }
-
-    fn remove(&self, k: &Path) -> Option<CachedFileMetadataEntry> {
-        match self.inner.lock() {
-            Ok(cache) => cache.remove(k),
-            Err(e) => {
-                log_cache_error("remove", &e.to_string());
-                None
-            }
-        }
-    }
-
-    fn contains_key(&self, k: &Path) -> bool {
-        match self.inner.lock() {
-            Ok(cache) => cache.contains_key(k),
-            Err(e) => {
-                log_cache_error("contains_key", &e.to_string());
-                false
-            }
-        }
-    }
-
-    fn len(&self) -> usize {
-        match self.inner.lock() {
-            Ok(cache) => cache.len(),
-            Err(e) => {
-                log_cache_error("len", &e.to_string());
-                0
-            }
-        }
-    }
-
-    fn clear(&self) {
-        match self.inner.lock() {
-            Ok(cache) => cache.clear(),
-            Err(e) => log_cache_error("clear", &e.to_string()),
-        }
-    }
-
-    fn name(&self) -> String {
-        match self.inner.lock() {
-            Ok(cache) => cache.name(),
-            Err(e) => {
-                log_cache_error("name", &e.to_string());
-                "cache_error".to_string()
-            }
-        }
-    }
-}
-
-impl FileMetadataCache for MutexFileMetadataCache {
-    fn cache_limit(&self) -> usize {
-        match self.inner.lock() {
-            Ok(cache) => cache.cache_limit(),
-            Err(e) => {
-                log_cache_error("cache_limit", &e.to_string());
-                0
-            }
-        }
-    }
-
-    fn update_cache_limit(&self, limit: usize) {
-        match self.inner.lock() {
-            Ok(cache) => cache.update_cache_limit(limit),
-            Err(e) => log_cache_error("update_cache_limit", &e.to_string()),
-        }
-    }
-
-    fn list_entries(&self) -> std::collections::HashMap<Path, FileMetadataCacheEntry> {
-        match self.inner.lock() {
-            Ok(cache) => cache.list_entries(),
-            Err(e) => {
-                log_cache_error("list_entries", &e.to_string());
-                std::collections::HashMap::new()
-            }
-        }
-    }
-}
diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/custom_cache_manager.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/custom_cache_manager.rs
similarity index 88%
rename from sandbox/plugins/analytics-backend-datafusion/rust/src/custom_cache_manager.rs
rename to sandbox/plugins/analytics-backend-datafusion/rust/src/cache/custom_cache_manager.rs
index 097d3657b9e8e..d170a274a73bb 100644
--- a/sandbox/plugins/analytics-backend-datafusion/rust/src/custom_cache_manager.rs
+++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/custom_cache_manager.rs
@@ -10,13 +10,14 @@ use std::sync::Arc;
 use datafusion::execution::cache::cache_manager::{FileMetadataCache, FileStatisticsCache, CacheManagerConfig};
 use datafusion::execution::cache::file_statistics_cache::DefaultFileStatisticsCache;
 use datafusion::execution::cache::CacheAccessor;
-use crate::statistics_cache::compute_parquet_statistics;
-use crate::cache::MutexFileMetadataCache;
-use crate::statistics_cache::CustomStatisticsCache;
+use crate::cache::statistics_cache::compute_parquet_statistics;
+use crate::cache::metadata_cache::MutexFileMetadataCache;
+use crate::cache::statistics_cache::CustomStatisticsCache;
 use object_store::path::Path;
 use object_store::ObjectMeta;
-use datafusion::datasource::physical_plan::parquet::metadata::DFParquetMetadata;
+use object_store::ObjectStore;
 use log::{debug, error};
+use crate::indexed_table::parquet_bridge;
 
 /// Create ObjectMeta from a local file path.
 fn create_object_meta_from_file(file_path: &str) -> Result<Vec<ObjectMeta>, datafusion::common::DataFusionError> {
@@ -229,14 +230,14 @@ impl CustomCacheManager {
     /// Check if a file exists in a specific cache type
     pub fn contains_file_by_type(&self, file_path: &str, cache_type: &str) -> bool {
         match cache_type {
-            crate::cache::CACHE_TYPE_METADATA => {
+            crate::cache::metadata_cache::CACHE_TYPE_METADATA => {
                 let path = Path::from(file_path);
                 self.file_metadata_cache
                     .as_ref()
                     .and_then(|cache| cache.get(&path))
                     .is_some()
             }
-            crate::cache::CACHE_TYPE_STATS => {
+            crate::cache::metadata_cache::CACHE_TYPE_STATS => {
                 self.statistics_cache
                     .as_ref()
                     .map_or(false, |cache| cache.contains_key(&Path::from(file_path)))
@@ -294,7 +295,7 @@ impl CustomCacheManager {
     /// Clear specific cache type
     pub fn clear_cache_type(&self, cache_type: &str) -> Result<(), String> {
         match cache_type {
-            crate::cache::CACHE_TYPE_METADATA => {
+            crate::cache::metadata_cache::CACHE_TYPE_METADATA => {
                 if let Some(cache) = &self.file_metadata_cache {
                     cache.clear();
                     Ok(())
@@ -302,7 +303,7 @@ impl CustomCacheManager {
                     Err("No metadata cache configured".to_string())
                 }
             }
-            crate::cache::CACHE_TYPE_STATS => {
+            crate::cache::metadata_cache::CACHE_TYPE_STATS => {
                 if let Some(cache) = &self.statistics_cache {
                     cache.clear();
                     Ok(())
@@ -317,7 +318,7 @@ impl CustomCacheManager {
     /// Get memory consumed by specific cache type
     pub fn get_memory_consumed_by_type(&self, cache_type: &str) -> Result<usize, String> {
         match cache_type {
-            crate::cache::CACHE_TYPE_METADATA => {
+            crate::cache::metadata_cache::CACHE_TYPE_METADATA => {
                 if let Some(cache) = &self.file_metadata_cache {
                     if let Ok(cache_guard) = cache.inner.lock() {
                         Ok(cache_guard.memory_used())
@@ -328,7 +329,7 @@ impl CustomCacheManager {
                     Err("No metadata cache configured".to_string())
                 }
             }
-            crate::cache::CACHE_TYPE_STATS => {
+            crate::cache::metadata_cache::CACHE_TYPE_STATS => {
                 if let Some(cache) = &self.statistics_cache {
                     Ok(cache.memory_consumed())
                 } else {
@@ -351,42 +352,20 @@ impl CustomCacheManager {
         let object_meta = object_metas.first()
             .ok_or_else(|| "No object metadata returned".to_string())?;
 
-        let store = Arc::new(object_store::local::LocalFileSystem::new());
+        let store: Arc<dyn ObjectStore> = Arc::new(object_store::local::LocalFileSystem::new());
 
-        // Get cache reference for DataFusion metadata loading
-        let cache_ref = self.file_metadata_cache.as_ref()
-            .ok_or_else(|| "No file metadata cache configured".to_string())?;
+        let metadata_cache = self.file_metadata_cache.as_ref()
+            .ok_or_else(|| "No file metadata cache configured".to_string())?
+            .clone() as Arc<dyn FileMetadataCache>;
 
-        let metadata_cache = cache_ref.clone() as Arc<dyn FileMetadataCache>;
-
-        // Use DataFusion's metadata loading by passing reference to file_metadata_cache to get complete metadata
-        // IMPORTANT: When a cache is provided to DFParquetMetadata, fetch_metadata() will:
-        // 1. Enable page index loading (with_page_indexes(true))
-        // 2. Load the complete metadata including column and offset indexes
-        // 3. Automatically put the metadata into the cache (lines 155-160 in datafusion's metadata.rs)
-        // This ensures we cache exactly what DataFusion would cache during query execution
-        let _parquet_metadata = rt_handle.block_on(async {
-            let df_metadata = DFParquetMetadata::new(store.as_ref(), object_meta)
-                .with_file_metadata_cache(Some(metadata_cache));
-
-            // fetch_metadata() performs the cache put operation internally
-            df_metadata.fetch_metadata().await
-                .map_err(|e| format!("Failed to fetch metadata: {}", e))
+        // Warm the level-1 metadata cache footer-only. `load_parquet_metadata`
+        // fetches with PageIndexPolicy::Skip — only footer bytes, no page index IO.
+        // On success the entry is in the cache; on failure the error propagates.
+        let location = object_meta.location.clone();
+        rt_handle.block_on(async {
+            parquet_bridge::load_parquet_metadata(store, &location, metadata_cache).await
         })?;
-
-        // Verify the metadata was cached properly
-        match cache_ref.inner.lock() {
-            Ok(cache_guard) => {
-                let path = Path::from(file_path.to_string());
-                if cache_guard.contains_key(&path) {
-                    Ok(true)
-                } else {
-                    debug!("[CACHE ERROR] Failed to cache metadata for: {}", file_path);
-                    Ok(false)
-                }
-            }
-            Err(e) => Err(format!("Failed to verify cache: {}", e))
-        }
+        Ok(true)
     }
 
     /// Compute and put statistics into cache
diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/eviction_policy.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/eviction_policy.rs
similarity index 100%
rename from sandbox/plugins/analytics-backend-datafusion/rust/src/eviction_policy.rs
rename to sandbox/plugins/analytics-backend-datafusion/rust/src/cache/eviction_policy.rs
diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/metadata_cache.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/metadata_cache.rs
new file mode 100644
index 0000000000000..e71fe68a03a39
--- /dev/null
+++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/metadata_cache.rs
@@ -0,0 +1,344 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+use std::sync::atomic::{AtomicUsize, Ordering};
+use std::sync::{Arc, Mutex};
+
+use datafusion::datasource::physical_plan::parquet::metadata::CachedParquetMetaData;
+use datafusion::execution::cache::cache_manager::{
+    CachedFileMetadataEntry, FileMetadataCache, FileMetadataCacheEntry,
+};
+use datafusion::execution::cache::CacheAccessor;
+use datafusion::execution::cache::DefaultFilesMetadataCache;
+use datafusion::parquet::file::metadata::ParquetMetaData;
+use log::error;
+use object_store::path::Path;
+
+// Cache type constants
+pub const CACHE_TYPE_METADATA: &str = "METADATA";
+pub const CACHE_TYPE_STATS: &str = "STATISTICS";
+
+// Helper function to log cache operations
+fn log_cache_error(operation: &str, error: &str) {
+    error!("[CACHE ERROR] {} operation failed: {}", operation, error);
+}
+
+/// Return a cache entry whose `ParquetMetaData` carries footer-only metadata (no
+/// `ColumnIndex` / `OffsetIndex`). If the entry already lacks a page index — or
+/// isn't a `CachedParquetMetaData` at all — it's returned unchanged (no clone, no
+/// rebuild).
+///
+/// This is the single chokepoint that enforces the footer-only invariant: every
+/// `put` runs the entry through here before it lands in the shared LRU.
+fn strip_page_index(entry: CachedFileMetadataEntry) -> CachedFileMetadataEntry {
+    let Some(cached) = entry
+        .file_metadata
+        .as_any()
+        .downcast_ref::<CachedParquetMetaData>()
+    else {
+        return entry;
+    };
+    let meta = cached.parquet_metadata();
+    if meta.column_index().is_none() && meta.offset_index().is_none() {
+        // Already footer-only — keep the existing Arc, avoid a rebuild.
+        return entry;
+    }
+    // Rebuild without the page index. The heavy decoded `ColumnIndex` /
+    // `OffsetIndex` are released when the original Arc drops; the footer
+    // (row-group + column chunk stats) is preserved.
+    let stripped = ParquetMetaData::clone(meta)
+        .into_builder()
+        .set_column_index(None)
+        .set_offset_index(None)
+        .build();
+    CachedFileMetadataEntry::new(
+        entry.meta,
+        Arc::new(CachedParquetMetaData::new(Arc::new(stripped))),
+    )
+}
+
+// Wrapper to make Mutex<DefaultFilesMetadataCache> implement FileMetadataCache
+pub struct MutexFileMetadataCache {
+    pub inner: Mutex<DefaultFilesMetadataCache>,
+    hit_count: AtomicUsize,
+    miss_count: AtomicUsize,
+}
+
+impl MutexFileMetadataCache {
+    pub fn new(cache: DefaultFilesMetadataCache) -> Self {
+        Self {
+            inner: Mutex::new(cache),
+            hit_count: AtomicUsize::new(0),
+            miss_count: AtomicUsize::new(0),
+        }
+    }
+
+    pub fn hit_count(&self) -> usize {
+        self.hit_count.load(Ordering::Relaxed)
+    }
+
+    pub fn miss_count(&self) -> usize {
+        self.miss_count.load(Ordering::Relaxed)
+    }
+
+    pub fn reset_stats(&self) {
+        self.hit_count.store(0, Ordering::Relaxed);
+        self.miss_count.store(0, Ordering::Relaxed);
+    }
+
+    pub fn clear_cache(&self) {
+        if let Ok(cache) = self.inner.lock() {
+            cache.clear();
+        }
+    }
+
+    pub fn update_cache_limit(&self, new_limit: usize) {
+        if let Ok(cache) = self.inner.lock() {
+            cache.update_cache_limit(new_limit);
+        }
+    }
+
+    pub fn get_cache_limit(&self) -> usize {
+        if let Ok(cache) = self.inner.lock() {
+            cache.cache_limit()
+        } else {
+            0
+        }
+    }
+}
+
+impl CacheAccessor<Path, CachedFileMetadataEntry> for MutexFileMetadataCache {
+    fn get(&self, k: &Path) -> Option<CachedFileMetadataEntry> {
+        match self.inner.lock() {
+            Ok(cache) => {
+                let result = cache.get(k);
+                if result.is_some() {
+                    self.hit_count.fetch_add(1, Ordering::Relaxed);
+                } else {
+                    self.miss_count.fetch_add(1, Ordering::Relaxed);
+                }
+                result
+            }
+            Err(e) => {
+                log_cache_error("get", &e.to_string());
+                None
+            }
+        }
+    }
+
+    fn put(&self, k: &Path, v: CachedFileMetadataEntry) -> Option<CachedFileMetadataEntry> {
+        // Enforce the footer-only invariant at the single cache chokepoint.
+        //
+        // DataFusion's parquet paths (`infer_schema`, the scan opener,
+        // `fetch_statistics`) hand this cache to `DFParquetMetadata::fetch_metadata`,
+        // which force-decodes the FULL page index (`ColumnIndex` + `OffsetIndex`
+        // for every column of every row group) before calling `put`. On wide
+        // schemas that decoded index dominates the native heap and, since this is
+        // a shared LRU keyed by path, also evicts the small footer-only entries
+        // the scan paths depend on.
+        //
+        // We can't stop DataFusion from decoding it, but we can refuse to retain
+        // it: strip the page index here so the level-1 cache only ever holds
+        // footer-only metadata (row-group + file stats). Page-level pruning is
+        // unaffected — both scan paths rebuild a predicate-scoped page index per
+        // query through the shared scoped cache (`parquet_page_cache`).
+        let v = strip_page_index(v);
+        match self.inner.lock() {
+            Ok(cache) => cache.put(k, v),
+            Err(e) => {
+                log_cache_error("put", &e.to_string());
+                None
+            }
+        }
+    }
+
+    fn remove(&self, k: &Path) -> Option<CachedFileMetadataEntry> {
+        match self.inner.lock() {
+            Ok(cache) => cache.remove(k),
+            Err(e) => {
+                log_cache_error("remove", &e.to_string());
+                None
+            }
+        }
+    }
+
+    fn contains_key(&self, k: &Path) -> bool {
+        match self.inner.lock() {
+            Ok(cache) => cache.contains_key(k),
+            Err(e) => {
+                log_cache_error("contains_key", &e.to_string());
+                false
+            }
+        }
+    }
+
+    fn len(&self) -> usize {
+        match self.inner.lock() {
+            Ok(cache) => cache.len(),
+            Err(e) => {
+                log_cache_error("len", &e.to_string());
+                0
+            }
+        }
+    }
+
+    fn clear(&self) {
+        match self.inner.lock() {
+            Ok(cache) => cache.clear(),
+            Err(e) => log_cache_error("clear", &e.to_string()),
+        }
+    }
+
+    fn name(&self) -> String {
+        match self.inner.lock() {
+            Ok(cache) => cache.name(),
+            Err(e) => {
+                log_cache_error("name", &e.to_string());
+                "cache_error".to_string()
+            }
+        }
+    }
+}
+
+impl FileMetadataCache for MutexFileMetadataCache {
+    fn cache_limit(&self) -> usize {
+        match self.inner.lock() {
+            Ok(cache) => cache.cache_limit(),
+            Err(e) => {
+                log_cache_error("cache_limit", &e.to_string());
+                0
+            }
+        }
+    }
+
+    fn update_cache_limit(&self, limit: usize) {
+        match self.inner.lock() {
+            Ok(cache) => cache.update_cache_limit(limit),
+            Err(e) => log_cache_error("update_cache_limit", &e.to_string()),
+        }
+    }
+
+    fn list_entries(&self) -> std::collections::HashMap<Path, FileMetadataCacheEntry> {
+        match self.inner.lock() {
+            Ok(cache) => cache.list_entries(),
+            Err(e) => {
+                log_cache_error("list_entries", &e.to_string());
+                std::collections::HashMap::new()
+            }
+        }
+    }
+}
+
+#[cfg(test)]
+mod strip_page_index_tests {
+    use super::*;
+    use datafusion::arrow::array::{Int64Array, RecordBatch};
+    use datafusion::arrow::datatypes::{DataType, Field, Schema};
+    use datafusion::parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions};
+    use datafusion::parquet::arrow::ArrowWriter;
+    use datafusion::parquet::file::properties::{EnabledStatistics, WriterProperties};
+    use object_store::ObjectMeta;
+    use prost::bytes::Bytes;
+
+    fn parquet_with_page_index() -> Bytes {
+        let schema = Arc::new(Schema::new(vec![Field::new("v", DataType::Int64, false)]));
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![Arc::new(Int64Array::from((0..4096i64).collect::<Vec<_>>()))],
+        )
+        .unwrap();
+        let props = WriterProperties::builder()
+            .set_statistics_enabled(EnabledStatistics::Page)
+            .set_data_page_row_count_limit(128)
+            .build();
+        let mut buf: Vec<u8> = Vec::new();
+        let mut w = ArrowWriter::try_new(&mut buf, schema, Some(props)).unwrap();
+        w.write(&batch).unwrap();
+        w.close().unwrap();
+        Bytes::from(buf)
+    }
+
+    fn object_meta(bytes: &Bytes) -> ObjectMeta {
+        ObjectMeta {
+            location: Path::from("data.parquet"),
+            last_modified: chrono::Utc::now(),
+            size: bytes.len() as u64,
+            e_tag: None,
+            version: None,
+        }
+    }
+
+    fn full_index_entry(bytes: &Bytes) -> CachedFileMetadataEntry {
+        let meta = ArrowReaderMetadata::load(
+            &bytes.clone(),
+            ArrowReaderOptions::new().with_page_index(true),
+        )
+        .unwrap();
+        let pq = meta.metadata().clone();
+        assert!(pq.column_index().is_some() && pq.offset_index().is_some());
+        CachedFileMetadataEntry::new(object_meta(bytes), Arc::new(CachedParquetMetaData::new(pq)))
+    }
+
+    fn page_index_present(entry: &CachedFileMetadataEntry) -> bool {
+        let cached = entry
+            .file_metadata
+            .as_any()
+            .downcast_ref::<CachedParquetMetaData>()
+            .unwrap();
+        let m = cached.parquet_metadata();
+        m.column_index().is_some() || m.offset_index().is_some()
+    }
+
+    #[test]
+    fn put_strips_page_index_and_get_returns_footer_only() {
+        let bytes = parquet_with_page_index();
+        let entry = full_index_entry(&bytes);
+        assert!(page_index_present(&entry), "precondition: entry has page index");
+
+        let cache = MutexFileMetadataCache::new(DefaultFilesMetadataCache::new(64 * 1024 * 1024));
+        let key = Path::from("data.parquet");
+        cache.put(&key, entry);
+
+        let got = cache.get(&key).expect("entry must be retrievable");
+        assert!(!page_index_present(&got), "cached entry must be footer-only after put");
+        let cached = got
+            .file_metadata
+            .as_any()
+            .downcast_ref::<CachedParquetMetaData>()
+            .unwrap();
+        let m = cached.parquet_metadata();
+        assert!(m.num_row_groups() > 0);
+        assert!(m.row_group(0).column(0).statistics().is_some(), "footer stats must survive");
+    }
+
+    #[test]
+    fn strip_is_noop_for_footer_only_entry() {
+        let bytes = parquet_with_page_index();
+        let meta = ArrowReaderMetadata::load(
+            &bytes.clone(),
+            ArrowReaderOptions::new().with_page_index(false),
+        )
+        .unwrap();
+        let pq = meta.metadata().clone();
+        assert!(pq.column_index().is_none() && pq.offset_index().is_none());
+        let entry = CachedFileMetadataEntry::new(
+            object_meta(&bytes),
+            Arc::new(CachedParquetMetaData::new(Arc::clone(&pq))),
+        );
+        let stripped = strip_page_index(entry);
+        let cached = stripped
+            .file_metadata
+            .as_any()
+            .downcast_ref::<CachedParquetMetaData>()
+            .unwrap();
+        assert!(
+            Arc::ptr_eq(cached.parquet_metadata(), &pq),
+            "footer-only entry must be returned unchanged (same Arc)"
+        );
+    }
+}
diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/mod.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/mod.rs
new file mode 100644
index 0000000000000..31cd378c69518
--- /dev/null
+++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/mod.rs
@@ -0,0 +1,37 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+//! Cache infrastructure for the analytics backend.
+//!
+//! # Structure
+//!
+//! - [`eviction_policy`] — pluggable eviction policy trait (`CachePolicy`) and
+//!   built-in implementations (`LruPolicy`, `LfuPolicy`). Add new policies here
+//!   (e.g. S3-FIFO) without touching the cache implementations.
+//! - [`metadata_cache`] — `MutexFileMetadataCache`: wraps DataFusion's
+//!   `DefaultFilesMetadataCache` with hit/miss counters and enforces the
+//!   footer-only invariant via `strip_page_index` at every `put`.
+//! - [`statistics_cache`] — `CustomStatisticsCache`: byte-bounded LRU cache
+//!   for per-file `Statistics` (row-group min/max/null-count).
+//! - [`custom_manager`] — `CustomCacheManager`: ties the metadata and
+//!   statistics caches together for pre-warming and lifecycle management.
+//! - [`page_index`] — scoped parquet page-index caches (ColumnIndex +
+//!   OffsetIndex), cell-granular and backed by `BoundedCache` /
+//!   `Box<dyn CachePolicy>`.
+
+pub mod custom_cache_manager;
+pub mod eviction_policy;
+pub mod metadata_cache;
+pub mod page_index;
+pub mod statistics_cache;
+
+// Flat re-exports so existing call sites keep working without path changes.
+pub use custom_cache_manager::CustomCacheManager;
+pub use eviction_policy::{CachePolicy, CacheResult, PolicyType, create_policy};
+pub use metadata_cache::{MutexFileMetadataCache, CACHE_TYPE_METADATA, CACHE_TYPE_STATS};
+pub use statistics_cache::{CustomStatisticsCache, compute_parquet_statistics};
diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/cache_keys.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/cache_keys.rs
new file mode 100644
index 0000000000000..8bcc121a1646e
--- /dev/null
+++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/cache_keys.rs
@@ -0,0 +1,65 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+//! Cache key types for the two scoped page-index caches.
+
+use std::fmt::Display;
+use std::sync::Arc;
+
+use parquet::file::page_index::offset_index::OffsetIndexMetaData;
+
+/// ColumnIndex cache key — one decoded `ColumnIndexMetaData` **cell** per
+/// `(file, column, row-group)`. The page index for a given column+RG is an
+/// intrinsic property of the file: it is identical no matter which *other*
+/// columns a query filters on, or which literal a predicate uses. Keying at the
+/// cell granularity means a column's per-page string min/max is decoded and
+/// stored **once per file**, then reused by every query whose predicate touches
+/// that column — regardless of the predicate-column *combination* or the
+/// surviving-row-group *set*. (The prior set-keyed design re-decoded and
+/// re-stored a column for every distinct predicate/RG combination — storage grew
+/// with query diversity, not schema width.)
+///
+/// Both scan paths resolve the same `(file, col, rg)` for the same logical
+/// request, so cells are shared across paths → cross-path sharing.
+#[derive(Clone, PartialEq, Eq, Hash, Debug)]
+pub(crate) struct CiCellKey {
+    pub(crate) path: Arc<str>,
+    pub(crate) col: usize,
+    pub(crate) rg: usize,
+}
+
+impl Display for CiCellKey {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}:{}:{}", self.path, self.col, self.rg)
+    }
+}
+
+/// OffsetIndex cache key — one decoded value per `(file, column)`, where the
+/// value is that column's `OffsetIndexMetaData` for **every** row group (a
+/// `Vec` indexed by RG). Unlike the ColumnIndex, the OffsetIndex is read at scan
+/// time for any RG DataFusion chooses to scan — and DataFusion picks that set
+/// itself, after our load — so a column's OffsetIndex must always cover all RGs
+/// (an empty entry on a scanned RG panics / breaks reads). RG can therefore never
+/// be a key axis here; the cell is the whole-column, all-RG offset index. Keyed
+/// only on `(file, col)`, so any query that reads a column reuses its offset
+/// index irrespective of projection or predicate.
+#[derive(Clone, PartialEq, Eq, Hash, Debug)]
+pub(crate) struct OiCellKey {
+    pub(crate) path: Arc<str>,
+    pub(crate) col: usize,
+}
+
+impl Display for OiCellKey {
+    fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result {
+        write!(f, "{}:{}", self.path, self.col)
+    }
+}
+
+/// One column's OffsetIndex across all row groups (indexed by RG). The value type
+/// of [`OFFSET_INDEX_CACHE`].
+pub(crate) type OiColumn = Vec<OffsetIndexMetaData>;
diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/cache_store.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/cache_store.rs
new file mode 100644
index 0000000000000..90977cfe1364a
--- /dev/null
+++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/cache_store.rs
@@ -0,0 +1,195 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+//! Byte-bounded cache with a pluggable eviction policy, used by the two
+//! scoped page-index caches ([`COLUMN_INDEX_CACHE`] / [`OFFSET_INDEX_CACHE`]
+//! defined in `mod.rs`).
+
+use std::fmt::Display;
+use std::hash::Hash;
+use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering::Relaxed};
+use std::sync::Mutex;
+
+use dashmap::DashMap;
+
+use crate::cache::eviction_policy::{CachePolicy, PolicyType, create_policy};
+
+/// Default byte budget for EACH scoped cache, used until the caller sets one from
+/// the runtime's configured limit (see [`set_column_index_cache_limit`] /
+/// [`set_offset_index_cache_limit`]). The two caches are budgeted independently:
+/// the ColumnIndex (per-page string min/max) is the heavy one and the OffsetIndex
+/// (fixed-width page offsets) is tiny, so they get separate, separately-tunable
+/// limits rather than sharing one number.
+///
+/// TODO : configure via settings
+pub(crate) const DEFAULT_SCOPED_CACHE_LIMIT: usize = 150 * 1024 * 1024;
+
+/// Snapshot of one scoped cache's counters plus occupancy. Surfaced on
+/// node-stats and used by tests to assert hits/misses without `Arc::ptr_eq`.
+#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)]
+pub struct ScopedCacheStats {
+    pub hits: u64,
+    pub misses: u64,
+    pub evictions: u64,
+    pub entries: usize,
+    pub used_bytes: usize,
+    pub limit_bytes: usize,
+}
+
+/// Byte-bounded cache with a pluggable eviction policy.
+///
+/// `DashMap` shards the value store so concurrent `get` calls on different keys
+/// never contend. The eviction policy (ordering metadata + victim selection) sits
+/// behind a `Mutex` — it is only touched on `insert` and `set_limit`, not on the
+/// read path. Counters are atomics so `stats()` is always lock-free.
+pub(super) struct BoundedCache<K, V>
+where
+    K: Eq + Hash + Clone + Display + Send + Sync + 'static,
+    V: Clone + Send + Sync + 'static,
+{
+    /// Primary value store — concurrent reads with no global lock.
+    map: DashMap<K, (V, usize)>,
+    /// Reverse map: policy string key → typed cache key, needed to resolve
+    /// eviction candidates (the policy works with `String` keys).
+    reverse: DashMap<String, K>,
+    /// Eviction policy — behind a Mutex since mutation (on_insert/select_for_eviction)
+    /// is not concurrent-safe. Not held during reads.
+    policy: Mutex<Box<dyn CachePolicy>>,
+    limit: AtomicUsize,
+    // lock-free counters
+    hits: AtomicU64,
+    misses: AtomicU64,
+    evictions: AtomicU64,
+    used_bytes: AtomicUsize,
+}
+
+impl<K, V> BoundedCache<K, V>
+where
+    K: Eq + Hash + Clone + Display + Send + Sync + 'static,
+    V: Clone + Send + Sync + 'static,
+{
+    pub(super) fn new(limit: usize, policy_type: PolicyType) -> Self {
+        Self {
+            map: DashMap::new(),
+            reverse: DashMap::new(),
+            policy: Mutex::new(create_policy(policy_type)),
+            limit: AtomicUsize::new(limit),
+            hits: AtomicU64::new(0),
+            misses: AtomicU64::new(0),
+            evictions: AtomicU64::new(0),
+            used_bytes: AtomicUsize::new(0),
+        }
+    }
+
+    pub(super) fn get(&self, key: &K) -> Option<V> {
+        match self.map.get(key) {
+            Some(entry) => {
+                let size = entry.1;
+                if let Ok(mut p) = self.policy.lock() {
+                    p.on_access(&key.to_string(), size);
+                }
+                self.hits.fetch_add(1, Relaxed);
+                Some(entry.0.clone())
+            }
+            None => {
+                self.misses.fetch_add(1, Relaxed);
+                None
+            }
+        }
+    }
+
+    pub(super) fn insert(&self, key: K, value: V, size: usize) {
+        let limit = self.limit.load(Relaxed);
+        if size > limit {
+            return;
+        }
+        let key_str = key.to_string();
+        if let Some(old) = self.map.insert(key.clone(), (value, size)) {
+            self.used_bytes.fetch_sub(old.1, Relaxed);
+        }
+        self.reverse.insert(key_str.clone(), key);
+        self.used_bytes.fetch_add(size, Relaxed);
+        if let Ok(mut p) = self.policy.lock() {
+            p.on_insert(&key_str, size);
+        }
+        self.evict();
+    }
+
+    fn evict(&self) {
+        let limit = self.limit.load(Relaxed);
+        let used = self.used_bytes.load(Relaxed);
+        if used <= limit {
+            return;
+        }
+        let candidates = if let Ok(p) = self.policy.lock() {
+            p.select_for_eviction(used - limit)
+        } else {
+            return;
+        };
+        for key_str in candidates {
+            if let Some((_, typed_key)) = self.reverse.remove(&key_str) {
+                if let Some((_, (_, size))) = self.map.remove(&typed_key) {
+                    self.used_bytes.fetch_sub(size, Relaxed);
+                    self.evictions.fetch_add(1, Relaxed);
+                    if let Ok(mut p) = self.policy.lock() {
+                        p.on_remove(&key_str);
+                    }
+                }
+            }
+        }
+    }
+
+    pub(super) fn set_limit(&self, limit: usize) {
+        self.limit.store(limit, Relaxed);
+        self.evict();
+    }
+
+    pub(super) fn clear_keep_limit(&self) {
+        self.map.clear();
+        self.reverse.clear();
+        if let Ok(mut p) = self.policy.lock() {
+            p.clear();
+        }
+        self.hits.store(0, Relaxed);
+        self.misses.store(0, Relaxed);
+        self.evictions.store(0, Relaxed);
+        self.used_bytes.store(0, Relaxed);
+    }
+
+    /// Remove all entries whose string key starts with `prefix` (used to evict
+    /// all cells for a given file path when the file is deleted/replaced).
+    pub(super) fn evict_by_prefix(&self, prefix: &str) {
+        let victims: Vec<String> = self.reverse
+            .iter()
+            .filter(|e| e.key().starts_with(prefix))
+            .map(|e| e.key().clone())
+            .collect();
+        for key_str in victims {
+            if let Some((_, typed_key)) = self.reverse.remove(&key_str) {
+                if let Some((_, (_, size))) = self.map.remove(&typed_key) {
+                    self.used_bytes.fetch_sub(size, Relaxed);
+                    self.evictions.fetch_add(1, Relaxed);
+                    if let Ok(mut p) = self.policy.lock() {
+                        p.on_remove(&key_str);
+                    }
+                }
+            }
+        }
+    }
+
+    pub(super) fn stats(&self) -> ScopedCacheStats {
+        ScopedCacheStats {
+            hits: self.hits.load(Relaxed),
+            misses: self.misses.load(Relaxed),
+            evictions: self.evictions.load(Relaxed),
+            entries: self.map.len(),
+            used_bytes: self.used_bytes.load(Relaxed),
+            limit_bytes: self.limit.load(Relaxed),
+        }
+    }
+}
diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/column_schema_resolver.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/column_schema_resolver.rs
new file mode 100644
index 0000000000000..bc4e711003a95
--- /dev/null
+++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/column_schema_resolver.rs
@@ -0,0 +1,116 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+//! Predicate-column name → parquet leaf-index resolution.
+//!
+//! Resolution is done against the file's OWN schema (derived from the footer)
+//! rather than the shared table schema to ensure correct leaf indices under
+//! schema evolution (see [`resolve_predicate_parquet_columns`] for details).
+
+use std::collections::HashSet;
+use std::sync::Arc;
+
+use arrow::datatypes::SchemaRef;
+use datafusion::parquet::arrow::arrow_reader::statistics::StatisticsConverter;
+use datafusion::parquet::file::metadata::ParquetMetaData;
+use parquet::arrow::parquet_to_arrow_schema;
+
+/// Map the query's predicate-column names to **this file's** parquet leaf
+/// indices, resolving against the file's OWN schema so the indices are correct
+/// even when the file is missing columns (schema evolution).
+///
+/// # Why the file's own schema, not the shared table schema
+///
+/// `StatisticsConverter`/`parquet_column` map a column by finding its position in
+/// the supplied arrow schema and then matching that position to a parquet leaf
+/// (`get_column_root_idx`). The table schema is the **union** of all
+/// files' columns [N]; a given file may physically contain fewer[M] (e.g.
+/// the merged file has M leaves — the absent columns are all-null and not
+/// written). Resolving against the N-field union therefore maps a column to the
+/// WRONG leaf in a M-leaf file. We would then build
+/// the scoped ColumnIndex/OffsetIndex at the wrong leaf and leave the real one an
+/// empty placeholder — and DataFusion's pruner, which resolves against the file's
+/// physical schema, reads the real leaf and panics on the empty `page_locations`
+/// (`statistics.rs` `page_locations.last().unwrap()`).
+///
+/// Deriving the arrow schema from the file footer (`parquet_to_arrow_schema`)
+/// gives a 1:1 field↔leaf correspondence for that file, so the resolved index
+/// matches what DataFusion dereferences. Columns absent from the file are skipped.
+pub fn resolve_predicate_parquet_columns(
+    _arrow_schema: &SchemaRef,
+    metadata: &ParquetMetaData,
+    predicate_column_names: &[String],
+) -> Vec<usize> {
+    let parquet_schema = metadata.file_metadata().schema_descr();
+    // Per-file arrow schema: 1:1 with this file's parquet leaves, so a column's
+    // arrow position maps to its true leaf. (The passed `_arrow_schema` is the
+    // union table schema and is intentionally NOT used for index resolution —
+    // see the doc comment.)
+    let file_arrow_schema = match parquet_to_arrow_schema(
+        parquet_schema,
+        metadata.file_metadata().key_value_metadata(),
+    ) {
+        Ok(s) => Arc::new(s),
+        // If we can't derive the file schema, fall back to the union schema; the
+        // caller still falls back to footer-only on any downstream mismatch.
+        Err(_) => return resolve_with_schema(_arrow_schema, metadata, predicate_column_names),
+    };
+    resolve_with_schema(&file_arrow_schema, metadata, predicate_column_names)
+}
+
+/// Resolve TWO name-sets (e.g. predicate columns and projection columns) against
+/// the same file in one pass. Deriving the per-file arrow schema
+/// (`parquet_to_arrow_schema`) is the dominant cost of name→leaf resolution on
+/// wide schemas (it rebuilds the whole file's Schema); the two callers in the
+/// indexed setup loop previously each rebuilt it, so doing it once here removes a
+/// full schema reconstruction per file per query. Pure refactor — each returned
+/// Vec is identical to calling `resolve_predicate_parquet_columns` separately.
+pub fn resolve_predicate_parquet_columns_pair(
+    union_schema: &SchemaRef,
+    metadata: &ParquetMetaData,
+    names_a: &[String],
+    names_b: &[String],
+) -> (Vec<usize>, Vec<usize>) {
+    let parquet_schema = metadata.file_metadata().schema_descr();
+    match parquet_to_arrow_schema(
+        parquet_schema,
+        metadata.file_metadata().key_value_metadata(),
+    ) {
+        Ok(s) => {
+            let file_arrow_schema = Arc::new(s);
+            (
+                resolve_with_schema(&file_arrow_schema, metadata, names_a),
+                resolve_with_schema(&file_arrow_schema, metadata, names_b),
+            )
+        }
+        // Same fallback as the single-name path: resolve against the union schema.
+        Err(_) => (
+            resolve_with_schema(union_schema, metadata, names_a),
+            resolve_with_schema(union_schema, metadata, names_b),
+        ),
+    }
+}
+
+/// Resolve predicate column names → parquet leaf indices against a specific arrow
+/// schema, via the same `StatisticsConverter` mapping DataFusion's pruner uses.
+pub(super) fn resolve_with_schema(
+    arrow_schema: &SchemaRef,
+    metadata: &ParquetMetaData,
+    predicate_column_names: &[String],
+) -> Vec<usize> {
+    let parquet_schema = metadata.file_metadata().schema_descr();
+    let mut set = HashSet::new();
+    for name in predicate_column_names {
+        if let Ok(conv) = StatisticsConverter::try_new(name, arrow_schema, parquet_schema) {
+            if let Some(idx) = conv.parquet_column_index() {
+                set.insert(idx);
+            }
+        }
+    }
+    set.into_iter().collect()
+}
diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/mod.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/mod.rs
new file mode 100644
index 0000000000000..760f46d6d65e0
--- /dev/null
+++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/mod.rs
@@ -0,0 +1,188 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+//! Scoped parquet page-index caches — TWO caches, by consumer.
+//!
+//! # Why this exists
+//!
+//! Parquet metadata loading pulls the **entire page index** — `ColumnIndex`
+//! (per-page min/max; the per-page *string* min/max is the heap hog) plus
+//! `OffsetIndex` (per-page byte offsets), for every column of every row group.
+//! On wide schemas this is very memory expensive.
+//! The level-1 metadata cache is kept footer-only (see
+//! [`crate::cache`]); this module rebuilds a *scoped* page index per query and
+//! caches it, shared by both scan paths (the DataFusion `ListingTable` path and
+//! the custom indexed-table executor).
+//!
+//! # Two caches, because the two indexes have different drivers
+//!
+//! The `ColumnIndex` and `OffsetIndex` are consumed by different parts of
+//! DataFusion / parquet, with **different natural cache keys**. Forcing
+//! them into one key makes the projection-driven OffsetIndex poison the
+//! predicate-driven ColumnIndex's broad cross-path sharing (the failure mode of
+//! the prior iteration). So they are split:
+//!
+//! - **ColumnIndex — predicate-driven.** Read only at *prune* time, and only for
+//!   the predicate column being evaluated
+//!   (`page_filter::PagesPruningStatistics`, `offset_index[rg][predicate_col]`).
+//!   Key: `(file, predicate_cols, surviving_rgs)`. Deterministic in the
+//!   *predicate* (independent of what you `SELECT`), so the same filter shares
+//!   its entry across scan paths **and** across queries with different
+//!   projections. This is the heavy index (string min/max) and the big heap win.
+//!   Scoped to predicate columns (`NONE` placeholders elsewhere) and, optionally,
+//!   to the row groups that pass footer-stats pruning ([`surviving_row_groups`]).
+//!
+//! - **OffsetIndex — projection-driven.** Read at *scan* time for **projected**
+//!   columns (`InMemoryRowGroup::fetch_ranges`, `projection.leaf_included(idx)`),
+//!   and at prune time for the predicate column, and at column 0 for the
+//!   page-skip metric. Key: `(file, projection_cols)` where
+//!   `projection_cols = predicate ∪ projection ∪ {0}`. This is the cheap, fixed-width
+//!   index (no per-page string stats). Built for **all row groups** (an empty
+//!   OffsetIndex on a row group DataFusion scans panics / breaks reads, and
+//!   DataFusion chooses the scanned set itself, after our load — see
+//!   HANDOFF_step2_rg_scoping.md §1e).
+//!
+//! Each cache stores only its decoded vector (`ParquetColumnIndex` /
+//! `ParquetOffsetIndex`) — never a full `ParquetMetaData` (no footer
+//! duplication). On lookup the two are **grafted** onto the caller's
+//! already-resident footer via [`ParquetMetaData::into_builder`] →
+//! `set_column_index`/`set_offset_index`.
+//!
+//! **Consequence for tests:** a lookup returns a *fresh* `Arc`, so `Arc::ptr_eq`
+//! is the wrong signal for "served from cache" — assert via the per-cache hit
+//! counters ([`column_index_cache_stats`] / [`offset_index_cache_stats`]).
+//!
+//! ## Correctness / fallback
+//!
+//! Any failure (file has no page index, a column lacks an index range, a
+//! decode/IO error) makes the load return `None`. The caller keeps its
+//! footer-only metadata and the pruner conservatively no-ops (scans the whole
+//! row group) — never a wrong result.
+//!
+//! ## Upstream note
+//!
+//! arrow-rs is moving toward first-class selective metadata decoding
+//! (apache/arrow-rs#8643 open; the `ParquetStatisticsPolicy::skip_except` pattern
+//! merged in #8797 / #8714 for encoding stats). None yet expose a page-index
+//! column/row-group projection, so we hand-roll it with the deprecated
+//! [`read_columns_indexes`]/[`read_offset_indexes`] (the only public subset
+//! decoders). Migrate to `ParquetMetaDataOptions` when it grows a page-index knob.
+
+pub mod cache_store;
+pub mod cache_keys;
+pub mod page_index_io;
+pub mod column_schema_resolver;
+
+use cache_store::{BoundedCache, DEFAULT_SCOPED_CACHE_LIMIT};
+use cache_keys::{CiCellKey, OiCellKey, OiColumn};
+
+use crate::cache::eviction_policy::PolicyType;
+use datafusion::parquet::file::page_index::column_index::ColumnIndexMetaData;
+use once_cell::sync::Lazy;
+
+pub use cache_store::ScopedCacheStats;
+pub use page_index_io::{
+    load_scoped_page_index,
+    load_scoped_page_index_cols,
+    load_scoped_page_index_rgs,
+    load_page_index_fully_scoped,
+    surviving_row_groups,
+};
+pub use column_schema_resolver::{
+    resolve_predicate_parquet_columns,
+    resolve_predicate_parquet_columns_pair,
+};
+
+// Process-global caches
+
+pub(crate) static COLUMN_INDEX_CACHE: Lazy<BoundedCache<CiCellKey, ColumnIndexMetaData>> =
+    Lazy::new(|| BoundedCache::new(DEFAULT_SCOPED_CACHE_LIMIT, PolicyType::Lru));
+
+pub(crate) static OFFSET_INDEX_CACHE: Lazy<BoundedCache<OiCellKey, OiColumn>> =
+    Lazy::new(|| BoundedCache::new(DEFAULT_SCOPED_CACHE_LIMIT, PolicyType::Lru));
+
+/// Set the ColumnIndex cache's byte budget. Called from startup wiring with the
+/// configured limit. Idempotent; shrinking evicts immediately. Zero ignored.
+pub fn set_column_index_cache_limit(limit: usize) {
+    if limit > 0 {
+        COLUMN_INDEX_CACHE.set_limit(limit);
+    }
+}
+
+/// Set the OffsetIndex cache's byte budget. Called from startup wiring with the
+/// configured limit. Idempotent; shrinking evicts immediately. Zero ignored.
+pub fn set_offset_index_cache_limit(limit: usize) {
+    if limit > 0 {
+        OFFSET_INDEX_CACHE.set_limit(limit);
+    }
+}
+
+/// Counters + occupancy of the ColumnIndex (predicate-driven) cache. Lock-free.
+pub fn column_index_cache_stats() -> ScopedCacheStats {
+    COLUMN_INDEX_CACHE.stats()
+}
+
+/// Counters + occupancy of the OffsetIndex (projection-driven) cache. Lock-free.
+pub fn offset_index_cache_stats() -> ScopedCacheStats {
+    OFFSET_INDEX_CACHE.stats()
+}
+
+/// Drop all entries and reset counters in BOTH caches, keeping the budgets. For
+/// operational testing — reset and re-measure without a cluster restart.
+pub fn clear_scoped_cache() {
+    COLUMN_INDEX_CACHE.clear_keep_limit();
+    OFFSET_INDEX_CACHE.clear_keep_limit();
+}
+
+/// Evict all page-index cells for a specific file from both caches.
+///
+/// Called when a segment file is deleted or replaced so stale cells don't survive
+/// in the cache under the same `(path, col, rg)` key. The page-index caches have
+/// no freshness check (unlike the metadata cache's `is_valid_for`), so stale cells
+/// from a re-written file would otherwise be served as hits — wrong data.
+pub fn evict_file_from_scoped_cache(file_path: &str) {
+    COLUMN_INDEX_CACHE.evict_by_prefix(file_path);
+    OFFSET_INDEX_CACHE.evict_by_prefix(file_path);
+}
+
+/// Crate-wide guard so every test that touches the process-global caches mutually
+/// excludes (distinct fixtures alone aren't enough — the `InMemory` path is always
+/// "data.parquet"). Shared (not per-module) so all cache users serialize.
+#[cfg(test)]
+pub(crate) static SCOPED_CACHE_TEST_GUARD: std::sync::Mutex<()> = std::sync::Mutex::new(());
+
+/// Clear both caches AND restore the default limit on each.
+#[cfg(test)]
+pub(crate) fn clear_scoped_cache_for_test() {
+    COLUMN_INDEX_CACHE.clear_keep_limit();
+    COLUMN_INDEX_CACHE.set_limit(DEFAULT_SCOPED_CACHE_LIMIT);
+    OFFSET_INDEX_CACHE.clear_keep_limit();
+    OFFSET_INDEX_CACHE.set_limit(DEFAULT_SCOPED_CACHE_LIMIT);
+}
+
+#[cfg(test)]
+pub(crate) fn set_column_index_cache_limit_for_test(limit: usize) {
+    COLUMN_INDEX_CACHE.set_limit(limit);
+}
+
+/// Combined view (sum of both caches) — test-only convenience for assertions that
+/// only need "is the scoped machinery doing anything". Production code reads the
+/// two caches separately ([`column_index_cache_stats`] / [`offset_index_cache_stats`]).
+#[cfg(test)]
+pub(crate) fn scoped_cache_stats() -> ScopedCacheStats {
+    let a = column_index_cache_stats();
+    let b = offset_index_cache_stats();
+    ScopedCacheStats {
+        hits: a.hits + b.hits,
+        misses: a.misses + b.misses,
+        evictions: a.evictions + b.evictions,
+        entries: a.entries + b.entries,
+        used_bytes: a.used_bytes + b.used_bytes,
+        limit_bytes: a.limit_bytes.max(b.limit_bytes),
+    }
+}
diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/page_index_io.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/page_index_io.rs
new file mode 100644
index 0000000000000..e2fd3f8c7695b
--- /dev/null
+++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/page_index_io.rs
@@ -0,0 +1,1626 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+//! Page-index load entry points and their supporting internals.
+//!
+//! The four public `load_scoped_page_index*` functions are the only callers of
+//! the cache machinery; all decoding, cache lookup, and grafting happens here.
+
+use std::collections::{HashMap, HashSet};
+use std::mem;
+use std::ops::Range;
+use std::sync::Arc;
+
+use arrow::datatypes::SchemaRef;
+use datafusion::parquet::errors::{ParquetError, Result as ParquetResult};
+use datafusion::parquet::file::metadata::{
+    ColumnChunkMetaData, OffsetIndexBuilder, ParquetColumnIndex, ParquetMetaData,
+    ParquetOffsetIndex,
+};
+use datafusion::parquet::file::page_index::column_index::ColumnIndexMetaData;
+use datafusion::parquet::file::page_index::index_reader::{
+    read_columns_indexes, read_offset_indexes,
+};
+use datafusion::parquet::file::reader::{ChunkReader, Length};
+use object_store::ObjectStore;
+use parquet::file::page_index::offset_index::OffsetIndexMetaData;
+use prost::bytes::{buf, Buf, Bytes};
+
+use super::cache_keys::{CiCellKey, OiCellKey, OiColumn};
+use super::{COLUMN_INDEX_CACHE, OFFSET_INDEX_CACHE};
+
+/// Load + graft a scoped page index: ColumnIndex for `predicate_cols` (all RGs),
+/// OffsetIndex for all columns/all RGs. The Step-1 baseline.
+pub async fn load_scoped_page_index(
+    store: &Arc<dyn ObjectStore>,
+    location: &object_store::path::Path,
+    footer_meta: &Arc<ParquetMetaData>,
+    predicate_cols: &[usize],
+) -> Option<Arc<ParquetMetaData>> {
+    attach_scoped_page_index_to_metadata(store, location, footer_meta, predicate_cols, None, None).await
+}
+
+/// Like [`load_scoped_page_index`], but the ColumnIndex is built only for the row
+/// groups in `surviving_rgs` (footer-stats survivors — [`surviving_row_groups`]);
+/// other RGs get a `NONE` ColumnIndex placeholder. OffsetIndex stays all-columns.
+pub async fn load_scoped_page_index_rgs(
+    store: &Arc<dyn ObjectStore>,
+    location: &object_store::path::Path,
+    footer_meta: &Arc<ParquetMetaData>,
+    predicate_cols: &[usize],
+    surviving_rgs: &[usize],
+) -> Option<Arc<ParquetMetaData>> {
+    attach_scoped_page_index_to_metadata(store, location, footer_meta, predicate_cols, Some(surviving_rgs), None).await
+}
+
+/// Like [`load_scoped_page_index`], but the OffsetIndex is built only for
+/// `projection_cols` (the loader unions in the predicate columns + column 0
+/// defensively); other columns get an empty placeholder. ColumnIndex stays
+/// all-RG. See [`OiKey`] for which columns must be real and why.
+pub async fn load_scoped_page_index_cols(
+    store: &Arc<dyn ObjectStore>,
+    location: &object_store::path::Path,
+    footer_meta: &Arc<ParquetMetaData>,
+    predicate_cols: &[usize],
+    projection_cols: &[usize],
+) -> Option<Arc<ParquetMetaData>> {
+    attach_scoped_page_index_to_metadata(store, location, footer_meta, predicate_cols, None, Some(projection_cols)).await
+}
+
+/// Fully scoped: ColumnIndex RG-scoped to `surviving_rgs`, OffsetIndex
+/// column-scoped to `projection_cols` (∪ predicate ∪ {0}). The Step-2 target both
+/// scan paths call once they know their surviving-RG set and projection.
+pub async fn load_page_index_fully_scoped(
+    store: &Arc<dyn ObjectStore>,
+    location: &object_store::path::Path,
+    footer_meta: &Arc<ParquetMetaData>,
+    predicate_cols: &[usize],
+    surviving_rgs: &[usize],
+    projection_cols: &[usize],
+) -> Option<Arc<ParquetMetaData>> {
+    attach_scoped_page_index_to_metadata(
+        store,
+        location,
+        footer_meta,
+        predicate_cols,
+        Some(surviving_rgs),
+        Some(projection_cols),
+    )
+        .await
+}
+
+//  Surviving-RG computation (footer-stats prune; superset of DF's set) - NOT WIRED YET
+
+/// Compute the row groups that pass footer RG-statistics pruning for `predicate`.
+///
+/// A **superset** of the row groups DataFusion will scan (DataFusion applies the
+/// same footer-stats pruning plus bloom/range/limit, which only remove more), so
+/// scoping the predicate-column ColumnIndex to this set is safe. Returns all row
+/// groups if the predicate can't be lowered or stats are missing. Deterministic
+/// in `(footer_meta, schema, predicate)` → both scan paths agree.
+pub fn surviving_row_groups(
+    footer_meta: &ParquetMetaData,
+    arrow_schema: &SchemaRef,
+    predicate: &Arc<dyn datafusion::physical_expr::PhysicalExpr>,
+) -> Vec<usize> {
+    use arrow::array::{ArrayRef, BooleanArray, UInt64Array};
+    use datafusion::parquet::arrow::arrow_reader::statistics::StatisticsConverter;
+    use datafusion::physical_optimizer::pruning::{PruningPredicate, PruningStatistics};
+    use datafusion::scalar::ScalarValue;
+    use std::collections::HashSet;
+
+    let num_rgs = footer_meta.num_row_groups();
+    let all: Vec<usize> = (0..num_rgs).collect();
+    if num_rgs == 0 {
+        return all;
+    }
+
+    let Ok(pp) = PruningPredicate::try_new(Arc::clone(predicate), Arc::clone(arrow_schema)) else {
+        return all;
+    };
+
+    struct RgStats<'a> {
+        meta: &'a ParquetMetaData,
+        schema: &'a SchemaRef,
+        num_rgs: usize,
+    }
+    impl<'a> RgStats<'a> {
+        fn conv(&self, col: &str) -> Option<StatisticsConverter<'_>> {
+            StatisticsConverter::try_new(col, self.schema, self.meta.file_metadata().schema_descr())
+                .ok()
+        }
+    }
+    impl<'a> PruningStatistics for RgStats<'a> {
+        fn min_values(&self, column: &datafusion::common::Column) -> Option<ArrayRef> {
+            self.conv(&column.name)?
+                .row_group_mins(self.meta.row_groups().iter())
+                .ok()
+        }
+        fn max_values(&self, column: &datafusion::common::Column) -> Option<ArrayRef> {
+            self.conv(&column.name)?
+                .row_group_maxes(self.meta.row_groups().iter())
+                .ok()
+        }
+        fn num_containers(&self) -> usize {
+            self.num_rgs
+        }
+        fn null_counts(&self, column: &datafusion::common::Column) -> Option<ArrayRef> {
+            self.conv(&column.name)?
+                .row_group_null_counts(self.meta.row_groups().iter())
+                .ok()
+                .map(|a| Arc::new(a) as ArrayRef)
+        }
+        fn row_counts(&self) -> Option<ArrayRef> {
+            let counts: Vec<u64> =
+                self.meta.row_groups().iter().map(|rg| rg.num_rows() as u64).collect();
+            Some(Arc::new(UInt64Array::from(counts)) as ArrayRef)
+        }
+        fn contained(
+            &self,
+            _column: &datafusion::common::Column,
+            _values: &HashSet<ScalarValue>,
+        ) -> Option<BooleanArray> {
+            None
+        }
+    }
+
+    let stats = RgStats { meta: footer_meta, schema: arrow_schema, num_rgs };
+    match pp.prune(&stats) {
+        Ok(mask) => mask
+            .iter()
+            .enumerate()
+            .filter_map(|(i, keep)| if *keep { Some(i) } else { None })
+            .collect(),
+        Err(_) => all,
+    }
+}
+
+async fn attach_scoped_page_index_to_metadata(
+    store: &Arc<dyn ObjectStore>,
+    location: &object_store::path::Path,
+    footer_meta: &Arc<ParquetMetaData>,
+    predicate_cols: &[usize],
+    surviving_rgs: Option<&[usize]>,
+    projection_cols: Option<&[usize]>,
+) -> Option<Arc<ParquetMetaData>> {
+    // Nothing to build only when there is NEITHER a predicate (ColumnIndex for
+    // pruning) NOR an explicit projection (OffsetIndex for page-level IO). A
+    // match-only query has empty `predicate_cols` but a real, non-empty
+    // `projection_cols` (the projected columns) — it still needs the OffsetIndex so
+    // the parquet reader can fetch just the matched rows' pages instead of whole
+    // column chunks. `projection_cols == None` (all-columns) with no predicate is the
+    // legacy "nothing requested" case → return None as before.
+    let has_offset_work = projection_cols.map(|c| !c.is_empty()).unwrap_or(false);
+    if predicate_cols.is_empty() && !has_offset_work {
+        return None;
+    }
+    // CI and OI are independent — run their IO concurrently.
+    // CPU setup before each `.await` still runs sequentially (same thread), but
+    // both `store.get_ranges` calls are in-flight at the same time so wall-clock
+    // time = max(ci_latency, oi_latency) instead of their sum.
+    let (ci_result, oi_result) = tokio::join!(
+        async {
+            if predicate_cols.is_empty() {
+                Some(None)
+            } else {
+                Some(Some(get_or_build_column_index(store, location, footer_meta, predicate_cols, surviving_rgs).await?))
+            }
+        },
+        get_or_build_offset_index(store, location, footer_meta, predicate_cols, projection_cols),
+    );
+    let column_index = ci_result?;
+    let offset_index = oi_result?;
+    Some(graft(footer_meta, column_index, offset_index))
+}
+
+/// Build a fresh `ParquetMetaData` = `footer` with the page-index pair grafted
+/// on. Clones the footer to get an owned value for the builder — but with
+/// `ParquetMetaData.row_groups` held behind an `Arc` (see the arrow-rs change),
+/// that clone is a refcount bump, not a deep copy of every row group's
+/// column-chunk metadata. On wide / many-row-group files (e.g. textbench's
+/// ~403-col, ~64-RG footers) that deep copy was ~60ms/query; sharing makes the
+/// graft effectively free.
+fn graft(
+    footer_meta: &Arc<ParquetMetaData>,
+    column_index: Option<ParquetColumnIndex>,
+    offset_index: ParquetOffsetIndex,
+) -> Arc<ParquetMetaData> {
+    let base = ParquetMetaData::clone(footer_meta);
+    let rebuilt = base
+        .into_builder()
+        .set_column_index(column_index)
+        .set_offset_index(Some(offset_index))
+        .build();
+    Arc::new(rebuilt)
+}
+
+// ── ColumnIndex cache lookup + build (per `(file, col, rg)` cell) ────────────
+
+/// Assemble the full-width `[rg][col]` `ColumnIndex` matrix (real cells only at
+/// `predicate_cols` × built RGs; `NONE` everywhere else) by looking up each
+/// `(file, col, rg)` cell in the cache and decoding only the cells that miss.
+///
+/// `surviving_rgs == None` builds every RG; `Some(set)` restricts the built RGs
+/// to footer-stats survivors ([`surviving_row_groups`]). Either way a cell is
+/// keyed solely on `(file, col, rg)`, so it is decoded once per file and reused
+/// across every predicate combination and surviving-RG set that touches it.
+async fn get_or_build_column_index(
+    store: &Arc<dyn ObjectStore>,
+    location: &object_store::path::Path,
+    footer_meta: &Arc<ParquetMetaData>,
+    predicate_cols: &[usize],
+    surviving_rgs: Option<&[usize]>,
+) -> Option<ParquetColumnIndex> {
+    let num_rgs = footer_meta.num_row_groups();
+    if num_rgs == 0 {
+        return None;
+    }
+    let num_cols = footer_meta.file_metadata().schema_descr().num_columns();
+
+    debug_assert!(
+        predicate_cols.iter().all(|&i| i < num_cols),
+        "predicate_cols contains out-of-bounds index (num_cols={num_cols}): {predicate_cols:?}"
+    );
+    if predicate_cols.iter().any(|&i| i >= num_cols) {
+        return None;
+    }
+
+    // Which RGs to build the (heavy) predicate-column ColumnIndex for.
+    let build_rgs: Vec<usize> = match surviving_rgs {
+        None => (0..num_rgs).collect(),
+        Some(set) => {
+            debug_assert!(
+                set.iter().all(|&r| r < num_rgs),
+                "surviving_rgs contains out-of-bounds index (num_rgs={num_rgs}): {set:?}"
+            );
+            set.iter().copied().filter(|&r| r < num_rgs).collect()
+        }
+    };
+    if build_rgs.is_empty() {
+        // Nothing to build (e.g. an empty survivor set) → footer-only fallback.
+        return None;
+    }
+
+    let path: Arc<str> = Arc::from(location.as_ref());
+
+    // Initially filled NONE for all RGs and all cols - as placeholders
+    let mut col_index_matrix: ParquetColumnIndex = (0..num_rgs)
+        .map(|_| (0..num_cols).map(|_| ColumnIndexMetaData::NONE).collect())
+        .collect();
+
+    // Phase 1: serve every needed cell that is already cached; collect misses.
+    let mut missing_col_rg_matrix: Vec<(usize, usize)> = Vec::new(); // (col, rg)
+    for &rg in &build_rgs {
+        for &col in predicate_cols {
+            let key = CiCellKey { path: path.clone(), col, rg };
+            match COLUMN_INDEX_CACHE.get(&key) {
+                Some(cell) => col_index_matrix[rg][col] = cell,
+                None => missing_col_rg_matrix.push((col, rg)),
+            }
+        }
+    }
+    // Phase 2: decode the missing cells (vectored fetch grouped by RG), place
+    // them in the matrix, and populate the cache.
+    if !missing_col_rg_matrix.is_empty() {
+        let built = build_column_index_cells(store, location, footer_meta, &missing_col_rg_matrix).await?;
+        for cell in built {
+            debug_assert!(
+                cell.rg < col_index_matrix.len() && cell.col < col_index_matrix[cell.rg].len(),
+                "cell ({}, {}) out of matrix bounds ({num_rgs} rgs, {num_cols} cols)",
+                cell.col, cell.rg,
+            );
+            COLUMN_INDEX_CACHE.insert(
+                CiCellKey { path: path.clone(), col: cell.col, rg: cell.rg },
+                cell.data.clone(),
+                cell.size,
+            );
+            col_index_matrix[cell.rg][cell.col] = cell.data;
+        }
+    }
+
+    Some(col_index_matrix)
+}
+
+struct RgPlan {
+    rg: usize,
+    cols: Vec<usize>,
+    chunks: Vec<ColumnChunkMetaData>,
+    range_start: u64,
+}
+
+struct CiCell {
+    col: usize,
+    rg: usize,
+    data: ColumnIndexMetaData,
+    size: usize,
+}
+
+struct OiCell {
+    col: usize,
+    data: OiColumn,
+    size: usize,
+}
+
+/// Range-read + decode the requested `(col, rg)` ColumnIndex cells, grouping by
+/// row group so each RG's columns share one vectored fetch + decode. `None` if
+/// any requested column lacks a column-index range (→ footer-only fallback).
+async fn build_column_index_cells(
+    store: &Arc<dyn ObjectStore>,
+    location: &object_store::path::Path,
+    footer_meta: &Arc<ParquetMetaData>,
+    col_rg_matrix: &[(usize, usize)],
+) -> Option<Vec<CiCell>> {
+    let mut by_rg: HashMap<usize, Vec<usize>> = HashMap::new();
+    for &(col, rg) in col_rg_matrix {
+        by_rg.entry(rg).or_default().push(col);
+    }
+
+    let mut plans: Vec<RgPlan> = Vec::with_capacity(by_rg.len());
+    let mut fetch_ranges: Vec<Range<u64>> = Vec::with_capacity(by_rg.len());
+    for (rg, cols) in by_rg {
+        let rgm = footer_meta.row_group(rg);
+        let chunks: Vec<ColumnChunkMetaData> = cols.iter().map(|&i| rgm.column(i).clone()).collect();
+        let range = column_index_union(&chunks)?;
+        plans.push(RgPlan { rg, cols, chunks, range_start: range.start });
+        fetch_ranges.push(range);
+    }
+
+    let buffers = store.get_ranges(location, &fetch_ranges).await.ok()?;
+    if buffers.len() != fetch_ranges.len() {
+        return None;
+    }
+
+    let mut out: Vec<CiCell> = Vec::with_capacity(col_rg_matrix.len());
+    for (plan, buf) in plans.iter().zip(buffers.iter()) {
+        let reader = BufferChunkReader { base: plan.range_start, bytes: buf.clone() };
+        // Deprecated but the only PUBLIC column-subset decoder (arrow-rs#8643).
+        #[allow(deprecated)]
+        let decoded = read_columns_indexes(&reader, &plan.chunks).ok()??;
+        if decoded.len() != plan.cols.len() {
+            return None;
+        }
+        let rgm = footer_meta.row_group(plan.rg);
+        for (entry, &col) in decoded.into_iter().zip(plan.cols.iter()) {
+            let size = rgm.column(col).column_index_length().unwrap_or(0).max(0) as usize;
+            out.push(CiCell { col, rg: plan.rg, data: entry, size });
+        }
+    }
+    Some(out)
+}
+
+// ── OffsetIndex cache lookup + build (per `(file, col)` cell, all RGs) ───────
+
+/// Assemble the full-width `[rg][col]` `OffsetIndex` matrix (real entries only at
+/// the resolved offset columns; empty placeholders elsewhere) from per-`(file,
+/// col)` cells, decoding only the columns that miss.
+///
+/// The resolved offset-column set is `predicate ∪ projection ∪ {0}` (`projection_cols
+/// == None` → all columns); see [`OiCellKey`] for why each must be real. Each
+/// cached cell is a column's OffsetIndex across **all** row groups, keyed only on
+/// `(file, col)`, so it is decoded once per file and reused across every query
+/// that reads that column irrespective of projection or predicate.
+async fn get_or_build_offset_index(
+    store: &Arc<dyn ObjectStore>,
+    location: &object_store::path::Path,
+    footer_meta: &Arc<ParquetMetaData>,
+    predicate_cols: &[usize],
+    projection_cols: Option<&[usize]>,
+) -> Option<ParquetOffsetIndex> {
+    let num_rgs = footer_meta.num_row_groups();
+    if num_rgs == 0 {
+        return None;
+    }
+    let num_cols = footer_meta.file_metadata().schema_descr().num_columns();
+
+    // Resolve which columns need a real OffsetIndex: predicate ∪ projection ∪ {0},
+    // clamped. `None` → all columns.
+    // First column {0} , is always needed as it's used in stats.
+    let off_cols: Vec<usize> = match projection_cols {
+        None => (0..num_cols).collect(),
+        Some(proj_cols) => {
+            let mut set: HashSet<usize> = HashSet::new();
+            set.insert(0); // metric reads column 0
+            for &c in predicate_cols {
+                set.insert(c);
+            }
+            for &c in proj_cols {
+                set.insert(c);
+            }
+            debug_assert!(
+                set.iter().all(|&c| c < num_cols),
+                "column index out of bounds (num_cols={num_cols}): {set:?}"
+            );
+            set.into_iter().filter(|&c| c < num_cols).collect()
+        }
+    };
+    if off_cols.is_empty() {
+        return None;
+    }
+
+    let path: Arc<str> = Arc::from(location.as_ref());
+    // Placeholder for columns we don't build: a SINGLE page spanning the whole row
+    // group, NOT an empty page-locations list. A scoped OffsetIndex is grafted as a
+    // full-width `[rg][col]` matrix; consumers (DataFusion's page pruner, arrow's
+    // reader, our indexed pruner) index it by absolute column and dereference
+    // `page_locations` (`.last()`, `[0]`, `windows(2)`). An EMPTY placeholder
+    // panics those (`page_locations.last().unwrap()` etc.) if any path touches a
+    // column we scoped out — which is hard to predict across every query shape
+    // (count/agg, SingleCollector prefetch, schema-evolved files). A one-page
+    // placeholder is always safe to dereference and makes pruning conservatively
+    // keep the whole RG (1 page = all rows → can't prune), never a wrong result.
+    let placeholder_for = |rg_idx: usize| -> OffsetIndexMetaData {
+        let mut b = OffsetIndexBuilder::new();
+        b.append_offset_and_size(0, 0);
+        b.append_row_count(footer_meta.row_group(rg_idx).num_rows());
+        b.build()
+    };
+    // The placeholder is identical for every column within a row group (it only
+    // depends on the RG's row count). Build it ONCE per RG and clone it across the
+    // columns, instead of constructing `num_cols` identical OffsetIndexMetaData
+    // (each a heap alloc) per RG. On wide schemas (clickbench ~105 cols) this is the
+    // bulk of the per-file warm scoped-load cost — only the few scoped columns get
+    // real data scattered in afterward; the rest stay as clones of this placeholder.
+    let mut matrix: ParquetOffsetIndex = (0..num_rgs)
+        .map(|rg| {
+            let ph = placeholder_for(rg);
+            vec![ph; num_cols]
+        })
+        .collect();
+
+    // Phase 1: serve cached columns; collect misses.
+    let mut missing: Vec<usize> = Vec::new();
+    for &col in &off_cols {
+        let key = OiCellKey { path: path.clone(), col };
+        match OFFSET_INDEX_CACHE.get(&key) {
+            Some(column) => scatter_offset_column(&mut matrix, col, &column),
+            None => missing.push(col),
+        }
+    }
+
+    // Phase 2: decode the missing columns (each spanning all RGs), scatter into
+    // the matrix, and populate the cache.
+    if !missing.is_empty() {
+        let built = build_offset_index_columns(store, location, footer_meta, &missing, num_rgs).await?;
+        for cell in built {
+            OFFSET_INDEX_CACHE.insert(OiCellKey { path: path.clone(), col: cell.col }, cell.data.clone(), cell.size);
+            scatter_offset_column_owned(&mut matrix, cell.col, cell.data);
+        }
+    }
+
+    Some(matrix)
+}
+
+/// Place a column's all-RG OffsetIndex (indexed by RG) into the matrix at `col`.
+fn scatter_offset_column(matrix: &mut ParquetOffsetIndex, col: usize, column: &OiColumn) {
+    for (rg, entry) in column.iter().enumerate() {
+        if rg < matrix.len() {
+            matrix[rg][col] = entry.clone();
+        }
+    }
+}
+
+/// Consuming version of [`scatter_offset_column`] — used after inserting into
+/// the cache so we move rather than clone the per-RG entries.
+fn scatter_offset_column_owned(matrix: &mut ParquetOffsetIndex, col: usize, column: OiColumn) {
+    for (rg, entry) in column.into_iter().enumerate() {
+        if rg < matrix.len() {
+            matrix[rg][col] = entry;
+        }
+    }
+}
+
+/// Range-read + decode the OffsetIndex for each requested column across **every**
+/// row group (read-time safety — see [`OiCellKey`]). `None` if any column lacks
+/// an offset-index range (→ footer-only fallback).
+async fn build_offset_index_columns(
+    store: &Arc<dyn ObjectStore>,
+    location: &object_store::path::Path,
+    footer_meta: &Arc<ParquetMetaData>,
+    cols: &[usize],
+    num_rgs: usize,
+) -> Option<Vec<OiCell>> {
+    struct RgPlan {
+        chunks: Vec<ColumnChunkMetaData>,
+        range_start: u64,
+    }
+    let mut plans: Vec<RgPlan> = Vec::with_capacity(num_rgs);
+    let mut fetch_ranges: Vec<Range<u64>> = Vec::with_capacity(num_rgs);
+    for rg_idx in 0..num_rgs {
+        let rg = footer_meta.row_group(rg_idx);
+        let chunks: Vec<ColumnChunkMetaData> = cols.iter().map(|&i| rg.column(i).clone()).collect();
+        let range = offset_index_union(&chunks)?;
+        plans.push(RgPlan { chunks, range_start: range.start });
+        fetch_ranges.push(range);
+    }
+
+    let buffers = store.get_ranges(location, &fetch_ranges).await.ok()?;
+    if buffers.len() != fetch_ranges.len() {
+        return None;
+    }
+
+    // Per-column accumulator: one OiColumn slot per requested col, filled RG by RG.
+    let mut columns: Vec<OiColumn> = cols.iter().map(|_| Vec::with_capacity(num_rgs)).collect();
+    for (plan, buf) in plans.iter().zip(buffers.iter()) {
+        let reader = BufferChunkReader { base: plan.range_start, bytes: buf.clone() };
+        #[allow(deprecated)]
+        let decoded = read_offset_indexes(&reader, &plan.chunks).ok()??;
+        if decoded.len() != cols.len() {
+            return None;
+        }
+        for (k, entry) in decoded.into_iter().enumerate() {
+            columns[k].push(entry);
+        }
+    }
+
+    let mut out: Vec<OiCell> = Vec::with_capacity(cols.len());
+    for (k, &col) in cols.iter().enumerate() {
+        let size = footer_meta
+            .row_groups()
+            .iter()
+            .map(|rg| rg.column(col).offset_index_length().unwrap_or(0).max(0) as usize)
+            .sum();
+        out.push(OiCell { col, data: mem::take(&mut columns[k]), size });
+    }
+    Some(out)
+}
+
+/// Union of `column_index` byte ranges across the given column chunks. `None` if
+/// any chunk lacks a column index (we require all predicate columns to have one,
+/// else fall back to footer-only).
+fn column_index_union(chunks: &[ColumnChunkMetaData]) -> Option<Range<u64>> {
+    range_union(chunks, |c| {
+        let off = u64::try_from(c.column_index_offset()?).ok()?;
+        let len = u64::try_from(c.column_index_length()?).ok()?;
+        Some(off..off + len)
+    })
+}
+
+/// Union of `offset_index` byte ranges across the given column chunks.
+fn offset_index_union(chunks: &[ColumnChunkMetaData]) -> Option<Range<u64>> {
+    range_union(chunks, |c| {
+        let off = u64::try_from(c.offset_index_offset()?).ok()?;
+        let len = u64::try_from(c.offset_index_length()?).ok()?;
+        Some(off..off + len)
+    })
+}
+
+fn range_union(
+    chunks: &[ColumnChunkMetaData],
+    f: impl Fn(&ColumnChunkMetaData) -> Option<Range<u64>>,
+) -> Option<Range<u64>> {
+    let mut acc: Option<Range<u64>> = None;
+    for c in chunks {
+        let r = f(c)?; // any missing range → bail (caller falls back)
+        acc = Some(match acc {
+            None => r,
+            Some(a) => a.start.min(r.start)..a.end.max(r.end),
+        });
+    }
+    acc
+}
+
+/// A [`ChunkReader`] over an in-memory byte buffer representing the file region
+/// `[base, base + bytes.len())`. The arrow-rs page-index readers call
+/// `get_bytes(absolute_offset, len)`; we translate into the buffer.
+struct BufferChunkReader {
+    base: u64,
+    bytes: Bytes,
+}
+
+impl Length for BufferChunkReader {
+    fn len(&self) -> u64 {
+        self.base + self.bytes.len() as u64
+    }
+}
+
+impl ChunkReader for BufferChunkReader {
+    type T = buf::Reader<Bytes>;
+
+    fn get_read(&self, start: u64) -> ParquetResult<Self::T> {
+        let rel = self.rel(start, 0)?;
+        Ok(self.bytes.slice(rel..).reader())
+    }
+
+    fn get_bytes(&self, start: u64, length: usize) -> ParquetResult<Bytes> {
+        let rel = self.rel(start, length)?;
+        Ok(self.bytes.slice(rel..rel + length))
+    }
+}
+
+impl BufferChunkReader {
+    /// Translate an absolute file offset `start` to a buffer-relative index.
+    /// The fork's `read_columns_indexes`/`read_offset_indexes` call `get_bytes`
+    /// with absolute file offsets (from chunk metadata); `self.base` is the
+    /// absolute start of the fetched buffer, so `start - base` gives the
+    /// position within `self.bytes`.
+    fn rel(&self, start: u64, length: usize) -> ParquetResult<usize> {
+        let rel = start.checked_sub(self.base).ok_or_else(|| {
+            ParquetError::General(format!(
+                "page-index read offset {start} precedes buffer base {}",
+                self.base
+            ))
+        })?;
+        let rel = usize::try_from(rel)
+            .map_err(|e| ParquetError::General(format!("offset overflow: {e}")))?;
+        if rel + length > self.bytes.len() {
+            return Err(ParquetError::General(format!(
+                "page-index read [{rel}..{}) exceeds buffer of len {}",
+                rel + length,
+                self.bytes.len()
+            )));
+        }
+        Ok(rel)
+    }
+}
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use super::super::{
+        clear_scoped_cache_for_test, column_index_cache_stats, offset_index_cache_stats,
+        scoped_cache_stats, set_column_index_cache_limit_for_test, ScopedCacheStats,
+        SCOPED_CACHE_TEST_GUARD,
+    };
+    use super::super::column_schema_resolver::{resolve_predicate_parquet_columns, resolve_predicate_parquet_columns_pair};
+    use crate::indexed_table::page_pruner::{build_pruning_predicate, PagePruner};
+    use arrow::array::{Int32Array, RecordBatch};
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion::common::ScalarValue;
+    use datafusion::logical_expr::Operator;
+    use datafusion::parquet::arrow::arrow_reader::{
+        ArrowReaderMetadata, ArrowReaderOptions, RowSelection, RowSelector,
+    };
+    use datafusion::parquet::arrow::ArrowWriter;
+    use datafusion::parquet::file::properties::{EnabledStatistics, WriterProperties};
+    use datafusion::physical_expr::expressions::{BinaryExpr, Column as PhysColumn, Literal};
+    use datafusion::physical_expr::PhysicalExpr;
+    use object_store::memory::InMemory;
+    use object_store::path::Path as ObjPath;
+    use object_store::{ObjectStoreExt, PutPayload};
+
+    use super::super::SCOPED_CACHE_TEST_GUARD as CACHE_TEST_GUARD;
+
+    // ── fixtures + expr helpers ──────────────────────────────────────────
+
+    /// 2 columns (`price`, `qty`), 32 rows, 1 row group, 4 pages of 8 rows.
+    fn two_col_parquet() -> (Bytes, SchemaRef) {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("price", DataType::Int32, false),
+            Field::new("qty", DataType::Int32, false),
+        ]));
+        let prices: Vec<i32> = (0..32).collect();
+        let qtys: Vec<i32> = (100..132).collect();
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![Arc::new(Int32Array::from(prices)), Arc::new(Int32Array::from(qtys))],
+        )
+            .unwrap();
+        let props = WriterProperties::builder()
+            .set_max_row_group_size(32)
+            .set_data_page_row_count_limit(8)
+            .set_write_batch_size(8)
+            .set_statistics_enabled(EnabledStatistics::Page)
+            .build();
+        let mut buf: Vec<u8> = Vec::new();
+        let mut w = ArrowWriter::try_new(&mut buf, schema.clone(), Some(props)).unwrap();
+        w.write(&batch).unwrap();
+        w.close().unwrap();
+        (Bytes::from(buf), schema)
+    }
+
+    /// 4 row groups of 10 rows (`id` 0..40, `v` = id*2), page size 5.
+    fn four_rg_parquet() -> (Bytes, SchemaRef) {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("id", DataType::Int32, false),
+            Field::new("v", DataType::Int32, false),
+        ]));
+        let ids: Vec<i32> = (0..40).collect();
+        let vs: Vec<i32> = (0..40).map(|x| x * 2).collect();
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![Arc::new(Int32Array::from(ids)), Arc::new(Int32Array::from(vs))],
+        )
+            .unwrap();
+        let props = WriterProperties::builder()
+            .set_max_row_group_size(10)
+            .set_data_page_row_count_limit(5)
+            .set_write_batch_size(5)
+            .set_statistics_enabled(EnabledStatistics::Page)
+            .build();
+        let mut buf: Vec<u8> = Vec::new();
+        let mut w = ArrowWriter::try_new(&mut buf, schema.clone(), Some(props)).unwrap();
+        w.write(&batch).unwrap();
+        w.close().unwrap();
+        (Bytes::from(buf), schema)
+    }
+
+    /// 4 columns (2 int `n0`,`n1` + 2 wide string `s0`,`s1`), 1 RG, multiple pages.
+    fn wide4_parquet() -> (Bytes, SchemaRef) {
+        use arrow::array::StringArray;
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("n0", DataType::Int32, false),
+            Field::new("n1", DataType::Int32, false),
+            Field::new("s0", DataType::Utf8, false),
+            Field::new("s1", DataType::Utf8, false),
+        ]));
+        const ROWS: i32 = 256;
+        let n0: Vec<i32> = (0..ROWS).collect();
+        let n1: Vec<i32> = (0..ROWS).collect();
+        let s0: Vec<String> = (0..ROWS).map(|r| format!("s0_{r:05}_padpadpad")).collect();
+        let s1: Vec<String> = (0..ROWS).map(|r| format!("s1_{r:05}_padpadpad")).collect();
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(Int32Array::from(n0)),
+                Arc::new(Int32Array::from(n1)),
+                Arc::new(StringArray::from(s0)),
+                Arc::new(StringArray::from(s1)),
+            ],
+        )
+            .unwrap();
+        let props = WriterProperties::builder()
+            .set_max_row_group_size(ROWS as usize)
+            .set_data_page_row_count_limit(32)
+            .set_write_batch_size(32)
+            .set_statistics_enabled(EnabledStatistics::Page)
+            .build();
+        let mut buf: Vec<u8> = Vec::new();
+        let mut w = ArrowWriter::try_new(&mut buf, schema.clone(), Some(props)).unwrap();
+        w.write(&batch).unwrap();
+        w.close().unwrap();
+        (Bytes::from(buf), schema)
+    }
+
+    async fn stage(bytes: Bytes) -> (Arc<dyn ObjectStore>, ObjPath) {
+        let store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
+        let loc = ObjPath::from("data.parquet");
+        store.put(&loc, PutPayload::from_bytes(bytes)).await.unwrap();
+        (store, loc)
+    }
+
+    fn footer_only(bytes: &Bytes) -> Arc<ParquetMetaData> {
+        ArrowReaderMetadata::load(&bytes.clone(), ArrowReaderOptions::new().with_page_index(false))
+            .unwrap()
+            .metadata()
+            .clone()
+    }
+
+    fn full_index(bytes: &Bytes) -> Arc<ParquetMetaData> {
+        ArrowReaderMetadata::load(&bytes.clone(), ArrowReaderOptions::new().with_page_index(true))
+            .unwrap()
+            .metadata()
+            .clone()
+    }
+
+    fn col(name: &str, idx: usize) -> Arc<dyn PhysicalExpr> {
+        Arc::new(PhysColumn::new(name, idx))
+    }
+    fn lit_int(v: i32) -> Arc<dyn PhysicalExpr> {
+        Arc::new(Literal::new(ScalarValue::Int32(Some(v))))
+    }
+    fn pred(name: &str, idx: usize, op: Operator, v: i32) -> Arc<dyn PhysicalExpr> {
+        Arc::new(BinaryExpr::new(col(name, idx), op, lit_int(v)))
+    }
+    fn kept(sel: &RowSelection) -> usize {
+        sel.iter().filter(|s| !s.skip).map(|s| s.row_count).sum()
+    }
+    fn ci() -> ScopedCacheStats {
+        column_index_cache_stats()
+    }
+    fn oi() -> ScopedCacheStats {
+        offset_index_cache_stats()
+    }
+
+    fn read_selected_column(
+        bytes: &Bytes,
+        meta: &Arc<ParquetMetaData>,
+        leaf_col: usize,
+        selection: RowSelection,
+    ) -> std::result::Result<Vec<i32>, String> {
+        use datafusion::parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder;
+        use datafusion::parquet::arrow::ProjectionMask;
+
+        let arm = ArrowReaderMetadata::try_new(Arc::clone(meta), ArrowReaderOptions::new())
+            .map_err(|e| format!("try_new metadata: {e}"))?;
+        let builder = ParquetRecordBatchReaderBuilder::new_with_metadata(bytes.clone(), arm);
+        let proj = ProjectionMask::leaves(builder.parquet_schema(), [leaf_col]);
+        let mut reader = builder
+            .with_row_groups(vec![0])
+            .with_projection(proj)
+            .with_row_selection(selection)
+            .build()
+            .map_err(|e| format!("build reader: {e}"))?;
+        let mut out = Vec::new();
+        while let Some(next) = reader.next() {
+            let batch = next.map_err(|e| format!("read batch: {e}"))?;
+            let a = batch
+                .column(0)
+                .as_any()
+                .downcast_ref::<Int32Array>()
+                .ok_or("projected column was not Int32")?;
+            for i in 0..a.len() {
+                out.push(a.value(i));
+            }
+        }
+        Ok(out)
+    }
+
+    // ── baseline / correctness ────────────────────────────────────────────
+
+    #[tokio::test]
+    async fn footer_only_has_no_page_index() {
+        let (bytes, _schema) = two_col_parquet();
+        let fo = footer_only(&bytes);
+        assert!(fo.column_index().is_none());
+        assert!(fo.offset_index().is_none());
+    }
+
+    #[tokio::test]
+    async fn empty_column_set_returns_none() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, _schema) = two_col_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+        assert!(load_scoped_page_index(&store, &loc, &fo, &[]).await.is_none());
+        assert_eq!(ci().entries, 0);
+        assert_eq!(oi().entries, 0);
+    }
+
+    #[tokio::test]
+    async fn scoped_index_is_predicate_scoped_for_column_index() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = two_col_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+
+        let cols = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]);
+        assert_eq!(cols, vec![0]);
+
+        let aug = load_scoped_page_index(&store, &loc, &fo, &cols).await.unwrap();
+        let c = aug.column_index().unwrap();
+        let o = aug.offset_index().unwrap();
+        assert!(!matches!(c[0][0], ColumnIndexMetaData::NONE), "predicate col has real CI");
+        assert!(matches!(c[0][1], ColumnIndexMetaData::NONE), "non-predicate col CI is NONE");
+        assert!(
+            !o[0][0].page_locations().is_empty() && !o[0][1].page_locations().is_empty(),
+            "OffsetIndex real for every column (all-col default)"
+        );
+    }
+
+    // The pair-resolve must return exactly what two separate single-name resolves
+    // return — it only shares the per-file arrow-schema derivation, nothing else.
+    #[tokio::test]
+    async fn resolve_pair_equals_two_single_resolves() {
+        let (bytes, schema) = two_col_parquet();
+        let fo = footer_only(&bytes);
+        let names_a = vec!["price".to_string()];
+        let names_b = vec!["qty".to_string(), "price".to_string()];
+        let mut single_a = resolve_predicate_parquet_columns(&schema, &fo, &names_a);
+        let mut single_b = resolve_predicate_parquet_columns(&schema, &fo, &names_b);
+        let (mut pair_a, mut pair_b) =
+            resolve_predicate_parquet_columns_pair(&schema, &fo, &names_a, &names_b);
+        single_a.sort_unstable(); single_b.sort_unstable();
+        pair_a.sort_unstable(); pair_b.sort_unstable();
+        assert_eq!(pair_a, single_a, "pair predicate result must match single");
+        assert_eq!(pair_b, single_b, "pair projection result must match single");
+        assert_eq!(pair_a, vec![0]);
+        assert_eq!(pair_b, vec![0, 1]);
+    }
+
+    /// Regression: a match()-only query has NO residual predicate columns
+    /// (`parquet_cols` empty) but DOES project columns (`offset_cols` non-empty).
+    /// The scoped load must still build the OffsetIndex for the projected column
+    /// so the parquet reader fetches only matched-row pages, not whole chunks.
+    /// (Bug: the load short-circuited on empty `parquet_cols`, skipping the
+    /// OffsetIndex → reader over-read ~2.5× the bytes on `... | stats ... by URL`.)
+    #[tokio::test]
+    async fn projection_only_builds_offset_index_without_predicate() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, _schema) = two_col_parquet(); // price=col0, qty=col1
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+
+        // No predicate columns; project qty (col 1).
+        let parquet_cols: Vec<usize> = vec![];
+        let offset_cols: Vec<usize> = vec![1];
+        let aug = load_scoped_page_index_cols(&store, &loc, &fo, &parquet_cols, &offset_cols)
+            .await
+            .expect("projection-only load must produce grafted metadata (not None)");
+
+        // No predicate → no ColumnIndex grafted.
+        assert!(aug.column_index().is_none(), "no predicate → ColumnIndex absent");
+
+        // OffsetIndex must be real for the projected col (1) AND col 0 (loader
+        // always unions in {0}); other behavior unchanged.
+        let o = aug.offset_index().expect("OffsetIndex must be grafted");
+        assert!(!o[0][1].page_locations().is_empty(), "projected col qty has real OffsetIndex");
+        assert!(!o[0][0].page_locations().is_empty(), "col 0 OffsetIndex real (always unioned)");
+    }
+
+    #[tokio::test]
+    async fn scoped_pruning_matches_full_index() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = two_col_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+        let cols = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]);
+        let aug = load_scoped_page_index(&store, &loc, &fo, &cols).await.unwrap();
+        let full = full_index(&bytes);
+        let pp = build_pruning_predicate(&pred("price", 0, Operator::GtEq, 20), schema.clone()).unwrap();
+        let s = PagePruner::new(&schema, Arc::clone(&aug)).prune_rg(&pp, 0, None);
+        let f = PagePruner::new(&schema, full).prune_rg(&pp, 0, None);
+        assert_eq!(s.as_ref().map(kept), f.as_ref().map(kept));
+        assert_eq!(s.as_ref().map(kept), Some(16));
+    }
+
+    /// Schema-evolution fixture: a file whose physical layout is `[extra, price]`
+    /// (so `price` is parquet leaf **1**), 32 rows / 4 pages. Used to prove the
+    /// predicate→leaf resolution does NOT depend on a column's position in a wider
+    /// *union* schema.
+    fn evolved_extra_price_parquet() -> Bytes {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("extra", DataType::Int32, false),
+            Field::new("price", DataType::Int32, false),
+        ]));
+        let extra: Vec<i32> = (1000..1032).collect();
+        let prices: Vec<i32> = (0..32).collect();
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![Arc::new(Int32Array::from(extra)), Arc::new(Int32Array::from(prices))],
+        )
+            .unwrap();
+        let props = WriterProperties::builder()
+            .set_max_row_group_size(32)
+            .set_data_page_row_count_limit(8)
+            .set_write_batch_size(8)
+            .set_statistics_enabled(EnabledStatistics::Page)
+            .build();
+        let mut buf: Vec<u8> = Vec::new();
+        let mut w = ArrowWriter::try_new(&mut buf, schema.clone(), Some(props)).unwrap();
+        w.write(&batch).unwrap();
+        w.close().unwrap();
+        Bytes::from(buf)
+    }
+
+    /// Regression for the schema-evolution wrong-count bug: when the query's
+    /// (union) schema lists `price` at a DIFFERENT position than the file's
+    /// physical layout, the predicate must still resolve to the file's TRUE leaf
+    /// and scoped pruning must match the full-index pruning. Previously the
+    /// resolver used the union-schema position, scoped the page index at the wrong
+    /// leaf, and the residual mis-pruned → over-count.
+    #[tokio::test]
+    async fn scoped_resolution_is_per_file_under_schema_evolution() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let bytes = evolved_extra_price_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+
+        // The UNION/table schema the query carries: `price` at position 0 — but in
+        // THIS file `price` is physically leaf 1 (after `extra`).
+        let union_schema: SchemaRef = Arc::new(Schema::new(vec![
+            Field::new("price", DataType::Int32, false),
+            Field::new("qty", DataType::Int32, false),
+            Field::new("extra", DataType::Int32, false),
+        ]));
+
+        // Must resolve to the file's TRUE leaf for `price` = 1, NOT the union
+        // position 0 (which is `extra` in this file).
+        let cols = resolve_predicate_parquet_columns(&union_schema, &fo, &["price".to_string()]);
+        assert_eq!(cols, vec![1], "price must resolve to its per-file leaf (1), not union pos 0");
+
+        let aug = load_scoped_page_index(&store, &loc, &fo, &cols).await.unwrap();
+        let full = full_index(&bytes);
+        // `price` pages: 0..8,8..16,16..24,24..32; `price >= 20` keeps the last two
+        // pages (rows 16..32 = 16 rows). Build the pruning predicate against the
+        // FILE schema (price at index 1) so the converter matches the data.
+        let file_schema: SchemaRef = Arc::new(Schema::new(vec![
+            Field::new("extra", DataType::Int32, false),
+            Field::new("price", DataType::Int32, false),
+        ]));
+        let pp = build_pruning_predicate(&pred("price", 1, Operator::GtEq, 20), file_schema.clone()).unwrap();
+        let s = PagePruner::new(&file_schema, Arc::clone(&aug)).prune_rg(&pp, 0, None);
+        let f = PagePruner::new(&file_schema, full).prune_rg(&pp, 0, None);
+        assert_eq!(s.as_ref().map(kept), f.as_ref().map(kept), "scoped pruning must match full index");
+        assert_eq!(s.as_ref().map(kept), Some(16));
+        clear_scoped_cache_for_test();
+    }
+
+    /// Page pruning over a column-scoped index must be a SAFE SUPERSET of the
+    /// full-index pruning — never drop a row the full index would keep (that
+    /// would be an under-count / lost result). It MAY keep extra rows (the
+    /// residual mask drops them post-decode), so equality is NOT required and is
+    /// the wrong invariant.
+    ///
+    /// The hazard this guards: with the page index scoped to `price` only, `qty`
+    /// is a one-page non-panicking OffsetIndex placeholder with a `NONE`
+    /// ColumnIndex. If the pruner TRUSTED that placeholder's (absent) stats it
+    /// would build a bogus single-page grid for `qty` and could mis-prune. The
+    /// `page_pruner` fix treats a `NONE`-ColumnIndex column as "no usable stats"
+    /// (like a schema-evolution-absent column) → it contributes "unknown" and
+    /// never prunes on `qty`, so the scoped result stays a conservative superset.
+    #[tokio::test]
+    async fn scoped_pruning_is_safe_superset_with_placeholdered_residual_col() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        // price 0..32 (pages 0..8,8..16,16..24,24..32); qty 100..132 (pages
+        // 100..108,108..116,116..124,124..132). 1 RG, 4 pages each.
+        let (bytes, schema) = two_col_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+
+        // Scope the page index to `price` ONLY (mimics the predicate-scoped indexed
+        // path). `qty` therefore gets the one-page placeholder + NONE ColumnIndex.
+        let cols = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]);
+        assert_eq!(cols, vec![0]);
+        let aug = load_scoped_page_index(&store, &loc, &fo, &cols).await.unwrap();
+        let full = full_index(&bytes);
+
+        // Residual references BOTH columns: price >= 16 (full keeps pages 2,3) AND
+        // qty <= 115 (full keeps pages 0,1) → full intersection prunes to 0 rows.
+        let price_ge = pred("price", 0, Operator::GtEq, 16);
+        let qty_le = pred("qty", 1, Operator::LtEq, 115);
+        let residual: Arc<dyn PhysicalExpr> =
+            Arc::new(BinaryExpr::new(price_ge, Operator::And, qty_le));
+        let pp = build_pruning_predicate(&residual, schema.clone()).unwrap();
+
+        let s_kept = PagePruner::new(&schema, Arc::clone(&aug)).prune_rg(&pp, 0, None).map(|s| kept(&s));
+        let f_kept = PagePruner::new(&schema, full).prune_rg(&pp, 0, None).map(|s| kept(&s));
+        // Superset invariant: scoped must keep AT LEAST what full keeps (never
+        // fewer). It keeps more here (16 vs 0) because it correctly cannot prune
+        // the placeholdered `qty` — that's safe; the residual mask removes the
+        // extras post-decode. A scoped result SMALLER than full would be the real
+        // bug (lost rows). `None` = "kept everything" (no pruning) = the maximal
+        // superset, also safe.
+        let s = s_kept.unwrap_or(usize::MAX);
+        let f = f_kept.unwrap_or(usize::MAX);
+        assert!(
+            s >= f,
+            "scoped page pruning must be a safe superset of full ({} kept) but kept fewer ({})",
+            f, s
+        );
+        clear_scoped_cache_for_test();
+    }
+
+    #[tokio::test]
+    async fn scoped_index_reads_non_predicate_projected_column() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = two_col_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+        let cols = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]);
+        let aug = load_scoped_page_index(&store, &loc, &fo, &cols).await.unwrap();
+        let selection = RowSelection::from(vec![RowSelector::skip(16), RowSelector::select(16)]);
+        let scoped_vals = read_selected_column(&bytes, &aug, 1, selection.clone()).unwrap();
+        let full = full_index(&bytes);
+        let full_vals = read_selected_column(&bytes, &full, 1, selection).unwrap();
+        let expected: Vec<i32> = (116..132).collect();
+        assert_eq!(scoped_vals, expected);
+        assert_eq!(scoped_vals, full_vals);
+    }
+
+    // ── cache behavior: hits, independence, eviction ──────────────────────
+
+    /// Second identical load is a pure hit in BOTH caches; no new cells/bytes.
+    /// Cells: predicate `price` → 1 CI cell `(col0,rg0)`; all-column OffsetIndex
+    /// (the default) → 2 OI cells (one per column).
+    #[tokio::test]
+    async fn second_load_is_cache_hit() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = two_col_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+        let cols = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]);
+
+        let _ = load_scoped_page_index(&store, &loc, &fo, &cols).await.unwrap();
+        let (c1, o1) = (ci(), oi());
+        assert_eq!((c1.hits, c1.misses, c1.entries), (0, 1, 1), "1 CI cell (price,rg0)");
+        assert_eq!((o1.hits, o1.misses, o1.entries), (0, 2, 2), "2 OI cells (col0,col1)");
+        assert!(c1.used_bytes > 0 && o1.used_bytes > 0);
+
+        let _ = load_scoped_page_index(&store, &loc, &fo, &cols).await.unwrap();
+        let (c2, o2) = (ci(), oi());
+        assert_eq!((c2.hits, c2.misses, c2.entries, c2.used_bytes), (1, 1, 1, c1.used_bytes));
+        assert_eq!((o2.hits, o2.misses, o2.entries, o2.used_bytes), (2, 2, 2, o1.used_bytes));
+    }
+
+    /// Distinct predicate columns → distinct CI cells, but the OffsetIndex column
+    /// cells are SHARED. Both loads default to the all-column OffsetIndex, so the
+    /// second load re-reads the SAME 2 OI cells from cache (no new cells). This is
+    /// the whole point of cell-keying: a column's index is stored once per file.
+    #[tokio::test]
+    async fn distinct_predicates_share_offset_index() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = two_col_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+        let c_price = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]);
+        let c_qty = resolve_predicate_parquet_columns(&schema, &fo, &["qty".to_string()]);
+
+        let _ = load_scoped_page_index(&store, &loc, &fo, &c_price).await.unwrap();
+        let _ = load_scoped_page_index(&store, &loc, &fo, &c_qty).await.unwrap();
+
+        assert_eq!(ci().entries, 2, "distinct predicate cells: (price,rg0) + (qty,rg0)");
+        assert_eq!(oi().entries, 2, "all-column OffsetIndex: 2 column cells, shared");
+        // Second (qty) load re-read the same 2 OI cells from cache.
+        assert_eq!(oi().hits, 2);
+    }
+
+    /// The cell-keying payoff: a predicate that ADDS a column reuses the cell the
+    /// first predicate already decoded, instead of re-decoding it inside a new
+    /// set-keyed entry. `price` then `{price, qty}` → `price`'s cell is a HIT; only
+    /// `qty`'s cell is freshly decoded. (Under the old set-keyed cache this was a
+    /// full miss that re-decoded `price`.)
+    #[tokio::test]
+    async fn adding_predicate_column_reuses_existing_cell() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = two_col_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+        let c_price = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]);
+        let c_both = resolve_predicate_parquet_columns(
+            &schema,
+            &fo,
+            &["price".to_string(), "qty".to_string()],
+        );
+
+        let _ = load_scoped_page_index(&store, &loc, &fo, &c_price).await.unwrap();
+        assert_eq!((ci().hits, ci().misses, ci().entries), (0, 1, 1), "price cell decoded");
+
+        // Predicate now covers {price, qty}: price's cell hits, qty's cell misses.
+        let _ = load_scoped_page_index(&store, &loc, &fo, &c_both).await.unwrap();
+        assert_eq!(
+            (ci().hits, ci().misses, ci().entries),
+            (1, 2, 2),
+            "price cell reused (hit); only qty cell freshly decoded"
+        );
+        clear_scoped_cache_for_test();
+    }
+
+    /// Two predicates on the SAME column with DIFFERENT literals resolve to the
+    /// same `(file, col)` parquet column, so they share the one CI cell — predicate
+    /// *value* never multiplies cache entries. (`status>=400` vs `status>=100`.)
+    #[tokio::test]
+    async fn different_literals_same_column_share_cell() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = two_col_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+        // Both predicates are on `price` (col 0) — only the literal differs, which
+        // never enters the cache key.
+        let cols = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]);
+
+        let _ = load_scoped_page_index(&store, &loc, &fo, &cols).await.unwrap();
+        let _ = load_scoped_page_index(&store, &loc, &fo, &cols).await.unwrap();
+        assert_eq!(ci().entries, 1, "same column → one cell regardless of literal");
+        assert_eq!(ci().hits, 1);
+        clear_scoped_cache_for_test();
+    }
+
+    /// CI hit/miss accounting across two predicate-column sets.
+    #[tokio::test]
+    async fn stats_count_hits_and_misses() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = two_col_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+        let c_price = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]);
+        let c_qty = resolve_predicate_parquet_columns(&schema, &fo, &["qty".to_string()]);
+
+        let _ = load_scoped_page_index(&store, &loc, &fo, &c_price).await.unwrap();
+        assert_eq!((ci().hits, ci().misses), (0, 1));
+        let _ = load_scoped_page_index(&store, &loc, &fo, &c_price).await.unwrap();
+        assert_eq!((ci().hits, ci().misses), (1, 1));
+        let _ = load_scoped_page_index(&store, &loc, &fo, &c_qty).await.unwrap();
+        assert_eq!((ci().hits, ci().misses), (1, 2));
+        let _ = load_scoped_page_index(&store, &loc, &fo, &c_price).await.unwrap();
+        let _ = load_scoped_page_index(&store, &loc, &fo, &c_qty).await.unwrap();
+        let s = ci();
+        assert_eq!((s.hits, s.misses, s.entries, s.evictions), (3, 2, 2, 0));
+    }
+
+    /// Byte-bounded LRU on the (now cell-keyed) ColumnIndex cache: with the budget
+    /// sized to hold ~1.5 cells, loading two distinct column cells evicts the LRU
+    /// one; the cache never exceeds its limit and never degrades to "cache
+    /// nothing"; the most-recently-used cell survives.
+    #[tokio::test]
+    async fn lru_evicts_over_byte_budget() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = two_col_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+        let c_price = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]);
+        let c_qty = resolve_predicate_parquet_columns(&schema, &fo, &["qty".to_string()]);
+
+        // Measure one CI cell (predicate `price` = col0 at the single RG), then set
+        // a budget of ~1.5 cells so a second distinct cell forces an eviction.
+        let _ = load_scoped_page_index(&store, &loc, &fo, &c_price).await.unwrap();
+        let one_cell = ci().used_bytes;
+        assert!(one_cell > 0);
+        let budget = one_cell + one_cell / 2;
+        clear_scoped_cache_for_test();
+        set_column_index_cache_limit_for_test(budget);
+
+        let _ = load_scoped_page_index(&store, &loc, &fo, &c_price).await.unwrap(); // cell (col0)
+        let _ = load_scoped_page_index(&store, &loc, &fo, &c_qty).await.unwrap(); // cell (col1) → evicts col0
+
+        assert!(ci().used_bytes <= budget, "CI bytes {} must stay within {}", ci().used_bytes, budget);
+        assert_eq!(ci().entries, 1, "only the most-recent cell fits");
+        assert!(ci().evictions >= 1, "the LRU cell must have evicted");
+
+        // The most-recently-used cell (qty/col1) must still be a hit.
+        let hits_before = ci().hits;
+        let _ = load_scoped_page_index(&store, &loc, &fo, &c_qty).await.unwrap();
+        assert_eq!(ci().hits, hits_before + 1, "MRU cell must remain cached");
+
+        clear_scoped_cache_for_test();
+    }
+
+    // ── Step 2: RG-scoping the ColumnIndex ────────────────────────────────
+
+    #[tokio::test]
+    async fn surviving_row_groups_matches_footer_stats_prune() {
+        let (bytes, schema) = four_rg_parquet();
+        let fo = footer_only(&bytes);
+        assert_eq!(fo.num_row_groups(), 4);
+        let p = pred("id", 0, Operator::GtEq, 25);
+        assert_eq!(surviving_row_groups(&fo, &schema, &p), vec![2, 3]);
+        let p2 = pred("id", 0, Operator::Lt, 12);
+        assert_eq!(surviving_row_groups(&fo, &schema, &p2), vec![0, 1]);
+    }
+
+    #[tokio::test]
+    async fn rg_scoped_load_builds_column_index_only_for_survivors() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = four_rg_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+        let cols = resolve_predicate_parquet_columns(&schema, &fo, &["id".to_string()]);
+        let surviving = vec![2usize, 3usize];
+
+        let aug = load_scoped_page_index_rgs(&store, &loc, &fo, &cols, &surviving).await.unwrap();
+        let c = aug.column_index().unwrap();
+        let o = aug.offset_index().unwrap();
+        assert_eq!(c.len(), 4);
+        for &rg in &surviving {
+            assert!(!matches!(c[rg][0], ColumnIndexMetaData::NONE), "survivor RG {rg} real CI");
+        }
+        for &rg in &[0usize, 1usize] {
+            assert!(matches!(c[rg][0], ColumnIndexMetaData::NONE), "pruned RG {rg} NONE CI");
+        }
+        for rg in 0..4 {
+            for cc in 0..2 {
+                assert!(!o[rg][cc].page_locations().is_empty(), "OI real for all rg/col");
+            }
+        }
+        let full = full_index(&bytes);
+        let pp = build_pruning_predicate(&pred("id", 0, Operator::GtEq, 25), schema.clone()).unwrap();
+        let s = PagePruner::new(&schema, Arc::clone(&aug)).prune_rg(&pp, 2, None);
+        let f = PagePruner::new(&schema, full).prune_rg(&pp, 2, None);
+        assert_eq!(s.as_ref().map(kept), f.as_ref().map(kept));
+        clear_scoped_cache_for_test();
+    }
+
+    #[tokio::test]
+    async fn rg_scoping_reduces_column_index_bytes() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = four_rg_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+        let cols = resolve_predicate_parquet_columns(&schema, &fo, &["id".to_string()]);
+
+        let _ = load_scoped_page_index(&store, &loc, &fo, &cols).await.unwrap();
+        let all_rg = ci().used_bytes;
+        clear_scoped_cache_for_test();
+        let _ = load_scoped_page_index_rgs(&store, &loc, &fo, &cols, &[2, 3]).await.unwrap();
+        assert!(ci().used_bytes < all_rg, "RG-scoped CI bytes {} < all-RG {}", ci().used_bytes, all_rg);
+        clear_scoped_cache_for_test();
+    }
+
+    /// CI cells are keyed per `(col, rg)`. Loading survivors {2,3} caches cells
+    /// (id,rg2) + (id,rg3); reloading the same survivor set hits both; a different
+    /// survivor set {0,1} adds two fresh cells. So a column's per-RG index is
+    /// reused across overlapping survivor sets instead of re-decoded per set.
+    #[tokio::test]
+    async fn rg_scoped_key_includes_surviving_rgs() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = four_rg_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+        let cols = resolve_predicate_parquet_columns(&schema, &fo, &["id".to_string()]);
+
+        let _ = load_scoped_page_index_rgs(&store, &loc, &fo, &cols, &[2, 3]).await.unwrap();
+        assert_eq!((ci().misses, ci().entries), (2, 2), "cells (id,rg2)+(id,rg3)");
+        let _ = load_scoped_page_index_rgs(&store, &loc, &fo, &cols, &[2, 3]).await.unwrap();
+        assert_eq!((ci().hits, ci().entries), (2, 2), "same survivors → both cells hit");
+        let _ = load_scoped_page_index_rgs(&store, &loc, &fo, &cols, &[0, 1]).await.unwrap();
+        assert_eq!((ci().misses, ci().entries), (4, 4), "new survivors → 2 fresh cells");
+        // OI stayed all-columns across all three → 2 column cells, shared.
+        assert_eq!(oi().entries, 2);
+        clear_scoped_cache_for_test();
+    }
+
+    /// Partial-overlap survivor sets only decode the NEW row groups. Load
+    /// survivors {2,3} (cells rg2,rg3), then {1,2,3}: rg2+rg3 hit, only rg1 is
+    /// freshly decoded. Proves RG-scoping reuses per-RG cells across overlapping
+    /// survivor sets rather than re-decoding the whole set.
+    #[tokio::test]
+    async fn overlapping_survivor_sets_decode_only_new_rgs() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = four_rg_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+        let cols = resolve_predicate_parquet_columns(&schema, &fo, &["id".to_string()]);
+
+        let _ = load_scoped_page_index_rgs(&store, &loc, &fo, &cols, &[2, 3]).await.unwrap();
+        assert_eq!((ci().hits, ci().misses, ci().entries), (0, 2, 2), "cells (id,rg2)+(id,rg3)");
+
+        // {1,2,3}: rg2 & rg3 are cached (2 hits); only rg1 is new (1 miss).
+        let _ = load_scoped_page_index_rgs(&store, &loc, &fo, &cols, &[1, 2, 3]).await.unwrap();
+        assert_eq!(
+            (ci().hits, ci().misses, ci().entries),
+            (2, 3, 3),
+            "rg2+rg3 reused (2 hits); only rg1 freshly decoded"
+        );
+        clear_scoped_cache_for_test();
+    }
+
+    /// The combined payoff across BOTH axes: a second query that adds a new
+    /// predicate column AND scans a wider RG set decodes only the genuinely new
+    /// `(col, rg)` cells. Uses `wide4` (1 RG) for the column axis and asserts CI
+    /// cell-level hit/miss deltas.
+    #[tokio::test]
+    async fn new_column_combination_caches_only_new_column_cells() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = wide4_parquet(); // n0,n1,s0,s1 — 1 RG
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+        let c_n0 = resolve_predicate_parquet_columns(&schema, &fo, &["n0".to_string()]);
+        let c_n0_n1 = resolve_predicate_parquet_columns(
+            &schema,
+            &fo,
+            &["n0".to_string(), "n1".to_string()],
+        );
+        let c_n1_s0 = resolve_predicate_parquet_columns(
+            &schema,
+            &fo,
+            &["n1".to_string(), "s0".to_string()],
+        );
+
+        // {n0}: 1 new cell.
+        let _ = load_scoped_page_index(&store, &loc, &fo, &c_n0).await.unwrap();
+        assert_eq!((ci().hits, ci().misses, ci().entries), (0, 1, 1));
+        // {n0,n1}: n0 hits, n1 new.
+        let _ = load_scoped_page_index(&store, &loc, &fo, &c_n0_n1).await.unwrap();
+        assert_eq!((ci().hits, ci().misses, ci().entries), (1, 2, 2), "n0 reused; n1 new");
+        // {n1,s0}: n1 hits, s0 new.
+        let _ = load_scoped_page_index(&store, &loc, &fo, &c_n1_s0).await.unwrap();
+        assert_eq!((ci().hits, ci().misses, ci().entries), (2, 3, 3), "n1 reused; s0 new");
+        clear_scoped_cache_for_test();
+    }
+
+    /// OffsetIndex equivalent: different projections cache only the new column
+    /// cells. Project {s0} (offset cols n1∪s0∪{0}), then {s1} (offset cols
+    /// n1∪s1∪{0}) — the shared cols (0, n1) hit; only the genuinely new projected
+    /// column is decoded.
+    #[tokio::test]
+    async fn different_projections_cache_only_new_offset_columns() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = wide4_parquet(); // n0=0,n1=1,s0=2,s1=3 — 1 RG
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+        let pred_cols = resolve_predicate_parquet_columns(&schema, &fo, &["n1".to_string()]);
+
+        // Project s0 (col 2): offset cols = {0, 1(n1), 2(s0)} → 3 new cells.
+        let _ = load_scoped_page_index_cols(&store, &loc, &fo, &pred_cols, &[2]).await.unwrap();
+        assert_eq!((oi().hits, oi().misses, oi().entries), (0, 3, 3), "cols 0,1,2");
+
+        // Project s1 (col 3): offset cols = {0, 1, 3}. Cols 0 & 1 hit; col 3 new.
+        let _ = load_scoped_page_index_cols(&store, &loc, &fo, &pred_cols, &[3]).await.unwrap();
+        assert_eq!(
+            (oi().hits, oi().misses, oi().entries),
+            (2, 4, 4),
+            "cols 0,1 reused (2 hits); only col 3 freshly decoded"
+        );
+        clear_scoped_cache_for_test();
+    }
+
+    // ── Step 2: column-scoping the OffsetIndex ────────────────────────────
+
+    #[tokio::test]
+    async fn col_scoped_offset_index_only_for_requested_columns() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = wide4_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+        let pred_cols = resolve_predicate_parquet_columns(&schema, &fo, &["n1".to_string()]);
+        assert_eq!(pred_cols, vec![1]);
+
+        let aug = load_scoped_page_index_cols(&store, &loc, &fo, &pred_cols, &[2]).await.unwrap();
+        let o = aug.offset_index().unwrap();
+        // wide4 has 256 rows / page-size 32 → real columns have multiple pages.
+        let full = full_index(&bytes);
+        let real_pages = full.offset_index().unwrap()[0][0].page_locations().len();
+        assert!(real_pages > 1, "fixture should have multi-page columns");
+        // Scoped columns (predicate 1 ∪ projection 2 ∪ metric 0) carry the REAL
+        // page index; the rest carry a single whole-RG placeholder page (non-empty
+        // so any consumer dereference is safe — never empty, which would panic).
+        for &c in &[0usize, 1, 2] {
+            assert_eq!(o[0][c].page_locations().len(), real_pages, "col {c} (pred/proj/metric) real OI");
+        }
+        assert_eq!(
+            o[0][3].page_locations().len(),
+            1,
+            "col 3 (scoped out) OI is a single-page placeholder, not real and not empty"
+        );
+    }
+
+    #[tokio::test]
+    async fn col_scoped_reads_projected_non_predicate_column() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = two_col_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+        let pred_cols = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]);
+        let aug = load_scoped_page_index_cols(&store, &loc, &fo, &pred_cols, &[1]).await.unwrap();
+        let selection = RowSelection::from(vec![RowSelector::skip(16), RowSelector::select(16)]);
+        let scoped_vals = read_selected_column(&bytes, &aug, 1, selection.clone()).unwrap();
+        let full = full_index(&bytes);
+        let full_vals = read_selected_column(&bytes, &full, 1, selection).unwrap();
+        let expected: Vec<i32> = (116..132).collect();
+        assert_eq!(scoped_vals, expected);
+        assert_eq!(scoped_vals, full_vals);
+    }
+
+    #[tokio::test]
+    async fn col_scoping_reduces_offset_index_bytes() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = wide4_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+        let pred_cols = resolve_predicate_parquet_columns(&schema, &fo, &["n1".to_string()]);
+
+        let _ = load_scoped_page_index(&store, &loc, &fo, &pred_cols).await.unwrap();
+        let all_cols = oi().used_bytes;
+        clear_scoped_cache_for_test();
+        let _ = load_scoped_page_index_cols(&store, &loc, &fo, &pred_cols, &[2]).await.unwrap();
+        assert!(oi().used_bytes < all_cols, "col-scoped OI {} < all-col {}", oi().used_bytes, all_cols);
+        clear_scoped_cache_for_test();
+    }
+
+    /// Cell-keying makes OffsetIndex reuse automatic: an all-columns load caches
+    /// per-column cells, and a later column-scoped load whose set is covered by
+    /// those cells hits them — no new entries, no special "collapse to all-columns
+    /// sentinel" needed (the prior set-keyed design's mechanism).
+    #[tokio::test]
+    async fn col_scoping_full_coverage_collapses_to_all_columns_entry() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = two_col_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+        let pred_cols = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]);
+
+        let _ = load_scoped_page_index(&store, &loc, &fo, &pred_cols).await.unwrap();
+        assert_eq!(oi().entries, 2, "all-columns load caches 2 column cells");
+        // Project {1}; union {0,1} = both columns, both already cached → 2 hits.
+        let _ = load_scoped_page_index_cols(&store, &loc, &fo, &pred_cols, &[1]).await.unwrap();
+        assert_eq!(oi().entries, 2, "covered columns reuse their cells, no new entries");
+        assert_eq!(oi().hits, 2);
+        clear_scoped_cache_for_test();
+    }
+
+    /// The fully-scoped entry point: CI RG-scoped + OI column-scoped together.
+    #[tokio::test]
+    async fn fully_scoped_load_combines_both_axes() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = four_rg_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+        let cols = resolve_predicate_parquet_columns(&schema, &fo, &["id".to_string()]);
+
+        // CI scoped to RGs {2,3}; OI scoped to {0,1} = all 2 cols (collapses to all).
+        let aug = load_page_index_fully_scoped(&store, &loc, &fo, &cols, &[2, 3], &[1])
+            .await
+            .unwrap();
+        let c = aug.column_index().unwrap();
+        assert!(matches!(c[0][0], ColumnIndexMetaData::NONE), "RG0 pruned → NONE CI");
+        assert!(!matches!(c[2][0], ColumnIndexMetaData::NONE), "RG2 survivor → real CI");
+        // CI cells (id,rg2)+(id,rg3) = 2; OI cells (col0)+(col1) = 2.
+        assert_eq!(ci().entries, 2);
+        assert_eq!(oi().entries, 2);
+        clear_scoped_cache_for_test();
+    }
+
+    // ── Eviction on file deletion ─────────────────────────────────────────────
+
+    /// Evicting a file removes ALL its CI and OI cells from the caches —
+    /// a subsequent load for the same path is a miss, not a stale hit.
+    #[tokio::test]
+    async fn evict_file_clears_all_cells_for_that_path() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = two_col_parquet();
+        let (store, loc) = stage(bytes.clone()).await;
+        let fo = footer_only(&bytes);
+        let cols = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]);
+
+        // Warm: 1 CI cell (price,rg0) + 2 OI cells (col0, col1).
+        let _ = load_scoped_page_index_cols(&store, &loc, &fo, &cols, &[0, 1]).await.unwrap();
+        assert_eq!(ci().entries, 1);
+        assert_eq!(oi().entries, 2);
+        assert_eq!(ci().misses, 1);
+
+        // Evict the file.
+        super::super::evict_file_from_scoped_cache(loc.as_ref());
+        assert_eq!(ci().entries, 0, "CI cells must be gone after eviction");
+        assert_eq!(oi().entries, 0, "OI cells must be gone after eviction");
+
+        // Reload — must be a miss, not a hit from stale cache.
+        let _ = load_scoped_page_index_cols(&store, &loc, &fo, &cols, &[0, 1]).await.unwrap();
+        assert_eq!(ci().misses, 2, "second load after eviction must be a cache miss");
+        assert_eq!(ci().hits, 0, "no hits — eviction prevented serving stale data");
+
+        clear_scoped_cache_for_test();
+    }
+
+    /// Evicting file A does not remove cells for file B. Cross-file isolation.
+    #[tokio::test]
+    async fn evict_file_does_not_affect_other_files() {
+        let _g = CACHE_TEST_GUARD.lock().unwrap();
+        clear_scoped_cache_for_test();
+        let (bytes, schema) = two_col_parquet();
+        let cols = resolve_predicate_parquet_columns(&schema, &footer_only(&bytes), &["price".to_string()]);
+
+        // Stage two identical files at different paths.
+        let store_a: Arc<dyn object_store::ObjectStore> = Arc::new(object_store::memory::InMemory::new());
+        let loc_a = object_store::path::Path::from("file_a.parquet");
+        let loc_b = object_store::path::Path::from("file_b.parquet");
+        store_a.put(&loc_a, object_store::PutPayload::from_bytes(bytes.clone())).await.unwrap();
+        store_a.put(&loc_b, object_store::PutPayload::from_bytes(bytes.clone())).await.unwrap();
+
+        let fo = footer_only(&bytes);
+        let _ = load_scoped_page_index_cols(&store_a, &loc_a, &fo, &cols, &[0, 1]).await.unwrap();
+        let _ = load_scoped_page_index_cols(&store_a, &loc_b, &fo, &cols, &[0, 1]).await.unwrap();
+        assert_eq!(ci().entries, 2, "one CI cell per file");
+        assert_eq!(oi().entries, 4, "two OI cells per file");
+
+        // Evict only file_a.
+        super::super::evict_file_from_scoped_cache(loc_a.as_ref());
+        assert_eq!(ci().entries, 1, "only file_a's CI cell removed");
+        assert_eq!(oi().entries, 2, "only file_a's OI cells removed");
+
+        // file_b's cells are still hits.
+        let hits_before = ci().hits;
+        let _ = load_scoped_page_index_cols(&store_a, &loc_b, &fo, &cols, &[0, 1]).await.unwrap();
+        assert_eq!(ci().hits, hits_before + 1, "file_b CI cell must still be cached");
+
+        clear_scoped_cache_for_test();
+    }
+
+}
diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/statistics_cache.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/statistics_cache.rs
similarity index 100%
rename from sandbox/plugins/analytics-backend-datafusion/rust/src/statistics_cache.rs
rename to sandbox/plugins/analytics-backend-datafusion/rust/src/cache/statistics_cache.rs
diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/indexed_executor.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/indexed_executor.rs
index 09a691d7fbeda..5d882c7507754 100644
--- a/sandbox/plugins/analytics-backend-datafusion/rust/src/indexed_executor.rs
+++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/indexed_executor.rs
@@ -67,7 +67,7 @@ use crate::indexed_table::table_provider::{
     EvaluatorFactory, IndexedTableConfig, IndexedTableProvider, SegmentFileInfo,
 };
 
-use std::collections::{BTreeSet, HashMap};
+use std::collections::{HashMap, HashSet};
 use std::fmt;
 
 use crate::api::ShardView;
@@ -304,7 +304,7 @@ fn collect_predicate_column_indices(extraction: Option<&ExtractionResult>) -> Ve
     let Some(e) = extraction else { return vec![] };
     let mut exprs = Vec::new();
     collect_predicate_exprs(&e.tree, &mut exprs);
-    let mut indices = BTreeSet::new();
+    let mut indices = HashSet::new();
     for expr in &exprs {
         let _ = expr.apply(|node| {
             if let Some(col) = node.downcast_ref::<Column>() {
@@ -315,6 +315,45 @@ fn collect_predicate_column_indices(extraction: Option<&ExtractionResult>) -> Ve
     }
     indices.into_iter().collect()
 }
+
+fn collect_predicate_column_names(
+    extraction: Option<&ExtractionResult>,
+    schema: &SchemaRef,
+) -> Vec<String> {
+    let Some(e) = extraction else { return vec![] };
+    let mut exprs = Vec::new();
+    collect_predicate_exprs(&e.tree, &mut exprs);
+    let mut names = HashSet::new();
+    for expr in &exprs {
+        let _ = expr.apply(|node| {
+            if let Some(col) = node.downcast_ref::<Column>() {
+                if let Some(field) = schema.fields().get(col.index()) {
+                    names.insert(field.name().to_string());
+                }
+            }
+            Ok(TreeNodeRecursion::Continue)
+        });
+    }
+    names.into_iter().collect()
+}
+
+fn collect_plan_column_names(plan: &datafusion::logical_expr::LogicalPlan) -> Vec<String> {
+    let mut names = HashSet::new();
+    let _ = plan.apply(|node| {
+        let _ = node.apply_expressions(|expr| {
+            let _ = expr.apply(|e| {
+                if let Expr::Column(col) = e {
+                    names.insert(col.name().to_string());
+                }
+                Ok(TreeNodeRecursion::Continue)
+            });
+            Ok(TreeNodeRecursion::Continue)
+        });
+        Ok(TreeNodeRecursion::Continue)
+    });
+    names.into_iter().collect()
+}
+
 /// For a tree classified as `SingleCollector`, walk it to find the single
 /// Collector leaf and return its query bytes.
 fn single_collector_id(tree: &BoolNode) -> Option<i32> {
@@ -924,6 +963,38 @@ async unsafe fn execute_indexed_with_context_inner(
 
     let predicate_columns = collect_predicate_column_indices(extraction.as_ref());
 
+    // Augment each segment's footer-only metadata with a scoped page index so
+    // the indexed PagePruner can page-prune. Both predicate (→ ColumnIndex) and
+    // projection (→ OffsetIndex) are wired — a match()-only query still needs a
+    // scoped OffsetIndex so the reader fetches only matched pages.
+    let predicate_column_names = collect_predicate_column_names(extraction.as_ref(), &schema);
+    let projection_column_names = collect_plan_column_names(&logical_plan);
+    if !predicate_column_names.is_empty() || !projection_column_names.is_empty() {
+        for segment in segments.iter_mut() {
+            let (parquet_cols, offset_cols) =
+                crate::parquet_page_cache::resolve_predicate_parquet_columns_pair(
+                    &schema,
+                    &segment.metadata,
+                    &predicate_column_names,
+                    &projection_column_names,
+                );
+            if parquet_cols.is_empty() && offset_cols.is_empty() {
+                continue;
+            }
+            if let Some(augmented) = crate::parquet_page_cache::load_scoped_page_index_cols(
+                &store,
+                &segment.object_path,
+                &segment.metadata,
+                &parquet_cols,
+                &offset_cols,
+            )
+            .await
+            {
+                segment.metadata = augmented;
+            }
+        }
+    }
+
     let factory: EvaluatorFactory = match classification {
         FilterClass::None => {
             // Predicate-only scan: page-pruned universe, residual applied in
diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/indexed_table/parquet_bridge.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/indexed_table/parquet_bridge.rs
index 5231187a694e2..65d7dab71ba02 100644
--- a/sandbox/plugins/analytics-backend-datafusion/rust/src/indexed_table/parquet_bridge.rs
+++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/indexed_table/parquet_bridge.rs
@@ -24,10 +24,13 @@ use std::time::{Duration, Instant};
 
 use datafusion::arrow::datatypes::SchemaRef;
 use datafusion::common::Result;
-use datafusion::datasource::physical_plan::parquet::metadata::DFParquetMetadata;
+use datafusion::datasource::physical_plan::parquet::metadata::CachedParquetMetaData;
 use datafusion::datasource::physical_plan::parquet::{
     ParquetAccessPlan, ParquetFileMetrics, ParquetFileReaderFactory, RowGroupAccess,
 };
+use datafusion::execution::cache::cache_manager::CachedFileMetadataEntry;
+use datafusion::parquet::arrow::async_reader::ParquetObjectReader;
+use datafusion::parquet::file::metadata::{PageIndexPolicy, ParquetMetaDataReader};
 use datafusion::datasource::physical_plan::ParquetSource;
 use datafusion::execution::cache::cache_manager::FileMetadataCache;
 use datafusion::execution::object_store::ObjectStoreUrl;
@@ -48,8 +51,11 @@ use prost::bytes::Bytes;
 
 // ── Parquet Metadata Loading ─────────────────────────────────────────
 
-/// Load parquet metadata via DataFusion's `DFParquetMetadata`, consulting the
-/// caller-supplied `FileMetadataCache`.
+/// Load footer-only parquet metadata, consulting the caller-supplied cache.
+///
+/// On a cache hit the cached (footer-only) metadata is returned with no IO.
+/// On a cache miss we fetch with `PageIndexPolicy::Skip` — never fetching page
+/// index bytes — then store the footer in the cache for future hits.
 pub async fn load_parquet_metadata(
     store: Arc<dyn ObjectStore>,
     location: &object_store::path::Path,
@@ -58,18 +64,50 @@ pub async fn load_parquet_metadata(
     let meta = store
         .head(location)
         .await
-        .map_err(|e| format!("object-store head {}: {}", location, e))?;
+        .map_err(|e| format!("object-store head {location}: {e}"))?;
     let size = meta.size;
 
-    let pq_meta = DFParquetMetadata::new(&*store, &meta)
-        .with_file_metadata_cache(Some(metadata_cache))
-        .fetch_metadata()
-        .await
-        .map_err(|e| format!("load parquet metadata {}: {}", location, e))?;
+    // Cache hit — return footer-only metadata without any IO.
+    let pq_meta = if let Some(entry) = metadata_cache.get(location) {
+        if entry.is_valid_for(&meta) {
+            entry
+                .file_metadata
+                .as_any()
+                .downcast_ref::<CachedParquetMetaData>()
+                .map(|cached| Arc::clone(cached.parquet_metadata()))
+        } else {
+            None
+        }
+    } else {
+        None
+    };
+
+    // Cache miss — fetch footer only, no page index bytes.
+    let pq_meta = match pq_meta {
+        Some(m) => m,
+        None => {
+            let mut reader = ParquetObjectReader::new(Arc::clone(&store), location.clone());
+            let fetched = Arc::new(
+                ParquetMetaDataReader::new()
+                    .with_page_index_policy(PageIndexPolicy::Skip)
+                    .load_and_finish(&mut reader, size)
+                    .await
+                    .map_err(|e| format!("load parquet metadata {location}: {e}"))?,
+            );
+            metadata_cache.put(
+                location,
+                CachedFileMetadataEntry::new(
+                    meta,
+                    Arc::new(CachedParquetMetaData::new(Arc::clone(&fetched))),
+                ),
+            );
+            fetched
+        }
+    };
 
     let file_meta = pq_meta.file_metadata();
     let schema = parquet_to_arrow_schema(file_meta.schema_descr(), file_meta.key_value_metadata())
-        .map_err(|e| format!("parquet_to_arrow_schema {}: {}", location, e))?;
+        .map_err(|e| format!("parquet_to_arrow_schema {location}: {e}"))?;
 
     Ok((Arc::new(schema), size, pq_meta))
 }
diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/lib.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/lib.rs
index c1c853db2c04a..4eb3f603d6fd0 100644
--- a/sandbox/plugins/analytics-backend-datafusion/rust/src/lib.rs
+++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/lib.rs
@@ -25,9 +25,7 @@ pub mod api;
 pub mod cache;
 pub mod cancellation;
 pub mod cross_rt_stream;
-pub mod custom_cache_manager;
 pub mod datafusion_query_config;
-pub mod eviction_policy;
 pub mod executor;
 pub mod ffm;
 pub mod indexed_executor;
@@ -50,7 +48,6 @@ pub mod shard_table_provider;
 pub mod runtime_manager;
 pub mod schema_coerce;
 pub mod session_context;
-pub mod statistics_cache;
 pub mod udaf;
 pub mod udf;
 pub mod udwf;
@@ -58,3 +55,11 @@ pub mod native_node_stats;
 pub mod search_stats;
 pub mod stats;
 pub mod task_monitors;
+pub mod scoped_index_optimizer;
+pub mod scoped_page_index_reader;
+
+// Path aliases — old module names still resolve unchanged.
+pub use cache::statistics_cache;
+pub use cache::eviction_policy;
+pub use cache::custom_cache_manager;
+pub use cache::page_index as parquet_page_cache;
diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/scoped_index_optimizer.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/scoped_index_optimizer.rs
new file mode 100644
index 0000000000000..9da4e511f869e
--- /dev/null
+++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/scoped_index_optimizer.rs
@@ -0,0 +1,415 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+//! Physical optimizer rule that installs the scoped page-index reader factory on
+//! **every** parquet scan in the plan — provider-agnostic.
+//!
+//! # Why a rule (not a TableProvider)
+//!
+//! The scoped page-index loader is a property of *how we read parquet*, not of
+//! *which TableProvider* produced the scan. Wiring it into a specific provider
+//! leaves other scan paths on DataFusion's default reader, which loads the full
+//! all-column page index every query and caches none of it.
+//!
+//! This rule walks the physical plan, finds each parquet `DataSourceExec`, reads
+//! the predicate already pushed onto its `ParquetSource`, derives the predicate
+//! columns, and swaps in a [`ScopedPageIndexReaderFactory`] scoped to those
+//! columns. It runs after DataFusion's own optimizers (which is when filter
+//! pushdown has populated `ParquetSource::predicate`), so it works uniformly for
+//! `ListingTable`, `ShardTableProvider`, and any future parquet provider.
+//!
+//! # Replace, do NOT skip-if-present
+//!
+//! DataFusion's `ParquetFormat::create_physical_plan` ALWAYS pre-installs its own
+//! `CachedParquetFileReaderFactory` (the full all-column page-index loader). That
+//! is exactly the factory we want to replace, so this rule does not skip a scan
+//! just because a factory is already set. (A skip-if-present guard was the
+//! original bug that made the end-to-end listing scan never use the scoped
+//! reader.) The indexed path does not run this rule — it uses its own executor.
+//!
+//! # No-op cases (left exactly as DataFusion would run them)
+//!
+//!  - A `DataSourceExec` that isn't parquet — skipped.
+//!  - A parquet scan with no predicate, or whose predicate references no file
+//!    columns — skipped (nothing to scope; the opener loads on demand as today).
+
+use std::sync::Arc;
+
+use datafusion::common::config::ConfigOptions;
+use datafusion::common::tree_node::{Transformed, TreeNode};
+use datafusion::common::Result;
+use datafusion::datasource::physical_plan::{FileSource, ParquetSource};
+use datafusion::datasource::source::DataSourceExec;
+use datafusion::execution::cache::cache_manager::FileMetadataCache;
+use datafusion::physical_expr::utils::collect_columns;
+use datafusion::physical_optimizer::PhysicalOptimizerRule;
+use datafusion::physical_plan::ExecutionPlan;
+use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder};
+use object_store::ObjectStore;
+
+use crate::scoped_page_index_reader::ScopedPageIndexReaderFactory;
+
+/// Installs the scoped page-index reader factory on parquet scans.
+///
+/// Carries the object store and shared metadata cache because a
+/// `PhysicalOptimizerRule` has no access to the session; the caller constructs
+/// it from the query's `RuntimeEnv`.
+#[derive(Debug)]
+pub struct ScopedPageIndexOptimizer {
+    store: Arc<dyn ObjectStore>,
+    metadata_cache: Arc<dyn FileMetadataCache>,
+}
+
+impl ScopedPageIndexOptimizer {
+    pub fn new(store: Arc<dyn ObjectStore>, metadata_cache: Arc<dyn FileMetadataCache>) -> Self {
+        Self { store, metadata_cache }
+    }
+}
+
+impl PhysicalOptimizerRule for ScopedPageIndexOptimizer {
+    fn optimize(
+        &self,
+        plan: Arc<dyn ExecutionPlan>,
+        _config: &ConfigOptions,
+    ) -> Result<Arc<dyn ExecutionPlan>> {
+        let rewritten = plan.transform_up(|node| {
+            let Some(dse) = node.downcast_ref::<DataSourceExec>() else {
+                return Ok(Transformed::no(node));
+            };
+            let Some(config) = dse.data_source().as_ref().downcast_ref::<FileScanConfig>() else {
+                return Ok(Transformed::no(node));
+            };
+            let Some(parquet) = (config.file_source().as_ref() as &dyn std::any::Any)
+                .downcast_ref::<ParquetSource>()
+            else {
+                return Ok(Transformed::no(node));
+            };
+
+            let file_schema = config.file_schema();
+            let predicate = parquet.filter();
+
+            // Predicate column NAMES — empty when there's no pushed-down filter.
+            let mut predicate_names: Vec<String> = predicate
+                .as_ref()
+                .map(|p| {
+                    let mut names: Vec<String> = collect_columns(p)
+                        .into_iter()
+                        .map(|c| c.name().to_string())
+                        .filter(|n| file_schema.index_of(n).is_ok())
+                        .collect();
+                    names.sort();
+                    names.dedup();
+                    names
+                })
+                .unwrap_or_default();
+
+            // Projected column NAMES — the columns this scan actually reads.
+            // `projected_schema()` reflects the projection pushed into the scan.
+            let projection_names: Vec<String> = match config.projected_schema() {
+                Ok(ps) => ps
+                    .fields()
+                    .iter()
+                    .map(|f| f.name().to_string())
+                    .filter(|n| file_schema.index_of(n).is_ok())
+                    .collect(),
+                Err(_) => Vec::new(),
+            };
+
+            // Only scope when there's something to scope to. A full-schema scan
+            // with no predicate gains nothing from scoping — skip it.
+            // `projected_schema()` returns the full schema when no projection is
+            // pushed, so we check whether the projection is a strict subset.
+            let is_projected = projection_names.len() < file_schema.fields().len();
+            if predicate_names.is_empty() && !is_projected {
+                return Ok(Transformed::no(node));
+            }
+            // Pass empty projection when the scan reads all columns — the factory
+            // will build all-column OffsetIndex (existing behavior).
+            let projection_names = if is_projected { projection_names } else { Vec::new() };
+
+            // Build the scoped factory and reinstall the source. The predicate is
+            // retained for parity but not used for RG scoping (Step 1 builds an
+            // all-row-group, column-scoped page index — see the reader's docs).
+            let factory = Arc::new(ScopedPageIndexReaderFactory::new(
+                Arc::clone(&self.store),
+                Arc::clone(&self.metadata_cache),
+                predicate_names,
+                projection_names,
+                predicate,
+                Arc::clone(file_schema),
+            ));
+            let new_source = parquet.clone().with_parquet_file_reader_factory(factory);
+            let new_config = FileScanConfigBuilder::from(config.clone())
+                .with_source(Arc::new(new_source))
+                .build();
+            let new_dse: Arc<dyn ExecutionPlan> = DataSourceExec::from_data_source(new_config);
+            Ok(Transformed::yes(new_dse))
+        })?;
+        Ok(rewritten.data)
+    }
+
+    fn name(&self) -> &str {
+        "ScopedPageIndexOptimizer"
+    }
+
+    /// We swap a reader factory only; the scan's output schema is unchanged.
+    fn schema_check(&self) -> bool {
+        true
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion::execution::cache::DefaultFilesMetadataCache;
+    use datafusion::execution::object_store::ObjectStoreUrl;
+    use datafusion::logical_expr::Operator;
+    use datafusion::physical_expr::expressions::{lit, BinaryExpr, Column};
+    use datafusion::physical_expr::PhysicalExpr;
+    use datafusion_datasource::table_schema::TableSchema;
+    use object_store::memory::InMemory;
+    use crate::cache::page_index;
+    use crate::parquet_page_cache::{clear_scoped_cache_for_test, scoped_cache_stats};
+
+    fn schema() -> Arc<Schema> {
+        Arc::new(Schema::new(vec![
+            Field::new("a", DataType::Int32, false),
+            Field::new("b", DataType::Int32, false),
+        ]))
+    }
+
+    fn deps() -> (Arc<dyn ObjectStore>, Arc<dyn FileMetadataCache>) {
+        (
+            Arc::new(InMemory::new()),
+            Arc::new(DefaultFilesMetadataCache::new(64 * 1024 * 1024)),
+        )
+    }
+
+    fn datasource_exec(parquet: ParquetSource) -> Arc<dyn ExecutionPlan> {
+        let config =
+            FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), Arc::new(parquet)).build();
+        DataSourceExec::from_data_source(config)
+    }
+
+    fn predicate_on_a() -> Arc<dyn PhysicalExpr> {
+        Arc::new(BinaryExpr::new(
+            Arc::new(Column::new("a", 0)),
+            Operator::Gt,
+            lit(5i32),
+        ))
+    }
+
+    fn parquet_for(sch: &Arc<Schema>) -> ParquetSource {
+        ParquetSource::new(TableSchema::new(sch.clone(), vec![]))
+    }
+
+    fn get_factory(plan: &Arc<dyn ExecutionPlan>) -> Option<bool> {
+        let dse = plan.downcast_ref::<DataSourceExec>()?;
+        let cfg = (dse.data_source().as_ref() as &dyn std::any::Any)
+            .downcast_ref::<FileScanConfig>()?;
+        let pq = (cfg.file_source().as_ref() as &dyn std::any::Any)
+            .downcast_ref::<ParquetSource>()?;
+        Some(pq.parquet_file_reader_factory().is_some())
+    }
+
+    #[test]
+    fn installs_factory_when_predicate_present() {
+        let sch = schema();
+        let (store, cache) = deps();
+        let parquet = parquet_for(&sch).with_predicate(predicate_on_a());
+        let plan = datasource_exec(parquet);
+        assert_eq!(get_factory(&plan), Some(false), "precondition: no factory yet");
+
+        let rule = ScopedPageIndexOptimizer::new(store, cache);
+        let out = rule.optimize(plan, &ConfigOptions::default()).unwrap();
+        assert_eq!(
+            get_factory(&out),
+            Some(true),
+            "optimizer must install a scoped reader factory when a predicate is present"
+        );
+    }
+
+    #[test]
+    fn noop_without_predicate_or_projection() {
+        let sch = schema();
+        let (store, cache) = deps();
+        // No predicate, no pushed projection — nothing to scope.
+        let plan = datasource_exec(parquet_for(&sch));
+        let rule = ScopedPageIndexOptimizer::new(store, cache);
+        let out = rule.optimize(plan, &ConfigOptions::default()).unwrap();
+        assert_eq!(
+            get_factory(&out),
+            Some(false),
+            "no predicate and no projection → nothing to scope → no factory installed"
+        );
+    }
+
+    /// A projection-only scan (no predicate pushed down) still needs a scoped
+    /// OffsetIndex so the parquet reader fetches only matched-row pages instead
+    /// of whole column chunks. The factory must be installed from the projected
+    /// schema even when `parquet.filter()` is `None`.
+    #[test]
+    fn installs_factory_for_projection_only_scan() {
+        use datafusion::parquet::arrow::ProjectionMask;
+        use datafusion_datasource::file_scan_config::FileScanConfigBuilder;
+
+        let sch = schema(); // fields: a(0), b(1)
+        let (store, cache) = deps();
+
+        // Project only column `a` — no predicate.
+        let parquet = parquet_for(&sch);
+        let config = FileScanConfigBuilder::new(
+            ObjectStoreUrl::local_filesystem(),
+            Arc::new(parquet),
+        )
+        .with_projection(Some(vec![0])) // project `a` only
+        .build();
+        let plan = DataSourceExec::from_data_source(config);
+
+        let rule = ScopedPageIndexOptimizer::new(store, cache);
+        let out = rule.optimize(plan, &ConfigOptions::default()).unwrap();
+        assert_eq!(
+            get_factory(&out),
+            Some(true),
+            "projection-only scan must get a scoped factory for OffsetIndex scoping"
+        );
+    }
+
+    /// The rule REPLACES an already-installed factory when a predicate is present
+    /// (DataFusion's `ParquetFormat` always pre-installs its own).
+    #[test]
+    fn replaces_existing_default_factory() {
+        let sch = schema();
+        let (store, cache) = deps();
+        let pre = Arc::new(ScopedPageIndexReaderFactory::new(
+            Arc::clone(&store),
+            Arc::clone(&cache),
+            vec!["a".to_string()],
+            vec!["a".to_string()],
+            None,
+            sch.clone(),
+        ));
+        let parquet = parquet_for(&sch)
+            .with_predicate(predicate_on_a())
+            .with_parquet_file_reader_factory(pre);
+        let plan = datasource_exec(parquet);
+        assert_eq!(get_factory(&plan), Some(true), "precondition: a factory is present");
+
+        let rule = ScopedPageIndexOptimizer::new(store, cache);
+        let out = rule.optimize(Arc::clone(&plan), &ConfigOptions::default()).unwrap();
+        assert_eq!(get_factory(&out), Some(true), "scoped factory present after rule");
+        assert!(
+            !Arc::ptr_eq(&plan, &out),
+            "rule must rewrite the scan to install the scoped factory, replacing the default"
+        );
+    }
+
+    /// End-to-end through a real `SessionContext` + stock `ListingTable`: write a
+    /// parquet file, register it, plan `SELECT s0 WHERE n1 >= k`, apply the rule,
+    /// execute, and assert (a) results are correct and (b) the shared scoped
+    /// page-index cache filled — proving the rule installs a working scoped reader
+    /// on the vanilla listing path.
+    #[tokio::test]
+    async fn end_to_end_listing_scan_fills_scoped_cache() {
+        use arrow::array::{Int32Array, StringArray};
+        use arrow::record_batch::RecordBatch;
+        use datafusion::datasource::file_format::parquet::ParquetFormat;
+        use datafusion::datasource::listing::{
+            ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl,
+        };
+        use datafusion::parquet::arrow::ArrowWriter;
+        use datafusion::parquet::file::properties::{EnabledStatistics, WriterProperties};
+        use datafusion::prelude::SessionContext;
+        use futures::StreamExt;
+
+        // Serialize on the shared guard — this asserts on the global cache.
+        let _g = page_index::SCOPED_CACHE_TEST_GUARD
+            .lock()
+            .unwrap();
+        crate::cache::page_index::clear_scoped_cache_for_test();
+
+        let sch = Arc::new(Schema::new(vec![
+            Field::new("n0", DataType::Int32, false),
+            Field::new("n1", DataType::Int32, false),
+            Field::new("s0", DataType::Utf8, false),
+            Field::new("s1", DataType::Utf8, false),
+        ]));
+        const ROWS: i32 = 4096;
+        let n0: Vec<i32> = (0..ROWS).collect();
+        let n1: Vec<i32> = (0..ROWS).collect();
+        let s0: Vec<String> = (0..ROWS).map(|r| format!("s0_{r:06}_padding_padding")).collect();
+        let s1: Vec<String> = (0..ROWS).map(|r| format!("s1_{r:06}_padding_padding")).collect();
+        let batch = RecordBatch::try_new(
+            sch.clone(),
+            vec![
+                Arc::new(Int32Array::from(n0)),
+                Arc::new(Int32Array::from(n1)),
+                Arc::new(StringArray::from(s0)),
+                Arc::new(StringArray::from(s1)),
+            ],
+        )
+        .unwrap();
+
+        let dir = std::env::temp_dir().join(format!("scoped_e2e_{}", std::process::id()));
+        let _ = std::fs::create_dir_all(&dir);
+        let file_path = dir.join("data.parquet");
+        {
+            let props = WriterProperties::builder()
+                .set_data_page_row_count_limit(256)
+                .set_write_batch_size(256)
+                .set_statistics_enabled(EnabledStatistics::Page)
+                .build();
+            let f = std::fs::File::create(&file_path).unwrap();
+            let mut w = ArrowWriter::try_new(f, sch.clone(), Some(props)).unwrap();
+            w.write(&batch).unwrap();
+            w.close().unwrap();
+        }
+
+        let ctx = SessionContext::new();
+        let store: Arc<dyn ObjectStore> = Arc::new(object_store::local::LocalFileSystem::new());
+        let table_url = ListingTableUrl::parse(format!("file://{}", dir.to_str().unwrap())).unwrap();
+        ctx.register_object_store(table_url.as_ref(), Arc::clone(&store));
+        let listing_options = ListingOptions::new(Arc::new(ParquetFormat::new()))
+            .with_file_extension(".parquet")
+            .with_collect_stat(true);
+        let resolved = listing_options.infer_schema(&ctx.state(), &table_url).await.unwrap();
+        let config = ListingTableConfig::new(table_url.clone())
+            .with_listing_options(listing_options)
+            .with_schema(resolved);
+        let provider = Arc::new(ListingTable::try_new(config).unwrap());
+        ctx.register_table("t", provider).unwrap();
+
+        // n1 >= 4080 keeps rows 4080..4096 (16 rows); project the non-predicate s0.
+        let df = ctx.sql("SELECT s0 FROM t WHERE n1 >= 4080").await.unwrap();
+        let physical = df.create_physical_plan().await.unwrap();
+
+        let metadata_cache = ctx.runtime_env().cache_manager.get_file_metadata_cache();
+        let rule = ScopedPageIndexOptimizer::new(Arc::clone(&store), metadata_cache);
+        let physical = rule.optimize(physical, &ConfigOptions::default()).unwrap();
+
+        let mut stream =
+            datafusion::physical_plan::execute_stream(physical, ctx.task_ctx()).unwrap();
+        let mut rows = 0usize;
+        while let Some(b) = stream.next().await {
+            rows += b.unwrap().num_rows();
+        }
+
+        assert_eq!(rows, 16, "predicate n1>=4080 must keep 16 rows");
+
+        let stats = scoped_cache_stats();
+        assert!(
+            stats.entries >= 1 && stats.used_bytes > 0,
+            "scoped cache must have filled on the listing path: {stats:?}"
+        );
+        assert!(stats.misses >= 1, "first scan must register a scoped-cache miss: {stats:?}");
+
+        clear_scoped_cache_for_test();
+        let _ = std::fs::remove_dir_all(&dir);
+    }
+}
diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/scoped_page_index_reader.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/scoped_page_index_reader.rs
new file mode 100644
index 0000000000000..d7aed34584e1c
--- /dev/null
+++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/scoped_page_index_reader.rs
@@ -0,0 +1,388 @@
+/*
+ * SPDX-License-Identifier: Apache-2.0
+ *
+ * The OpenSearch Contributors require contributions made to
+ * this file be licensed under the Apache-2.0 license or a
+ * compatible open source license.
+ */
+
+//! Scoped page-index reader factory for the **listing-table** scan path.
+//!
+//! # Why this exists
+//!
+//! The listing-table path (`ShardTableProvider` / vanilla `ListingTable`) uses
+//! DataFusion's default reader factory, so when page pruning is enabled the
+//! `ParquetOpener` loads the **entire** page index (`ColumnIndex` + `OffsetIndex`
+//! for *every* column) of each surviving file, every query, and caches none of
+//! it. On wide schemas the `ColumnIndex` (per-page string min/max) dominates the
+//! native heap.
+//!
+//! This factory closes that gap using the unified scoped cache
+//! ([`crate::cache::page_index`]). The seam is DataFusion's
+//! [`ParquetFileReaderFactory`]: the `ParquetOpener` asks the reader for metadata
+//! via `get_metadata`, and — per `opener::load_page_index` — if the returned
+//! `ParquetMetaData` *already* carries a page index, the opener uses it and skips
+//! the full, all-column load. So our reader's `get_metadata`:
+//!
+//!   1. loads footer-only metadata (shared metadata-cache hit — see
+//!      [`crate::indexed_table::parquet_bridge::load_parquet_metadata`]), then
+//!   2. augments it with a page index scoped to the predicate columns via
+//!      [`crate::cache::page_index::load_scoped_page_index`]
+//!      (real `ColumnIndex` for predicate columns, real `OffsetIndex` for all
+//!      columns), and
+//!   3. returns that augmented metadata.
+//!
+//! The scoped `(file, predicate-columns)` cache is shared with the indexed path,
+//! so repeated queries reuse the decoded index across both scan paths.
+//!
+//! # Why all row groups (no RG scoping here)
+//!
+//! `ScopedPageIndexOptimizer` only swaps the reader factory; DataFusion still
+//! selects which row groups to scan via its OWN RG-statistics pruning, and its
+//! page-pruner + reader then dereference `column_index[rg][col]` /
+//! `offset_index[rg][col]` for *its* chosen RGs — a set independent of anything
+//! we could compute here. Leaving a placeholder entry on an RG DataFusion still
+//! touches panics (`page_row_counts.first().unwrap()` on an empty `OffsetIndex`),
+//! and its page-index gate is per-FILE (both indexes must be `Some`), so a
+//! partial page index would lie to it. So the page index is built for ALL row
+//! groups, column-scoped only — heap stays bounded because the heavy
+//! `ColumnIndex` is scoped to predicate columns and only the cheap all-column
+//! `OffsetIndex` spans every RG (and that is required for correctness at read
+//! time anyway).
+//!
+//! # Fallback
+//!
+//! If there are no predicate columns, or scoped augmentation fails for a file
+//! (no page index, decode/IO error), `get_metadata` returns the footer-only
+//! metadata and the opener loads the page index on demand exactly as today —
+//! correct, just without the scoping benefit for that file. Never a wrong result.
+
+use std::sync::Arc;
+
+use arrow::datatypes::SchemaRef;
+use datafusion::datasource::physical_plan::parquet::{ParquetFileMetrics, ParquetFileReaderFactory};
+use datafusion::execution::cache::cache_manager::FileMetadataCache;
+use datafusion::parquet::arrow::arrow_reader::ArrowReaderOptions;
+use datafusion::parquet::arrow::async_reader::AsyncFileReader;
+use datafusion::parquet::errors::ParquetError;
+use datafusion::parquet::file::metadata::ParquetMetaData;
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
+use datafusion_datasource::PartitionedFile;
+use futures::future::BoxFuture;
+use futures::FutureExt;
+use object_store::{ObjectStore, ObjectStoreExt};
+use prost::bytes::Bytes;
+
+use crate::cache::page_index::{load_scoped_page_index_cols, resolve_predicate_parquet_columns_pair};
+use crate::indexed_table::parquet_bridge::load_parquet_metadata;
+
+/// A [`ParquetFileReaderFactory`] that, on `get_metadata`, returns metadata whose
+/// page index is scoped to the query's predicate columns. Data reads go straight
+/// to the object store.
+///
+/// Carries predicate column *names* + the file schema rather than pre-resolved
+/// parquet indices: the reader resolves names → parquet leaf indices per file via
+/// the same `resolve_predicate_parquet_columns` the indexed path uses, robust to
+/// schema evolution across files (a column absent from one file is just skipped).
+#[derive(Debug)]
+pub struct ScopedPageIndexReaderFactory {
+    store: Arc<dyn ObjectStore>,
+    metadata_cache: Arc<dyn FileMetadataCache>,
+    /// File-column names referenced by the query predicate. Empty means "no
+    /// scoping" — `get_metadata` returns footer-only and the opener loads the
+    /// page index on demand as usual.
+    predicate_column_names: Arc<Vec<String>>,
+    /// File-column names this scan PROJECTS (reads). Used to scope the
+    /// OffsetIndex to `predicate ∪ projection` instead of all columns. Empty =
+    /// fall back to all-column offsets (old behavior).
+    projection_column_names: Arc<Vec<String>>,
+    /// The physical predicate (if any). Retained in the constructor signature for
+    /// parity with the indexed path, but intentionally NOT used for RG scoping
+    /// here (see module docs).
+    #[allow(dead_code)]
+    predicate: Option<Arc<dyn PhysicalExpr>>,
+    /// File schema (no partition columns), for per-file column resolution.
+    file_schema: SchemaRef,
+}
+
+impl ScopedPageIndexReaderFactory {
+    pub fn new(
+        store: Arc<dyn ObjectStore>,
+        metadata_cache: Arc<dyn FileMetadataCache>,
+        predicate_column_names: Vec<String>,
+        projection_column_names: Vec<String>,
+        predicate: Option<Arc<dyn PhysicalExpr>>,
+        file_schema: SchemaRef,
+    ) -> Self {
+        Self {
+            store,
+            metadata_cache,
+            predicate_column_names: Arc::new(predicate_column_names),
+            projection_column_names: Arc::new(projection_column_names),
+            predicate,
+            file_schema,
+        }
+    }
+}
+
+impl ParquetFileReaderFactory for ScopedPageIndexReaderFactory {
+    fn create_reader(
+        &self,
+        partition_index: usize,
+        file: PartitionedFile,
+        _metadata_size_hint: Option<usize>,
+        metrics: &ExecutionPlanMetricsSet,
+    ) -> datafusion::common::Result<Box<dyn AsyncFileReader + Send>> {
+        let file_metrics =
+            ParquetFileMetrics::new(partition_index, file.object_meta.location.as_ref(), metrics);
+        Ok(Box::new(ScopedPageIndexReader {
+            store: Arc::clone(&self.store),
+            metadata_cache: Arc::clone(&self.metadata_cache),
+            predicate_column_names: Arc::clone(&self.predicate_column_names),
+            projection_column_names: Arc::clone(&self.projection_column_names),
+            file_schema: Arc::clone(&self.file_schema),
+            location: file.object_meta.location.clone(),
+            metrics: file_metrics,
+        }))
+    }
+}
+
+struct ScopedPageIndexReader {
+    store: Arc<dyn ObjectStore>,
+    metadata_cache: Arc<dyn FileMetadataCache>,
+    predicate_column_names: Arc<Vec<String>>,
+    projection_column_names: Arc<Vec<String>>,
+    file_schema: SchemaRef,
+    location: object_store::path::Path,
+    metrics: ParquetFileMetrics,
+}
+
+impl AsyncFileReader for ScopedPageIndexReader {
+    fn get_bytes(
+        &mut self,
+        range: std::ops::Range<u64>,
+    ) -> BoxFuture<'_, datafusion::parquet::errors::Result<Bytes>> {
+        self.metrics.bytes_scanned.add((range.end - range.start) as usize);
+        let store = Arc::clone(&self.store);
+        let location = self.location.clone();
+        // IO-runtime dispatch is handled by the store wrapper around the
+        // registered store, so a plain `.await` already runs on the IO runtime.
+        async move {
+            store
+                .get_range(&location, range)
+                .await
+                .map_err(|e| ParquetError::External(Box::new(e)))
+        }
+        .boxed()
+    }
+
+    fn get_byte_ranges(
+        &mut self,
+        ranges: Vec<std::ops::Range<u64>>,
+    ) -> BoxFuture<'_, datafusion::parquet::errors::Result<Vec<Bytes>>> {
+        let total: u64 = ranges.iter().map(|r| r.end - r.start).sum();
+        self.metrics.bytes_scanned.add(total as usize);
+        let store = Arc::clone(&self.store);
+        let location = self.location.clone();
+        async move {
+            store
+                .get_ranges(&location, &ranges)
+                .await
+                .map_err(|e| ParquetError::External(Box::new(e)))
+        }
+        .boxed()
+    }
+
+    fn get_metadata(
+        &mut self,
+        _options: Option<&ArrowReaderOptions>,
+    ) -> BoxFuture<'_, datafusion::parquet::errors::Result<Arc<ParquetMetaData>>> {
+        let store = Arc::clone(&self.store);
+        let metadata_cache = Arc::clone(&self.metadata_cache);
+        let predicate_names = Arc::clone(&self.predicate_column_names);
+        let projection_names = Arc::clone(&self.projection_column_names);
+        let file_schema = Arc::clone(&self.file_schema);
+        let location = self.location.clone();
+        async move {
+            // 1. Footer-only metadata (shared metadata-cache hit if pre-seeded).
+            let (_schema, _size, footer) =
+                load_parquet_metadata(Arc::clone(&store), &location, Arc::clone(&metadata_cache))
+                    .await
+                    .map_err(|e| ParquetError::General(format!("footer metadata {location}: {e}")))?;
+
+            // 2. Resolve predicate + projection names → parquet leaf indices, then
+            //    augment with a column-scoped page index. Gated on either being
+            //    non-empty: a projection-only query still needs a scoped OffsetIndex.
+            if !predicate_names.is_empty() || !projection_names.is_empty() {
+                let (parquet_cols, offset_cols) = resolve_predicate_parquet_columns_pair(
+                    &file_schema, &footer, &predicate_names, &projection_names,
+                );
+                if let Some(augmented) = load_scoped_page_index_cols(
+                    &store,
+                    &location,
+                    &footer,
+                    &parquet_cols,
+                    &offset_cols,
+                )
+                .await
+                {
+                    return Ok(augmented);
+                }
+            }
+
+            Ok(footer)
+        }
+        .boxed()
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+    use arrow::array::{Int32Array, RecordBatch};
+    use arrow::datatypes::{DataType, Field, Schema};
+    use datafusion::parquet::arrow::ArrowWriter;
+    use datafusion::parquet::file::page_index::column_index::ColumnIndexMetaData;
+    use datafusion::parquet::file::properties::{EnabledStatistics, WriterProperties};
+    use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet;
+    use object_store::memory::InMemory;
+    use object_store::path::Path as ObjPath;
+    use object_store::{ObjectStore, ObjectStoreExt, PutPayload};
+
+    // Shared crate-wide guard so all users of the one process-global scoped cache
+    // mutually exclude.
+    use crate::cache::page_index::SCOPED_CACHE_TEST_GUARD as SCOPED_TEST_GUARD;
+
+    /// Two int columns (`price`, `qty`), one row group, four 8-row data pages.
+    fn two_col_parquet() -> (Bytes, SchemaRef) {
+        let schema = Arc::new(Schema::new(vec![
+            Field::new("price", DataType::Int32, false),
+            Field::new("qty", DataType::Int32, false),
+        ]));
+        let prices: Vec<i32> = (0..32).collect();
+        let qtys: Vec<i32> = (100..132).collect();
+        let batch = RecordBatch::try_new(
+            schema.clone(),
+            vec![
+                Arc::new(Int32Array::from(prices)),
+                Arc::new(Int32Array::from(qtys)),
+            ],
+        )
+        .unwrap();
+        let props = WriterProperties::builder()
+            .set_max_row_group_size(32)
+            .set_data_page_row_count_limit(8)
+            .set_write_batch_size(8)
+            .set_statistics_enabled(EnabledStatistics::Page)
+            .build();
+        let mut buf: Vec<u8> = Vec::new();
+        let mut w = ArrowWriter::try_new(&mut buf, schema.clone(), Some(props)).unwrap();
+        w.write(&batch).unwrap();
+        w.close().unwrap();
+        (Bytes::from(buf), schema)
+    }
+
+    async fn stage(bytes: Bytes) -> (Arc<dyn ObjectStore>, ObjPath) {
+        let store: Arc<dyn ObjectStore> = Arc::new(InMemory::new());
+        let loc = ObjPath::from("data.parquet");
+        store.put(&loc, PutPayload::from_bytes(bytes)).await.unwrap();
+        (store, loc)
+    }
+
+    fn fresh_cache() -> Arc<dyn FileMetadataCache> {
+        Arc::new(crate::cache::MutexFileMetadataCache::new(
+            datafusion::execution::cache::DefaultFilesMetadataCache::new(64 * 1024 * 1024),
+        ))
+    }
+
+    fn metrics() -> ExecutionPlanMetricsSet {
+        ExecutionPlanMetricsSet::new()
+    }
+
+    /// The factory's reader must, on `get_metadata`, return metadata whose page
+    /// index is scoped to the predicate column (`price`) — real ColumnIndex for
+    /// `price`, NONE placeholder for `qty` — while keeping a REAL OffsetIndex for
+    /// BOTH columns. Also fills the shared scoped cache.
+    #[tokio::test]
+    async fn get_metadata_returns_scoped_page_index() {
+        let _g = SCOPED_TEST_GUARD.lock().unwrap();
+        crate::cache::page_index::clear_scoped_cache_for_test();
+
+        let (bytes, schema) = two_col_parquet();
+        let (store, loc) = stage(bytes).await;
+        let factory = ScopedPageIndexReaderFactory::new(
+            Arc::clone(&store),
+            fresh_cache(),
+            vec!["price".to_string()],
+            // Project both columns so the OffsetIndex is built for both (this test
+            // asserts a real OffsetIndex for every column).
+            vec!["price".to_string(), "qty".to_string()],
+            None,
+            schema,
+        );
+        let pf = PartitionedFile::new(loc.as_ref().to_string(), 0);
+        let m = metrics();
+        let mut reader = factory.create_reader(0, pf, None, &m).unwrap();
+
+        let meta = reader.get_metadata(None).await.unwrap();
+        let ci = meta.column_index().expect("augmented metadata has column index");
+        let oi = meta.offset_index().expect("augmented metadata has offset index");
+        assert!(
+            !matches!(ci[0][0], ColumnIndexMetaData::NONE),
+            "predicate col (price) must have a real ColumnIndex"
+        );
+        assert!(
+            matches!(ci[0][1], ColumnIndexMetaData::NONE),
+            "non-predicate col (qty) ColumnIndex must be a NONE placeholder"
+        );
+        assert!(
+            !oi[0][0].page_locations().is_empty() && !oi[0][1].page_locations().is_empty(),
+            "OffsetIndex must be real for every column"
+        );
+
+        let stats = crate::cache::page_index::scoped_cache_stats();
+        assert!(stats.entries >= 1 && stats.misses >= 1 && stats.used_bytes > 0);
+
+        crate::cache::page_index::clear_scoped_cache_for_test();
+    }
+
+    /// No predicate columns → no scoping happens: `get_metadata` returns the
+    /// footer load as-is and the scoped cache is never touched.
+    ///
+    /// Note: we deliberately do NOT assert the returned metadata has no page
+    /// index. Until the base metadata-cache strip lands (Step 1e), the shared
+    /// `load_parquet_metadata` still loads the full page index when a metadata
+    /// cache is present (DataFusion's `PageIndexPolicy::Optional`). The invariant
+    /// this reader guarantees with no predicate is "no scoping", i.e. the scoped
+    /// cache stays empty — which holds before and after 1e.
+    #[tokio::test]
+    async fn get_metadata_no_predicate_does_not_scope() {
+        let _g = SCOPED_TEST_GUARD.lock().unwrap();
+        crate::cache::page_index::clear_scoped_cache_for_test();
+
+        let (bytes, schema) = two_col_parquet();
+        let (store, loc) = stage(bytes).await;
+        let factory = ScopedPageIndexReaderFactory::new(
+            Arc::clone(&store),
+            fresh_cache(),
+            vec![],
+            vec![],
+            None,
+            schema,
+        );
+        let pf = PartitionedFile::new(loc.as_ref().to_string(), 0);
+        let m = metrics();
+        let mut reader = factory.create_reader(0, pf, None, &m).unwrap();
+
+        let _meta = reader.get_metadata(None).await.unwrap();
+        let stats = crate::cache::page_index::scoped_cache_stats();
+        assert_eq!(
+            (stats.entries, stats.misses, stats.hits),
+            (0, 0, 0),
+            "no predicate → scoped cache must be untouched"
+        );
+
+        crate::cache::page_index::clear_scoped_cache_for_test();
+    }
+}
diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/session_context.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/session_context.rs
index bfae9428738bf..0b21b59377df8 100644
--- a/sandbox/plugins/analytics-backend-datafusion/rust/src/session_context.rs
+++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/session_context.rs
@@ -32,6 +32,7 @@ use object_store::ObjectMeta;
 use crate::api::{DataFusionRuntime, ShardView};
 use crate::datafusion_query_config::DatafusionQueryConfig;
 use crate::query_tracker::QueryTrackingContext;
+use crate::scoped_index_optimizer::ScopedPageIndexOptimizer;
 
 /// Opaque handle holding a configured SessionContext between FFM calls.
 pub struct SessionContextHandle {
@@ -229,6 +230,15 @@ pub async unsafe fn create_session_context(
             );
     }
 
+    // Install the scoped page-index reader factory on every parquet scan.
+    // Registered AFTER ProjectRowIdOptimizer so it sees the final DataSourceExec.
+    state_builder = state_builder.with_physical_optimizer_rule(Arc::new(
+        ScopedPageIndexOptimizer::new(
+            Arc::clone(&shard_view.store),
+            runtime.runtime_env.cache_manager.get_file_metadata_cache(),
+        ),
+    ));
+
     let state = state_builder.build();
 
     let ctx = SessionContext::new_with_state(state);
@@ -270,6 +280,25 @@ pub async unsafe fn create_session_context(
         table_name.to_string()
     };
 
+    // Pre-warm the metadata cache footer-only before infer_schema fires.
+    // infer_schema calls DFParquetMetadata::fetch_metadata with PageIndexPolicy::Optional
+    // on a cache miss — fetching full page index bytes. By pre-warming here with
+    // PageIndexPolicy::Skip via load_parquet_metadata, every infer_schema call becomes
+    // a cache hit and never touches the page index bytes.
+    // Cache key is meta.location (Path) — same key infer_schema uses.
+    // Empty shard: loop is a no-op; infer_schema is also skipped below.
+    {
+        let metadata_cache = runtime.runtime_env.cache_manager.get_file_metadata_cache();
+        for meta in shard_view.object_metas.as_ref() {
+            let _ = crate::indexed_table::parquet_bridge::load_parquet_metadata(
+                Arc::clone(&shard_view.store),
+                &meta.location,
+                Arc::clone(&metadata_cache),
+            )
+            .await;
+        }
+    }
+
     // Empty shard: skip infer_schema (errors on zero files); widen_schema_from_plan
     // below populates columns from the substrait base_schema.
     let inferred: arrow::datatypes::SchemaRef = if shard_view.object_metas.is_empty() {

From 4483eae400b6ae08956cacf9072633c0a499eddc Mon Sep 17 00:00:00 2001
From: G <bharath78910@gmail.com>
Date: Sun, 21 Jun 2026 02:57:31 +0530
Subject: [PATCH 2/2] addressing comments

Signed-off-by: G <bharath78910@gmail.com>
---
 .../page_index/column_schema_resolver.rs      | 24 +++++++++----------
 .../rust/src/cache/page_index/mod.rs          |  3 +--
 .../src/cache/page_index/page_index_io.rs     | 24 +++++++++++--------
 3 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/column_schema_resolver.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/column_schema_resolver.rs
index bc4e711003a95..984d387446dcb 100644
--- a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/column_schema_resolver.rs
+++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/column_schema_resolver.rs
@@ -56,9 +56,10 @@ pub fn resolve_predicate_parquet_columns(
         metadata.file_metadata().key_value_metadata(),
     ) {
         Ok(s) => Arc::new(s),
-        // If we can't derive the file schema, fall back to the union schema; the
-        // caller still falls back to footer-only on any downstream mismatch.
-        Err(_) => return resolve_with_schema(_arrow_schema, metadata, predicate_column_names),
+        // If we can't derive the file schema (malformed footer, unsupported type),
+        // return empty. Empty is the safe conservative choice:  the caller skips the
+        // scoped load and falls back to footer-only.
+        Err(_) => return vec![],
     };
     resolve_with_schema(&file_arrow_schema, metadata, predicate_column_names)
 }
@@ -71,10 +72,10 @@ pub fn resolve_predicate_parquet_columns(
 /// full schema reconstruction per file per query. Pure refactor — each returned
 /// Vec is identical to calling `resolve_predicate_parquet_columns` separately.
 pub fn resolve_predicate_parquet_columns_pair(
-    union_schema: &SchemaRef,
+    _union_schema: &SchemaRef,
     metadata: &ParquetMetaData,
-    names_a: &[String],
-    names_b: &[String],
+    predicate_col_names: &[String],
+    projection_col_names: &[String],
 ) -> (Vec<usize>, Vec<usize>) {
     let parquet_schema = metadata.file_metadata().schema_descr();
     match parquet_to_arrow_schema(
@@ -84,15 +85,12 @@ pub fn resolve_predicate_parquet_columns_pair(
         Ok(s) => {
             let file_arrow_schema = Arc::new(s);
             (
-                resolve_with_schema(&file_arrow_schema, metadata, names_a),
-                resolve_with_schema(&file_arrow_schema, metadata, names_b),
+                resolve_with_schema(&file_arrow_schema, metadata, predicate_col_names),
+                resolve_with_schema(&file_arrow_schema, metadata, projection_col_names),
             )
         }
-        // Same fallback as the single-name path: resolve against the union schema.
-        Err(_) => (
-            resolve_with_schema(union_schema, metadata, names_a),
-            resolve_with_schema(union_schema, metadata, names_b),
-        ),
+        // Can't derive the file schema — return empty for both sets.
+        Err(_) => (vec![], vec![]),
     }
 }
 
diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/mod.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/mod.rs
index 760f46d6d65e0..1157ba370d285 100644
--- a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/mod.rs
+++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/mod.rs
@@ -44,8 +44,7 @@
 //!   `projection_cols = predicate ∪ projection ∪ {0}`. This is the cheap, fixed-width
 //!   index (no per-page string stats). Built for **all row groups** (an empty
 //!   OffsetIndex on a row group DataFusion scans panics / breaks reads, and
-//!   DataFusion chooses the scanned set itself, after our load — see
-//!   HANDOFF_step2_rg_scoping.md §1e).
+//!   DataFusion chooses the scanned set itself, after our load).
 //!
 //! Each cache stores only its decoded vector (`ParquetColumnIndex` /
 //! `ParquetOffsetIndex`) — never a full `ParquetMetaData` (no footer
diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/page_index_io.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/page_index_io.rs
index e2fd3f8c7695b..340e4d67f19f9 100644
--- a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/page_index_io.rs
+++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/page_index_io.rs
@@ -16,7 +16,9 @@ use std::mem;
 use std::ops::Range;
 use std::sync::Arc;
 
+use arrow::array::{ArrayRef, BooleanArray, UInt64Array};
 use arrow::datatypes::SchemaRef;
+use datafusion::parquet::arrow::arrow_reader::statistics::StatisticsConverter;
 use datafusion::parquet::errors::{ParquetError, Result as ParquetResult};
 use datafusion::parquet::file::metadata::{
     ColumnChunkMetaData, OffsetIndexBuilder, ParquetColumnIndex, ParquetMetaData,
@@ -27,6 +29,9 @@ use datafusion::parquet::file::page_index::index_reader::{
     read_columns_indexes, read_offset_indexes,
 };
 use datafusion::parquet::file::reader::{ChunkReader, Length};
+use datafusion::physical_optimizer::pruning::{PruningPredicate, PruningStatistics};
+use datafusion::physical_expr::PhysicalExpr;
+use datafusion::scalar::ScalarValue;
 use object_store::ObjectStore;
 use parquet::file::page_index::offset_index::OffsetIndexMetaData;
 use prost::bytes::{buf, Buf, Bytes};
@@ -106,13 +111,8 @@ pub async fn load_page_index_fully_scoped(
 pub fn surviving_row_groups(
     footer_meta: &ParquetMetaData,
     arrow_schema: &SchemaRef,
-    predicate: &Arc<dyn datafusion::physical_expr::PhysicalExpr>,
+    predicate: &Arc<dyn PhysicalExpr>,
 ) -> Vec<usize> {
-    use arrow::array::{ArrayRef, BooleanArray, UInt64Array};
-    use datafusion::parquet::arrow::arrow_reader::statistics::StatisticsConverter;
-    use datafusion::physical_optimizer::pruning::{PruningPredicate, PruningStatistics};
-    use datafusion::scalar::ScalarValue;
-    use std::collections::HashSet;
 
     let num_rgs = footer_meta.num_row_groups();
     let all: Vec<usize> = (0..num_rgs).collect();
@@ -417,14 +417,18 @@ async fn get_or_build_offset_index(
     }
     let num_cols = footer_meta.file_metadata().schema_descr().num_columns();
 
-    // Resolve which columns need a real OffsetIndex: predicate ∪ projection ∪ {0},
-    // clamped. `None` → all columns.
-    // First column {0} , is always needed as it's used in stats.
+    // Resolve which columns need a real OffsetIndex:
+    //   None   → no explicit projection, read everything → all columns get a real entry.
+    //   Some   → build {col 0} ∪ predicate_cols ∪ proj_cols, clamped to num_cols.
+    //            Col 0 is always included because the page-skip metric reads it
+    //            regardless of what the query projects or filters on.
+    //            Predicate-only queries (empty proj_cols) still get col 0 + predicate
+    //            columns; projection-only queries get col 0 + projected columns.
     let off_cols: Vec<usize> = match projection_cols {
         None => (0..num_cols).collect(),
         Some(proj_cols) => {
             let mut set: HashSet<usize> = HashSet::new();
-            set.insert(0); // metric reads column 0
+            set.insert(0); // page-skip metric always reads col 0
             for &c in predicate_cols {
                 set.insert(c);
             }