From 709d000780b2e7072a6adcf81691e0402a3fdb22 Mon Sep 17 00:00:00 2001 From: G Date: Sat, 20 Jun 2026 23:13:17 +0530 Subject: [PATCH 1/2] core page index cache changes Signed-off-by: G --- .../rust/src/cache.rs | 183 -- .../src/{ => cache}/custom_cache_manager.rs | 65 +- .../rust/src/{ => cache}/eviction_policy.rs | 0 .../rust/src/cache/metadata_cache.rs | 344 ++++ .../rust/src/cache/mod.rs | 37 + .../rust/src/cache/page_index/cache_keys.rs | 65 + .../rust/src/cache/page_index/cache_store.rs | 195 ++ .../page_index/column_schema_resolver.rs | 116 ++ .../rust/src/cache/page_index/mod.rs | 188 ++ .../src/cache/page_index/page_index_io.rs | 1626 +++++++++++++++++ .../rust/src/{ => cache}/statistics_cache.rs | 0 .../rust/src/indexed_executor.rs | 75 +- .../rust/src/indexed_table/parquet_bridge.rs | 58 +- .../rust/src/lib.rs | 11 +- .../rust/src/scoped_index_optimizer.rs | 415 +++++ .../rust/src/scoped_page_index_reader.rs | 388 ++++ .../rust/src/session_context.rs | 29 + 17 files changed, 3554 insertions(+), 241 deletions(-) delete mode 100644 sandbox/plugins/analytics-backend-datafusion/rust/src/cache.rs rename sandbox/plugins/analytics-backend-datafusion/rust/src/{ => cache}/custom_cache_manager.rs (88%) rename sandbox/plugins/analytics-backend-datafusion/rust/src/{ => cache}/eviction_policy.rs (100%) create mode 100644 sandbox/plugins/analytics-backend-datafusion/rust/src/cache/metadata_cache.rs create mode 100644 sandbox/plugins/analytics-backend-datafusion/rust/src/cache/mod.rs create mode 100644 sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/cache_keys.rs create mode 100644 sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/cache_store.rs create mode 100644 sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/column_schema_resolver.rs create mode 100644 sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/mod.rs create mode 100644 sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/page_index_io.rs rename sandbox/plugins/analytics-backend-datafusion/rust/src/{ => cache}/statistics_cache.rs (100%) create mode 100644 sandbox/plugins/analytics-backend-datafusion/rust/src/scoped_index_optimizer.rs create mode 100644 sandbox/plugins/analytics-backend-datafusion/rust/src/scoped_page_index_reader.rs diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache.rs deleted file mode 100644 index d5fb186acbc51..0000000000000 --- a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache.rs +++ /dev/null @@ -1,183 +0,0 @@ -/* - * SPDX-License-Identifier: Apache-2.0 - * - * The OpenSearch Contributors require contributions made to - * this file be licensed under the Apache-2.0 license or a - * compatible open source license. - */ - -use std::sync::atomic::{AtomicUsize, Ordering}; -use std::sync::{Arc, Mutex}; - -use datafusion::execution::cache::cache_manager::{ - CachedFileMetadataEntry, FileMetadataCache, FileMetadataCacheEntry, -}; -use datafusion::execution::cache::DefaultFilesMetadataCache; -use datafusion::execution::cache::CacheAccessor; -use log::error; -use object_store::path::Path; - -// Cache type constants -pub const CACHE_TYPE_METADATA: &str = "METADATA"; -pub const CACHE_TYPE_STATS: &str = "STATISTICS"; - -// Helper function to log cache operations -fn log_cache_error(operation: &str, error: &str) { - error!("[CACHE ERROR] {} operation failed: {}", operation, error); -} - -// Wrapper to make Mutex implement FileMetadataCache -pub struct MutexFileMetadataCache { - pub inner: Mutex, - hit_count: AtomicUsize, - miss_count: AtomicUsize, -} - -impl MutexFileMetadataCache { - pub fn new(cache: DefaultFilesMetadataCache) -> Self { - Self { - inner: Mutex::new(cache), - hit_count: AtomicUsize::new(0), - miss_count: AtomicUsize::new(0), - } - } - - pub fn hit_count(&self) -> usize { - self.hit_count.load(Ordering::Relaxed) - } - - pub fn miss_count(&self) -> usize { - self.miss_count.load(Ordering::Relaxed) - } - - pub fn reset_stats(&self) { - self.hit_count.store(0, Ordering::Relaxed); - self.miss_count.store(0, Ordering::Relaxed); - } - - pub fn clear_cache(&self) { - if let Ok(cache) = self.inner.lock() { - cache.clear(); - } - } - - pub fn update_cache_limit(&self, new_limit: usize) { - if let Ok(cache) = self.inner.lock() { - cache.update_cache_limit(new_limit); - } - } - - pub fn get_cache_limit(&self) -> usize { - if let Ok(cache) = self.inner.lock() { - cache.cache_limit() - } else { - 0 - } - } -} - -impl CacheAccessor for MutexFileMetadataCache { - fn get(&self, k: &Path) -> Option { - match self.inner.lock() { - Ok(cache) => { - let result = cache.get(k); - if result.is_some() { - self.hit_count.fetch_add(1, Ordering::Relaxed); - } else { - self.miss_count.fetch_add(1, Ordering::Relaxed); - } - result - } - Err(e) => { - log_cache_error("get", &e.to_string()); - None - } - } - } - - fn put(&self, k: &Path, v: CachedFileMetadataEntry) -> Option { - match self.inner.lock() { - Ok(cache) => cache.put(k, v), - Err(e) => { - log_cache_error("put", &e.to_string()); - None - } - } - } - - fn remove(&self, k: &Path) -> Option { - match self.inner.lock() { - Ok(cache) => cache.remove(k), - Err(e) => { - log_cache_error("remove", &e.to_string()); - None - } - } - } - - fn contains_key(&self, k: &Path) -> bool { - match self.inner.lock() { - Ok(cache) => cache.contains_key(k), - Err(e) => { - log_cache_error("contains_key", &e.to_string()); - false - } - } - } - - fn len(&self) -> usize { - match self.inner.lock() { - Ok(cache) => cache.len(), - Err(e) => { - log_cache_error("len", &e.to_string()); - 0 - } - } - } - - fn clear(&self) { - match self.inner.lock() { - Ok(cache) => cache.clear(), - Err(e) => log_cache_error("clear", &e.to_string()), - } - } - - fn name(&self) -> String { - match self.inner.lock() { - Ok(cache) => cache.name(), - Err(e) => { - log_cache_error("name", &e.to_string()); - "cache_error".to_string() - } - } - } -} - -impl FileMetadataCache for MutexFileMetadataCache { - fn cache_limit(&self) -> usize { - match self.inner.lock() { - Ok(cache) => cache.cache_limit(), - Err(e) => { - log_cache_error("cache_limit", &e.to_string()); - 0 - } - } - } - - fn update_cache_limit(&self, limit: usize) { - match self.inner.lock() { - Ok(cache) => cache.update_cache_limit(limit), - Err(e) => log_cache_error("update_cache_limit", &e.to_string()), - } - } - - fn list_entries(&self) -> std::collections::HashMap { - match self.inner.lock() { - Ok(cache) => cache.list_entries(), - Err(e) => { - log_cache_error("list_entries", &e.to_string()); - std::collections::HashMap::new() - } - } - } -} diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/custom_cache_manager.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/custom_cache_manager.rs similarity index 88% rename from sandbox/plugins/analytics-backend-datafusion/rust/src/custom_cache_manager.rs rename to sandbox/plugins/analytics-backend-datafusion/rust/src/cache/custom_cache_manager.rs index 097d3657b9e8e..d170a274a73bb 100644 --- a/sandbox/plugins/analytics-backend-datafusion/rust/src/custom_cache_manager.rs +++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/custom_cache_manager.rs @@ -10,13 +10,14 @@ use std::sync::Arc; use datafusion::execution::cache::cache_manager::{FileMetadataCache, FileStatisticsCache, CacheManagerConfig}; use datafusion::execution::cache::file_statistics_cache::DefaultFileStatisticsCache; use datafusion::execution::cache::CacheAccessor; -use crate::statistics_cache::compute_parquet_statistics; -use crate::cache::MutexFileMetadataCache; -use crate::statistics_cache::CustomStatisticsCache; +use crate::cache::statistics_cache::compute_parquet_statistics; +use crate::cache::metadata_cache::MutexFileMetadataCache; +use crate::cache::statistics_cache::CustomStatisticsCache; use object_store::path::Path; use object_store::ObjectMeta; -use datafusion::datasource::physical_plan::parquet::metadata::DFParquetMetadata; +use object_store::ObjectStore; use log::{debug, error}; +use crate::indexed_table::parquet_bridge; /// Create ObjectMeta from a local file path. fn create_object_meta_from_file(file_path: &str) -> Result, datafusion::common::DataFusionError> { @@ -229,14 +230,14 @@ impl CustomCacheManager { /// Check if a file exists in a specific cache type pub fn contains_file_by_type(&self, file_path: &str, cache_type: &str) -> bool { match cache_type { - crate::cache::CACHE_TYPE_METADATA => { + crate::cache::metadata_cache::CACHE_TYPE_METADATA => { let path = Path::from(file_path); self.file_metadata_cache .as_ref() .and_then(|cache| cache.get(&path)) .is_some() } - crate::cache::CACHE_TYPE_STATS => { + crate::cache::metadata_cache::CACHE_TYPE_STATS => { self.statistics_cache .as_ref() .map_or(false, |cache| cache.contains_key(&Path::from(file_path))) @@ -294,7 +295,7 @@ impl CustomCacheManager { /// Clear specific cache type pub fn clear_cache_type(&self, cache_type: &str) -> Result<(), String> { match cache_type { - crate::cache::CACHE_TYPE_METADATA => { + crate::cache::metadata_cache::CACHE_TYPE_METADATA => { if let Some(cache) = &self.file_metadata_cache { cache.clear(); Ok(()) @@ -302,7 +303,7 @@ impl CustomCacheManager { Err("No metadata cache configured".to_string()) } } - crate::cache::CACHE_TYPE_STATS => { + crate::cache::metadata_cache::CACHE_TYPE_STATS => { if let Some(cache) = &self.statistics_cache { cache.clear(); Ok(()) @@ -317,7 +318,7 @@ impl CustomCacheManager { /// Get memory consumed by specific cache type pub fn get_memory_consumed_by_type(&self, cache_type: &str) -> Result { match cache_type { - crate::cache::CACHE_TYPE_METADATA => { + crate::cache::metadata_cache::CACHE_TYPE_METADATA => { if let Some(cache) = &self.file_metadata_cache { if let Ok(cache_guard) = cache.inner.lock() { Ok(cache_guard.memory_used()) @@ -328,7 +329,7 @@ impl CustomCacheManager { Err("No metadata cache configured".to_string()) } } - crate::cache::CACHE_TYPE_STATS => { + crate::cache::metadata_cache::CACHE_TYPE_STATS => { if let Some(cache) = &self.statistics_cache { Ok(cache.memory_consumed()) } else { @@ -351,42 +352,20 @@ impl CustomCacheManager { let object_meta = object_metas.first() .ok_or_else(|| "No object metadata returned".to_string())?; - let store = Arc::new(object_store::local::LocalFileSystem::new()); + let store: Arc = Arc::new(object_store::local::LocalFileSystem::new()); - // Get cache reference for DataFusion metadata loading - let cache_ref = self.file_metadata_cache.as_ref() - .ok_or_else(|| "No file metadata cache configured".to_string())?; + let metadata_cache = self.file_metadata_cache.as_ref() + .ok_or_else(|| "No file metadata cache configured".to_string())? + .clone() as Arc; - let metadata_cache = cache_ref.clone() as Arc; - - // Use DataFusion's metadata loading by passing reference to file_metadata_cache to get complete metadata - // IMPORTANT: When a cache is provided to DFParquetMetadata, fetch_metadata() will: - // 1. Enable page index loading (with_page_indexes(true)) - // 2. Load the complete metadata including column and offset indexes - // 3. Automatically put the metadata into the cache (lines 155-160 in datafusion's metadata.rs) - // This ensures we cache exactly what DataFusion would cache during query execution - let _parquet_metadata = rt_handle.block_on(async { - let df_metadata = DFParquetMetadata::new(store.as_ref(), object_meta) - .with_file_metadata_cache(Some(metadata_cache)); - - // fetch_metadata() performs the cache put operation internally - df_metadata.fetch_metadata().await - .map_err(|e| format!("Failed to fetch metadata: {}", e)) + // Warm the level-1 metadata cache footer-only. `load_parquet_metadata` + // fetches with PageIndexPolicy::Skip — only footer bytes, no page index IO. + // On success the entry is in the cache; on failure the error propagates. + let location = object_meta.location.clone(); + rt_handle.block_on(async { + parquet_bridge::load_parquet_metadata(store, &location, metadata_cache).await })?; - - // Verify the metadata was cached properly - match cache_ref.inner.lock() { - Ok(cache_guard) => { - let path = Path::from(file_path.to_string()); - if cache_guard.contains_key(&path) { - Ok(true) - } else { - debug!("[CACHE ERROR] Failed to cache metadata for: {}", file_path); - Ok(false) - } - } - Err(e) => Err(format!("Failed to verify cache: {}", e)) - } + Ok(true) } /// Compute and put statistics into cache diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/eviction_policy.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/eviction_policy.rs similarity index 100% rename from sandbox/plugins/analytics-backend-datafusion/rust/src/eviction_policy.rs rename to sandbox/plugins/analytics-backend-datafusion/rust/src/cache/eviction_policy.rs diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/metadata_cache.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/metadata_cache.rs new file mode 100644 index 0000000000000..e71fe68a03a39 --- /dev/null +++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/metadata_cache.rs @@ -0,0 +1,344 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +use std::sync::atomic::{AtomicUsize, Ordering}; +use std::sync::{Arc, Mutex}; + +use datafusion::datasource::physical_plan::parquet::metadata::CachedParquetMetaData; +use datafusion::execution::cache::cache_manager::{ + CachedFileMetadataEntry, FileMetadataCache, FileMetadataCacheEntry, +}; +use datafusion::execution::cache::CacheAccessor; +use datafusion::execution::cache::DefaultFilesMetadataCache; +use datafusion::parquet::file::metadata::ParquetMetaData; +use log::error; +use object_store::path::Path; + +// Cache type constants +pub const CACHE_TYPE_METADATA: &str = "METADATA"; +pub const CACHE_TYPE_STATS: &str = "STATISTICS"; + +// Helper function to log cache operations +fn log_cache_error(operation: &str, error: &str) { + error!("[CACHE ERROR] {} operation failed: {}", operation, error); +} + +/// Return a cache entry whose `ParquetMetaData` carries footer-only metadata (no +/// `ColumnIndex` / `OffsetIndex`). If the entry already lacks a page index — or +/// isn't a `CachedParquetMetaData` at all — it's returned unchanged (no clone, no +/// rebuild). +/// +/// This is the single chokepoint that enforces the footer-only invariant: every +/// `put` runs the entry through here before it lands in the shared LRU. +fn strip_page_index(entry: CachedFileMetadataEntry) -> CachedFileMetadataEntry { + let Some(cached) = entry + .file_metadata + .as_any() + .downcast_ref::() + else { + return entry; + }; + let meta = cached.parquet_metadata(); + if meta.column_index().is_none() && meta.offset_index().is_none() { + // Already footer-only — keep the existing Arc, avoid a rebuild. + return entry; + } + // Rebuild without the page index. The heavy decoded `ColumnIndex` / + // `OffsetIndex` are released when the original Arc drops; the footer + // (row-group + column chunk stats) is preserved. + let stripped = ParquetMetaData::clone(meta) + .into_builder() + .set_column_index(None) + .set_offset_index(None) + .build(); + CachedFileMetadataEntry::new( + entry.meta, + Arc::new(CachedParquetMetaData::new(Arc::new(stripped))), + ) +} + +// Wrapper to make Mutex implement FileMetadataCache +pub struct MutexFileMetadataCache { + pub inner: Mutex, + hit_count: AtomicUsize, + miss_count: AtomicUsize, +} + +impl MutexFileMetadataCache { + pub fn new(cache: DefaultFilesMetadataCache) -> Self { + Self { + inner: Mutex::new(cache), + hit_count: AtomicUsize::new(0), + miss_count: AtomicUsize::new(0), + } + } + + pub fn hit_count(&self) -> usize { + self.hit_count.load(Ordering::Relaxed) + } + + pub fn miss_count(&self) -> usize { + self.miss_count.load(Ordering::Relaxed) + } + + pub fn reset_stats(&self) { + self.hit_count.store(0, Ordering::Relaxed); + self.miss_count.store(0, Ordering::Relaxed); + } + + pub fn clear_cache(&self) { + if let Ok(cache) = self.inner.lock() { + cache.clear(); + } + } + + pub fn update_cache_limit(&self, new_limit: usize) { + if let Ok(cache) = self.inner.lock() { + cache.update_cache_limit(new_limit); + } + } + + pub fn get_cache_limit(&self) -> usize { + if let Ok(cache) = self.inner.lock() { + cache.cache_limit() + } else { + 0 + } + } +} + +impl CacheAccessor for MutexFileMetadataCache { + fn get(&self, k: &Path) -> Option { + match self.inner.lock() { + Ok(cache) => { + let result = cache.get(k); + if result.is_some() { + self.hit_count.fetch_add(1, Ordering::Relaxed); + } else { + self.miss_count.fetch_add(1, Ordering::Relaxed); + } + result + } + Err(e) => { + log_cache_error("get", &e.to_string()); + None + } + } + } + + fn put(&self, k: &Path, v: CachedFileMetadataEntry) -> Option { + // Enforce the footer-only invariant at the single cache chokepoint. + // + // DataFusion's parquet paths (`infer_schema`, the scan opener, + // `fetch_statistics`) hand this cache to `DFParquetMetadata::fetch_metadata`, + // which force-decodes the FULL page index (`ColumnIndex` + `OffsetIndex` + // for every column of every row group) before calling `put`. On wide + // schemas that decoded index dominates the native heap and, since this is + // a shared LRU keyed by path, also evicts the small footer-only entries + // the scan paths depend on. + // + // We can't stop DataFusion from decoding it, but we can refuse to retain + // it: strip the page index here so the level-1 cache only ever holds + // footer-only metadata (row-group + file stats). Page-level pruning is + // unaffected — both scan paths rebuild a predicate-scoped page index per + // query through the shared scoped cache (`parquet_page_cache`). + let v = strip_page_index(v); + match self.inner.lock() { + Ok(cache) => cache.put(k, v), + Err(e) => { + log_cache_error("put", &e.to_string()); + None + } + } + } + + fn remove(&self, k: &Path) -> Option { + match self.inner.lock() { + Ok(cache) => cache.remove(k), + Err(e) => { + log_cache_error("remove", &e.to_string()); + None + } + } + } + + fn contains_key(&self, k: &Path) -> bool { + match self.inner.lock() { + Ok(cache) => cache.contains_key(k), + Err(e) => { + log_cache_error("contains_key", &e.to_string()); + false + } + } + } + + fn len(&self) -> usize { + match self.inner.lock() { + Ok(cache) => cache.len(), + Err(e) => { + log_cache_error("len", &e.to_string()); + 0 + } + } + } + + fn clear(&self) { + match self.inner.lock() { + Ok(cache) => cache.clear(), + Err(e) => log_cache_error("clear", &e.to_string()), + } + } + + fn name(&self) -> String { + match self.inner.lock() { + Ok(cache) => cache.name(), + Err(e) => { + log_cache_error("name", &e.to_string()); + "cache_error".to_string() + } + } + } +} + +impl FileMetadataCache for MutexFileMetadataCache { + fn cache_limit(&self) -> usize { + match self.inner.lock() { + Ok(cache) => cache.cache_limit(), + Err(e) => { + log_cache_error("cache_limit", &e.to_string()); + 0 + } + } + } + + fn update_cache_limit(&self, limit: usize) { + match self.inner.lock() { + Ok(cache) => cache.update_cache_limit(limit), + Err(e) => log_cache_error("update_cache_limit", &e.to_string()), + } + } + + fn list_entries(&self) -> std::collections::HashMap { + match self.inner.lock() { + Ok(cache) => cache.list_entries(), + Err(e) => { + log_cache_error("list_entries", &e.to_string()); + std::collections::HashMap::new() + } + } + } +} + +#[cfg(test)] +mod strip_page_index_tests { + use super::*; + use datafusion::arrow::array::{Int64Array, RecordBatch}; + use datafusion::arrow::datatypes::{DataType, Field, Schema}; + use datafusion::parquet::arrow::arrow_reader::{ArrowReaderMetadata, ArrowReaderOptions}; + use datafusion::parquet::arrow::ArrowWriter; + use datafusion::parquet::file::properties::{EnabledStatistics, WriterProperties}; + use object_store::ObjectMeta; + use prost::bytes::Bytes; + + fn parquet_with_page_index() -> Bytes { + let schema = Arc::new(Schema::new(vec![Field::new("v", DataType::Int64, false)])); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int64Array::from((0..4096i64).collect::>()))], + ) + .unwrap(); + let props = WriterProperties::builder() + .set_statistics_enabled(EnabledStatistics::Page) + .set_data_page_row_count_limit(128) + .build(); + let mut buf: Vec = Vec::new(); + let mut w = ArrowWriter::try_new(&mut buf, schema, Some(props)).unwrap(); + w.write(&batch).unwrap(); + w.close().unwrap(); + Bytes::from(buf) + } + + fn object_meta(bytes: &Bytes) -> ObjectMeta { + ObjectMeta { + location: Path::from("data.parquet"), + last_modified: chrono::Utc::now(), + size: bytes.len() as u64, + e_tag: None, + version: None, + } + } + + fn full_index_entry(bytes: &Bytes) -> CachedFileMetadataEntry { + let meta = ArrowReaderMetadata::load( + &bytes.clone(), + ArrowReaderOptions::new().with_page_index(true), + ) + .unwrap(); + let pq = meta.metadata().clone(); + assert!(pq.column_index().is_some() && pq.offset_index().is_some()); + CachedFileMetadataEntry::new(object_meta(bytes), Arc::new(CachedParquetMetaData::new(pq))) + } + + fn page_index_present(entry: &CachedFileMetadataEntry) -> bool { + let cached = entry + .file_metadata + .as_any() + .downcast_ref::() + .unwrap(); + let m = cached.parquet_metadata(); + m.column_index().is_some() || m.offset_index().is_some() + } + + #[test] + fn put_strips_page_index_and_get_returns_footer_only() { + let bytes = parquet_with_page_index(); + let entry = full_index_entry(&bytes); + assert!(page_index_present(&entry), "precondition: entry has page index"); + + let cache = MutexFileMetadataCache::new(DefaultFilesMetadataCache::new(64 * 1024 * 1024)); + let key = Path::from("data.parquet"); + cache.put(&key, entry); + + let got = cache.get(&key).expect("entry must be retrievable"); + assert!(!page_index_present(&got), "cached entry must be footer-only after put"); + let cached = got + .file_metadata + .as_any() + .downcast_ref::() + .unwrap(); + let m = cached.parquet_metadata(); + assert!(m.num_row_groups() > 0); + assert!(m.row_group(0).column(0).statistics().is_some(), "footer stats must survive"); + } + + #[test] + fn strip_is_noop_for_footer_only_entry() { + let bytes = parquet_with_page_index(); + let meta = ArrowReaderMetadata::load( + &bytes.clone(), + ArrowReaderOptions::new().with_page_index(false), + ) + .unwrap(); + let pq = meta.metadata().clone(); + assert!(pq.column_index().is_none() && pq.offset_index().is_none()); + let entry = CachedFileMetadataEntry::new( + object_meta(&bytes), + Arc::new(CachedParquetMetaData::new(Arc::clone(&pq))), + ); + let stripped = strip_page_index(entry); + let cached = stripped + .file_metadata + .as_any() + .downcast_ref::() + .unwrap(); + assert!( + Arc::ptr_eq(cached.parquet_metadata(), &pq), + "footer-only entry must be returned unchanged (same Arc)" + ); + } +} diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/mod.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/mod.rs new file mode 100644 index 0000000000000..31cd378c69518 --- /dev/null +++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/mod.rs @@ -0,0 +1,37 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +//! Cache infrastructure for the analytics backend. +//! +//! # Structure +//! +//! - [`eviction_policy`] — pluggable eviction policy trait (`CachePolicy`) and +//! built-in implementations (`LruPolicy`, `LfuPolicy`). Add new policies here +//! (e.g. S3-FIFO) without touching the cache implementations. +//! - [`metadata_cache`] — `MutexFileMetadataCache`: wraps DataFusion's +//! `DefaultFilesMetadataCache` with hit/miss counters and enforces the +//! footer-only invariant via `strip_page_index` at every `put`. +//! - [`statistics_cache`] — `CustomStatisticsCache`: byte-bounded LRU cache +//! for per-file `Statistics` (row-group min/max/null-count). +//! - [`custom_manager`] — `CustomCacheManager`: ties the metadata and +//! statistics caches together for pre-warming and lifecycle management. +//! - [`page_index`] — scoped parquet page-index caches (ColumnIndex + +//! OffsetIndex), cell-granular and backed by `BoundedCache` / +//! `Box`. + +pub mod custom_cache_manager; +pub mod eviction_policy; +pub mod metadata_cache; +pub mod page_index; +pub mod statistics_cache; + +// Flat re-exports so existing call sites keep working without path changes. +pub use custom_cache_manager::CustomCacheManager; +pub use eviction_policy::{CachePolicy, CacheResult, PolicyType, create_policy}; +pub use metadata_cache::{MutexFileMetadataCache, CACHE_TYPE_METADATA, CACHE_TYPE_STATS}; +pub use statistics_cache::{CustomStatisticsCache, compute_parquet_statistics}; diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/cache_keys.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/cache_keys.rs new file mode 100644 index 0000000000000..8bcc121a1646e --- /dev/null +++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/cache_keys.rs @@ -0,0 +1,65 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +//! Cache key types for the two scoped page-index caches. + +use std::fmt::Display; +use std::sync::Arc; + +use parquet::file::page_index::offset_index::OffsetIndexMetaData; + +/// ColumnIndex cache key — one decoded `ColumnIndexMetaData` **cell** per +/// `(file, column, row-group)`. The page index for a given column+RG is an +/// intrinsic property of the file: it is identical no matter which *other* +/// columns a query filters on, or which literal a predicate uses. Keying at the +/// cell granularity means a column's per-page string min/max is decoded and +/// stored **once per file**, then reused by every query whose predicate touches +/// that column — regardless of the predicate-column *combination* or the +/// surviving-row-group *set*. (The prior set-keyed design re-decoded and +/// re-stored a column for every distinct predicate/RG combination — storage grew +/// with query diversity, not schema width.) +/// +/// Both scan paths resolve the same `(file, col, rg)` for the same logical +/// request, so cells are shared across paths → cross-path sharing. +#[derive(Clone, PartialEq, Eq, Hash, Debug)] +pub(crate) struct CiCellKey { + pub(crate) path: Arc, + pub(crate) col: usize, + pub(crate) rg: usize, +} + +impl Display for CiCellKey { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}:{}", self.path, self.col, self.rg) + } +} + +/// OffsetIndex cache key — one decoded value per `(file, column)`, where the +/// value is that column's `OffsetIndexMetaData` for **every** row group (a +/// `Vec` indexed by RG). Unlike the ColumnIndex, the OffsetIndex is read at scan +/// time for any RG DataFusion chooses to scan — and DataFusion picks that set +/// itself, after our load — so a column's OffsetIndex must always cover all RGs +/// (an empty entry on a scanned RG panics / breaks reads). RG can therefore never +/// be a key axis here; the cell is the whole-column, all-RG offset index. Keyed +/// only on `(file, col)`, so any query that reads a column reuses its offset +/// index irrespective of projection or predicate. +#[derive(Clone, PartialEq, Eq, Hash, Debug)] +pub(crate) struct OiCellKey { + pub(crate) path: Arc, + pub(crate) col: usize, +} + +impl Display for OiCellKey { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}:{}", self.path, self.col) + } +} + +/// One column's OffsetIndex across all row groups (indexed by RG). The value type +/// of [`OFFSET_INDEX_CACHE`]. +pub(crate) type OiColumn = Vec; diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/cache_store.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/cache_store.rs new file mode 100644 index 0000000000000..90977cfe1364a --- /dev/null +++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/cache_store.rs @@ -0,0 +1,195 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +//! Byte-bounded cache with a pluggable eviction policy, used by the two +//! scoped page-index caches ([`COLUMN_INDEX_CACHE`] / [`OFFSET_INDEX_CACHE`] +//! defined in `mod.rs`). + +use std::fmt::Display; +use std::hash::Hash; +use std::sync::atomic::{AtomicU64, AtomicUsize, Ordering::Relaxed}; +use std::sync::Mutex; + +use dashmap::DashMap; + +use crate::cache::eviction_policy::{CachePolicy, PolicyType, create_policy}; + +/// Default byte budget for EACH scoped cache, used until the caller sets one from +/// the runtime's configured limit (see [`set_column_index_cache_limit`] / +/// [`set_offset_index_cache_limit`]). The two caches are budgeted independently: +/// the ColumnIndex (per-page string min/max) is the heavy one and the OffsetIndex +/// (fixed-width page offsets) is tiny, so they get separate, separately-tunable +/// limits rather than sharing one number. +/// +/// TODO : configure via settings +pub(crate) const DEFAULT_SCOPED_CACHE_LIMIT: usize = 150 * 1024 * 1024; + +/// Snapshot of one scoped cache's counters plus occupancy. Surfaced on +/// node-stats and used by tests to assert hits/misses without `Arc::ptr_eq`. +#[derive(Debug, Clone, Copy, PartialEq, Eq, Default)] +pub struct ScopedCacheStats { + pub hits: u64, + pub misses: u64, + pub evictions: u64, + pub entries: usize, + pub used_bytes: usize, + pub limit_bytes: usize, +} + +/// Byte-bounded cache with a pluggable eviction policy. +/// +/// `DashMap` shards the value store so concurrent `get` calls on different keys +/// never contend. The eviction policy (ordering metadata + victim selection) sits +/// behind a `Mutex` — it is only touched on `insert` and `set_limit`, not on the +/// read path. Counters are atomics so `stats()` is always lock-free. +pub(super) struct BoundedCache +where + K: Eq + Hash + Clone + Display + Send + Sync + 'static, + V: Clone + Send + Sync + 'static, +{ + /// Primary value store — concurrent reads with no global lock. + map: DashMap, + /// Reverse map: policy string key → typed cache key, needed to resolve + /// eviction candidates (the policy works with `String` keys). + reverse: DashMap, + /// Eviction policy — behind a Mutex since mutation (on_insert/select_for_eviction) + /// is not concurrent-safe. Not held during reads. + policy: Mutex>, + limit: AtomicUsize, + // lock-free counters + hits: AtomicU64, + misses: AtomicU64, + evictions: AtomicU64, + used_bytes: AtomicUsize, +} + +impl BoundedCache +where + K: Eq + Hash + Clone + Display + Send + Sync + 'static, + V: Clone + Send + Sync + 'static, +{ + pub(super) fn new(limit: usize, policy_type: PolicyType) -> Self { + Self { + map: DashMap::new(), + reverse: DashMap::new(), + policy: Mutex::new(create_policy(policy_type)), + limit: AtomicUsize::new(limit), + hits: AtomicU64::new(0), + misses: AtomicU64::new(0), + evictions: AtomicU64::new(0), + used_bytes: AtomicUsize::new(0), + } + } + + pub(super) fn get(&self, key: &K) -> Option { + match self.map.get(key) { + Some(entry) => { + let size = entry.1; + if let Ok(mut p) = self.policy.lock() { + p.on_access(&key.to_string(), size); + } + self.hits.fetch_add(1, Relaxed); + Some(entry.0.clone()) + } + None => { + self.misses.fetch_add(1, Relaxed); + None + } + } + } + + pub(super) fn insert(&self, key: K, value: V, size: usize) { + let limit = self.limit.load(Relaxed); + if size > limit { + return; + } + let key_str = key.to_string(); + if let Some(old) = self.map.insert(key.clone(), (value, size)) { + self.used_bytes.fetch_sub(old.1, Relaxed); + } + self.reverse.insert(key_str.clone(), key); + self.used_bytes.fetch_add(size, Relaxed); + if let Ok(mut p) = self.policy.lock() { + p.on_insert(&key_str, size); + } + self.evict(); + } + + fn evict(&self) { + let limit = self.limit.load(Relaxed); + let used = self.used_bytes.load(Relaxed); + if used <= limit { + return; + } + let candidates = if let Ok(p) = self.policy.lock() { + p.select_for_eviction(used - limit) + } else { + return; + }; + for key_str in candidates { + if let Some((_, typed_key)) = self.reverse.remove(&key_str) { + if let Some((_, (_, size))) = self.map.remove(&typed_key) { + self.used_bytes.fetch_sub(size, Relaxed); + self.evictions.fetch_add(1, Relaxed); + if let Ok(mut p) = self.policy.lock() { + p.on_remove(&key_str); + } + } + } + } + } + + pub(super) fn set_limit(&self, limit: usize) { + self.limit.store(limit, Relaxed); + self.evict(); + } + + pub(super) fn clear_keep_limit(&self) { + self.map.clear(); + self.reverse.clear(); + if let Ok(mut p) = self.policy.lock() { + p.clear(); + } + self.hits.store(0, Relaxed); + self.misses.store(0, Relaxed); + self.evictions.store(0, Relaxed); + self.used_bytes.store(0, Relaxed); + } + + /// Remove all entries whose string key starts with `prefix` (used to evict + /// all cells for a given file path when the file is deleted/replaced). + pub(super) fn evict_by_prefix(&self, prefix: &str) { + let victims: Vec = self.reverse + .iter() + .filter(|e| e.key().starts_with(prefix)) + .map(|e| e.key().clone()) + .collect(); + for key_str in victims { + if let Some((_, typed_key)) = self.reverse.remove(&key_str) { + if let Some((_, (_, size))) = self.map.remove(&typed_key) { + self.used_bytes.fetch_sub(size, Relaxed); + self.evictions.fetch_add(1, Relaxed); + if let Ok(mut p) = self.policy.lock() { + p.on_remove(&key_str); + } + } + } + } + } + + pub(super) fn stats(&self) -> ScopedCacheStats { + ScopedCacheStats { + hits: self.hits.load(Relaxed), + misses: self.misses.load(Relaxed), + evictions: self.evictions.load(Relaxed), + entries: self.map.len(), + used_bytes: self.used_bytes.load(Relaxed), + limit_bytes: self.limit.load(Relaxed), + } + } +} diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/column_schema_resolver.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/column_schema_resolver.rs new file mode 100644 index 0000000000000..bc4e711003a95 --- /dev/null +++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/column_schema_resolver.rs @@ -0,0 +1,116 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +//! Predicate-column name → parquet leaf-index resolution. +//! +//! Resolution is done against the file's OWN schema (derived from the footer) +//! rather than the shared table schema to ensure correct leaf indices under +//! schema evolution (see [`resolve_predicate_parquet_columns`] for details). + +use std::collections::HashSet; +use std::sync::Arc; + +use arrow::datatypes::SchemaRef; +use datafusion::parquet::arrow::arrow_reader::statistics::StatisticsConverter; +use datafusion::parquet::file::metadata::ParquetMetaData; +use parquet::arrow::parquet_to_arrow_schema; + +/// Map the query's predicate-column names to **this file's** parquet leaf +/// indices, resolving against the file's OWN schema so the indices are correct +/// even when the file is missing columns (schema evolution). +/// +/// # Why the file's own schema, not the shared table schema +/// +/// `StatisticsConverter`/`parquet_column` map a column by finding its position in +/// the supplied arrow schema and then matching that position to a parquet leaf +/// (`get_column_root_idx`). The table schema is the **union** of all +/// files' columns [N]; a given file may physically contain fewer[M] (e.g. +/// the merged file has M leaves — the absent columns are all-null and not +/// written). Resolving against the N-field union therefore maps a column to the +/// WRONG leaf in a M-leaf file. We would then build +/// the scoped ColumnIndex/OffsetIndex at the wrong leaf and leave the real one an +/// empty placeholder — and DataFusion's pruner, which resolves against the file's +/// physical schema, reads the real leaf and panics on the empty `page_locations` +/// (`statistics.rs` `page_locations.last().unwrap()`). +/// +/// Deriving the arrow schema from the file footer (`parquet_to_arrow_schema`) +/// gives a 1:1 field↔leaf correspondence for that file, so the resolved index +/// matches what DataFusion dereferences. Columns absent from the file are skipped. +pub fn resolve_predicate_parquet_columns( + _arrow_schema: &SchemaRef, + metadata: &ParquetMetaData, + predicate_column_names: &[String], +) -> Vec { + let parquet_schema = metadata.file_metadata().schema_descr(); + // Per-file arrow schema: 1:1 with this file's parquet leaves, so a column's + // arrow position maps to its true leaf. (The passed `_arrow_schema` is the + // union table schema and is intentionally NOT used for index resolution — + // see the doc comment.) + let file_arrow_schema = match parquet_to_arrow_schema( + parquet_schema, + metadata.file_metadata().key_value_metadata(), + ) { + Ok(s) => Arc::new(s), + // If we can't derive the file schema, fall back to the union schema; the + // caller still falls back to footer-only on any downstream mismatch. + Err(_) => return resolve_with_schema(_arrow_schema, metadata, predicate_column_names), + }; + resolve_with_schema(&file_arrow_schema, metadata, predicate_column_names) +} + +/// Resolve TWO name-sets (e.g. predicate columns and projection columns) against +/// the same file in one pass. Deriving the per-file arrow schema +/// (`parquet_to_arrow_schema`) is the dominant cost of name→leaf resolution on +/// wide schemas (it rebuilds the whole file's Schema); the two callers in the +/// indexed setup loop previously each rebuilt it, so doing it once here removes a +/// full schema reconstruction per file per query. Pure refactor — each returned +/// Vec is identical to calling `resolve_predicate_parquet_columns` separately. +pub fn resolve_predicate_parquet_columns_pair( + union_schema: &SchemaRef, + metadata: &ParquetMetaData, + names_a: &[String], + names_b: &[String], +) -> (Vec, Vec) { + let parquet_schema = metadata.file_metadata().schema_descr(); + match parquet_to_arrow_schema( + parquet_schema, + metadata.file_metadata().key_value_metadata(), + ) { + Ok(s) => { + let file_arrow_schema = Arc::new(s); + ( + resolve_with_schema(&file_arrow_schema, metadata, names_a), + resolve_with_schema(&file_arrow_schema, metadata, names_b), + ) + } + // Same fallback as the single-name path: resolve against the union schema. + Err(_) => ( + resolve_with_schema(union_schema, metadata, names_a), + resolve_with_schema(union_schema, metadata, names_b), + ), + } +} + +/// Resolve predicate column names → parquet leaf indices against a specific arrow +/// schema, via the same `StatisticsConverter` mapping DataFusion's pruner uses. +pub(super) fn resolve_with_schema( + arrow_schema: &SchemaRef, + metadata: &ParquetMetaData, + predicate_column_names: &[String], +) -> Vec { + let parquet_schema = metadata.file_metadata().schema_descr(); + let mut set = HashSet::new(); + for name in predicate_column_names { + if let Ok(conv) = StatisticsConverter::try_new(name, arrow_schema, parquet_schema) { + if let Some(idx) = conv.parquet_column_index() { + set.insert(idx); + } + } + } + set.into_iter().collect() +} diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/mod.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/mod.rs new file mode 100644 index 0000000000000..760f46d6d65e0 --- /dev/null +++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/mod.rs @@ -0,0 +1,188 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +//! Scoped parquet page-index caches — TWO caches, by consumer. +//! +//! # Why this exists +//! +//! Parquet metadata loading pulls the **entire page index** — `ColumnIndex` +//! (per-page min/max; the per-page *string* min/max is the heap hog) plus +//! `OffsetIndex` (per-page byte offsets), for every column of every row group. +//! On wide schemas this is very memory expensive. +//! The level-1 metadata cache is kept footer-only (see +//! [`crate::cache`]); this module rebuilds a *scoped* page index per query and +//! caches it, shared by both scan paths (the DataFusion `ListingTable` path and +//! the custom indexed-table executor). +//! +//! # Two caches, because the two indexes have different drivers +//! +//! The `ColumnIndex` and `OffsetIndex` are consumed by different parts of +//! DataFusion / parquet, with **different natural cache keys**. Forcing +//! them into one key makes the projection-driven OffsetIndex poison the +//! predicate-driven ColumnIndex's broad cross-path sharing (the failure mode of +//! the prior iteration). So they are split: +//! +//! - **ColumnIndex — predicate-driven.** Read only at *prune* time, and only for +//! the predicate column being evaluated +//! (`page_filter::PagesPruningStatistics`, `offset_index[rg][predicate_col]`). +//! Key: `(file, predicate_cols, surviving_rgs)`. Deterministic in the +//! *predicate* (independent of what you `SELECT`), so the same filter shares +//! its entry across scan paths **and** across queries with different +//! projections. This is the heavy index (string min/max) and the big heap win. +//! Scoped to predicate columns (`NONE` placeholders elsewhere) and, optionally, +//! to the row groups that pass footer-stats pruning ([`surviving_row_groups`]). +//! +//! - **OffsetIndex — projection-driven.** Read at *scan* time for **projected** +//! columns (`InMemoryRowGroup::fetch_ranges`, `projection.leaf_included(idx)`), +//! and at prune time for the predicate column, and at column 0 for the +//! page-skip metric. Key: `(file, projection_cols)` where +//! `projection_cols = predicate ∪ projection ∪ {0}`. This is the cheap, fixed-width +//! index (no per-page string stats). Built for **all row groups** (an empty +//! OffsetIndex on a row group DataFusion scans panics / breaks reads, and +//! DataFusion chooses the scanned set itself, after our load — see +//! HANDOFF_step2_rg_scoping.md §1e). +//! +//! Each cache stores only its decoded vector (`ParquetColumnIndex` / +//! `ParquetOffsetIndex`) — never a full `ParquetMetaData` (no footer +//! duplication). On lookup the two are **grafted** onto the caller's +//! already-resident footer via [`ParquetMetaData::into_builder`] → +//! `set_column_index`/`set_offset_index`. +//! +//! **Consequence for tests:** a lookup returns a *fresh* `Arc`, so `Arc::ptr_eq` +//! is the wrong signal for "served from cache" — assert via the per-cache hit +//! counters ([`column_index_cache_stats`] / [`offset_index_cache_stats`]). +//! +//! ## Correctness / fallback +//! +//! Any failure (file has no page index, a column lacks an index range, a +//! decode/IO error) makes the load return `None`. The caller keeps its +//! footer-only metadata and the pruner conservatively no-ops (scans the whole +//! row group) — never a wrong result. +//! +//! ## Upstream note +//! +//! arrow-rs is moving toward first-class selective metadata decoding +//! (apache/arrow-rs#8643 open; the `ParquetStatisticsPolicy::skip_except` pattern +//! merged in #8797 / #8714 for encoding stats). None yet expose a page-index +//! column/row-group projection, so we hand-roll it with the deprecated +//! [`read_columns_indexes`]/[`read_offset_indexes`] (the only public subset +//! decoders). Migrate to `ParquetMetaDataOptions` when it grows a page-index knob. + +pub mod cache_store; +pub mod cache_keys; +pub mod page_index_io; +pub mod column_schema_resolver; + +use cache_store::{BoundedCache, DEFAULT_SCOPED_CACHE_LIMIT}; +use cache_keys::{CiCellKey, OiCellKey, OiColumn}; + +use crate::cache::eviction_policy::PolicyType; +use datafusion::parquet::file::page_index::column_index::ColumnIndexMetaData; +use once_cell::sync::Lazy; + +pub use cache_store::ScopedCacheStats; +pub use page_index_io::{ + load_scoped_page_index, + load_scoped_page_index_cols, + load_scoped_page_index_rgs, + load_page_index_fully_scoped, + surviving_row_groups, +}; +pub use column_schema_resolver::{ + resolve_predicate_parquet_columns, + resolve_predicate_parquet_columns_pair, +}; + +// Process-global caches + +pub(crate) static COLUMN_INDEX_CACHE: Lazy> = + Lazy::new(|| BoundedCache::new(DEFAULT_SCOPED_CACHE_LIMIT, PolicyType::Lru)); + +pub(crate) static OFFSET_INDEX_CACHE: Lazy> = + Lazy::new(|| BoundedCache::new(DEFAULT_SCOPED_CACHE_LIMIT, PolicyType::Lru)); + +/// Set the ColumnIndex cache's byte budget. Called from startup wiring with the +/// configured limit. Idempotent; shrinking evicts immediately. Zero ignored. +pub fn set_column_index_cache_limit(limit: usize) { + if limit > 0 { + COLUMN_INDEX_CACHE.set_limit(limit); + } +} + +/// Set the OffsetIndex cache's byte budget. Called from startup wiring with the +/// configured limit. Idempotent; shrinking evicts immediately. Zero ignored. +pub fn set_offset_index_cache_limit(limit: usize) { + if limit > 0 { + OFFSET_INDEX_CACHE.set_limit(limit); + } +} + +/// Counters + occupancy of the ColumnIndex (predicate-driven) cache. Lock-free. +pub fn column_index_cache_stats() -> ScopedCacheStats { + COLUMN_INDEX_CACHE.stats() +} + +/// Counters + occupancy of the OffsetIndex (projection-driven) cache. Lock-free. +pub fn offset_index_cache_stats() -> ScopedCacheStats { + OFFSET_INDEX_CACHE.stats() +} + +/// Drop all entries and reset counters in BOTH caches, keeping the budgets. For +/// operational testing — reset and re-measure without a cluster restart. +pub fn clear_scoped_cache() { + COLUMN_INDEX_CACHE.clear_keep_limit(); + OFFSET_INDEX_CACHE.clear_keep_limit(); +} + +/// Evict all page-index cells for a specific file from both caches. +/// +/// Called when a segment file is deleted or replaced so stale cells don't survive +/// in the cache under the same `(path, col, rg)` key. The page-index caches have +/// no freshness check (unlike the metadata cache's `is_valid_for`), so stale cells +/// from a re-written file would otherwise be served as hits — wrong data. +pub fn evict_file_from_scoped_cache(file_path: &str) { + COLUMN_INDEX_CACHE.evict_by_prefix(file_path); + OFFSET_INDEX_CACHE.evict_by_prefix(file_path); +} + +/// Crate-wide guard so every test that touches the process-global caches mutually +/// excludes (distinct fixtures alone aren't enough — the `InMemory` path is always +/// "data.parquet"). Shared (not per-module) so all cache users serialize. +#[cfg(test)] +pub(crate) static SCOPED_CACHE_TEST_GUARD: std::sync::Mutex<()> = std::sync::Mutex::new(()); + +/// Clear both caches AND restore the default limit on each. +#[cfg(test)] +pub(crate) fn clear_scoped_cache_for_test() { + COLUMN_INDEX_CACHE.clear_keep_limit(); + COLUMN_INDEX_CACHE.set_limit(DEFAULT_SCOPED_CACHE_LIMIT); + OFFSET_INDEX_CACHE.clear_keep_limit(); + OFFSET_INDEX_CACHE.set_limit(DEFAULT_SCOPED_CACHE_LIMIT); +} + +#[cfg(test)] +pub(crate) fn set_column_index_cache_limit_for_test(limit: usize) { + COLUMN_INDEX_CACHE.set_limit(limit); +} + +/// Combined view (sum of both caches) — test-only convenience for assertions that +/// only need "is the scoped machinery doing anything". Production code reads the +/// two caches separately ([`column_index_cache_stats`] / [`offset_index_cache_stats`]). +#[cfg(test)] +pub(crate) fn scoped_cache_stats() -> ScopedCacheStats { + let a = column_index_cache_stats(); + let b = offset_index_cache_stats(); + ScopedCacheStats { + hits: a.hits + b.hits, + misses: a.misses + b.misses, + evictions: a.evictions + b.evictions, + entries: a.entries + b.entries, + used_bytes: a.used_bytes + b.used_bytes, + limit_bytes: a.limit_bytes.max(b.limit_bytes), + } +} diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/page_index_io.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/page_index_io.rs new file mode 100644 index 0000000000000..e2fd3f8c7695b --- /dev/null +++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/page_index_io.rs @@ -0,0 +1,1626 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +//! Page-index load entry points and their supporting internals. +//! +//! The four public `load_scoped_page_index*` functions are the only callers of +//! the cache machinery; all decoding, cache lookup, and grafting happens here. + +use std::collections::{HashMap, HashSet}; +use std::mem; +use std::ops::Range; +use std::sync::Arc; + +use arrow::datatypes::SchemaRef; +use datafusion::parquet::errors::{ParquetError, Result as ParquetResult}; +use datafusion::parquet::file::metadata::{ + ColumnChunkMetaData, OffsetIndexBuilder, ParquetColumnIndex, ParquetMetaData, + ParquetOffsetIndex, +}; +use datafusion::parquet::file::page_index::column_index::ColumnIndexMetaData; +use datafusion::parquet::file::page_index::index_reader::{ + read_columns_indexes, read_offset_indexes, +}; +use datafusion::parquet::file::reader::{ChunkReader, Length}; +use object_store::ObjectStore; +use parquet::file::page_index::offset_index::OffsetIndexMetaData; +use prost::bytes::{buf, Buf, Bytes}; + +use super::cache_keys::{CiCellKey, OiCellKey, OiColumn}; +use super::{COLUMN_INDEX_CACHE, OFFSET_INDEX_CACHE}; + +/// Load + graft a scoped page index: ColumnIndex for `predicate_cols` (all RGs), +/// OffsetIndex for all columns/all RGs. The Step-1 baseline. +pub async fn load_scoped_page_index( + store: &Arc, + location: &object_store::path::Path, + footer_meta: &Arc, + predicate_cols: &[usize], +) -> Option> { + attach_scoped_page_index_to_metadata(store, location, footer_meta, predicate_cols, None, None).await +} + +/// Like [`load_scoped_page_index`], but the ColumnIndex is built only for the row +/// groups in `surviving_rgs` (footer-stats survivors — [`surviving_row_groups`]); +/// other RGs get a `NONE` ColumnIndex placeholder. OffsetIndex stays all-columns. +pub async fn load_scoped_page_index_rgs( + store: &Arc, + location: &object_store::path::Path, + footer_meta: &Arc, + predicate_cols: &[usize], + surviving_rgs: &[usize], +) -> Option> { + attach_scoped_page_index_to_metadata(store, location, footer_meta, predicate_cols, Some(surviving_rgs), None).await +} + +/// Like [`load_scoped_page_index`], but the OffsetIndex is built only for +/// `projection_cols` (the loader unions in the predicate columns + column 0 +/// defensively); other columns get an empty placeholder. ColumnIndex stays +/// all-RG. See [`OiKey`] for which columns must be real and why. +pub async fn load_scoped_page_index_cols( + store: &Arc, + location: &object_store::path::Path, + footer_meta: &Arc, + predicate_cols: &[usize], + projection_cols: &[usize], +) -> Option> { + attach_scoped_page_index_to_metadata(store, location, footer_meta, predicate_cols, None, Some(projection_cols)).await +} + +/// Fully scoped: ColumnIndex RG-scoped to `surviving_rgs`, OffsetIndex +/// column-scoped to `projection_cols` (∪ predicate ∪ {0}). The Step-2 target both +/// scan paths call once they know their surviving-RG set and projection. +pub async fn load_page_index_fully_scoped( + store: &Arc, + location: &object_store::path::Path, + footer_meta: &Arc, + predicate_cols: &[usize], + surviving_rgs: &[usize], + projection_cols: &[usize], +) -> Option> { + attach_scoped_page_index_to_metadata( + store, + location, + footer_meta, + predicate_cols, + Some(surviving_rgs), + Some(projection_cols), + ) + .await +} + +// Surviving-RG computation (footer-stats prune; superset of DF's set) - NOT WIRED YET + +/// Compute the row groups that pass footer RG-statistics pruning for `predicate`. +/// +/// A **superset** of the row groups DataFusion will scan (DataFusion applies the +/// same footer-stats pruning plus bloom/range/limit, which only remove more), so +/// scoping the predicate-column ColumnIndex to this set is safe. Returns all row +/// groups if the predicate can't be lowered or stats are missing. Deterministic +/// in `(footer_meta, schema, predicate)` → both scan paths agree. +pub fn surviving_row_groups( + footer_meta: &ParquetMetaData, + arrow_schema: &SchemaRef, + predicate: &Arc, +) -> Vec { + use arrow::array::{ArrayRef, BooleanArray, UInt64Array}; + use datafusion::parquet::arrow::arrow_reader::statistics::StatisticsConverter; + use datafusion::physical_optimizer::pruning::{PruningPredicate, PruningStatistics}; + use datafusion::scalar::ScalarValue; + use std::collections::HashSet; + + let num_rgs = footer_meta.num_row_groups(); + let all: Vec = (0..num_rgs).collect(); + if num_rgs == 0 { + return all; + } + + let Ok(pp) = PruningPredicate::try_new(Arc::clone(predicate), Arc::clone(arrow_schema)) else { + return all; + }; + + struct RgStats<'a> { + meta: &'a ParquetMetaData, + schema: &'a SchemaRef, + num_rgs: usize, + } + impl<'a> RgStats<'a> { + fn conv(&self, col: &str) -> Option> { + StatisticsConverter::try_new(col, self.schema, self.meta.file_metadata().schema_descr()) + .ok() + } + } + impl<'a> PruningStatistics for RgStats<'a> { + fn min_values(&self, column: &datafusion::common::Column) -> Option { + self.conv(&column.name)? + .row_group_mins(self.meta.row_groups().iter()) + .ok() + } + fn max_values(&self, column: &datafusion::common::Column) -> Option { + self.conv(&column.name)? + .row_group_maxes(self.meta.row_groups().iter()) + .ok() + } + fn num_containers(&self) -> usize { + self.num_rgs + } + fn null_counts(&self, column: &datafusion::common::Column) -> Option { + self.conv(&column.name)? + .row_group_null_counts(self.meta.row_groups().iter()) + .ok() + .map(|a| Arc::new(a) as ArrayRef) + } + fn row_counts(&self) -> Option { + let counts: Vec = + self.meta.row_groups().iter().map(|rg| rg.num_rows() as u64).collect(); + Some(Arc::new(UInt64Array::from(counts)) as ArrayRef) + } + fn contained( + &self, + _column: &datafusion::common::Column, + _values: &HashSet, + ) -> Option { + None + } + } + + let stats = RgStats { meta: footer_meta, schema: arrow_schema, num_rgs }; + match pp.prune(&stats) { + Ok(mask) => mask + .iter() + .enumerate() + .filter_map(|(i, keep)| if *keep { Some(i) } else { None }) + .collect(), + Err(_) => all, + } +} + +async fn attach_scoped_page_index_to_metadata( + store: &Arc, + location: &object_store::path::Path, + footer_meta: &Arc, + predicate_cols: &[usize], + surviving_rgs: Option<&[usize]>, + projection_cols: Option<&[usize]>, +) -> Option> { + // Nothing to build only when there is NEITHER a predicate (ColumnIndex for + // pruning) NOR an explicit projection (OffsetIndex for page-level IO). A + // match-only query has empty `predicate_cols` but a real, non-empty + // `projection_cols` (the projected columns) — it still needs the OffsetIndex so + // the parquet reader can fetch just the matched rows' pages instead of whole + // column chunks. `projection_cols == None` (all-columns) with no predicate is the + // legacy "nothing requested" case → return None as before. + let has_offset_work = projection_cols.map(|c| !c.is_empty()).unwrap_or(false); + if predicate_cols.is_empty() && !has_offset_work { + return None; + } + // CI and OI are independent — run their IO concurrently. + // CPU setup before each `.await` still runs sequentially (same thread), but + // both `store.get_ranges` calls are in-flight at the same time so wall-clock + // time = max(ci_latency, oi_latency) instead of their sum. + let (ci_result, oi_result) = tokio::join!( + async { + if predicate_cols.is_empty() { + Some(None) + } else { + Some(Some(get_or_build_column_index(store, location, footer_meta, predicate_cols, surviving_rgs).await?)) + } + }, + get_or_build_offset_index(store, location, footer_meta, predicate_cols, projection_cols), + ); + let column_index = ci_result?; + let offset_index = oi_result?; + Some(graft(footer_meta, column_index, offset_index)) +} + +/// Build a fresh `ParquetMetaData` = `footer` with the page-index pair grafted +/// on. Clones the footer to get an owned value for the builder — but with +/// `ParquetMetaData.row_groups` held behind an `Arc` (see the arrow-rs change), +/// that clone is a refcount bump, not a deep copy of every row group's +/// column-chunk metadata. On wide / many-row-group files (e.g. textbench's +/// ~403-col, ~64-RG footers) that deep copy was ~60ms/query; sharing makes the +/// graft effectively free. +fn graft( + footer_meta: &Arc, + column_index: Option, + offset_index: ParquetOffsetIndex, +) -> Arc { + let base = ParquetMetaData::clone(footer_meta); + let rebuilt = base + .into_builder() + .set_column_index(column_index) + .set_offset_index(Some(offset_index)) + .build(); + Arc::new(rebuilt) +} + +// ── ColumnIndex cache lookup + build (per `(file, col, rg)` cell) ──────────── + +/// Assemble the full-width `[rg][col]` `ColumnIndex` matrix (real cells only at +/// `predicate_cols` × built RGs; `NONE` everywhere else) by looking up each +/// `(file, col, rg)` cell in the cache and decoding only the cells that miss. +/// +/// `surviving_rgs == None` builds every RG; `Some(set)` restricts the built RGs +/// to footer-stats survivors ([`surviving_row_groups`]). Either way a cell is +/// keyed solely on `(file, col, rg)`, so it is decoded once per file and reused +/// across every predicate combination and surviving-RG set that touches it. +async fn get_or_build_column_index( + store: &Arc, + location: &object_store::path::Path, + footer_meta: &Arc, + predicate_cols: &[usize], + surviving_rgs: Option<&[usize]>, +) -> Option { + let num_rgs = footer_meta.num_row_groups(); + if num_rgs == 0 { + return None; + } + let num_cols = footer_meta.file_metadata().schema_descr().num_columns(); + + debug_assert!( + predicate_cols.iter().all(|&i| i < num_cols), + "predicate_cols contains out-of-bounds index (num_cols={num_cols}): {predicate_cols:?}" + ); + if predicate_cols.iter().any(|&i| i >= num_cols) { + return None; + } + + // Which RGs to build the (heavy) predicate-column ColumnIndex for. + let build_rgs: Vec = match surviving_rgs { + None => (0..num_rgs).collect(), + Some(set) => { + debug_assert!( + set.iter().all(|&r| r < num_rgs), + "surviving_rgs contains out-of-bounds index (num_rgs={num_rgs}): {set:?}" + ); + set.iter().copied().filter(|&r| r < num_rgs).collect() + } + }; + if build_rgs.is_empty() { + // Nothing to build (e.g. an empty survivor set) → footer-only fallback. + return None; + } + + let path: Arc = Arc::from(location.as_ref()); + + // Initially filled NONE for all RGs and all cols - as placeholders + let mut col_index_matrix: ParquetColumnIndex = (0..num_rgs) + .map(|_| (0..num_cols).map(|_| ColumnIndexMetaData::NONE).collect()) + .collect(); + + // Phase 1: serve every needed cell that is already cached; collect misses. + let mut missing_col_rg_matrix: Vec<(usize, usize)> = Vec::new(); // (col, rg) + for &rg in &build_rgs { + for &col in predicate_cols { + let key = CiCellKey { path: path.clone(), col, rg }; + match COLUMN_INDEX_CACHE.get(&key) { + Some(cell) => col_index_matrix[rg][col] = cell, + None => missing_col_rg_matrix.push((col, rg)), + } + } + } + // Phase 2: decode the missing cells (vectored fetch grouped by RG), place + // them in the matrix, and populate the cache. + if !missing_col_rg_matrix.is_empty() { + let built = build_column_index_cells(store, location, footer_meta, &missing_col_rg_matrix).await?; + for cell in built { + debug_assert!( + cell.rg < col_index_matrix.len() && cell.col < col_index_matrix[cell.rg].len(), + "cell ({}, {}) out of matrix bounds ({num_rgs} rgs, {num_cols} cols)", + cell.col, cell.rg, + ); + COLUMN_INDEX_CACHE.insert( + CiCellKey { path: path.clone(), col: cell.col, rg: cell.rg }, + cell.data.clone(), + cell.size, + ); + col_index_matrix[cell.rg][cell.col] = cell.data; + } + } + + Some(col_index_matrix) +} + +struct RgPlan { + rg: usize, + cols: Vec, + chunks: Vec, + range_start: u64, +} + +struct CiCell { + col: usize, + rg: usize, + data: ColumnIndexMetaData, + size: usize, +} + +struct OiCell { + col: usize, + data: OiColumn, + size: usize, +} + +/// Range-read + decode the requested `(col, rg)` ColumnIndex cells, grouping by +/// row group so each RG's columns share one vectored fetch + decode. `None` if +/// any requested column lacks a column-index range (→ footer-only fallback). +async fn build_column_index_cells( + store: &Arc, + location: &object_store::path::Path, + footer_meta: &Arc, + col_rg_matrix: &[(usize, usize)], +) -> Option> { + let mut by_rg: HashMap> = HashMap::new(); + for &(col, rg) in col_rg_matrix { + by_rg.entry(rg).or_default().push(col); + } + + let mut plans: Vec = Vec::with_capacity(by_rg.len()); + let mut fetch_ranges: Vec> = Vec::with_capacity(by_rg.len()); + for (rg, cols) in by_rg { + let rgm = footer_meta.row_group(rg); + let chunks: Vec = cols.iter().map(|&i| rgm.column(i).clone()).collect(); + let range = column_index_union(&chunks)?; + plans.push(RgPlan { rg, cols, chunks, range_start: range.start }); + fetch_ranges.push(range); + } + + let buffers = store.get_ranges(location, &fetch_ranges).await.ok()?; + if buffers.len() != fetch_ranges.len() { + return None; + } + + let mut out: Vec = Vec::with_capacity(col_rg_matrix.len()); + for (plan, buf) in plans.iter().zip(buffers.iter()) { + let reader = BufferChunkReader { base: plan.range_start, bytes: buf.clone() }; + // Deprecated but the only PUBLIC column-subset decoder (arrow-rs#8643). + #[allow(deprecated)] + let decoded = read_columns_indexes(&reader, &plan.chunks).ok()??; + if decoded.len() != plan.cols.len() { + return None; + } + let rgm = footer_meta.row_group(plan.rg); + for (entry, &col) in decoded.into_iter().zip(plan.cols.iter()) { + let size = rgm.column(col).column_index_length().unwrap_or(0).max(0) as usize; + out.push(CiCell { col, rg: plan.rg, data: entry, size }); + } + } + Some(out) +} + +// ── OffsetIndex cache lookup + build (per `(file, col)` cell, all RGs) ─────── + +/// Assemble the full-width `[rg][col]` `OffsetIndex` matrix (real entries only at +/// the resolved offset columns; empty placeholders elsewhere) from per-`(file, +/// col)` cells, decoding only the columns that miss. +/// +/// The resolved offset-column set is `predicate ∪ projection ∪ {0}` (`projection_cols +/// == None` → all columns); see [`OiCellKey`] for why each must be real. Each +/// cached cell is a column's OffsetIndex across **all** row groups, keyed only on +/// `(file, col)`, so it is decoded once per file and reused across every query +/// that reads that column irrespective of projection or predicate. +async fn get_or_build_offset_index( + store: &Arc, + location: &object_store::path::Path, + footer_meta: &Arc, + predicate_cols: &[usize], + projection_cols: Option<&[usize]>, +) -> Option { + let num_rgs = footer_meta.num_row_groups(); + if num_rgs == 0 { + return None; + } + let num_cols = footer_meta.file_metadata().schema_descr().num_columns(); + + // Resolve which columns need a real OffsetIndex: predicate ∪ projection ∪ {0}, + // clamped. `None` → all columns. + // First column {0} , is always needed as it's used in stats. + let off_cols: Vec = match projection_cols { + None => (0..num_cols).collect(), + Some(proj_cols) => { + let mut set: HashSet = HashSet::new(); + set.insert(0); // metric reads column 0 + for &c in predicate_cols { + set.insert(c); + } + for &c in proj_cols { + set.insert(c); + } + debug_assert!( + set.iter().all(|&c| c < num_cols), + "column index out of bounds (num_cols={num_cols}): {set:?}" + ); + set.into_iter().filter(|&c| c < num_cols).collect() + } + }; + if off_cols.is_empty() { + return None; + } + + let path: Arc = Arc::from(location.as_ref()); + // Placeholder for columns we don't build: a SINGLE page spanning the whole row + // group, NOT an empty page-locations list. A scoped OffsetIndex is grafted as a + // full-width `[rg][col]` matrix; consumers (DataFusion's page pruner, arrow's + // reader, our indexed pruner) index it by absolute column and dereference + // `page_locations` (`.last()`, `[0]`, `windows(2)`). An EMPTY placeholder + // panics those (`page_locations.last().unwrap()` etc.) if any path touches a + // column we scoped out — which is hard to predict across every query shape + // (count/agg, SingleCollector prefetch, schema-evolved files). A one-page + // placeholder is always safe to dereference and makes pruning conservatively + // keep the whole RG (1 page = all rows → can't prune), never a wrong result. + let placeholder_for = |rg_idx: usize| -> OffsetIndexMetaData { + let mut b = OffsetIndexBuilder::new(); + b.append_offset_and_size(0, 0); + b.append_row_count(footer_meta.row_group(rg_idx).num_rows()); + b.build() + }; + // The placeholder is identical for every column within a row group (it only + // depends on the RG's row count). Build it ONCE per RG and clone it across the + // columns, instead of constructing `num_cols` identical OffsetIndexMetaData + // (each a heap alloc) per RG. On wide schemas (clickbench ~105 cols) this is the + // bulk of the per-file warm scoped-load cost — only the few scoped columns get + // real data scattered in afterward; the rest stay as clones of this placeholder. + let mut matrix: ParquetOffsetIndex = (0..num_rgs) + .map(|rg| { + let ph = placeholder_for(rg); + vec![ph; num_cols] + }) + .collect(); + + // Phase 1: serve cached columns; collect misses. + let mut missing: Vec = Vec::new(); + for &col in &off_cols { + let key = OiCellKey { path: path.clone(), col }; + match OFFSET_INDEX_CACHE.get(&key) { + Some(column) => scatter_offset_column(&mut matrix, col, &column), + None => missing.push(col), + } + } + + // Phase 2: decode the missing columns (each spanning all RGs), scatter into + // the matrix, and populate the cache. + if !missing.is_empty() { + let built = build_offset_index_columns(store, location, footer_meta, &missing, num_rgs).await?; + for cell in built { + OFFSET_INDEX_CACHE.insert(OiCellKey { path: path.clone(), col: cell.col }, cell.data.clone(), cell.size); + scatter_offset_column_owned(&mut matrix, cell.col, cell.data); + } + } + + Some(matrix) +} + +/// Place a column's all-RG OffsetIndex (indexed by RG) into the matrix at `col`. +fn scatter_offset_column(matrix: &mut ParquetOffsetIndex, col: usize, column: &OiColumn) { + for (rg, entry) in column.iter().enumerate() { + if rg < matrix.len() { + matrix[rg][col] = entry.clone(); + } + } +} + +/// Consuming version of [`scatter_offset_column`] — used after inserting into +/// the cache so we move rather than clone the per-RG entries. +fn scatter_offset_column_owned(matrix: &mut ParquetOffsetIndex, col: usize, column: OiColumn) { + for (rg, entry) in column.into_iter().enumerate() { + if rg < matrix.len() { + matrix[rg][col] = entry; + } + } +} + +/// Range-read + decode the OffsetIndex for each requested column across **every** +/// row group (read-time safety — see [`OiCellKey`]). `None` if any column lacks +/// an offset-index range (→ footer-only fallback). +async fn build_offset_index_columns( + store: &Arc, + location: &object_store::path::Path, + footer_meta: &Arc, + cols: &[usize], + num_rgs: usize, +) -> Option> { + struct RgPlan { + chunks: Vec, + range_start: u64, + } + let mut plans: Vec = Vec::with_capacity(num_rgs); + let mut fetch_ranges: Vec> = Vec::with_capacity(num_rgs); + for rg_idx in 0..num_rgs { + let rg = footer_meta.row_group(rg_idx); + let chunks: Vec = cols.iter().map(|&i| rg.column(i).clone()).collect(); + let range = offset_index_union(&chunks)?; + plans.push(RgPlan { chunks, range_start: range.start }); + fetch_ranges.push(range); + } + + let buffers = store.get_ranges(location, &fetch_ranges).await.ok()?; + if buffers.len() != fetch_ranges.len() { + return None; + } + + // Per-column accumulator: one OiColumn slot per requested col, filled RG by RG. + let mut columns: Vec = cols.iter().map(|_| Vec::with_capacity(num_rgs)).collect(); + for (plan, buf) in plans.iter().zip(buffers.iter()) { + let reader = BufferChunkReader { base: plan.range_start, bytes: buf.clone() }; + #[allow(deprecated)] + let decoded = read_offset_indexes(&reader, &plan.chunks).ok()??; + if decoded.len() != cols.len() { + return None; + } + for (k, entry) in decoded.into_iter().enumerate() { + columns[k].push(entry); + } + } + + let mut out: Vec = Vec::with_capacity(cols.len()); + for (k, &col) in cols.iter().enumerate() { + let size = footer_meta + .row_groups() + .iter() + .map(|rg| rg.column(col).offset_index_length().unwrap_or(0).max(0) as usize) + .sum(); + out.push(OiCell { col, data: mem::take(&mut columns[k]), size }); + } + Some(out) +} + +/// Union of `column_index` byte ranges across the given column chunks. `None` if +/// any chunk lacks a column index (we require all predicate columns to have one, +/// else fall back to footer-only). +fn column_index_union(chunks: &[ColumnChunkMetaData]) -> Option> { + range_union(chunks, |c| { + let off = u64::try_from(c.column_index_offset()?).ok()?; + let len = u64::try_from(c.column_index_length()?).ok()?; + Some(off..off + len) + }) +} + +/// Union of `offset_index` byte ranges across the given column chunks. +fn offset_index_union(chunks: &[ColumnChunkMetaData]) -> Option> { + range_union(chunks, |c| { + let off = u64::try_from(c.offset_index_offset()?).ok()?; + let len = u64::try_from(c.offset_index_length()?).ok()?; + Some(off..off + len) + }) +} + +fn range_union( + chunks: &[ColumnChunkMetaData], + f: impl Fn(&ColumnChunkMetaData) -> Option>, +) -> Option> { + let mut acc: Option> = None; + for c in chunks { + let r = f(c)?; // any missing range → bail (caller falls back) + acc = Some(match acc { + None => r, + Some(a) => a.start.min(r.start)..a.end.max(r.end), + }); + } + acc +} + +/// A [`ChunkReader`] over an in-memory byte buffer representing the file region +/// `[base, base + bytes.len())`. The arrow-rs page-index readers call +/// `get_bytes(absolute_offset, len)`; we translate into the buffer. +struct BufferChunkReader { + base: u64, + bytes: Bytes, +} + +impl Length for BufferChunkReader { + fn len(&self) -> u64 { + self.base + self.bytes.len() as u64 + } +} + +impl ChunkReader for BufferChunkReader { + type T = buf::Reader; + + fn get_read(&self, start: u64) -> ParquetResult { + let rel = self.rel(start, 0)?; + Ok(self.bytes.slice(rel..).reader()) + } + + fn get_bytes(&self, start: u64, length: usize) -> ParquetResult { + let rel = self.rel(start, length)?; + Ok(self.bytes.slice(rel..rel + length)) + } +} + +impl BufferChunkReader { + /// Translate an absolute file offset `start` to a buffer-relative index. + /// The fork's `read_columns_indexes`/`read_offset_indexes` call `get_bytes` + /// with absolute file offsets (from chunk metadata); `self.base` is the + /// absolute start of the fetched buffer, so `start - base` gives the + /// position within `self.bytes`. + fn rel(&self, start: u64, length: usize) -> ParquetResult { + let rel = start.checked_sub(self.base).ok_or_else(|| { + ParquetError::General(format!( + "page-index read offset {start} precedes buffer base {}", + self.base + )) + })?; + let rel = usize::try_from(rel) + .map_err(|e| ParquetError::General(format!("offset overflow: {e}")))?; + if rel + length > self.bytes.len() { + return Err(ParquetError::General(format!( + "page-index read [{rel}..{}) exceeds buffer of len {}", + rel + length, + self.bytes.len() + ))); + } + Ok(rel) + } +} +#[cfg(test)] +mod tests { + use super::*; + use super::super::{ + clear_scoped_cache_for_test, column_index_cache_stats, offset_index_cache_stats, + scoped_cache_stats, set_column_index_cache_limit_for_test, ScopedCacheStats, + SCOPED_CACHE_TEST_GUARD, + }; + use super::super::column_schema_resolver::{resolve_predicate_parquet_columns, resolve_predicate_parquet_columns_pair}; + use crate::indexed_table::page_pruner::{build_pruning_predicate, PagePruner}; + use arrow::array::{Int32Array, RecordBatch}; + use arrow::datatypes::{DataType, Field, Schema}; + use datafusion::common::ScalarValue; + use datafusion::logical_expr::Operator; + use datafusion::parquet::arrow::arrow_reader::{ + ArrowReaderMetadata, ArrowReaderOptions, RowSelection, RowSelector, + }; + use datafusion::parquet::arrow::ArrowWriter; + use datafusion::parquet::file::properties::{EnabledStatistics, WriterProperties}; + use datafusion::physical_expr::expressions::{BinaryExpr, Column as PhysColumn, Literal}; + use datafusion::physical_expr::PhysicalExpr; + use object_store::memory::InMemory; + use object_store::path::Path as ObjPath; + use object_store::{ObjectStoreExt, PutPayload}; + + use super::super::SCOPED_CACHE_TEST_GUARD as CACHE_TEST_GUARD; + + // ── fixtures + expr helpers ────────────────────────────────────────── + + /// 2 columns (`price`, `qty`), 32 rows, 1 row group, 4 pages of 8 rows. + fn two_col_parquet() -> (Bytes, SchemaRef) { + let schema = Arc::new(Schema::new(vec![ + Field::new("price", DataType::Int32, false), + Field::new("qty", DataType::Int32, false), + ])); + let prices: Vec = (0..32).collect(); + let qtys: Vec = (100..132).collect(); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(prices)), Arc::new(Int32Array::from(qtys))], + ) + .unwrap(); + let props = WriterProperties::builder() + .set_max_row_group_size(32) + .set_data_page_row_count_limit(8) + .set_write_batch_size(8) + .set_statistics_enabled(EnabledStatistics::Page) + .build(); + let mut buf: Vec = Vec::new(); + let mut w = ArrowWriter::try_new(&mut buf, schema.clone(), Some(props)).unwrap(); + w.write(&batch).unwrap(); + w.close().unwrap(); + (Bytes::from(buf), schema) + } + + /// 4 row groups of 10 rows (`id` 0..40, `v` = id*2), page size 5. + fn four_rg_parquet() -> (Bytes, SchemaRef) { + let schema = Arc::new(Schema::new(vec![ + Field::new("id", DataType::Int32, false), + Field::new("v", DataType::Int32, false), + ])); + let ids: Vec = (0..40).collect(); + let vs: Vec = (0..40).map(|x| x * 2).collect(); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(ids)), Arc::new(Int32Array::from(vs))], + ) + .unwrap(); + let props = WriterProperties::builder() + .set_max_row_group_size(10) + .set_data_page_row_count_limit(5) + .set_write_batch_size(5) + .set_statistics_enabled(EnabledStatistics::Page) + .build(); + let mut buf: Vec = Vec::new(); + let mut w = ArrowWriter::try_new(&mut buf, schema.clone(), Some(props)).unwrap(); + w.write(&batch).unwrap(); + w.close().unwrap(); + (Bytes::from(buf), schema) + } + + /// 4 columns (2 int `n0`,`n1` + 2 wide string `s0`,`s1`), 1 RG, multiple pages. + fn wide4_parquet() -> (Bytes, SchemaRef) { + use arrow::array::StringArray; + let schema = Arc::new(Schema::new(vec![ + Field::new("n0", DataType::Int32, false), + Field::new("n1", DataType::Int32, false), + Field::new("s0", DataType::Utf8, false), + Field::new("s1", DataType::Utf8, false), + ])); + const ROWS: i32 = 256; + let n0: Vec = (0..ROWS).collect(); + let n1: Vec = (0..ROWS).collect(); + let s0: Vec = (0..ROWS).map(|r| format!("s0_{r:05}_padpadpad")).collect(); + let s1: Vec = (0..ROWS).map(|r| format!("s1_{r:05}_padpadpad")).collect(); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(n0)), + Arc::new(Int32Array::from(n1)), + Arc::new(StringArray::from(s0)), + Arc::new(StringArray::from(s1)), + ], + ) + .unwrap(); + let props = WriterProperties::builder() + .set_max_row_group_size(ROWS as usize) + .set_data_page_row_count_limit(32) + .set_write_batch_size(32) + .set_statistics_enabled(EnabledStatistics::Page) + .build(); + let mut buf: Vec = Vec::new(); + let mut w = ArrowWriter::try_new(&mut buf, schema.clone(), Some(props)).unwrap(); + w.write(&batch).unwrap(); + w.close().unwrap(); + (Bytes::from(buf), schema) + } + + async fn stage(bytes: Bytes) -> (Arc, ObjPath) { + let store: Arc = Arc::new(InMemory::new()); + let loc = ObjPath::from("data.parquet"); + store.put(&loc, PutPayload::from_bytes(bytes)).await.unwrap(); + (store, loc) + } + + fn footer_only(bytes: &Bytes) -> Arc { + ArrowReaderMetadata::load(&bytes.clone(), ArrowReaderOptions::new().with_page_index(false)) + .unwrap() + .metadata() + .clone() + } + + fn full_index(bytes: &Bytes) -> Arc { + ArrowReaderMetadata::load(&bytes.clone(), ArrowReaderOptions::new().with_page_index(true)) + .unwrap() + .metadata() + .clone() + } + + fn col(name: &str, idx: usize) -> Arc { + Arc::new(PhysColumn::new(name, idx)) + } + fn lit_int(v: i32) -> Arc { + Arc::new(Literal::new(ScalarValue::Int32(Some(v)))) + } + fn pred(name: &str, idx: usize, op: Operator, v: i32) -> Arc { + Arc::new(BinaryExpr::new(col(name, idx), op, lit_int(v))) + } + fn kept(sel: &RowSelection) -> usize { + sel.iter().filter(|s| !s.skip).map(|s| s.row_count).sum() + } + fn ci() -> ScopedCacheStats { + column_index_cache_stats() + } + fn oi() -> ScopedCacheStats { + offset_index_cache_stats() + } + + fn read_selected_column( + bytes: &Bytes, + meta: &Arc, + leaf_col: usize, + selection: RowSelection, + ) -> std::result::Result, String> { + use datafusion::parquet::arrow::arrow_reader::ParquetRecordBatchReaderBuilder; + use datafusion::parquet::arrow::ProjectionMask; + + let arm = ArrowReaderMetadata::try_new(Arc::clone(meta), ArrowReaderOptions::new()) + .map_err(|e| format!("try_new metadata: {e}"))?; + let builder = ParquetRecordBatchReaderBuilder::new_with_metadata(bytes.clone(), arm); + let proj = ProjectionMask::leaves(builder.parquet_schema(), [leaf_col]); + let mut reader = builder + .with_row_groups(vec![0]) + .with_projection(proj) + .with_row_selection(selection) + .build() + .map_err(|e| format!("build reader: {e}"))?; + let mut out = Vec::new(); + while let Some(next) = reader.next() { + let batch = next.map_err(|e| format!("read batch: {e}"))?; + let a = batch + .column(0) + .as_any() + .downcast_ref::() + .ok_or("projected column was not Int32")?; + for i in 0..a.len() { + out.push(a.value(i)); + } + } + Ok(out) + } + + // ── baseline / correctness ──────────────────────────────────────────── + + #[tokio::test] + async fn footer_only_has_no_page_index() { + let (bytes, _schema) = two_col_parquet(); + let fo = footer_only(&bytes); + assert!(fo.column_index().is_none()); + assert!(fo.offset_index().is_none()); + } + + #[tokio::test] + async fn empty_column_set_returns_none() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, _schema) = two_col_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + assert!(load_scoped_page_index(&store, &loc, &fo, &[]).await.is_none()); + assert_eq!(ci().entries, 0); + assert_eq!(oi().entries, 0); + } + + #[tokio::test] + async fn scoped_index_is_predicate_scoped_for_column_index() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = two_col_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + + let cols = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]); + assert_eq!(cols, vec![0]); + + let aug = load_scoped_page_index(&store, &loc, &fo, &cols).await.unwrap(); + let c = aug.column_index().unwrap(); + let o = aug.offset_index().unwrap(); + assert!(!matches!(c[0][0], ColumnIndexMetaData::NONE), "predicate col has real CI"); + assert!(matches!(c[0][1], ColumnIndexMetaData::NONE), "non-predicate col CI is NONE"); + assert!( + !o[0][0].page_locations().is_empty() && !o[0][1].page_locations().is_empty(), + "OffsetIndex real for every column (all-col default)" + ); + } + + // The pair-resolve must return exactly what two separate single-name resolves + // return — it only shares the per-file arrow-schema derivation, nothing else. + #[tokio::test] + async fn resolve_pair_equals_two_single_resolves() { + let (bytes, schema) = two_col_parquet(); + let fo = footer_only(&bytes); + let names_a = vec!["price".to_string()]; + let names_b = vec!["qty".to_string(), "price".to_string()]; + let mut single_a = resolve_predicate_parquet_columns(&schema, &fo, &names_a); + let mut single_b = resolve_predicate_parquet_columns(&schema, &fo, &names_b); + let (mut pair_a, mut pair_b) = + resolve_predicate_parquet_columns_pair(&schema, &fo, &names_a, &names_b); + single_a.sort_unstable(); single_b.sort_unstable(); + pair_a.sort_unstable(); pair_b.sort_unstable(); + assert_eq!(pair_a, single_a, "pair predicate result must match single"); + assert_eq!(pair_b, single_b, "pair projection result must match single"); + assert_eq!(pair_a, vec![0]); + assert_eq!(pair_b, vec![0, 1]); + } + + /// Regression: a match()-only query has NO residual predicate columns + /// (`parquet_cols` empty) but DOES project columns (`offset_cols` non-empty). + /// The scoped load must still build the OffsetIndex for the projected column + /// so the parquet reader fetches only matched-row pages, not whole chunks. + /// (Bug: the load short-circuited on empty `parquet_cols`, skipping the + /// OffsetIndex → reader over-read ~2.5× the bytes on `... | stats ... by URL`.) + #[tokio::test] + async fn projection_only_builds_offset_index_without_predicate() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, _schema) = two_col_parquet(); // price=col0, qty=col1 + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + + // No predicate columns; project qty (col 1). + let parquet_cols: Vec = vec![]; + let offset_cols: Vec = vec![1]; + let aug = load_scoped_page_index_cols(&store, &loc, &fo, &parquet_cols, &offset_cols) + .await + .expect("projection-only load must produce grafted metadata (not None)"); + + // No predicate → no ColumnIndex grafted. + assert!(aug.column_index().is_none(), "no predicate → ColumnIndex absent"); + + // OffsetIndex must be real for the projected col (1) AND col 0 (loader + // always unions in {0}); other behavior unchanged. + let o = aug.offset_index().expect("OffsetIndex must be grafted"); + assert!(!o[0][1].page_locations().is_empty(), "projected col qty has real OffsetIndex"); + assert!(!o[0][0].page_locations().is_empty(), "col 0 OffsetIndex real (always unioned)"); + } + + #[tokio::test] + async fn scoped_pruning_matches_full_index() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = two_col_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + let cols = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]); + let aug = load_scoped_page_index(&store, &loc, &fo, &cols).await.unwrap(); + let full = full_index(&bytes); + let pp = build_pruning_predicate(&pred("price", 0, Operator::GtEq, 20), schema.clone()).unwrap(); + let s = PagePruner::new(&schema, Arc::clone(&aug)).prune_rg(&pp, 0, None); + let f = PagePruner::new(&schema, full).prune_rg(&pp, 0, None); + assert_eq!(s.as_ref().map(kept), f.as_ref().map(kept)); + assert_eq!(s.as_ref().map(kept), Some(16)); + } + + /// Schema-evolution fixture: a file whose physical layout is `[extra, price]` + /// (so `price` is parquet leaf **1**), 32 rows / 4 pages. Used to prove the + /// predicate→leaf resolution does NOT depend on a column's position in a wider + /// *union* schema. + fn evolved_extra_price_parquet() -> Bytes { + let schema = Arc::new(Schema::new(vec![ + Field::new("extra", DataType::Int32, false), + Field::new("price", DataType::Int32, false), + ])); + let extra: Vec = (1000..1032).collect(); + let prices: Vec = (0..32).collect(); + let batch = RecordBatch::try_new( + schema.clone(), + vec![Arc::new(Int32Array::from(extra)), Arc::new(Int32Array::from(prices))], + ) + .unwrap(); + let props = WriterProperties::builder() + .set_max_row_group_size(32) + .set_data_page_row_count_limit(8) + .set_write_batch_size(8) + .set_statistics_enabled(EnabledStatistics::Page) + .build(); + let mut buf: Vec = Vec::new(); + let mut w = ArrowWriter::try_new(&mut buf, schema.clone(), Some(props)).unwrap(); + w.write(&batch).unwrap(); + w.close().unwrap(); + Bytes::from(buf) + } + + /// Regression for the schema-evolution wrong-count bug: when the query's + /// (union) schema lists `price` at a DIFFERENT position than the file's + /// physical layout, the predicate must still resolve to the file's TRUE leaf + /// and scoped pruning must match the full-index pruning. Previously the + /// resolver used the union-schema position, scoped the page index at the wrong + /// leaf, and the residual mis-pruned → over-count. + #[tokio::test] + async fn scoped_resolution_is_per_file_under_schema_evolution() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let bytes = evolved_extra_price_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + + // The UNION/table schema the query carries: `price` at position 0 — but in + // THIS file `price` is physically leaf 1 (after `extra`). + let union_schema: SchemaRef = Arc::new(Schema::new(vec![ + Field::new("price", DataType::Int32, false), + Field::new("qty", DataType::Int32, false), + Field::new("extra", DataType::Int32, false), + ])); + + // Must resolve to the file's TRUE leaf for `price` = 1, NOT the union + // position 0 (which is `extra` in this file). + let cols = resolve_predicate_parquet_columns(&union_schema, &fo, &["price".to_string()]); + assert_eq!(cols, vec![1], "price must resolve to its per-file leaf (1), not union pos 0"); + + let aug = load_scoped_page_index(&store, &loc, &fo, &cols).await.unwrap(); + let full = full_index(&bytes); + // `price` pages: 0..8,8..16,16..24,24..32; `price >= 20` keeps the last two + // pages (rows 16..32 = 16 rows). Build the pruning predicate against the + // FILE schema (price at index 1) so the converter matches the data. + let file_schema: SchemaRef = Arc::new(Schema::new(vec![ + Field::new("extra", DataType::Int32, false), + Field::new("price", DataType::Int32, false), + ])); + let pp = build_pruning_predicate(&pred("price", 1, Operator::GtEq, 20), file_schema.clone()).unwrap(); + let s = PagePruner::new(&file_schema, Arc::clone(&aug)).prune_rg(&pp, 0, None); + let f = PagePruner::new(&file_schema, full).prune_rg(&pp, 0, None); + assert_eq!(s.as_ref().map(kept), f.as_ref().map(kept), "scoped pruning must match full index"); + assert_eq!(s.as_ref().map(kept), Some(16)); + clear_scoped_cache_for_test(); + } + + /// Page pruning over a column-scoped index must be a SAFE SUPERSET of the + /// full-index pruning — never drop a row the full index would keep (that + /// would be an under-count / lost result). It MAY keep extra rows (the + /// residual mask drops them post-decode), so equality is NOT required and is + /// the wrong invariant. + /// + /// The hazard this guards: with the page index scoped to `price` only, `qty` + /// is a one-page non-panicking OffsetIndex placeholder with a `NONE` + /// ColumnIndex. If the pruner TRUSTED that placeholder's (absent) stats it + /// would build a bogus single-page grid for `qty` and could mis-prune. The + /// `page_pruner` fix treats a `NONE`-ColumnIndex column as "no usable stats" + /// (like a schema-evolution-absent column) → it contributes "unknown" and + /// never prunes on `qty`, so the scoped result stays a conservative superset. + #[tokio::test] + async fn scoped_pruning_is_safe_superset_with_placeholdered_residual_col() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + // price 0..32 (pages 0..8,8..16,16..24,24..32); qty 100..132 (pages + // 100..108,108..116,116..124,124..132). 1 RG, 4 pages each. + let (bytes, schema) = two_col_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + + // Scope the page index to `price` ONLY (mimics the predicate-scoped indexed + // path). `qty` therefore gets the one-page placeholder + NONE ColumnIndex. + let cols = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]); + assert_eq!(cols, vec![0]); + let aug = load_scoped_page_index(&store, &loc, &fo, &cols).await.unwrap(); + let full = full_index(&bytes); + + // Residual references BOTH columns: price >= 16 (full keeps pages 2,3) AND + // qty <= 115 (full keeps pages 0,1) → full intersection prunes to 0 rows. + let price_ge = pred("price", 0, Operator::GtEq, 16); + let qty_le = pred("qty", 1, Operator::LtEq, 115); + let residual: Arc = + Arc::new(BinaryExpr::new(price_ge, Operator::And, qty_le)); + let pp = build_pruning_predicate(&residual, schema.clone()).unwrap(); + + let s_kept = PagePruner::new(&schema, Arc::clone(&aug)).prune_rg(&pp, 0, None).map(|s| kept(&s)); + let f_kept = PagePruner::new(&schema, full).prune_rg(&pp, 0, None).map(|s| kept(&s)); + // Superset invariant: scoped must keep AT LEAST what full keeps (never + // fewer). It keeps more here (16 vs 0) because it correctly cannot prune + // the placeholdered `qty` — that's safe; the residual mask removes the + // extras post-decode. A scoped result SMALLER than full would be the real + // bug (lost rows). `None` = "kept everything" (no pruning) = the maximal + // superset, also safe. + let s = s_kept.unwrap_or(usize::MAX); + let f = f_kept.unwrap_or(usize::MAX); + assert!( + s >= f, + "scoped page pruning must be a safe superset of full ({} kept) but kept fewer ({})", + f, s + ); + clear_scoped_cache_for_test(); + } + + #[tokio::test] + async fn scoped_index_reads_non_predicate_projected_column() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = two_col_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + let cols = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]); + let aug = load_scoped_page_index(&store, &loc, &fo, &cols).await.unwrap(); + let selection = RowSelection::from(vec![RowSelector::skip(16), RowSelector::select(16)]); + let scoped_vals = read_selected_column(&bytes, &aug, 1, selection.clone()).unwrap(); + let full = full_index(&bytes); + let full_vals = read_selected_column(&bytes, &full, 1, selection).unwrap(); + let expected: Vec = (116..132).collect(); + assert_eq!(scoped_vals, expected); + assert_eq!(scoped_vals, full_vals); + } + + // ── cache behavior: hits, independence, eviction ────────────────────── + + /// Second identical load is a pure hit in BOTH caches; no new cells/bytes. + /// Cells: predicate `price` → 1 CI cell `(col0,rg0)`; all-column OffsetIndex + /// (the default) → 2 OI cells (one per column). + #[tokio::test] + async fn second_load_is_cache_hit() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = two_col_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + let cols = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]); + + let _ = load_scoped_page_index(&store, &loc, &fo, &cols).await.unwrap(); + let (c1, o1) = (ci(), oi()); + assert_eq!((c1.hits, c1.misses, c1.entries), (0, 1, 1), "1 CI cell (price,rg0)"); + assert_eq!((o1.hits, o1.misses, o1.entries), (0, 2, 2), "2 OI cells (col0,col1)"); + assert!(c1.used_bytes > 0 && o1.used_bytes > 0); + + let _ = load_scoped_page_index(&store, &loc, &fo, &cols).await.unwrap(); + let (c2, o2) = (ci(), oi()); + assert_eq!((c2.hits, c2.misses, c2.entries, c2.used_bytes), (1, 1, 1, c1.used_bytes)); + assert_eq!((o2.hits, o2.misses, o2.entries, o2.used_bytes), (2, 2, 2, o1.used_bytes)); + } + + /// Distinct predicate columns → distinct CI cells, but the OffsetIndex column + /// cells are SHARED. Both loads default to the all-column OffsetIndex, so the + /// second load re-reads the SAME 2 OI cells from cache (no new cells). This is + /// the whole point of cell-keying: a column's index is stored once per file. + #[tokio::test] + async fn distinct_predicates_share_offset_index() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = two_col_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + let c_price = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]); + let c_qty = resolve_predicate_parquet_columns(&schema, &fo, &["qty".to_string()]); + + let _ = load_scoped_page_index(&store, &loc, &fo, &c_price).await.unwrap(); + let _ = load_scoped_page_index(&store, &loc, &fo, &c_qty).await.unwrap(); + + assert_eq!(ci().entries, 2, "distinct predicate cells: (price,rg0) + (qty,rg0)"); + assert_eq!(oi().entries, 2, "all-column OffsetIndex: 2 column cells, shared"); + // Second (qty) load re-read the same 2 OI cells from cache. + assert_eq!(oi().hits, 2); + } + + /// The cell-keying payoff: a predicate that ADDS a column reuses the cell the + /// first predicate already decoded, instead of re-decoding it inside a new + /// set-keyed entry. `price` then `{price, qty}` → `price`'s cell is a HIT; only + /// `qty`'s cell is freshly decoded. (Under the old set-keyed cache this was a + /// full miss that re-decoded `price`.) + #[tokio::test] + async fn adding_predicate_column_reuses_existing_cell() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = two_col_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + let c_price = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]); + let c_both = resolve_predicate_parquet_columns( + &schema, + &fo, + &["price".to_string(), "qty".to_string()], + ); + + let _ = load_scoped_page_index(&store, &loc, &fo, &c_price).await.unwrap(); + assert_eq!((ci().hits, ci().misses, ci().entries), (0, 1, 1), "price cell decoded"); + + // Predicate now covers {price, qty}: price's cell hits, qty's cell misses. + let _ = load_scoped_page_index(&store, &loc, &fo, &c_both).await.unwrap(); + assert_eq!( + (ci().hits, ci().misses, ci().entries), + (1, 2, 2), + "price cell reused (hit); only qty cell freshly decoded" + ); + clear_scoped_cache_for_test(); + } + + /// Two predicates on the SAME column with DIFFERENT literals resolve to the + /// same `(file, col)` parquet column, so they share the one CI cell — predicate + /// *value* never multiplies cache entries. (`status>=400` vs `status>=100`.) + #[tokio::test] + async fn different_literals_same_column_share_cell() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = two_col_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + // Both predicates are on `price` (col 0) — only the literal differs, which + // never enters the cache key. + let cols = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]); + + let _ = load_scoped_page_index(&store, &loc, &fo, &cols).await.unwrap(); + let _ = load_scoped_page_index(&store, &loc, &fo, &cols).await.unwrap(); + assert_eq!(ci().entries, 1, "same column → one cell regardless of literal"); + assert_eq!(ci().hits, 1); + clear_scoped_cache_for_test(); + } + + /// CI hit/miss accounting across two predicate-column sets. + #[tokio::test] + async fn stats_count_hits_and_misses() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = two_col_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + let c_price = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]); + let c_qty = resolve_predicate_parquet_columns(&schema, &fo, &["qty".to_string()]); + + let _ = load_scoped_page_index(&store, &loc, &fo, &c_price).await.unwrap(); + assert_eq!((ci().hits, ci().misses), (0, 1)); + let _ = load_scoped_page_index(&store, &loc, &fo, &c_price).await.unwrap(); + assert_eq!((ci().hits, ci().misses), (1, 1)); + let _ = load_scoped_page_index(&store, &loc, &fo, &c_qty).await.unwrap(); + assert_eq!((ci().hits, ci().misses), (1, 2)); + let _ = load_scoped_page_index(&store, &loc, &fo, &c_price).await.unwrap(); + let _ = load_scoped_page_index(&store, &loc, &fo, &c_qty).await.unwrap(); + let s = ci(); + assert_eq!((s.hits, s.misses, s.entries, s.evictions), (3, 2, 2, 0)); + } + + /// Byte-bounded LRU on the (now cell-keyed) ColumnIndex cache: with the budget + /// sized to hold ~1.5 cells, loading two distinct column cells evicts the LRU + /// one; the cache never exceeds its limit and never degrades to "cache + /// nothing"; the most-recently-used cell survives. + #[tokio::test] + async fn lru_evicts_over_byte_budget() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = two_col_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + let c_price = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]); + let c_qty = resolve_predicate_parquet_columns(&schema, &fo, &["qty".to_string()]); + + // Measure one CI cell (predicate `price` = col0 at the single RG), then set + // a budget of ~1.5 cells so a second distinct cell forces an eviction. + let _ = load_scoped_page_index(&store, &loc, &fo, &c_price).await.unwrap(); + let one_cell = ci().used_bytes; + assert!(one_cell > 0); + let budget = one_cell + one_cell / 2; + clear_scoped_cache_for_test(); + set_column_index_cache_limit_for_test(budget); + + let _ = load_scoped_page_index(&store, &loc, &fo, &c_price).await.unwrap(); // cell (col0) + let _ = load_scoped_page_index(&store, &loc, &fo, &c_qty).await.unwrap(); // cell (col1) → evicts col0 + + assert!(ci().used_bytes <= budget, "CI bytes {} must stay within {}", ci().used_bytes, budget); + assert_eq!(ci().entries, 1, "only the most-recent cell fits"); + assert!(ci().evictions >= 1, "the LRU cell must have evicted"); + + // The most-recently-used cell (qty/col1) must still be a hit. + let hits_before = ci().hits; + let _ = load_scoped_page_index(&store, &loc, &fo, &c_qty).await.unwrap(); + assert_eq!(ci().hits, hits_before + 1, "MRU cell must remain cached"); + + clear_scoped_cache_for_test(); + } + + // ── Step 2: RG-scoping the ColumnIndex ──────────────────────────────── + + #[tokio::test] + async fn surviving_row_groups_matches_footer_stats_prune() { + let (bytes, schema) = four_rg_parquet(); + let fo = footer_only(&bytes); + assert_eq!(fo.num_row_groups(), 4); + let p = pred("id", 0, Operator::GtEq, 25); + assert_eq!(surviving_row_groups(&fo, &schema, &p), vec![2, 3]); + let p2 = pred("id", 0, Operator::Lt, 12); + assert_eq!(surviving_row_groups(&fo, &schema, &p2), vec![0, 1]); + } + + #[tokio::test] + async fn rg_scoped_load_builds_column_index_only_for_survivors() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = four_rg_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + let cols = resolve_predicate_parquet_columns(&schema, &fo, &["id".to_string()]); + let surviving = vec![2usize, 3usize]; + + let aug = load_scoped_page_index_rgs(&store, &loc, &fo, &cols, &surviving).await.unwrap(); + let c = aug.column_index().unwrap(); + let o = aug.offset_index().unwrap(); + assert_eq!(c.len(), 4); + for &rg in &surviving { + assert!(!matches!(c[rg][0], ColumnIndexMetaData::NONE), "survivor RG {rg} real CI"); + } + for &rg in &[0usize, 1usize] { + assert!(matches!(c[rg][0], ColumnIndexMetaData::NONE), "pruned RG {rg} NONE CI"); + } + for rg in 0..4 { + for cc in 0..2 { + assert!(!o[rg][cc].page_locations().is_empty(), "OI real for all rg/col"); + } + } + let full = full_index(&bytes); + let pp = build_pruning_predicate(&pred("id", 0, Operator::GtEq, 25), schema.clone()).unwrap(); + let s = PagePruner::new(&schema, Arc::clone(&aug)).prune_rg(&pp, 2, None); + let f = PagePruner::new(&schema, full).prune_rg(&pp, 2, None); + assert_eq!(s.as_ref().map(kept), f.as_ref().map(kept)); + clear_scoped_cache_for_test(); + } + + #[tokio::test] + async fn rg_scoping_reduces_column_index_bytes() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = four_rg_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + let cols = resolve_predicate_parquet_columns(&schema, &fo, &["id".to_string()]); + + let _ = load_scoped_page_index(&store, &loc, &fo, &cols).await.unwrap(); + let all_rg = ci().used_bytes; + clear_scoped_cache_for_test(); + let _ = load_scoped_page_index_rgs(&store, &loc, &fo, &cols, &[2, 3]).await.unwrap(); + assert!(ci().used_bytes < all_rg, "RG-scoped CI bytes {} < all-RG {}", ci().used_bytes, all_rg); + clear_scoped_cache_for_test(); + } + + /// CI cells are keyed per `(col, rg)`. Loading survivors {2,3} caches cells + /// (id,rg2) + (id,rg3); reloading the same survivor set hits both; a different + /// survivor set {0,1} adds two fresh cells. So a column's per-RG index is + /// reused across overlapping survivor sets instead of re-decoded per set. + #[tokio::test] + async fn rg_scoped_key_includes_surviving_rgs() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = four_rg_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + let cols = resolve_predicate_parquet_columns(&schema, &fo, &["id".to_string()]); + + let _ = load_scoped_page_index_rgs(&store, &loc, &fo, &cols, &[2, 3]).await.unwrap(); + assert_eq!((ci().misses, ci().entries), (2, 2), "cells (id,rg2)+(id,rg3)"); + let _ = load_scoped_page_index_rgs(&store, &loc, &fo, &cols, &[2, 3]).await.unwrap(); + assert_eq!((ci().hits, ci().entries), (2, 2), "same survivors → both cells hit"); + let _ = load_scoped_page_index_rgs(&store, &loc, &fo, &cols, &[0, 1]).await.unwrap(); + assert_eq!((ci().misses, ci().entries), (4, 4), "new survivors → 2 fresh cells"); + // OI stayed all-columns across all three → 2 column cells, shared. + assert_eq!(oi().entries, 2); + clear_scoped_cache_for_test(); + } + + /// Partial-overlap survivor sets only decode the NEW row groups. Load + /// survivors {2,3} (cells rg2,rg3), then {1,2,3}: rg2+rg3 hit, only rg1 is + /// freshly decoded. Proves RG-scoping reuses per-RG cells across overlapping + /// survivor sets rather than re-decoding the whole set. + #[tokio::test] + async fn overlapping_survivor_sets_decode_only_new_rgs() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = four_rg_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + let cols = resolve_predicate_parquet_columns(&schema, &fo, &["id".to_string()]); + + let _ = load_scoped_page_index_rgs(&store, &loc, &fo, &cols, &[2, 3]).await.unwrap(); + assert_eq!((ci().hits, ci().misses, ci().entries), (0, 2, 2), "cells (id,rg2)+(id,rg3)"); + + // {1,2,3}: rg2 & rg3 are cached (2 hits); only rg1 is new (1 miss). + let _ = load_scoped_page_index_rgs(&store, &loc, &fo, &cols, &[1, 2, 3]).await.unwrap(); + assert_eq!( + (ci().hits, ci().misses, ci().entries), + (2, 3, 3), + "rg2+rg3 reused (2 hits); only rg1 freshly decoded" + ); + clear_scoped_cache_for_test(); + } + + /// The combined payoff across BOTH axes: a second query that adds a new + /// predicate column AND scans a wider RG set decodes only the genuinely new + /// `(col, rg)` cells. Uses `wide4` (1 RG) for the column axis and asserts CI + /// cell-level hit/miss deltas. + #[tokio::test] + async fn new_column_combination_caches_only_new_column_cells() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = wide4_parquet(); // n0,n1,s0,s1 — 1 RG + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + let c_n0 = resolve_predicate_parquet_columns(&schema, &fo, &["n0".to_string()]); + let c_n0_n1 = resolve_predicate_parquet_columns( + &schema, + &fo, + &["n0".to_string(), "n1".to_string()], + ); + let c_n1_s0 = resolve_predicate_parquet_columns( + &schema, + &fo, + &["n1".to_string(), "s0".to_string()], + ); + + // {n0}: 1 new cell. + let _ = load_scoped_page_index(&store, &loc, &fo, &c_n0).await.unwrap(); + assert_eq!((ci().hits, ci().misses, ci().entries), (0, 1, 1)); + // {n0,n1}: n0 hits, n1 new. + let _ = load_scoped_page_index(&store, &loc, &fo, &c_n0_n1).await.unwrap(); + assert_eq!((ci().hits, ci().misses, ci().entries), (1, 2, 2), "n0 reused; n1 new"); + // {n1,s0}: n1 hits, s0 new. + let _ = load_scoped_page_index(&store, &loc, &fo, &c_n1_s0).await.unwrap(); + assert_eq!((ci().hits, ci().misses, ci().entries), (2, 3, 3), "n1 reused; s0 new"); + clear_scoped_cache_for_test(); + } + + /// OffsetIndex equivalent: different projections cache only the new column + /// cells. Project {s0} (offset cols n1∪s0∪{0}), then {s1} (offset cols + /// n1∪s1∪{0}) — the shared cols (0, n1) hit; only the genuinely new projected + /// column is decoded. + #[tokio::test] + async fn different_projections_cache_only_new_offset_columns() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = wide4_parquet(); // n0=0,n1=1,s0=2,s1=3 — 1 RG + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + let pred_cols = resolve_predicate_parquet_columns(&schema, &fo, &["n1".to_string()]); + + // Project s0 (col 2): offset cols = {0, 1(n1), 2(s0)} → 3 new cells. + let _ = load_scoped_page_index_cols(&store, &loc, &fo, &pred_cols, &[2]).await.unwrap(); + assert_eq!((oi().hits, oi().misses, oi().entries), (0, 3, 3), "cols 0,1,2"); + + // Project s1 (col 3): offset cols = {0, 1, 3}. Cols 0 & 1 hit; col 3 new. + let _ = load_scoped_page_index_cols(&store, &loc, &fo, &pred_cols, &[3]).await.unwrap(); + assert_eq!( + (oi().hits, oi().misses, oi().entries), + (2, 4, 4), + "cols 0,1 reused (2 hits); only col 3 freshly decoded" + ); + clear_scoped_cache_for_test(); + } + + // ── Step 2: column-scoping the OffsetIndex ──────────────────────────── + + #[tokio::test] + async fn col_scoped_offset_index_only_for_requested_columns() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = wide4_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + let pred_cols = resolve_predicate_parquet_columns(&schema, &fo, &["n1".to_string()]); + assert_eq!(pred_cols, vec![1]); + + let aug = load_scoped_page_index_cols(&store, &loc, &fo, &pred_cols, &[2]).await.unwrap(); + let o = aug.offset_index().unwrap(); + // wide4 has 256 rows / page-size 32 → real columns have multiple pages. + let full = full_index(&bytes); + let real_pages = full.offset_index().unwrap()[0][0].page_locations().len(); + assert!(real_pages > 1, "fixture should have multi-page columns"); + // Scoped columns (predicate 1 ∪ projection 2 ∪ metric 0) carry the REAL + // page index; the rest carry a single whole-RG placeholder page (non-empty + // so any consumer dereference is safe — never empty, which would panic). + for &c in &[0usize, 1, 2] { + assert_eq!(o[0][c].page_locations().len(), real_pages, "col {c} (pred/proj/metric) real OI"); + } + assert_eq!( + o[0][3].page_locations().len(), + 1, + "col 3 (scoped out) OI is a single-page placeholder, not real and not empty" + ); + } + + #[tokio::test] + async fn col_scoped_reads_projected_non_predicate_column() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = two_col_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + let pred_cols = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]); + let aug = load_scoped_page_index_cols(&store, &loc, &fo, &pred_cols, &[1]).await.unwrap(); + let selection = RowSelection::from(vec![RowSelector::skip(16), RowSelector::select(16)]); + let scoped_vals = read_selected_column(&bytes, &aug, 1, selection.clone()).unwrap(); + let full = full_index(&bytes); + let full_vals = read_selected_column(&bytes, &full, 1, selection).unwrap(); + let expected: Vec = (116..132).collect(); + assert_eq!(scoped_vals, expected); + assert_eq!(scoped_vals, full_vals); + } + + #[tokio::test] + async fn col_scoping_reduces_offset_index_bytes() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = wide4_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + let pred_cols = resolve_predicate_parquet_columns(&schema, &fo, &["n1".to_string()]); + + let _ = load_scoped_page_index(&store, &loc, &fo, &pred_cols).await.unwrap(); + let all_cols = oi().used_bytes; + clear_scoped_cache_for_test(); + let _ = load_scoped_page_index_cols(&store, &loc, &fo, &pred_cols, &[2]).await.unwrap(); + assert!(oi().used_bytes < all_cols, "col-scoped OI {} < all-col {}", oi().used_bytes, all_cols); + clear_scoped_cache_for_test(); + } + + /// Cell-keying makes OffsetIndex reuse automatic: an all-columns load caches + /// per-column cells, and a later column-scoped load whose set is covered by + /// those cells hits them — no new entries, no special "collapse to all-columns + /// sentinel" needed (the prior set-keyed design's mechanism). + #[tokio::test] + async fn col_scoping_full_coverage_collapses_to_all_columns_entry() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = two_col_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + let pred_cols = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]); + + let _ = load_scoped_page_index(&store, &loc, &fo, &pred_cols).await.unwrap(); + assert_eq!(oi().entries, 2, "all-columns load caches 2 column cells"); + // Project {1}; union {0,1} = both columns, both already cached → 2 hits. + let _ = load_scoped_page_index_cols(&store, &loc, &fo, &pred_cols, &[1]).await.unwrap(); + assert_eq!(oi().entries, 2, "covered columns reuse their cells, no new entries"); + assert_eq!(oi().hits, 2); + clear_scoped_cache_for_test(); + } + + /// The fully-scoped entry point: CI RG-scoped + OI column-scoped together. + #[tokio::test] + async fn fully_scoped_load_combines_both_axes() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = four_rg_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + let cols = resolve_predicate_parquet_columns(&schema, &fo, &["id".to_string()]); + + // CI scoped to RGs {2,3}; OI scoped to {0,1} = all 2 cols (collapses to all). + let aug = load_page_index_fully_scoped(&store, &loc, &fo, &cols, &[2, 3], &[1]) + .await + .unwrap(); + let c = aug.column_index().unwrap(); + assert!(matches!(c[0][0], ColumnIndexMetaData::NONE), "RG0 pruned → NONE CI"); + assert!(!matches!(c[2][0], ColumnIndexMetaData::NONE), "RG2 survivor → real CI"); + // CI cells (id,rg2)+(id,rg3) = 2; OI cells (col0)+(col1) = 2. + assert_eq!(ci().entries, 2); + assert_eq!(oi().entries, 2); + clear_scoped_cache_for_test(); + } + + // ── Eviction on file deletion ───────────────────────────────────────────── + + /// Evicting a file removes ALL its CI and OI cells from the caches — + /// a subsequent load for the same path is a miss, not a stale hit. + #[tokio::test] + async fn evict_file_clears_all_cells_for_that_path() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = two_col_parquet(); + let (store, loc) = stage(bytes.clone()).await; + let fo = footer_only(&bytes); + let cols = resolve_predicate_parquet_columns(&schema, &fo, &["price".to_string()]); + + // Warm: 1 CI cell (price,rg0) + 2 OI cells (col0, col1). + let _ = load_scoped_page_index_cols(&store, &loc, &fo, &cols, &[0, 1]).await.unwrap(); + assert_eq!(ci().entries, 1); + assert_eq!(oi().entries, 2); + assert_eq!(ci().misses, 1); + + // Evict the file. + super::super::evict_file_from_scoped_cache(loc.as_ref()); + assert_eq!(ci().entries, 0, "CI cells must be gone after eviction"); + assert_eq!(oi().entries, 0, "OI cells must be gone after eviction"); + + // Reload — must be a miss, not a hit from stale cache. + let _ = load_scoped_page_index_cols(&store, &loc, &fo, &cols, &[0, 1]).await.unwrap(); + assert_eq!(ci().misses, 2, "second load after eviction must be a cache miss"); + assert_eq!(ci().hits, 0, "no hits — eviction prevented serving stale data"); + + clear_scoped_cache_for_test(); + } + + /// Evicting file A does not remove cells for file B. Cross-file isolation. + #[tokio::test] + async fn evict_file_does_not_affect_other_files() { + let _g = CACHE_TEST_GUARD.lock().unwrap(); + clear_scoped_cache_for_test(); + let (bytes, schema) = two_col_parquet(); + let cols = resolve_predicate_parquet_columns(&schema, &footer_only(&bytes), &["price".to_string()]); + + // Stage two identical files at different paths. + let store_a: Arc = Arc::new(object_store::memory::InMemory::new()); + let loc_a = object_store::path::Path::from("file_a.parquet"); + let loc_b = object_store::path::Path::from("file_b.parquet"); + store_a.put(&loc_a, object_store::PutPayload::from_bytes(bytes.clone())).await.unwrap(); + store_a.put(&loc_b, object_store::PutPayload::from_bytes(bytes.clone())).await.unwrap(); + + let fo = footer_only(&bytes); + let _ = load_scoped_page_index_cols(&store_a, &loc_a, &fo, &cols, &[0, 1]).await.unwrap(); + let _ = load_scoped_page_index_cols(&store_a, &loc_b, &fo, &cols, &[0, 1]).await.unwrap(); + assert_eq!(ci().entries, 2, "one CI cell per file"); + assert_eq!(oi().entries, 4, "two OI cells per file"); + + // Evict only file_a. + super::super::evict_file_from_scoped_cache(loc_a.as_ref()); + assert_eq!(ci().entries, 1, "only file_a's CI cell removed"); + assert_eq!(oi().entries, 2, "only file_a's OI cells removed"); + + // file_b's cells are still hits. + let hits_before = ci().hits; + let _ = load_scoped_page_index_cols(&store_a, &loc_b, &fo, &cols, &[0, 1]).await.unwrap(); + assert_eq!(ci().hits, hits_before + 1, "file_b CI cell must still be cached"); + + clear_scoped_cache_for_test(); + } + +} diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/statistics_cache.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/statistics_cache.rs similarity index 100% rename from sandbox/plugins/analytics-backend-datafusion/rust/src/statistics_cache.rs rename to sandbox/plugins/analytics-backend-datafusion/rust/src/cache/statistics_cache.rs diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/indexed_executor.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/indexed_executor.rs index 09a691d7fbeda..5d882c7507754 100644 --- a/sandbox/plugins/analytics-backend-datafusion/rust/src/indexed_executor.rs +++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/indexed_executor.rs @@ -67,7 +67,7 @@ use crate::indexed_table::table_provider::{ EvaluatorFactory, IndexedTableConfig, IndexedTableProvider, SegmentFileInfo, }; -use std::collections::{BTreeSet, HashMap}; +use std::collections::{HashMap, HashSet}; use std::fmt; use crate::api::ShardView; @@ -304,7 +304,7 @@ fn collect_predicate_column_indices(extraction: Option<&ExtractionResult>) -> Ve let Some(e) = extraction else { return vec![] }; let mut exprs = Vec::new(); collect_predicate_exprs(&e.tree, &mut exprs); - let mut indices = BTreeSet::new(); + let mut indices = HashSet::new(); for expr in &exprs { let _ = expr.apply(|node| { if let Some(col) = node.downcast_ref::() { @@ -315,6 +315,45 @@ fn collect_predicate_column_indices(extraction: Option<&ExtractionResult>) -> Ve } indices.into_iter().collect() } + +fn collect_predicate_column_names( + extraction: Option<&ExtractionResult>, + schema: &SchemaRef, +) -> Vec { + let Some(e) = extraction else { return vec![] }; + let mut exprs = Vec::new(); + collect_predicate_exprs(&e.tree, &mut exprs); + let mut names = HashSet::new(); + for expr in &exprs { + let _ = expr.apply(|node| { + if let Some(col) = node.downcast_ref::() { + if let Some(field) = schema.fields().get(col.index()) { + names.insert(field.name().to_string()); + } + } + Ok(TreeNodeRecursion::Continue) + }); + } + names.into_iter().collect() +} + +fn collect_plan_column_names(plan: &datafusion::logical_expr::LogicalPlan) -> Vec { + let mut names = HashSet::new(); + let _ = plan.apply(|node| { + let _ = node.apply_expressions(|expr| { + let _ = expr.apply(|e| { + if let Expr::Column(col) = e { + names.insert(col.name().to_string()); + } + Ok(TreeNodeRecursion::Continue) + }); + Ok(TreeNodeRecursion::Continue) + }); + Ok(TreeNodeRecursion::Continue) + }); + names.into_iter().collect() +} + /// For a tree classified as `SingleCollector`, walk it to find the single /// Collector leaf and return its query bytes. fn single_collector_id(tree: &BoolNode) -> Option { @@ -924,6 +963,38 @@ async unsafe fn execute_indexed_with_context_inner( let predicate_columns = collect_predicate_column_indices(extraction.as_ref()); + // Augment each segment's footer-only metadata with a scoped page index so + // the indexed PagePruner can page-prune. Both predicate (→ ColumnIndex) and + // projection (→ OffsetIndex) are wired — a match()-only query still needs a + // scoped OffsetIndex so the reader fetches only matched pages. + let predicate_column_names = collect_predicate_column_names(extraction.as_ref(), &schema); + let projection_column_names = collect_plan_column_names(&logical_plan); + if !predicate_column_names.is_empty() || !projection_column_names.is_empty() { + for segment in segments.iter_mut() { + let (parquet_cols, offset_cols) = + crate::parquet_page_cache::resolve_predicate_parquet_columns_pair( + &schema, + &segment.metadata, + &predicate_column_names, + &projection_column_names, + ); + if parquet_cols.is_empty() && offset_cols.is_empty() { + continue; + } + if let Some(augmented) = crate::parquet_page_cache::load_scoped_page_index_cols( + &store, + &segment.object_path, + &segment.metadata, + &parquet_cols, + &offset_cols, + ) + .await + { + segment.metadata = augmented; + } + } + } + let factory: EvaluatorFactory = match classification { FilterClass::None => { // Predicate-only scan: page-pruned universe, residual applied in diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/indexed_table/parquet_bridge.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/indexed_table/parquet_bridge.rs index 5231187a694e2..65d7dab71ba02 100644 --- a/sandbox/plugins/analytics-backend-datafusion/rust/src/indexed_table/parquet_bridge.rs +++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/indexed_table/parquet_bridge.rs @@ -24,10 +24,13 @@ use std::time::{Duration, Instant}; use datafusion::arrow::datatypes::SchemaRef; use datafusion::common::Result; -use datafusion::datasource::physical_plan::parquet::metadata::DFParquetMetadata; +use datafusion::datasource::physical_plan::parquet::metadata::CachedParquetMetaData; use datafusion::datasource::physical_plan::parquet::{ ParquetAccessPlan, ParquetFileMetrics, ParquetFileReaderFactory, RowGroupAccess, }; +use datafusion::execution::cache::cache_manager::CachedFileMetadataEntry; +use datafusion::parquet::arrow::async_reader::ParquetObjectReader; +use datafusion::parquet::file::metadata::{PageIndexPolicy, ParquetMetaDataReader}; use datafusion::datasource::physical_plan::ParquetSource; use datafusion::execution::cache::cache_manager::FileMetadataCache; use datafusion::execution::object_store::ObjectStoreUrl; @@ -48,8 +51,11 @@ use prost::bytes::Bytes; // ── Parquet Metadata Loading ───────────────────────────────────────── -/// Load parquet metadata via DataFusion's `DFParquetMetadata`, consulting the -/// caller-supplied `FileMetadataCache`. +/// Load footer-only parquet metadata, consulting the caller-supplied cache. +/// +/// On a cache hit the cached (footer-only) metadata is returned with no IO. +/// On a cache miss we fetch with `PageIndexPolicy::Skip` — never fetching page +/// index bytes — then store the footer in the cache for future hits. pub async fn load_parquet_metadata( store: Arc, location: &object_store::path::Path, @@ -58,18 +64,50 @@ pub async fn load_parquet_metadata( let meta = store .head(location) .await - .map_err(|e| format!("object-store head {}: {}", location, e))?; + .map_err(|e| format!("object-store head {location}: {e}"))?; let size = meta.size; - let pq_meta = DFParquetMetadata::new(&*store, &meta) - .with_file_metadata_cache(Some(metadata_cache)) - .fetch_metadata() - .await - .map_err(|e| format!("load parquet metadata {}: {}", location, e))?; + // Cache hit — return footer-only metadata without any IO. + let pq_meta = if let Some(entry) = metadata_cache.get(location) { + if entry.is_valid_for(&meta) { + entry + .file_metadata + .as_any() + .downcast_ref::() + .map(|cached| Arc::clone(cached.parquet_metadata())) + } else { + None + } + } else { + None + }; + + // Cache miss — fetch footer only, no page index bytes. + let pq_meta = match pq_meta { + Some(m) => m, + None => { + let mut reader = ParquetObjectReader::new(Arc::clone(&store), location.clone()); + let fetched = Arc::new( + ParquetMetaDataReader::new() + .with_page_index_policy(PageIndexPolicy::Skip) + .load_and_finish(&mut reader, size) + .await + .map_err(|e| format!("load parquet metadata {location}: {e}"))?, + ); + metadata_cache.put( + location, + CachedFileMetadataEntry::new( + meta, + Arc::new(CachedParquetMetaData::new(Arc::clone(&fetched))), + ), + ); + fetched + } + }; let file_meta = pq_meta.file_metadata(); let schema = parquet_to_arrow_schema(file_meta.schema_descr(), file_meta.key_value_metadata()) - .map_err(|e| format!("parquet_to_arrow_schema {}: {}", location, e))?; + .map_err(|e| format!("parquet_to_arrow_schema {location}: {e}"))?; Ok((Arc::new(schema), size, pq_meta)) } diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/lib.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/lib.rs index c1c853db2c04a..4eb3f603d6fd0 100644 --- a/sandbox/plugins/analytics-backend-datafusion/rust/src/lib.rs +++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/lib.rs @@ -25,9 +25,7 @@ pub mod api; pub mod cache; pub mod cancellation; pub mod cross_rt_stream; -pub mod custom_cache_manager; pub mod datafusion_query_config; -pub mod eviction_policy; pub mod executor; pub mod ffm; pub mod indexed_executor; @@ -50,7 +48,6 @@ pub mod shard_table_provider; pub mod runtime_manager; pub mod schema_coerce; pub mod session_context; -pub mod statistics_cache; pub mod udaf; pub mod udf; pub mod udwf; @@ -58,3 +55,11 @@ pub mod native_node_stats; pub mod search_stats; pub mod stats; pub mod task_monitors; +pub mod scoped_index_optimizer; +pub mod scoped_page_index_reader; + +// Path aliases — old module names still resolve unchanged. +pub use cache::statistics_cache; +pub use cache::eviction_policy; +pub use cache::custom_cache_manager; +pub use cache::page_index as parquet_page_cache; diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/scoped_index_optimizer.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/scoped_index_optimizer.rs new file mode 100644 index 0000000000000..9da4e511f869e --- /dev/null +++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/scoped_index_optimizer.rs @@ -0,0 +1,415 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +//! Physical optimizer rule that installs the scoped page-index reader factory on +//! **every** parquet scan in the plan — provider-agnostic. +//! +//! # Why a rule (not a TableProvider) +//! +//! The scoped page-index loader is a property of *how we read parquet*, not of +//! *which TableProvider* produced the scan. Wiring it into a specific provider +//! leaves other scan paths on DataFusion's default reader, which loads the full +//! all-column page index every query and caches none of it. +//! +//! This rule walks the physical plan, finds each parquet `DataSourceExec`, reads +//! the predicate already pushed onto its `ParquetSource`, derives the predicate +//! columns, and swaps in a [`ScopedPageIndexReaderFactory`] scoped to those +//! columns. It runs after DataFusion's own optimizers (which is when filter +//! pushdown has populated `ParquetSource::predicate`), so it works uniformly for +//! `ListingTable`, `ShardTableProvider`, and any future parquet provider. +//! +//! # Replace, do NOT skip-if-present +//! +//! DataFusion's `ParquetFormat::create_physical_plan` ALWAYS pre-installs its own +//! `CachedParquetFileReaderFactory` (the full all-column page-index loader). That +//! is exactly the factory we want to replace, so this rule does not skip a scan +//! just because a factory is already set. (A skip-if-present guard was the +//! original bug that made the end-to-end listing scan never use the scoped +//! reader.) The indexed path does not run this rule — it uses its own executor. +//! +//! # No-op cases (left exactly as DataFusion would run them) +//! +//! - A `DataSourceExec` that isn't parquet — skipped. +//! - A parquet scan with no predicate, or whose predicate references no file +//! columns — skipped (nothing to scope; the opener loads on demand as today). + +use std::sync::Arc; + +use datafusion::common::config::ConfigOptions; +use datafusion::common::tree_node::{Transformed, TreeNode}; +use datafusion::common::Result; +use datafusion::datasource::physical_plan::{FileSource, ParquetSource}; +use datafusion::datasource::source::DataSourceExec; +use datafusion::execution::cache::cache_manager::FileMetadataCache; +use datafusion::physical_expr::utils::collect_columns; +use datafusion::physical_optimizer::PhysicalOptimizerRule; +use datafusion::physical_plan::ExecutionPlan; +use datafusion_datasource::file_scan_config::{FileScanConfig, FileScanConfigBuilder}; +use object_store::ObjectStore; + +use crate::scoped_page_index_reader::ScopedPageIndexReaderFactory; + +/// Installs the scoped page-index reader factory on parquet scans. +/// +/// Carries the object store and shared metadata cache because a +/// `PhysicalOptimizerRule` has no access to the session; the caller constructs +/// it from the query's `RuntimeEnv`. +#[derive(Debug)] +pub struct ScopedPageIndexOptimizer { + store: Arc, + metadata_cache: Arc, +} + +impl ScopedPageIndexOptimizer { + pub fn new(store: Arc, metadata_cache: Arc) -> Self { + Self { store, metadata_cache } + } +} + +impl PhysicalOptimizerRule for ScopedPageIndexOptimizer { + fn optimize( + &self, + plan: Arc, + _config: &ConfigOptions, + ) -> Result> { + let rewritten = plan.transform_up(|node| { + let Some(dse) = node.downcast_ref::() else { + return Ok(Transformed::no(node)); + }; + let Some(config) = dse.data_source().as_ref().downcast_ref::() else { + return Ok(Transformed::no(node)); + }; + let Some(parquet) = (config.file_source().as_ref() as &dyn std::any::Any) + .downcast_ref::() + else { + return Ok(Transformed::no(node)); + }; + + let file_schema = config.file_schema(); + let predicate = parquet.filter(); + + // Predicate column NAMES — empty when there's no pushed-down filter. + let mut predicate_names: Vec = predicate + .as_ref() + .map(|p| { + let mut names: Vec = collect_columns(p) + .into_iter() + .map(|c| c.name().to_string()) + .filter(|n| file_schema.index_of(n).is_ok()) + .collect(); + names.sort(); + names.dedup(); + names + }) + .unwrap_or_default(); + + // Projected column NAMES — the columns this scan actually reads. + // `projected_schema()` reflects the projection pushed into the scan. + let projection_names: Vec = match config.projected_schema() { + Ok(ps) => ps + .fields() + .iter() + .map(|f| f.name().to_string()) + .filter(|n| file_schema.index_of(n).is_ok()) + .collect(), + Err(_) => Vec::new(), + }; + + // Only scope when there's something to scope to. A full-schema scan + // with no predicate gains nothing from scoping — skip it. + // `projected_schema()` returns the full schema when no projection is + // pushed, so we check whether the projection is a strict subset. + let is_projected = projection_names.len() < file_schema.fields().len(); + if predicate_names.is_empty() && !is_projected { + return Ok(Transformed::no(node)); + } + // Pass empty projection when the scan reads all columns — the factory + // will build all-column OffsetIndex (existing behavior). + let projection_names = if is_projected { projection_names } else { Vec::new() }; + + // Build the scoped factory and reinstall the source. The predicate is + // retained for parity but not used for RG scoping (Step 1 builds an + // all-row-group, column-scoped page index — see the reader's docs). + let factory = Arc::new(ScopedPageIndexReaderFactory::new( + Arc::clone(&self.store), + Arc::clone(&self.metadata_cache), + predicate_names, + projection_names, + predicate, + Arc::clone(file_schema), + )); + let new_source = parquet.clone().with_parquet_file_reader_factory(factory); + let new_config = FileScanConfigBuilder::from(config.clone()) + .with_source(Arc::new(new_source)) + .build(); + let new_dse: Arc = DataSourceExec::from_data_source(new_config); + Ok(Transformed::yes(new_dse)) + })?; + Ok(rewritten.data) + } + + fn name(&self) -> &str { + "ScopedPageIndexOptimizer" + } + + /// We swap a reader factory only; the scan's output schema is unchanged. + fn schema_check(&self) -> bool { + true + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::datatypes::{DataType, Field, Schema}; + use datafusion::execution::cache::DefaultFilesMetadataCache; + use datafusion::execution::object_store::ObjectStoreUrl; + use datafusion::logical_expr::Operator; + use datafusion::physical_expr::expressions::{lit, BinaryExpr, Column}; + use datafusion::physical_expr::PhysicalExpr; + use datafusion_datasource::table_schema::TableSchema; + use object_store::memory::InMemory; + use crate::cache::page_index; + use crate::parquet_page_cache::{clear_scoped_cache_for_test, scoped_cache_stats}; + + fn schema() -> Arc { + Arc::new(Schema::new(vec![ + Field::new("a", DataType::Int32, false), + Field::new("b", DataType::Int32, false), + ])) + } + + fn deps() -> (Arc, Arc) { + ( + Arc::new(InMemory::new()), + Arc::new(DefaultFilesMetadataCache::new(64 * 1024 * 1024)), + ) + } + + fn datasource_exec(parquet: ParquetSource) -> Arc { + let config = + FileScanConfigBuilder::new(ObjectStoreUrl::local_filesystem(), Arc::new(parquet)).build(); + DataSourceExec::from_data_source(config) + } + + fn predicate_on_a() -> Arc { + Arc::new(BinaryExpr::new( + Arc::new(Column::new("a", 0)), + Operator::Gt, + lit(5i32), + )) + } + + fn parquet_for(sch: &Arc) -> ParquetSource { + ParquetSource::new(TableSchema::new(sch.clone(), vec![])) + } + + fn get_factory(plan: &Arc) -> Option { + let dse = plan.downcast_ref::()?; + let cfg = (dse.data_source().as_ref() as &dyn std::any::Any) + .downcast_ref::()?; + let pq = (cfg.file_source().as_ref() as &dyn std::any::Any) + .downcast_ref::()?; + Some(pq.parquet_file_reader_factory().is_some()) + } + + #[test] + fn installs_factory_when_predicate_present() { + let sch = schema(); + let (store, cache) = deps(); + let parquet = parquet_for(&sch).with_predicate(predicate_on_a()); + let plan = datasource_exec(parquet); + assert_eq!(get_factory(&plan), Some(false), "precondition: no factory yet"); + + let rule = ScopedPageIndexOptimizer::new(store, cache); + let out = rule.optimize(plan, &ConfigOptions::default()).unwrap(); + assert_eq!( + get_factory(&out), + Some(true), + "optimizer must install a scoped reader factory when a predicate is present" + ); + } + + #[test] + fn noop_without_predicate_or_projection() { + let sch = schema(); + let (store, cache) = deps(); + // No predicate, no pushed projection — nothing to scope. + let plan = datasource_exec(parquet_for(&sch)); + let rule = ScopedPageIndexOptimizer::new(store, cache); + let out = rule.optimize(plan, &ConfigOptions::default()).unwrap(); + assert_eq!( + get_factory(&out), + Some(false), + "no predicate and no projection → nothing to scope → no factory installed" + ); + } + + /// A projection-only scan (no predicate pushed down) still needs a scoped + /// OffsetIndex so the parquet reader fetches only matched-row pages instead + /// of whole column chunks. The factory must be installed from the projected + /// schema even when `parquet.filter()` is `None`. + #[test] + fn installs_factory_for_projection_only_scan() { + use datafusion::parquet::arrow::ProjectionMask; + use datafusion_datasource::file_scan_config::FileScanConfigBuilder; + + let sch = schema(); // fields: a(0), b(1) + let (store, cache) = deps(); + + // Project only column `a` — no predicate. + let parquet = parquet_for(&sch); + let config = FileScanConfigBuilder::new( + ObjectStoreUrl::local_filesystem(), + Arc::new(parquet), + ) + .with_projection(Some(vec![0])) // project `a` only + .build(); + let plan = DataSourceExec::from_data_source(config); + + let rule = ScopedPageIndexOptimizer::new(store, cache); + let out = rule.optimize(plan, &ConfigOptions::default()).unwrap(); + assert_eq!( + get_factory(&out), + Some(true), + "projection-only scan must get a scoped factory for OffsetIndex scoping" + ); + } + + /// The rule REPLACES an already-installed factory when a predicate is present + /// (DataFusion's `ParquetFormat` always pre-installs its own). + #[test] + fn replaces_existing_default_factory() { + let sch = schema(); + let (store, cache) = deps(); + let pre = Arc::new(ScopedPageIndexReaderFactory::new( + Arc::clone(&store), + Arc::clone(&cache), + vec!["a".to_string()], + vec!["a".to_string()], + None, + sch.clone(), + )); + let parquet = parquet_for(&sch) + .with_predicate(predicate_on_a()) + .with_parquet_file_reader_factory(pre); + let plan = datasource_exec(parquet); + assert_eq!(get_factory(&plan), Some(true), "precondition: a factory is present"); + + let rule = ScopedPageIndexOptimizer::new(store, cache); + let out = rule.optimize(Arc::clone(&plan), &ConfigOptions::default()).unwrap(); + assert_eq!(get_factory(&out), Some(true), "scoped factory present after rule"); + assert!( + !Arc::ptr_eq(&plan, &out), + "rule must rewrite the scan to install the scoped factory, replacing the default" + ); + } + + /// End-to-end through a real `SessionContext` + stock `ListingTable`: write a + /// parquet file, register it, plan `SELECT s0 WHERE n1 >= k`, apply the rule, + /// execute, and assert (a) results are correct and (b) the shared scoped + /// page-index cache filled — proving the rule installs a working scoped reader + /// on the vanilla listing path. + #[tokio::test] + async fn end_to_end_listing_scan_fills_scoped_cache() { + use arrow::array::{Int32Array, StringArray}; + use arrow::record_batch::RecordBatch; + use datafusion::datasource::file_format::parquet::ParquetFormat; + use datafusion::datasource::listing::{ + ListingOptions, ListingTable, ListingTableConfig, ListingTableUrl, + }; + use datafusion::parquet::arrow::ArrowWriter; + use datafusion::parquet::file::properties::{EnabledStatistics, WriterProperties}; + use datafusion::prelude::SessionContext; + use futures::StreamExt; + + // Serialize on the shared guard — this asserts on the global cache. + let _g = page_index::SCOPED_CACHE_TEST_GUARD + .lock() + .unwrap(); + crate::cache::page_index::clear_scoped_cache_for_test(); + + let sch = Arc::new(Schema::new(vec![ + Field::new("n0", DataType::Int32, false), + Field::new("n1", DataType::Int32, false), + Field::new("s0", DataType::Utf8, false), + Field::new("s1", DataType::Utf8, false), + ])); + const ROWS: i32 = 4096; + let n0: Vec = (0..ROWS).collect(); + let n1: Vec = (0..ROWS).collect(); + let s0: Vec = (0..ROWS).map(|r| format!("s0_{r:06}_padding_padding")).collect(); + let s1: Vec = (0..ROWS).map(|r| format!("s1_{r:06}_padding_padding")).collect(); + let batch = RecordBatch::try_new( + sch.clone(), + vec![ + Arc::new(Int32Array::from(n0)), + Arc::new(Int32Array::from(n1)), + Arc::new(StringArray::from(s0)), + Arc::new(StringArray::from(s1)), + ], + ) + .unwrap(); + + let dir = std::env::temp_dir().join(format!("scoped_e2e_{}", std::process::id())); + let _ = std::fs::create_dir_all(&dir); + let file_path = dir.join("data.parquet"); + { + let props = WriterProperties::builder() + .set_data_page_row_count_limit(256) + .set_write_batch_size(256) + .set_statistics_enabled(EnabledStatistics::Page) + .build(); + let f = std::fs::File::create(&file_path).unwrap(); + let mut w = ArrowWriter::try_new(f, sch.clone(), Some(props)).unwrap(); + w.write(&batch).unwrap(); + w.close().unwrap(); + } + + let ctx = SessionContext::new(); + let store: Arc = Arc::new(object_store::local::LocalFileSystem::new()); + let table_url = ListingTableUrl::parse(format!("file://{}", dir.to_str().unwrap())).unwrap(); + ctx.register_object_store(table_url.as_ref(), Arc::clone(&store)); + let listing_options = ListingOptions::new(Arc::new(ParquetFormat::new())) + .with_file_extension(".parquet") + .with_collect_stat(true); + let resolved = listing_options.infer_schema(&ctx.state(), &table_url).await.unwrap(); + let config = ListingTableConfig::new(table_url.clone()) + .with_listing_options(listing_options) + .with_schema(resolved); + let provider = Arc::new(ListingTable::try_new(config).unwrap()); + ctx.register_table("t", provider).unwrap(); + + // n1 >= 4080 keeps rows 4080..4096 (16 rows); project the non-predicate s0. + let df = ctx.sql("SELECT s0 FROM t WHERE n1 >= 4080").await.unwrap(); + let physical = df.create_physical_plan().await.unwrap(); + + let metadata_cache = ctx.runtime_env().cache_manager.get_file_metadata_cache(); + let rule = ScopedPageIndexOptimizer::new(Arc::clone(&store), metadata_cache); + let physical = rule.optimize(physical, &ConfigOptions::default()).unwrap(); + + let mut stream = + datafusion::physical_plan::execute_stream(physical, ctx.task_ctx()).unwrap(); + let mut rows = 0usize; + while let Some(b) = stream.next().await { + rows += b.unwrap().num_rows(); + } + + assert_eq!(rows, 16, "predicate n1>=4080 must keep 16 rows"); + + let stats = scoped_cache_stats(); + assert!( + stats.entries >= 1 && stats.used_bytes > 0, + "scoped cache must have filled on the listing path: {stats:?}" + ); + assert!(stats.misses >= 1, "first scan must register a scoped-cache miss: {stats:?}"); + + clear_scoped_cache_for_test(); + let _ = std::fs::remove_dir_all(&dir); + } +} diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/scoped_page_index_reader.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/scoped_page_index_reader.rs new file mode 100644 index 0000000000000..d7aed34584e1c --- /dev/null +++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/scoped_page_index_reader.rs @@ -0,0 +1,388 @@ +/* + * SPDX-License-Identifier: Apache-2.0 + * + * The OpenSearch Contributors require contributions made to + * this file be licensed under the Apache-2.0 license or a + * compatible open source license. + */ + +//! Scoped page-index reader factory for the **listing-table** scan path. +//! +//! # Why this exists +//! +//! The listing-table path (`ShardTableProvider` / vanilla `ListingTable`) uses +//! DataFusion's default reader factory, so when page pruning is enabled the +//! `ParquetOpener` loads the **entire** page index (`ColumnIndex` + `OffsetIndex` +//! for *every* column) of each surviving file, every query, and caches none of +//! it. On wide schemas the `ColumnIndex` (per-page string min/max) dominates the +//! native heap. +//! +//! This factory closes that gap using the unified scoped cache +//! ([`crate::cache::page_index`]). The seam is DataFusion's +//! [`ParquetFileReaderFactory`]: the `ParquetOpener` asks the reader for metadata +//! via `get_metadata`, and — per `opener::load_page_index` — if the returned +//! `ParquetMetaData` *already* carries a page index, the opener uses it and skips +//! the full, all-column load. So our reader's `get_metadata`: +//! +//! 1. loads footer-only metadata (shared metadata-cache hit — see +//! [`crate::indexed_table::parquet_bridge::load_parquet_metadata`]), then +//! 2. augments it with a page index scoped to the predicate columns via +//! [`crate::cache::page_index::load_scoped_page_index`] +//! (real `ColumnIndex` for predicate columns, real `OffsetIndex` for all +//! columns), and +//! 3. returns that augmented metadata. +//! +//! The scoped `(file, predicate-columns)` cache is shared with the indexed path, +//! so repeated queries reuse the decoded index across both scan paths. +//! +//! # Why all row groups (no RG scoping here) +//! +//! `ScopedPageIndexOptimizer` only swaps the reader factory; DataFusion still +//! selects which row groups to scan via its OWN RG-statistics pruning, and its +//! page-pruner + reader then dereference `column_index[rg][col]` / +//! `offset_index[rg][col]` for *its* chosen RGs — a set independent of anything +//! we could compute here. Leaving a placeholder entry on an RG DataFusion still +//! touches panics (`page_row_counts.first().unwrap()` on an empty `OffsetIndex`), +//! and its page-index gate is per-FILE (both indexes must be `Some`), so a +//! partial page index would lie to it. So the page index is built for ALL row +//! groups, column-scoped only — heap stays bounded because the heavy +//! `ColumnIndex` is scoped to predicate columns and only the cheap all-column +//! `OffsetIndex` spans every RG (and that is required for correctness at read +//! time anyway). +//! +//! # Fallback +//! +//! If there are no predicate columns, or scoped augmentation fails for a file +//! (no page index, decode/IO error), `get_metadata` returns the footer-only +//! metadata and the opener loads the page index on demand exactly as today — +//! correct, just without the scoping benefit for that file. Never a wrong result. + +use std::sync::Arc; + +use arrow::datatypes::SchemaRef; +use datafusion::datasource::physical_plan::parquet::{ParquetFileMetrics, ParquetFileReaderFactory}; +use datafusion::execution::cache::cache_manager::FileMetadataCache; +use datafusion::parquet::arrow::arrow_reader::ArrowReaderOptions; +use datafusion::parquet::arrow::async_reader::AsyncFileReader; +use datafusion::parquet::errors::ParquetError; +use datafusion::parquet::file::metadata::ParquetMetaData; +use datafusion::physical_expr::PhysicalExpr; +use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; +use datafusion_datasource::PartitionedFile; +use futures::future::BoxFuture; +use futures::FutureExt; +use object_store::{ObjectStore, ObjectStoreExt}; +use prost::bytes::Bytes; + +use crate::cache::page_index::{load_scoped_page_index_cols, resolve_predicate_parquet_columns_pair}; +use crate::indexed_table::parquet_bridge::load_parquet_metadata; + +/// A [`ParquetFileReaderFactory`] that, on `get_metadata`, returns metadata whose +/// page index is scoped to the query's predicate columns. Data reads go straight +/// to the object store. +/// +/// Carries predicate column *names* + the file schema rather than pre-resolved +/// parquet indices: the reader resolves names → parquet leaf indices per file via +/// the same `resolve_predicate_parquet_columns` the indexed path uses, robust to +/// schema evolution across files (a column absent from one file is just skipped). +#[derive(Debug)] +pub struct ScopedPageIndexReaderFactory { + store: Arc, + metadata_cache: Arc, + /// File-column names referenced by the query predicate. Empty means "no + /// scoping" — `get_metadata` returns footer-only and the opener loads the + /// page index on demand as usual. + predicate_column_names: Arc>, + /// File-column names this scan PROJECTS (reads). Used to scope the + /// OffsetIndex to `predicate ∪ projection` instead of all columns. Empty = + /// fall back to all-column offsets (old behavior). + projection_column_names: Arc>, + /// The physical predicate (if any). Retained in the constructor signature for + /// parity with the indexed path, but intentionally NOT used for RG scoping + /// here (see module docs). + #[allow(dead_code)] + predicate: Option>, + /// File schema (no partition columns), for per-file column resolution. + file_schema: SchemaRef, +} + +impl ScopedPageIndexReaderFactory { + pub fn new( + store: Arc, + metadata_cache: Arc, + predicate_column_names: Vec, + projection_column_names: Vec, + predicate: Option>, + file_schema: SchemaRef, + ) -> Self { + Self { + store, + metadata_cache, + predicate_column_names: Arc::new(predicate_column_names), + projection_column_names: Arc::new(projection_column_names), + predicate, + file_schema, + } + } +} + +impl ParquetFileReaderFactory for ScopedPageIndexReaderFactory { + fn create_reader( + &self, + partition_index: usize, + file: PartitionedFile, + _metadata_size_hint: Option, + metrics: &ExecutionPlanMetricsSet, + ) -> datafusion::common::Result> { + let file_metrics = + ParquetFileMetrics::new(partition_index, file.object_meta.location.as_ref(), metrics); + Ok(Box::new(ScopedPageIndexReader { + store: Arc::clone(&self.store), + metadata_cache: Arc::clone(&self.metadata_cache), + predicate_column_names: Arc::clone(&self.predicate_column_names), + projection_column_names: Arc::clone(&self.projection_column_names), + file_schema: Arc::clone(&self.file_schema), + location: file.object_meta.location.clone(), + metrics: file_metrics, + })) + } +} + +struct ScopedPageIndexReader { + store: Arc, + metadata_cache: Arc, + predicate_column_names: Arc>, + projection_column_names: Arc>, + file_schema: SchemaRef, + location: object_store::path::Path, + metrics: ParquetFileMetrics, +} + +impl AsyncFileReader for ScopedPageIndexReader { + fn get_bytes( + &mut self, + range: std::ops::Range, + ) -> BoxFuture<'_, datafusion::parquet::errors::Result> { + self.metrics.bytes_scanned.add((range.end - range.start) as usize); + let store = Arc::clone(&self.store); + let location = self.location.clone(); + // IO-runtime dispatch is handled by the store wrapper around the + // registered store, so a plain `.await` already runs on the IO runtime. + async move { + store + .get_range(&location, range) + .await + .map_err(|e| ParquetError::External(Box::new(e))) + } + .boxed() + } + + fn get_byte_ranges( + &mut self, + ranges: Vec>, + ) -> BoxFuture<'_, datafusion::parquet::errors::Result>> { + let total: u64 = ranges.iter().map(|r| r.end - r.start).sum(); + self.metrics.bytes_scanned.add(total as usize); + let store = Arc::clone(&self.store); + let location = self.location.clone(); + async move { + store + .get_ranges(&location, &ranges) + .await + .map_err(|e| ParquetError::External(Box::new(e))) + } + .boxed() + } + + fn get_metadata( + &mut self, + _options: Option<&ArrowReaderOptions>, + ) -> BoxFuture<'_, datafusion::parquet::errors::Result>> { + let store = Arc::clone(&self.store); + let metadata_cache = Arc::clone(&self.metadata_cache); + let predicate_names = Arc::clone(&self.predicate_column_names); + let projection_names = Arc::clone(&self.projection_column_names); + let file_schema = Arc::clone(&self.file_schema); + let location = self.location.clone(); + async move { + // 1. Footer-only metadata (shared metadata-cache hit if pre-seeded). + let (_schema, _size, footer) = + load_parquet_metadata(Arc::clone(&store), &location, Arc::clone(&metadata_cache)) + .await + .map_err(|e| ParquetError::General(format!("footer metadata {location}: {e}")))?; + + // 2. Resolve predicate + projection names → parquet leaf indices, then + // augment with a column-scoped page index. Gated on either being + // non-empty: a projection-only query still needs a scoped OffsetIndex. + if !predicate_names.is_empty() || !projection_names.is_empty() { + let (parquet_cols, offset_cols) = resolve_predicate_parquet_columns_pair( + &file_schema, &footer, &predicate_names, &projection_names, + ); + if let Some(augmented) = load_scoped_page_index_cols( + &store, + &location, + &footer, + &parquet_cols, + &offset_cols, + ) + .await + { + return Ok(augmented); + } + } + + Ok(footer) + } + .boxed() + } +} + +#[cfg(test)] +mod tests { + use super::*; + use arrow::array::{Int32Array, RecordBatch}; + use arrow::datatypes::{DataType, Field, Schema}; + use datafusion::parquet::arrow::ArrowWriter; + use datafusion::parquet::file::page_index::column_index::ColumnIndexMetaData; + use datafusion::parquet::file::properties::{EnabledStatistics, WriterProperties}; + use datafusion::physical_plan::metrics::ExecutionPlanMetricsSet; + use object_store::memory::InMemory; + use object_store::path::Path as ObjPath; + use object_store::{ObjectStore, ObjectStoreExt, PutPayload}; + + // Shared crate-wide guard so all users of the one process-global scoped cache + // mutually exclude. + use crate::cache::page_index::SCOPED_CACHE_TEST_GUARD as SCOPED_TEST_GUARD; + + /// Two int columns (`price`, `qty`), one row group, four 8-row data pages. + fn two_col_parquet() -> (Bytes, SchemaRef) { + let schema = Arc::new(Schema::new(vec![ + Field::new("price", DataType::Int32, false), + Field::new("qty", DataType::Int32, false), + ])); + let prices: Vec = (0..32).collect(); + let qtys: Vec = (100..132).collect(); + let batch = RecordBatch::try_new( + schema.clone(), + vec![ + Arc::new(Int32Array::from(prices)), + Arc::new(Int32Array::from(qtys)), + ], + ) + .unwrap(); + let props = WriterProperties::builder() + .set_max_row_group_size(32) + .set_data_page_row_count_limit(8) + .set_write_batch_size(8) + .set_statistics_enabled(EnabledStatistics::Page) + .build(); + let mut buf: Vec = Vec::new(); + let mut w = ArrowWriter::try_new(&mut buf, schema.clone(), Some(props)).unwrap(); + w.write(&batch).unwrap(); + w.close().unwrap(); + (Bytes::from(buf), schema) + } + + async fn stage(bytes: Bytes) -> (Arc, ObjPath) { + let store: Arc = Arc::new(InMemory::new()); + let loc = ObjPath::from("data.parquet"); + store.put(&loc, PutPayload::from_bytes(bytes)).await.unwrap(); + (store, loc) + } + + fn fresh_cache() -> Arc { + Arc::new(crate::cache::MutexFileMetadataCache::new( + datafusion::execution::cache::DefaultFilesMetadataCache::new(64 * 1024 * 1024), + )) + } + + fn metrics() -> ExecutionPlanMetricsSet { + ExecutionPlanMetricsSet::new() + } + + /// The factory's reader must, on `get_metadata`, return metadata whose page + /// index is scoped to the predicate column (`price`) — real ColumnIndex for + /// `price`, NONE placeholder for `qty` — while keeping a REAL OffsetIndex for + /// BOTH columns. Also fills the shared scoped cache. + #[tokio::test] + async fn get_metadata_returns_scoped_page_index() { + let _g = SCOPED_TEST_GUARD.lock().unwrap(); + crate::cache::page_index::clear_scoped_cache_for_test(); + + let (bytes, schema) = two_col_parquet(); + let (store, loc) = stage(bytes).await; + let factory = ScopedPageIndexReaderFactory::new( + Arc::clone(&store), + fresh_cache(), + vec!["price".to_string()], + // Project both columns so the OffsetIndex is built for both (this test + // asserts a real OffsetIndex for every column). + vec!["price".to_string(), "qty".to_string()], + None, + schema, + ); + let pf = PartitionedFile::new(loc.as_ref().to_string(), 0); + let m = metrics(); + let mut reader = factory.create_reader(0, pf, None, &m).unwrap(); + + let meta = reader.get_metadata(None).await.unwrap(); + let ci = meta.column_index().expect("augmented metadata has column index"); + let oi = meta.offset_index().expect("augmented metadata has offset index"); + assert!( + !matches!(ci[0][0], ColumnIndexMetaData::NONE), + "predicate col (price) must have a real ColumnIndex" + ); + assert!( + matches!(ci[0][1], ColumnIndexMetaData::NONE), + "non-predicate col (qty) ColumnIndex must be a NONE placeholder" + ); + assert!( + !oi[0][0].page_locations().is_empty() && !oi[0][1].page_locations().is_empty(), + "OffsetIndex must be real for every column" + ); + + let stats = crate::cache::page_index::scoped_cache_stats(); + assert!(stats.entries >= 1 && stats.misses >= 1 && stats.used_bytes > 0); + + crate::cache::page_index::clear_scoped_cache_for_test(); + } + + /// No predicate columns → no scoping happens: `get_metadata` returns the + /// footer load as-is and the scoped cache is never touched. + /// + /// Note: we deliberately do NOT assert the returned metadata has no page + /// index. Until the base metadata-cache strip lands (Step 1e), the shared + /// `load_parquet_metadata` still loads the full page index when a metadata + /// cache is present (DataFusion's `PageIndexPolicy::Optional`). The invariant + /// this reader guarantees with no predicate is "no scoping", i.e. the scoped + /// cache stays empty — which holds before and after 1e. + #[tokio::test] + async fn get_metadata_no_predicate_does_not_scope() { + let _g = SCOPED_TEST_GUARD.lock().unwrap(); + crate::cache::page_index::clear_scoped_cache_for_test(); + + let (bytes, schema) = two_col_parquet(); + let (store, loc) = stage(bytes).await; + let factory = ScopedPageIndexReaderFactory::new( + Arc::clone(&store), + fresh_cache(), + vec![], + vec![], + None, + schema, + ); + let pf = PartitionedFile::new(loc.as_ref().to_string(), 0); + let m = metrics(); + let mut reader = factory.create_reader(0, pf, None, &m).unwrap(); + + let _meta = reader.get_metadata(None).await.unwrap(); + let stats = crate::cache::page_index::scoped_cache_stats(); + assert_eq!( + (stats.entries, stats.misses, stats.hits), + (0, 0, 0), + "no predicate → scoped cache must be untouched" + ); + + crate::cache::page_index::clear_scoped_cache_for_test(); + } +} diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/session_context.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/session_context.rs index bfae9428738bf..0b21b59377df8 100644 --- a/sandbox/plugins/analytics-backend-datafusion/rust/src/session_context.rs +++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/session_context.rs @@ -32,6 +32,7 @@ use object_store::ObjectMeta; use crate::api::{DataFusionRuntime, ShardView}; use crate::datafusion_query_config::DatafusionQueryConfig; use crate::query_tracker::QueryTrackingContext; +use crate::scoped_index_optimizer::ScopedPageIndexOptimizer; /// Opaque handle holding a configured SessionContext between FFM calls. pub struct SessionContextHandle { @@ -229,6 +230,15 @@ pub async unsafe fn create_session_context( ); } + // Install the scoped page-index reader factory on every parquet scan. + // Registered AFTER ProjectRowIdOptimizer so it sees the final DataSourceExec. + state_builder = state_builder.with_physical_optimizer_rule(Arc::new( + ScopedPageIndexOptimizer::new( + Arc::clone(&shard_view.store), + runtime.runtime_env.cache_manager.get_file_metadata_cache(), + ), + )); + let state = state_builder.build(); let ctx = SessionContext::new_with_state(state); @@ -270,6 +280,25 @@ pub async unsafe fn create_session_context( table_name.to_string() }; + // Pre-warm the metadata cache footer-only before infer_schema fires. + // infer_schema calls DFParquetMetadata::fetch_metadata with PageIndexPolicy::Optional + // on a cache miss — fetching full page index bytes. By pre-warming here with + // PageIndexPolicy::Skip via load_parquet_metadata, every infer_schema call becomes + // a cache hit and never touches the page index bytes. + // Cache key is meta.location (Path) — same key infer_schema uses. + // Empty shard: loop is a no-op; infer_schema is also skipped below. + { + let metadata_cache = runtime.runtime_env.cache_manager.get_file_metadata_cache(); + for meta in shard_view.object_metas.as_ref() { + let _ = crate::indexed_table::parquet_bridge::load_parquet_metadata( + Arc::clone(&shard_view.store), + &meta.location, + Arc::clone(&metadata_cache), + ) + .await; + } + } + // Empty shard: skip infer_schema (errors on zero files); widen_schema_from_plan // below populates columns from the substrait base_schema. let inferred: arrow::datatypes::SchemaRef = if shard_view.object_metas.is_empty() { From 4483eae400b6ae08956cacf9072633c0a499eddc Mon Sep 17 00:00:00 2001 From: G Date: Sun, 21 Jun 2026 02:57:31 +0530 Subject: [PATCH 2/2] addressing comments Signed-off-by: G --- .../page_index/column_schema_resolver.rs | 24 +++++++++---------- .../rust/src/cache/page_index/mod.rs | 3 +-- .../src/cache/page_index/page_index_io.rs | 24 +++++++++++-------- 3 files changed, 26 insertions(+), 25 deletions(-) diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/column_schema_resolver.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/column_schema_resolver.rs index bc4e711003a95..984d387446dcb 100644 --- a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/column_schema_resolver.rs +++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/column_schema_resolver.rs @@ -56,9 +56,10 @@ pub fn resolve_predicate_parquet_columns( metadata.file_metadata().key_value_metadata(), ) { Ok(s) => Arc::new(s), - // If we can't derive the file schema, fall back to the union schema; the - // caller still falls back to footer-only on any downstream mismatch. - Err(_) => return resolve_with_schema(_arrow_schema, metadata, predicate_column_names), + // If we can't derive the file schema (malformed footer, unsupported type), + // return empty. Empty is the safe conservative choice: the caller skips the + // scoped load and falls back to footer-only. + Err(_) => return vec![], }; resolve_with_schema(&file_arrow_schema, metadata, predicate_column_names) } @@ -71,10 +72,10 @@ pub fn resolve_predicate_parquet_columns( /// full schema reconstruction per file per query. Pure refactor — each returned /// Vec is identical to calling `resolve_predicate_parquet_columns` separately. pub fn resolve_predicate_parquet_columns_pair( - union_schema: &SchemaRef, + _union_schema: &SchemaRef, metadata: &ParquetMetaData, - names_a: &[String], - names_b: &[String], + predicate_col_names: &[String], + projection_col_names: &[String], ) -> (Vec, Vec) { let parquet_schema = metadata.file_metadata().schema_descr(); match parquet_to_arrow_schema( @@ -84,15 +85,12 @@ pub fn resolve_predicate_parquet_columns_pair( Ok(s) => { let file_arrow_schema = Arc::new(s); ( - resolve_with_schema(&file_arrow_schema, metadata, names_a), - resolve_with_schema(&file_arrow_schema, metadata, names_b), + resolve_with_schema(&file_arrow_schema, metadata, predicate_col_names), + resolve_with_schema(&file_arrow_schema, metadata, projection_col_names), ) } - // Same fallback as the single-name path: resolve against the union schema. - Err(_) => ( - resolve_with_schema(union_schema, metadata, names_a), - resolve_with_schema(union_schema, metadata, names_b), - ), + // Can't derive the file schema — return empty for both sets. + Err(_) => (vec![], vec![]), } } diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/mod.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/mod.rs index 760f46d6d65e0..1157ba370d285 100644 --- a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/mod.rs +++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/mod.rs @@ -44,8 +44,7 @@ //! `projection_cols = predicate ∪ projection ∪ {0}`. This is the cheap, fixed-width //! index (no per-page string stats). Built for **all row groups** (an empty //! OffsetIndex on a row group DataFusion scans panics / breaks reads, and -//! DataFusion chooses the scanned set itself, after our load — see -//! HANDOFF_step2_rg_scoping.md §1e). +//! DataFusion chooses the scanned set itself, after our load). //! //! Each cache stores only its decoded vector (`ParquetColumnIndex` / //! `ParquetOffsetIndex`) — never a full `ParquetMetaData` (no footer diff --git a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/page_index_io.rs b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/page_index_io.rs index e2fd3f8c7695b..340e4d67f19f9 100644 --- a/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/page_index_io.rs +++ b/sandbox/plugins/analytics-backend-datafusion/rust/src/cache/page_index/page_index_io.rs @@ -16,7 +16,9 @@ use std::mem; use std::ops::Range; use std::sync::Arc; +use arrow::array::{ArrayRef, BooleanArray, UInt64Array}; use arrow::datatypes::SchemaRef; +use datafusion::parquet::arrow::arrow_reader::statistics::StatisticsConverter; use datafusion::parquet::errors::{ParquetError, Result as ParquetResult}; use datafusion::parquet::file::metadata::{ ColumnChunkMetaData, OffsetIndexBuilder, ParquetColumnIndex, ParquetMetaData, @@ -27,6 +29,9 @@ use datafusion::parquet::file::page_index::index_reader::{ read_columns_indexes, read_offset_indexes, }; use datafusion::parquet::file::reader::{ChunkReader, Length}; +use datafusion::physical_optimizer::pruning::{PruningPredicate, PruningStatistics}; +use datafusion::physical_expr::PhysicalExpr; +use datafusion::scalar::ScalarValue; use object_store::ObjectStore; use parquet::file::page_index::offset_index::OffsetIndexMetaData; use prost::bytes::{buf, Buf, Bytes}; @@ -106,13 +111,8 @@ pub async fn load_page_index_fully_scoped( pub fn surviving_row_groups( footer_meta: &ParquetMetaData, arrow_schema: &SchemaRef, - predicate: &Arc, + predicate: &Arc, ) -> Vec { - use arrow::array::{ArrayRef, BooleanArray, UInt64Array}; - use datafusion::parquet::arrow::arrow_reader::statistics::StatisticsConverter; - use datafusion::physical_optimizer::pruning::{PruningPredicate, PruningStatistics}; - use datafusion::scalar::ScalarValue; - use std::collections::HashSet; let num_rgs = footer_meta.num_row_groups(); let all: Vec = (0..num_rgs).collect(); @@ -417,14 +417,18 @@ async fn get_or_build_offset_index( } let num_cols = footer_meta.file_metadata().schema_descr().num_columns(); - // Resolve which columns need a real OffsetIndex: predicate ∪ projection ∪ {0}, - // clamped. `None` → all columns. - // First column {0} , is always needed as it's used in stats. + // Resolve which columns need a real OffsetIndex: + // None → no explicit projection, read everything → all columns get a real entry. + // Some → build {col 0} ∪ predicate_cols ∪ proj_cols, clamped to num_cols. + // Col 0 is always included because the page-skip metric reads it + // regardless of what the query projects or filters on. + // Predicate-only queries (empty proj_cols) still get col 0 + predicate + // columns; projection-only queries get col 0 + projected columns. let off_cols: Vec = match projection_cols { None => (0..num_cols).collect(), Some(proj_cols) => { let mut set: HashSet = HashSet::new(); - set.insert(0); // metric reads column 0 + set.insert(0); // page-skip metric always reads col 0 for &c in predicate_cols { set.insert(c); }