diff --git a/src/auto.rs b/src/auto.rs index a380b26f..cc627fd3 100644 --- a/src/auto.rs +++ b/src/auto.rs @@ -9,8 +9,8 @@ use crate::vba::VbaProject; #[cfg(feature = "picture")] use crate::Picture; use crate::{ - open_workbook, open_workbook_from_rs, Data, DataRef, HeaderRow, Metadata, Ods, Range, Reader, - ReaderRef, Xls, Xlsb, Xlsx, + open_workbook, open_workbook_from_rs, Data, DataRef, HeaderRow, IndexSet, Metadata, Ods, Range, + Reader, ReaderRef, Xls, Xlsb, Xlsx, }; use std::fs::File; @@ -188,8 +188,32 @@ where match self { Sheets::Xlsx(e) => e.worksheet_range_ref(name).map_err(Error::Xlsx), Sheets::Xlsb(e) => e.worksheet_range_ref(name).map_err(Error::Xlsb), - Sheets::Xls(_) => unimplemented!(), - Sheets::Ods(_) => unimplemented!(), + _ => Err(Error::Msg( + // Xls and Ods are eager, owned-data readers and don't produce + // borrowed `DataRef` ranges (they don't implement `ReaderRef`) + "`worksheet_range_ref` is only supported for Xlsx and Xlsb", + )), + } + } + + fn worksheet_range_region_ref<'a>( + &'a mut self, + name: &str, + cols: impl Into, + rows: impl Into, + ) -> Result>, Self::Error> { + let cols = cols.into(); + let rows = rows.into(); + match self { + Sheets::Xlsx(e) => e + .worksheet_range_region_ref(name, cols, rows) + .map_err(Error::Xlsx), + Sheets::Xlsb(e) => e + .worksheet_range_region_ref(name, cols, rows) + .map_err(Error::Xlsb), + _ => Err(Error::Msg( + "`worksheet_range_region_ref` is only supported for Xlsx and Xlsb", + )), } } } diff --git a/src/index_set.rs b/src/index_set.rs new file mode 100644 index 00000000..38467b13 --- /dev/null +++ b/src/index_set.rs @@ -0,0 +1,246 @@ +use std::ops::{Range, RangeFrom, RangeFull, RangeInclusive, RangeTo, RangeToInclusive}; + +/// A normalized set of 0-based indices for projecting columns or rows. +/// +/// Constructed via `From`/`Into` from a range, single index, list, or list of +/// ranges. An empty set (eg: `..` or `IndexSet::default()`) selects *everything*. +/// +/// ``` +/// use calamine::IndexSet; +/// +/// let _: IndexSet = (0..5).into(); // contiguous range +/// let _: IndexSet = (5..).into(); // open-ended range +/// let _: IndexSet = [1, 3, 5].into(); // list of indexes +/// let _: IndexSet = [0..3, 8..10].into(); // disjoint ranges +/// let _: IndexSet = (..).into(); // everything +/// ``` +/// +/// Note: overlapping or duplicate inputs are merged. +#[derive(Debug, Clone, Default, PartialEq, Eq)] +pub struct IndexSet { + /// Sorted, merged, non-overlapping half-open `[start, end)` intervals. + /// Empty == "everything", [`IndexSet::UNBOUNDED`] is the upper-bound sentinel. + spans: Vec<(u32, u32)>, +} + +impl IndexSet { + /// Upper-bound sentinel; real worksheet ranges end far below + /// this, so it can never collide with an addressable index. + pub(crate) const UNBOUNDED: u32 = u32::MAX; + + /// Build normalized `IndexSet` from raw half-open `[start, end)` spans. Spans + /// with `start >= end` contribute nothing, overlapping/adjacent spans merge. + fn from_spans(mut raw: Vec<(u32, u32)>) -> Self { + raw.retain(|&(s, e)| s < e); + raw.sort_unstable_by_key(|&(s, _)| s); + let mut spans: Vec<(u32, u32)> = Vec::with_capacity(raw.len()); + for (s, e) in raw { + match spans.last_mut() { + // Overlapping or adjacent with the previous span: extend it. + Some(last) if s <= last.1 => last.1 = last.1.max(e), + _ => spans.push((s, e)), + } + } + IndexSet { spans } + } + + /// True if this set selects everything (no projection). + pub fn is_all(&self) -> bool { + self.spans.is_empty() + } + + /// True if index `i` is selected. An empty set selects everything. + pub(crate) fn keep(&self, i: u32) -> bool { + if self.spans.is_empty() { + return true; + } + // Find the last span whose start is <= i, then bounds-check its end. + match self.spans.binary_search_by(|&(s, _)| s.cmp(&i)) { + Ok(_) => true, // i is exactly a span start + Err(0) => false, // before the first span + Err(idx) => i < self.spans[idx - 1].1, + } + } + + /// The maximum index this set can select, or [`IndexSet::UNBOUNDED`] if + /// unbounded above (either "all", or an open-ended span such as `5..`). + pub(crate) fn max_index(&self) -> u32 { + match self.spans.last() { + Some(&(_, end)) if end == Self::UNBOUNDED => Self::UNBOUNDED, // open-ended + Some(&(_, end)) => end - 1, // half-open -> inclusive last + None => Self::UNBOUNDED, // "all" + } + } + + /// Number of selected indices that fall within `0..bound`. + pub(crate) fn selected_count(&self, bound: u32) -> u64 { + if self.spans.is_empty() { + return bound as u64; + } + // Relies on `spans` being normalized (non-overlapping), so summing + // clamped span widths counts each selected index exactly once. + self.spans + .iter() + .map(|&(s, e)| { + let s = s.min(bound); + let e = e.min(bound); + (e - s) as u64 + }) + .sum() + } +} + +impl From<&[Range]> for IndexSet { + fn from(ranges: &[Range]) -> Self { + IndexSet::from_spans(ranges.iter().map(|r| (r.start, r.end)).collect()) + } +} + +impl From<&[u32]> for IndexSet { + fn from(list: &[u32]) -> Self { + IndexSet::from_spans(list.iter().map(|&i| (i, i.saturating_add(1))).collect()) + } +} + +impl From<[Range; N]> for IndexSet { + fn from(ranges: [Range; N]) -> Self { + IndexSet::from(&ranges[..]) + } +} + +impl From<[u32; N]> for IndexSet { + fn from(list: [u32; N]) -> Self { + IndexSet::from(&list[..]) + } +} + +impl From> for IndexSet { + fn from(r: Range) -> Self { + IndexSet::from_spans(vec![(r.start, r.end)]) + } +} + +impl From> for IndexSet { + fn from(r: RangeFrom) -> Self { + IndexSet::from_spans(vec![(r.start, IndexSet::UNBOUNDED)]) + } +} + +impl From for IndexSet { + fn from(_: RangeFull) -> Self { + IndexSet::default() + } +} + +impl From> for IndexSet { + fn from(r: RangeInclusive) -> Self { + let (start, end) = (*r.start(), *r.end()); + IndexSet::from_spans(vec![(start, end.saturating_add(1))]) + } +} + +impl From> for IndexSet { + fn from(r: RangeTo) -> Self { + IndexSet::from_spans(vec![(0, r.end)]) + } +} + +impl From> for IndexSet { + fn from(r: RangeToInclusive) -> Self { + IndexSet::from_spans(vec![(0, r.end.saturating_add(1))]) + } +} + +impl From for IndexSet { + fn from(i: u32) -> Self { + IndexSet::from_spans(vec![(i, i.saturating_add(1))]) + } +} + +impl From>> for IndexSet { + fn from(ranges: Vec>) -> Self { + IndexSet::from(&ranges[..]) + } +} + +impl From> for IndexSet { + fn from(list: Vec) -> Self { + IndexSet::from(&list[..]) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use rstest::rstest; + + /// Each input set reduces to the expected normalized spans. + #[rstest] + #[case::all((..).into(), &[])] + #[case::degenerate((5..5).into(), &[])] // start >= end -> nothing -> "all" + #[case::half_open((0..5).into(), &[(0, 5)])] + #[case::inclusive((0..=4).into(), &[(0, 5)])] + #[case::open_ended((5..).into(), &[(5, IndexSet::UNBOUNDED)])] + #[case::unsorted_dup([3u32, 1, 2, 1].into(), &[(1, 4)])] // adjacent merge + #[case::disjoint([0..3, 8..10].into(), &[(0, 3), (8, 10)])] + #[case::overlapping([0..5, 3..10].into(), &[(0, 10)])] + #[case::contained([0..10, 2..5].into(), &[(0, 10)])] + fn normalization(#[case] set: IndexSet, #[case] expected: &[(u32, u32)]) { + assert_eq!(set.spans, expected.to_vec(), "spans for {set:?}"); + assert_eq!(set.is_all(), expected.is_empty(), "is_all for {set:?}"); + } + + /// The empty set is the canonical "all", so `..` and `default()` should agree. + #[test] + fn full_range_equals_default() { + assert_eq!(IndexSet::from(..), IndexSet::default()); + } + + /// `keep(i)` for representative in/out indices, including half-open boundaries, gaps + /// between disjoint ranges, and open-ended ranges probed past any real sheet bound. + #[rstest] + #[case::all(IndexSet::default(), &[(0, true), (999, true)])] + #[case::disjoint( + [0..3, 8..10].into(), + &[(2, true), (3, false), (7, false), (8, true), (9, true), (10, false)] + )] + #[case::half_open((0..5).into(), &[(0, true), (4, true), (5, false)])] + #[case::inclusive((0..=4).into(), &[(4, true), (5, false)])] + #[case::list([3u32, 1, 2, 1].into(), &[(0, false), (1, true), (3, true), (4, false)])] + #[case::open_ended((5..).into(), &[(4, false), (5, true), (1_000_000, true)])] + fn membership(#[case] set: IndexSet, #[case] probes: &[(u32, bool)]) { + for &(i, kept) in probes { + assert_eq!(set.keep(i), kept, "keep({i}) for {set:?}"); + } + } + + /// `max_index()` returns the inclusive upper bound, or `UNBOUNDED` when unbounded. + #[rstest] + #[case::all(IndexSet::default(), IndexSet::UNBOUNDED)] + #[case::disjoint([0..3, 8..10].into(), 9)] // last span's end - 1 + #[case::half_open((0..5).into(), 4)] // half-open -> inclusive last + #[case::inclusive((0..=4).into(), 4)] + #[case::open_ended((5..).into(), IndexSet::UNBOUNDED)] + #[case::single(3u32.into(), 3)] + fn max_index_is_inclusive_ceiling(#[case] set: IndexSet, #[case] expected: u32) { + assert_eq!(set.max_index(), expected, "max_index for {set:?}"); + } + + /// `selected_count(bound)` capacity estimate, clamped to the bound. + #[rstest] + #[case::all(IndexSet::default(), 7, 7)] // "all" -> bound + #[case::beyond_bound((8..10).into(), 5, 0)] // span starts beyond bound + #[case::clamped([0..3, 8..10].into(), 9, 4)] // 3 + (9 - 8) + #[case::within_bound([0..3, 8..10].into(), 100, 5)] // 3 + 2 + fn selected_count_clamps_to_bound( + #[case] set: IndexSet, + #[case] bound: u32, + #[case] expected: u64, + ) { + assert_eq!( + set.selected_count(bound), + expected, + "selected_count({bound}) for {set:?}" + ); + } +} diff --git a/src/lib.rs b/src/lib.rs index de172593..072a7456 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -90,6 +90,7 @@ mod xlsx; mod de; mod errors; +mod index_set; pub mod changelog; pub mod vba; @@ -108,6 +109,7 @@ pub use crate::de::{ DeError, RangeDeserializer, RangeDeserializerBuilder, RowDeserializer, ToCellDeserializer, }; pub use crate::errors::Error; +pub use crate::index_set::IndexSet; pub use crate::ods::{Ods, OdsError}; pub use crate::xls::{Xls, XlsError, XlsOptions}; pub use crate::xlsb::{Xlsb, XlsbError}; @@ -470,8 +472,12 @@ where /// /// This is implemented only for [`calamine::Xlsx`](crate::Xlsx) and [`calamine::Xlsb`](crate::Xlsb), as Xls and Ods formats /// do not support lazy iteration. - fn worksheet_range_ref<'a>(&'a mut self, name: &str) - -> Result>, Self::Error>; + fn worksheet_range_ref<'a>( + &'a mut self, + name: &str, + ) -> Result>, Self::Error> { + self.worksheet_range_region_ref(name, .., ..) + } /// Get the nth worksheet range where shared string values are only borrowed. Shortcut for getting the nth /// worksheet name, then the corresponding worksheet. @@ -485,6 +491,36 @@ where let name = self.sheet_names().get(n)?.to_string(); Some(self.worksheet_range_ref(&name)) } + + /// Get the worksheet range where shared string values are only borrowed, + /// projected onto a subset of columns and rows. + /// + /// `cols` and `rows` accept anything convertible to an [`IndexSet`]; a + /// range, list of ranges, an index, or a list of indices. Overlapping + /// selections are merged. Pass `..` to select everything. + fn worksheet_range_region_ref<'a>( + &'a mut self, + name: &str, + cols: impl Into, + rows: impl Into, + ) -> Result>, Self::Error>; + + /// Owned-data equivalent of [`worksheet_range_region_ref`](ReaderRef::worksheet_range_region_ref); + /// see that method for the column/row-selection semantics. + fn worksheet_range_region( + &mut self, + name: &str, + cols: impl Into, + rows: impl Into, + ) -> Result, Self::Error> { + let rge = self.worksheet_range_region_ref(name, cols, rows)?; + let inner = rge.inner.into_iter().map(Into::into).collect(); + Ok(Range { + start: rge.start, + end: rge.end, + inner, + }) + } } /// Convenient function to open a file with a `BufReader`. diff --git a/src/utils.rs b/src/utils.rs index b4f4248b..52a7bcc6 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -11,6 +11,9 @@ use std::io::{Read, Seek}; use quick_xml::{escape::resolve_xml_entity, events::BytesRef}; use zip::read::ZipArchive; +use crate::datatype::DataRef; +use crate::{Cell, Dimensions, HeaderRow, IndexSet, Range}; + const UNICODE_ESCAPE_LENGTH: usize = 7; // Length of _x00HH_. macro_rules! from_err { @@ -65,6 +68,78 @@ pub fn read_f64(s: &[u8]) -> f64 { f64::from_le_bytes(s[..8].try_into().unwrap()) } +/// Collect worksheet cells into a [`Range`], honouring the header row and column/row +/// projection. `cols` and `rows` are normalized index sets, with an empty set +/// selecting every column/row. Overlapping or duplicate indices are merged. +pub fn collect_cells_into_range<'a, E>( + header_row: HeaderRow, + cols: &IndexSet, + rows: &IndexSet, + dimensions: Dimensions, + mut next_cell: impl FnMut() -> Result>>, E>, +) -> Result>, E> { + // `Row(idx)` floors the kept rows at the header row; `FirstNonEmptyRow` keeps all. + let min_row = match header_row { + HeaderRow::FirstNonEmptyRow => 0, + HeaderRow::Row(idx) => idx, + }; + + // Keep non-empty cells within both projections; the header row is always retained. + let header_idx = match header_row { + HeaderRow::Row(idx) => Some(idx), + HeaderRow::FirstNonEmptyRow => None, + }; + let keep = |row: u32, col: u32| { + row >= min_row && cols.keep(col) && (rows.keep(row) || Some(row) == header_idx) + }; + + // When rows are bounded above, no cell past the last kept row can survive, so + // we can stop reading once we move beyond it. The header row is always retained, + // so the cutoff must clear it too. An unbounded `max_index` (IndexSet::UNBOUNDED) + // lets every cell through, so we scan the whole sheet. + let last_row = match header_idx { + Some(h) => rows.max_index().max(h), + None => rows.max_index(), + }; + + // Reserve for the *projected* cell count, not the full sheet. + let full_cols = dimensions.end.1 - dimensions.start.1 + 1; + let full_rows = dimensions.end.0 - dimensions.start.0 + 1; + let projected_len = dimensions + .len() + .saturating_mul(cols.selected_count(full_cols)) + .saturating_mul(rows.selected_count(full_rows)) + / (full_cols as u64).max(1) + / (full_rows as u64).max(1); + let mut cells = Vec::new(); + if projected_len < 100_000 { + cells.reserve(projected_len as usize); + } + while let Some(cell) = next_cell()? { + // Cells stream in ascending row order, so once we pass the last kept row + // nothing further can survive the projection: stop early. + if cell.pos.0 > last_row { + break; + } + if !matches!(cell.val, DataRef::Empty) && keep(cell.pos.0, cell.pos.1) { + cells.push(cell); + } + } + + // If no cell survived on the header row, anchor it with an empty cell in a + // kept column so it stays in the range. Skipped when no cells survived at all. + if let HeaderRow::Row(header_row_idx) = header_row { + if let Some(first) = cells.first() { + if first.pos.0 != header_row_idx { + let col = first.pos.1; + cells.push(Cell::new((header_row_idx, col), DataRef::Empty)); + } + } + } + + Ok(Range::from_sparse(cells)) +} + /// Push literal column into a String buffer pub fn push_column(mut col: u32, buf: &mut String) { if col < 26 { @@ -1165,6 +1240,68 @@ pub const FTAB_ARGC: [u8; FTAB_LEN] = [ mod tests { use super::*; + /// A row-bounded projection stops pulling cells once it passes the last kept + /// row, instead of draining the whole sheet. Unbounded rows read everything. + #[test] + fn collect_breaks_past_last_kept_row() { + // Stream one non-empty cell per row across 1000 rows, counting pulls. + let make_reader = || { + let mut row = 0u32; + move || -> Result>>, std::convert::Infallible> { + if row >= 1000 { + return Ok(None); + } + let cell = Cell::new((row, 0), DataRef::Int(row as i64)); + row += 1; + Ok(Some(cell)) + } + }; + let dims = Dimensions { + start: (0, 0), + end: (999, 0), + }; + + // Bounded to rows 0..3: should stop after pulling row 3 (the first row + // beyond the kept window), not scan all 1000 rows. + let mut pulls = 0; + let mut reader = make_reader(); + let range = collect_cells_into_range( + HeaderRow::FirstNonEmptyRow, + &(..).into(), + &(0u32..3).into(), + dims, + || { + let c = reader(); + if matches!(c, Ok(Some(_))) { + pulls += 1; + } + c + }, + ) + .unwrap(); + assert_eq!(range.get_size(), (3, 1)); + assert_eq!(pulls, 4, "should break after the first out-of-range row"); + + // Unbounded rows: must consume every cell. + let mut pulls = 0; + let mut reader = make_reader(); + collect_cells_into_range( + HeaderRow::FirstNonEmptyRow, + &(..).into(), + &(..).into(), + dims, + || { + let c = reader(); + if matches!(c, Ok(Some(_))) { + pulls += 1; + } + c + }, + ) + .unwrap(); + assert_eq!(pulls, 1000, "unbounded rows scan the whole sheet"); + } + #[test] fn sound_to_u32() { let data = b"ABCDEFGH"; diff --git a/src/xlsb/mod.rs b/src/xlsb/mod.rs index 42d417ad..85836b51 100644 --- a/src/xlsb/mod.rs +++ b/src/xlsb/mod.rs @@ -23,12 +23,12 @@ use zip::result::ZipError; use crate::datatype::DataRef; use crate::formats::{builtin_format_by_code, detect_custom_number_format, CellFormat}; use crate::utils::{ - build_zip_path_cache, cached_zip_path, push_column, read_f64, read_i32, read_u16, read_u32, - read_usize, + build_zip_path_cache, cached_zip_path, collect_cells_into_range, push_column, read_f64, + read_i32, read_u16, read_u32, read_usize, }; use crate::vba::VbaProject; use crate::{ - Cell, Data, HeaderRow, Metadata, Range, Reader, ReaderRef, Sheet, SheetType, SheetVisible, + Data, HeaderRow, IndexSet, Metadata, Range, Reader, ReaderRef, Sheet, SheetType, SheetVisible, }; /// A Xlsb specific error @@ -536,13 +536,7 @@ impl Reader for Xlsb { /// MS-XLSB 2.1.7.62 fn worksheet_range(&mut self, name: &str) -> Result, XlsbError> { - let rge = self.worksheet_range_ref(name)?; - let inner = rge.inner.into_iter().map(|v| v.into()).collect(); - Ok(Range { - start: rge.start, - end: rge.end, - inner, - }) + self.worksheet_range_region(name, .., ..) } /// MS-XLSB 2.1.7.62 @@ -580,66 +574,18 @@ impl Reader for Xlsb { } impl ReaderRef for Xlsb { - fn worksheet_range_ref<'a>(&'a mut self, name: &str) -> Result>, XlsbError> { + fn worksheet_range_region_ref<'a>( + &'a mut self, + name: &str, + cols: impl Into, + rows: impl Into, + ) -> Result>, XlsbError> { + let cols = cols.into(); + let rows = rows.into(); let header_row = self.options.header_row; let mut cell_reader = self.worksheet_cells_reader(name)?; - let len = cell_reader.dimensions().len(); - let mut cells = Vec::new(); - if len < 100_000 { - cells.reserve(len as usize); - } - - match header_row { - HeaderRow::FirstNonEmptyRow => { - // the header row is the row of the first non-empty cell - loop { - match cell_reader.next_cell() { - Ok(Some(Cell { - val: DataRef::Empty, - .. - })) => (), - Ok(Some(cell)) => cells.push(cell), - Ok(None) => break, - Err(e) => return Err(e), - } - } - } - HeaderRow::Row(header_row_idx) => { - // If `header_row` is a row index, we only add non-empty cells after this index. - loop { - match cell_reader.next_cell() { - Ok(Some(Cell { - val: DataRef::Empty, - .. - })) => (), - Ok(Some(cell)) => { - if cell.pos.0 >= header_row_idx { - cells.push(cell); - } - } - Ok(None) => break, - Err(e) => return Err(e), - } - } - - // If `header_row` is set and the first non-empty cell is not at the `header_row`, we add - // an empty cell at the beginning with row `header_row` and same column as the first non-empty cell. - if cells.first().is_some_and(|c| c.pos.0 != header_row_idx) { - cells.insert( - 0, - Cell { - pos: ( - header_row_idx, - cells.first().expect("cells should not be empty").pos.1, - ), - val: DataRef::Empty, - }, - ); - } - } - } - - Ok(Range::from_sparse(cells)) + let dims = cell_reader.dimensions(); + collect_cells_into_range(header_row, &cols, &rows, dims, || cell_reader.next_cell()) } } diff --git a/src/xlsx/mod.rs b/src/xlsx/mod.rs index 28e70cb8..a311543f 100644 --- a/src/xlsx/mod.rs +++ b/src/xlsx/mod.rs @@ -27,14 +27,15 @@ use zip::result::ZipError; use crate::datatype::DataRef; use crate::formats::{builtin_format_by_id, detect_custom_number_format, CellFormat}; use crate::utils::{ - build_zip_path_cache, cached_zip_path, unescape_entity_to_buffer, unescape_xml, + build_zip_path_cache, cached_zip_path, collect_cells_into_range, unescape_entity_to_buffer, + unescape_xml, }; use crate::vba::VbaProject; #[cfg(feature = "picture")] use crate::Picture; use crate::{ - Cell, CellErrorType, Data, Dimensions, HeaderRow, Metadata, Range, Reader, ReaderRef, Sheet, - SheetType, SheetVisible, Table, + CellErrorType, Data, Dimensions, HeaderRow, IndexSet, Metadata, Range, Reader, ReaderRef, + Sheet, SheetType, SheetVisible, Table, }; pub use cells_reader::{ XlsxCellFormula, XlsxCellFormulaMetadataRecord, XlsxCellReader, XlsxFormulaMetadata, @@ -2755,13 +2756,7 @@ impl Reader for Xlsx { } fn worksheet_range(&mut self, name: &str) -> Result, XlsxError> { - let rge = self.worksheet_range_ref(name)?; - let inner = rge.inner.into_iter().map(|v| v.into()).collect(); - Ok(Range { - start: rge.start, - end: rge.end, - inner, - }) + self.worksheet_range_region(name, .., ..) } fn worksheet_formula(&mut self, name: &str) -> Result, XlsxError> { @@ -2817,7 +2812,14 @@ impl Reader for Xlsx { } impl ReaderRef for Xlsx { - fn worksheet_range_ref<'a>(&'a mut self, name: &str) -> Result>, XlsxError> { + fn worksheet_range_region_ref<'a>( + &'a mut self, + name: &str, + cols: impl Into, + rows: impl Into, + ) -> Result>, XlsxError> { + let cols = cols.into(); + let rows = rows.into(); let header_row = self.options.header_row; let mut cell_reader = match self.worksheet_cells_reader(name) { Ok(reader) => reader, @@ -2827,63 +2829,8 @@ impl ReaderRef for Xlsx { } Err(e) => return Err(e), }; - let len = cell_reader.dimensions().len(); - let mut cells = Vec::new(); - if len < 100_000 { - cells.reserve(len as usize); - } - - match header_row { - HeaderRow::FirstNonEmptyRow => { - // the header row is the row of the first non-empty cell - loop { - match cell_reader.next_cell() { - Ok(Some(Cell { - val: DataRef::Empty, - .. - })) => (), - Ok(Some(cell)) => cells.push(cell), - Ok(None) => break, - Err(e) => return Err(e), - } - } - } - HeaderRow::Row(header_row_idx) => { - // If `header_row` is a row index, we only add non-empty cells after this index. - loop { - match cell_reader.next_cell() { - Ok(Some(Cell { - val: DataRef::Empty, - .. - })) => (), - Ok(Some(cell)) => { - if cell.pos.0 >= header_row_idx { - cells.push(cell); - } - } - Ok(None) => break, - Err(e) => return Err(e), - } - } - - // If `header_row` is set and the first non-empty cell is not at the `header_row`, we add - // an empty cell at the beginning with row `header_row` and same column as the first non-empty cell. - if cells.first().is_some_and(|c| c.pos.0 != header_row_idx) { - cells.insert( - 0, - Cell { - pos: ( - header_row_idx, - cells.first().expect("cells should not be empty").pos.1, - ), - val: DataRef::Empty, - }, - ); - } - } - } - - Ok(Range::from_sparse(cells)) + let dims = cell_reader.dimensions(); + collect_cells_into_range(header_row, &cols, &rows, dims, || cell_reader.next_cell()) } } diff --git a/tests/col-projection.xlsx b/tests/col-projection.xlsx new file mode 100644 index 00000000..ec005390 Binary files /dev/null and b/tests/col-projection.xlsx differ diff --git a/tests/test.rs b/tests/test.rs index e2b00a6d..b58346a2 100644 --- a/tests/test.rs +++ b/tests/test.rs @@ -6,8 +6,8 @@ use calamine::vba::Reference; use calamine::Data::{Bool, DateTime, DateTimeIso, DurationIso, Empty, Error, Float, Int, String}; use calamine::{ open_workbook, open_workbook_auto, DataRef, DataType, Dimensions, ExcelDateTime, - ExcelDateTimeType, HeaderRow, Ods, Range, Reader, ReaderRef, Sheet, SheetType, SheetVisible, - Xls, Xlsb, Xlsx, XlsxFormulaMetadata, + ExcelDateTimeType, HeaderRow, IndexSet, Ods, Range, Reader, ReaderRef, Sheet, SheetType, + SheetVisible, Xls, Xlsb, Xlsx, XlsxFormulaMetadata, }; use calamine::{CellErrorType::*, Data}; use rstest::rstest; @@ -3441,3 +3441,212 @@ fn test_hyperlinks_xlsx() { Err(calamine::XlsxError::WorksheetNotFound(_)) )); } + +/// Assert projected `cols` reproduce the full sheet's values +/// at the kept cols, with dropped columns reading `Empty`. +#[rstest] +#[case::disjoint("col-projection.xlsx", "Sheet1", IndexSet::from([0u32, 2]), &[0, 2])] +#[case::list("col-projection.xlsx", "Sheet1", IndexSet::from([0u32, 1, 2]), &[0, 1, 2])] +#[case::range("col-projection.xlsx", "Sheet1", IndexSet::from(0u32..3), &[0, 1, 2])] +#[case::single("col-projection.xlsx", "Sheet1", IndexSet::from(2u32), &[2])] +#[case::unsorted_dup("col-projection.xlsx", "Sheet1", IndexSet::from([2u32, 0, 0]), &[0, 2])] +#[case::xlsb("issues.xlsb", "issue2", IndexSet::from([1u32]), &[1])] +fn test_region_columns( + #[case] fixture: &str, + #[case] sheet: &str, + #[case] cols: IndexSet, + #[case] kept: &[u32], +) { + let path = test_path(fixture); + let mut full_wb = open_workbook_auto(&path).expect(&path); + let full = full_wb.worksheet_range_region_ref(sheet, .., ..).unwrap(); + let mut proj_wb = open_workbook_auto(&path).expect(&path); + let proj = proj_wb.worksheet_range_region_ref(sheet, cols, ..).unwrap(); + + // Bounding box spans the lowest..=highest kept column. Dropped interior + // columns are Empty, and every kept column matches the full sheet. + let (lo, hi) = (kept[0], *kept.last().unwrap()); + assert_eq!(proj.width() as u32, hi - lo + 1); + assert_eq!(proj.height(), full.height()); + for row in 0..proj.height() { + for c in lo..=hi { + let expected = if kept.contains(&c) { + full.get((row, c as usize)) + } else { + Some(&DataRef::Empty) + }; + assert_eq!( + proj.get((row, (c - lo) as usize)), + expected, + "row {row} col {c}" + ); + } + } +} + +/// Row projection reproduces the full sheet's selected row window. +#[rstest] +#[case::head("col-projection.xlsx", "Sheet1", IndexSet::from(0u32..2), 0, Some(1))] +#[case::tail("col-projection.xlsx", "Sheet1", IndexSet::from(1u32..), 1, None)] +#[case::xlsb("issues.xlsb", "issue2", IndexSet::from(0u32..2), 0, Some(1))] +fn test_region_rows( + #[case] fixture: &str, + #[case] sheet: &str, + #[case] rows: IndexSet, + #[case] start: u32, + #[case] end: Option, +) { + let path = test_path(fixture); + let mut x = open_workbook_auto(&path).expect(&path); + let full = x.worksheet_range_region(sheet, .., ..).unwrap(); + let proj = x.worksheet_range_region(sheet, .., rows).unwrap(); + + // `None` -> open-ended range + let last = end.unwrap_or_else(|| full.end().unwrap().0); + assert_eq!(proj.start().map(|(r, _)| r), Some(start)); + assert_eq!(proj.end().map(|(r, _)| r), Some(last)); + + // `get` is relative to each range's start; both fixtures start at column 0 + let full_start = full.start().unwrap().0; + for i in 0..proj.height() { + let full_row = (start as usize + i) - full_start as usize; + for c in 0..proj.width() { + assert_eq!( + proj.get((i, c)), + full.get((full_row, c)), + "rel row {i} col {c}" + ); + } + } +} + +#[test] +fn test_region_full_equals_plain() { + // The unprojected region (`.., ..`) matches `worksheet_range_ref`. + let mut x: Xlsx<_> = wb("col-projection.xlsx"); + let full = x.worksheet_range_region_ref("Sheet1", .., ..).unwrap(); + let mut plain_wb: Xlsx<_> = wb("col-projection.xlsx"); + let plain = plain_wb.worksheet_range_ref("Sheet1").unwrap(); + assert_eq!(full, plain); + assert!( + full.width() > 3, + "fixture must be wider than projection subsets" + ); +} + +#[test] +fn test_region_both_axes() { + let mut x: Xlsx<_> = wb("col-projection.xlsx"); + let full = x.worksheet_range_region("Sheet1", .., ..).unwrap(); + let both = x.worksheet_range_region("Sheet1", 0..2, 0..2).unwrap(); + assert_eq!(both.start(), Some((0, 0))); + assert_eq!(both.end().map(|(r, _)| r), Some(1)); + assert_eq!(both.width(), 2); + for row in 0..2 { + for col in 0..2 { + assert_eq!(both.get((row, col)), full.get((row, col))); + } + } +} + +#[test] +fn test_region_overlaps_merge() { + // Overlapping/duplicate selections are valid and merge silently. + let mut x: Xlsx<_> = wb("col-projection.xlsx"); + let merged = x + .worksheet_range_region("Sheet1", [0..2, 1..3], ..) + .unwrap(); + let contiguous = x.worksheet_range_region("Sheet1", 0..3, ..).unwrap(); + assert_eq!(merged, contiguous); +} + +/// Column projection composes with header row; dropped cols read +/// Empty and the header floor is preserved for both header modes. +#[rstest] +#[case(HeaderRow::Row(8))] +#[case(HeaderRow::FirstNonEmptyRow)] +fn test_region_header_columns(#[case] header: HeaderRow) { + let mut x: Xlsx<_> = wb("header-row.xlsx"); + x.with_header_row(header); + let full = x.worksheet_range_region("Sheet1", .., ..).unwrap(); + let proj = x.worksheet_range_region("Sheet1", [0, 2], ..).unwrap(); + assert_eq!(proj.start(), full.start()); + assert_eq!(proj.width(), 3); + for row in 0..proj.height() { + assert_eq!(proj.get((row, 0)), full.get((row, 0))); + assert_eq!(proj.get((row, 2)), full.get((row, 2))); + assert_eq!(proj.get((row, 1)), Some(&Empty)); + } +} + +#[test] +fn test_region_header_row_always_retained() { + // HeaderRow::Row(8) with a row selection that EXCLUDES row 8; header + // row should still be present, and rows above 8 must still be dropped. + let mut x: Xlsx<_> = wb("header-row.xlsx"); + let range = x + .with_header_row(HeaderRow::Row(8)) + .worksheet_range_region_ref("Sheet1", .., 9..10) + .unwrap(); + assert_eq!(range.start().map(|(r, _)| r), Some(8)); + assert_eq!(range.end().map(|(r, _)| r), Some(9)); +} + +#[test] +fn test_region_xlsb_with_header_row() { + let mut x: Xlsb<_> = wb("date.xlsb"); + x.with_header_row(HeaderRow::Row(1)); + let full = x.worksheet_range_region("Sheet1", .., ..).unwrap(); + let proj = x.worksheet_range_region("Sheet1", [1], ..).unwrap(); + assert_eq!(proj.start(), Some((1, 1))); + assert_eq!(proj.end(), Some((2, 1))); + assert_eq!(proj.width(), 1); + for row in 0..proj.height() { + assert_eq!(proj.get((row, 0)), full.get((row, 1))); + } +} + +#[test] +fn test_region_through_sheets_auto() { + let path = test_path("col-projection.xlsx"); + let mut sheets = open_workbook_auto(&path).expect(&path); + let dispatched = sheets + .worksheet_range_region_ref("Sheet1", [0, 2], ..) + .unwrap(); + + let mut direct: Xlsx<_> = wb("col-projection.xlsx"); + let expected = direct + .worksheet_range_region_ref("Sheet1", [0, 2], ..) + .unwrap(); + assert_eq!(dispatched, expected); + + // Owned variant also dispatches through `Sheets`. + let mut owned = open_workbook_auto(&path).expect(&path); + assert_eq!( + owned + .worksheet_range_region("Sheet1", [0, 1, 2], ..) + .unwrap() + .width(), + 3 + ); +} + +#[test] +fn test_region_unsupported_for_xls_ods() { + for file in ["any_sheets.xls", "any_sheets.ods"] { + let path = test_path(file); + let mut sheets = open_workbook_auto(&path).expect(&path); + let name = sheets.sheet_names()[0].clone(); + assert!( + matches!( + sheets.worksheet_range_region_ref(&name, [0], ..), + Err(calamine::Error::Msg(_)) + ), + "{file}: expected Error::Msg, not a panic" + ); + assert!(matches!( + sheets.worksheet_range_ref(&name), + Err(calamine::Error::Msg(_)) + )); + } +}