diff --git a/datafusion-examples/examples/data_io/parquet_index.rs b/datafusion-examples/examples/data_io/parquet_index.rs index e11a303f442a4..515dad7a51e17 100644 --- a/datafusion-examples/examples/data_io/parquet_index.rs +++ b/datafusion-examples/examples/data_io/parquet_index.rs @@ -462,7 +462,7 @@ impl PruningStatistics for ParquetMetadataIndex { } /// return the row counts for each file - fn row_counts(&self, _column: &Column) -> Option { + fn row_counts(&self) -> Option { Some(self.row_counts_ref().clone()) } diff --git a/datafusion-examples/examples/query_planning/pruning.rs b/datafusion-examples/examples/query_planning/pruning.rs index 33f3f8428a77f..7fdc4a7952d68 100644 --- a/datafusion-examples/examples/query_planning/pruning.rs +++ b/datafusion-examples/examples/query_planning/pruning.rs @@ -174,7 +174,7 @@ impl PruningStatistics for MyCatalog { None } - fn row_counts(&self, _column: &Column) -> Option { + fn row_counts(&self) -> Option { // In this example, we know nothing about the number of rows in each file None } diff --git a/datafusion/common/src/pruning.rs b/datafusion/common/src/pruning.rs index 27148de59a544..ebae23f0723a1 100644 --- a/datafusion/common/src/pruning.rs +++ b/datafusion/common/src/pruning.rs @@ -95,15 +95,17 @@ pub trait PruningStatistics { /// [`UInt64Array`]: arrow::array::UInt64Array fn null_counts(&self, column: &Column) -> Option; - /// Return the number of rows for the named column in each container - /// as an [`UInt64Array`]. + /// Return the number of rows in each container as an [`UInt64Array`]. + /// + /// Row counts are container-level (not column-specific) — the value + /// is the same regardless of which column is being considered. /// /// See [`Self::min_values`] for when to return `None` and null values. /// /// Note: the returned array must contain [`Self::num_containers`] rows /// /// [`UInt64Array`]: arrow::array::UInt64Array - fn row_counts(&self, column: &Column) -> Option; + fn row_counts(&self) -> Option; /// Returns [`BooleanArray`] where each row represents information known /// about specific literal `values` in a column. @@ -265,7 +267,7 @@ impl PruningStatistics for PartitionPruningStatistics { None } - fn row_counts(&self, _column: &Column) -> Option { + fn row_counts(&self) -> Option { None } @@ -398,11 +400,7 @@ impl PruningStatistics for PrunableStatistics { } } - fn row_counts(&self, column: &Column) -> Option { - // If the column does not exist in the schema, return None - if self.schema.index_of(column.name()).is_err() { - return None; - } + fn row_counts(&self) -> Option { if self .statistics .iter() @@ -502,9 +500,9 @@ impl PruningStatistics for CompositePruningStatistics { None } - fn row_counts(&self, column: &Column) -> Option { + fn row_counts(&self) -> Option { for stats in &self.statistics { - if let Some(array) = stats.row_counts(column) { + if let Some(array) = stats.row_counts() { return Some(array); } } @@ -566,9 +564,9 @@ mod tests { // Partition values don't know anything about nulls or row counts assert!(partition_stats.null_counts(&column_a).is_none()); - assert!(partition_stats.row_counts(&column_a).is_none()); + assert!(partition_stats.row_counts().is_none()); assert!(partition_stats.null_counts(&column_b).is_none()); - assert!(partition_stats.row_counts(&column_b).is_none()); + assert!(partition_stats.row_counts().is_none()); // Min/max values are the same as the partition values let min_values_a = @@ -709,9 +707,9 @@ mod tests { // Partition values don't know anything about nulls or row counts assert!(partition_stats.null_counts(&column_a).is_none()); - assert!(partition_stats.row_counts(&column_a).is_none()); + assert!(partition_stats.row_counts().is_none()); assert!(partition_stats.null_counts(&column_b).is_none()); - assert!(partition_stats.row_counts(&column_b).is_none()); + assert!(partition_stats.row_counts().is_none()); // Min/max values are all missing assert!(partition_stats.min_values(&column_a).is_none()); @@ -814,13 +812,13 @@ mod tests { assert_eq!(null_counts_b, expected_null_counts_b); // Row counts are the same as the statistics - let row_counts_a = as_uint64_array(&pruning_stats.row_counts(&column_a).unwrap()) + let row_counts_a = as_uint64_array(&pruning_stats.row_counts().unwrap()) .unwrap() .into_iter() .collect::>(); let expected_row_counts_a = vec![Some(100), Some(200)]; assert_eq!(row_counts_a, expected_row_counts_a); - let row_counts_b = as_uint64_array(&pruning_stats.row_counts(&column_b).unwrap()) + let row_counts_b = as_uint64_array(&pruning_stats.row_counts().unwrap()) .unwrap() .into_iter() .collect::>(); @@ -845,7 +843,7 @@ mod tests { // This is debatable, personally I think `row_count` should not take a `Column` as an argument // at all since all columns should have the same number of rows. // But for now we just document the current behavior in this test. - let row_counts_c = as_uint64_array(&pruning_stats.row_counts(&column_c).unwrap()) + let row_counts_c = as_uint64_array(&pruning_stats.row_counts().unwrap()) .unwrap() .into_iter() .collect::>(); @@ -853,12 +851,13 @@ mod tests { assert_eq!(row_counts_c, expected_row_counts_c); assert!(pruning_stats.contained(&column_c, &values).is_none()); - // Test with a column that doesn't exist + // Test with a column that doesn't exist — column-specific stats + // return None, but row_counts is container-level and still available let column_d = Column::new_unqualified("d"); assert!(pruning_stats.min_values(&column_d).is_none()); assert!(pruning_stats.max_values(&column_d).is_none()); assert!(pruning_stats.null_counts(&column_d).is_none()); - assert!(pruning_stats.row_counts(&column_d).is_none()); + assert!(pruning_stats.row_counts().is_some()); assert!(pruning_stats.contained(&column_d, &values).is_none()); } @@ -886,8 +885,8 @@ mod tests { assert!(pruning_stats.null_counts(&column_b).is_none()); // Row counts are all missing - assert!(pruning_stats.row_counts(&column_a).is_none()); - assert!(pruning_stats.row_counts(&column_b).is_none()); + assert!(pruning_stats.row_counts().is_none()); + assert!(pruning_stats.row_counts().is_none()); // Contained values are all empty let values = HashSet::from([ScalarValue::from(1i32)]); @@ -1027,13 +1026,11 @@ mod tests { let expected_null_counts_col_x = vec![Some(0), Some(10)]; assert_eq!(null_counts_col_x, expected_null_counts_col_x); - // Test row counts - only available from file statistics - assert!(composite_stats.row_counts(&part_a).is_none()); - let row_counts_col_x = - as_uint64_array(&composite_stats.row_counts(&col_x).unwrap()) - .unwrap() - .into_iter() - .collect::>(); + // Test row counts — container-level, available from file statistics + let row_counts_col_x = as_uint64_array(&composite_stats.row_counts().unwrap()) + .unwrap() + .into_iter() + .collect::>(); let expected_row_counts = vec![Some(100), Some(200)]; assert_eq!(row_counts_col_x, expected_row_counts); @@ -1046,12 +1043,13 @@ mod tests { // File statistics don't implement contained assert!(composite_stats.contained(&col_x, &values).is_none()); - // Non-existent column should return None for everything + // Non-existent column should return None for column-specific stats, + // but row_counts is container-level and still available let non_existent = Column::new_unqualified("non_existent"); assert!(composite_stats.min_values(&non_existent).is_none()); assert!(composite_stats.max_values(&non_existent).is_none()); assert!(composite_stats.null_counts(&non_existent).is_none()); - assert!(composite_stats.row_counts(&non_existent).is_none()); + assert!(composite_stats.row_counts().is_some()); assert!(composite_stats.contained(&non_existent, &values).is_none()); // Verify num_containers matches @@ -1155,7 +1153,7 @@ mod tests { let expected_null_counts = vec![Some(0), Some(5)]; assert_eq!(null_counts, expected_null_counts); - let row_counts = as_uint64_array(&composite_stats.row_counts(&col_a).unwrap()) + let row_counts = as_uint64_array(&composite_stats.row_counts().unwrap()) .unwrap() .into_iter() .collect::>(); @@ -1195,11 +1193,10 @@ mod tests { let expected_null_counts = vec![Some(10), Some(20)]; assert_eq!(null_counts, expected_null_counts); - let row_counts = - as_uint64_array(&composite_stats_reversed.row_counts(&col_a).unwrap()) - .unwrap() - .into_iter() - .collect::>(); + let row_counts = as_uint64_array(&composite_stats_reversed.row_counts().unwrap()) + .unwrap() + .into_iter() + .collect::>(); let expected_row_counts = vec![Some(1000), Some(2000)]; assert_eq!(row_counts, expected_row_counts); } diff --git a/datafusion/datasource-parquet/src/page_filter.rs b/datafusion/datasource-parquet/src/page_filter.rs index 194e6e94fba3a..baef36ce147d4 100644 --- a/datafusion/datasource-parquet/src/page_filter.rs +++ b/datafusion/datasource-parquet/src/page_filter.rs @@ -509,7 +509,7 @@ impl PruningStatistics for PagesPruningStatistics<'_> { } } - fn row_counts(&self, _column: &datafusion_common::Column) -> Option { + fn row_counts(&self) -> Option { match self.converter.data_page_row_counts( self.offset_index, self.row_group_metadatas, diff --git a/datafusion/datasource-parquet/src/row_group_filter.rs b/datafusion/datasource-parquet/src/row_group_filter.rs index 7a2ed8f2777e3..3f254c9f55282 100644 --- a/datafusion/datasource-parquet/src/row_group_filter.rs +++ b/datafusion/datasource-parquet/src/row_group_filter.rs @@ -19,7 +19,7 @@ use std::collections::{HashMap, HashSet}; use std::sync::Arc; use super::{ParquetAccessPlan, ParquetFileMetrics}; -use arrow::array::{ArrayRef, BooleanArray}; +use arrow::array::{ArrayRef, BooleanArray, UInt64Array}; use arrow::datatypes::Schema; use datafusion_common::pruning::PruningStatistics; use datafusion_common::{Column, Result, ScalarValue}; @@ -536,7 +536,7 @@ impl PruningStatistics for BloomFilterStatistics { None } - fn row_counts(&self, _column: &Column) -> Option { + fn row_counts(&self) -> Option { None } @@ -626,13 +626,13 @@ impl PruningStatistics for RowGroupPruningStatistics<'_> { .map(|counts| Arc::new(counts) as ArrayRef) } - fn row_counts(&self, column: &Column) -> Option { - // row counts are the same for all columns in a row group - self.statistics_converter(column) - .and_then(|c| Ok(c.row_group_row_counts(self.metadata_iter())?)) - .ok() - .flatten() - .map(|counts| Arc::new(counts) as ArrayRef) + fn row_counts(&self) -> Option { + // Row counts are container-level — read directly from row group metadata. + let counts: UInt64Array = self + .metadata_iter() + .map(|rg| Some(rg.num_rows() as u64)) + .collect(); + Some(Arc::new(counts) as ArrayRef) } fn contained( diff --git a/datafusion/pruning/src/pruning_predicate.rs b/datafusion/pruning/src/pruning_predicate.rs index 978d79b1f2fb0..873e82a97303e 100644 --- a/datafusion/pruning/src/pruning_predicate.rs +++ b/datafusion/pruning/src/pruning_predicate.rs @@ -929,7 +929,7 @@ fn build_statistics_record_batch( StatisticsType::Min => statistics.min_values(&column), StatisticsType::Max => statistics.max_values(&column), StatisticsType::NullCount => statistics.null_counts(&column), - StatisticsType::RowCount => statistics.row_counts(&column), + StatisticsType::RowCount => statistics.row_counts(), }; let array = array.unwrap_or_else(|| new_null_array(data_type, num_containers)); @@ -2300,11 +2300,10 @@ mod tests { .unwrap_or(None) } - fn row_counts(&self, column: &Column) -> Option { + fn row_counts(&self) -> Option { self.stats - .get(column) - .map(|container_stats| container_stats.row_counts()) - .unwrap_or(None) + .values() + .find_map(|container_stats| container_stats.row_counts()) } fn contained( @@ -2342,7 +2341,7 @@ mod tests { None } - fn row_counts(&self, _column: &Column) -> Option { + fn row_counts(&self) -> Option { None } diff --git a/docs/source/library-user-guide/upgrading/54.0.0.md b/docs/source/library-user-guide/upgrading/54.0.0.md index 6dc08cc344e5f..4e6178345bcce 100644 --- a/docs/source/library-user-guide/upgrading/54.0.0.md +++ b/docs/source/library-user-guide/upgrading/54.0.0.md @@ -289,6 +289,41 @@ auto-derefs through the `Arc`). > always return `None`. Use the `downcast_ref` method above instead, or > dereference through the `Arc` first with `plan.as_ref() as &dyn Any`. +### `PruningStatistics::row_counts` no longer takes a `column` parameter + +The `row_counts` method on the `PruningStatistics` trait no longer takes a +`&Column` argument, since row counts are a container-level property (the same +for every column). + +**Before:** + +```rust,ignore +fn row_counts(&self, column: &Column) -> Option { + // ... +} +``` + +**After:** + +```rust,ignore +fn row_counts(&self) -> Option { + // ... +} +``` + +**Who is affected:** + +- Users who implement the `PruningStatistics` trait + +**Migration guide:** + +Remove the `column: &Column` parameter from your `row_counts` implementation +and any corresponding call sites. If your implementation was using the column +argument, note that row counts are identical for all columns in a container, so +the parameter was unnecessary. + +See [PR #21369](https://github.com/apache/datafusion/pull/21369) for details. + ### Avro API and timestamp decoding changes DataFusion has switched to use `arrow-avro` (see [#17861]) when reading avro files