Skip to content

Commit

Permalink
parquet Statistics - deprecate has_* APIs and add _opt functions …
Browse files Browse the repository at this point in the history
…that return `Option<T>` (#6216)

* update public api Statistics::min to return an option.

I first re-named the existing method to `min_unchecked` and made it
internal to the crate.

I then added a `pub min(&self) -> Opiton<&T>` method.

I figure we can first change the public API before deciding what to do
about internal usage.

Ref: #6093

* update public api Statistics::max to return an option.

I first re-named the existing method to `max_unchecked` and made it
internal to the crate.

I then added a `pub max(&self) -> Opiton<&T>` method.

I figure we can first change the public API before deciding what to do
about internal usage.

Ref: #6093

* cargo fmt

* remove Statistics::has_min_max_set from the public api

Ref: #6093

* update impl HeapSize for ValueStatistics to use new min and max api

* migrate all tests to new Statistics min and max api

* make Statistics::null_count return Option<u64>

This removes ambiguity around whether the between all values are non-null or just that the null count stat is missing

Ref: #6215

* update expected metadata memory size tests

Changing null_count from u64 to Option<u64> increases the memory size and layout of the metadata.

I included these tests as a separate commit to call extra attention to it.

* add TODO question on is_min_max_backwards_compatible

* Apply suggestions from code review

Co-authored-by: Andrew Lamb <[email protected]>

* update ValueStatistics::max docs

* rename new optional ValueStatistics::max to max_opt

Per PR review, we will deprecate the old API instead of introducing a brekaing change.

Ref: #6216 (review)

* rename new optional ValueStatistics::min to min_opt

* add Statistics:{min,max}_bytes_opt

This adds the API and migrates all of the test usage.
The old APIs will be deprecated next.

* update make_stats_iterator macro to use *_opt methods

* deprecate non *_opt Statistics and ValueStatistics methods

* remove stale TODO comments

* remove has_min_max_set check from make_decimal_stats_iterator

The check is unnecessary now that the stats funcs return Option<T> when unset.

* deprecate has_min_max_set

An internal version was also created because it is used so extensively in testing.

* switch to null_count_opt and reintroduce deprecated null_count and has_nulls

* remove redundant test assertions of stats._internal_has_min_max_set

This removes the assertion from any test that subsequently unwraps both
min_opt and max_opt.

* replace negated test assertions of stats._internal_has_mix_max_set with assertions on min_opt and max_opt

This removes all use of Statistics::_internal_has_min_max_set from the code base, and so it is also removed.

* Revert changes to parquet writing, update comments

---------

Co-authored-by: Andrew Lamb <[email protected]>
  • Loading branch information
Michael-J-Ward and alamb authored Aug 15, 2024
1 parent 2461a16 commit 69b17ad
Show file tree
Hide file tree
Showing 9 changed files with 521 additions and 319 deletions.
109 changes: 74 additions & 35 deletions parquet/src/arrow/arrow_reader/statistics.rs
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ macro_rules! make_stats_iterator {
let next = self.iter.next();
next.map(|x| {
x.and_then(|stats| match stats {
$parquet_statistics_type(s) if stats.has_min_max_set() => Some(s.$func()),
$parquet_statistics_type(s) => s.$func(),
_ => None,
})
})
Expand All @@ -131,45 +131,85 @@ macro_rules! make_stats_iterator {

make_stats_iterator!(
MinBooleanStatsIterator,
min,
min_opt,
ParquetStatistics::Boolean,
bool
);
make_stats_iterator!(
MaxBooleanStatsIterator,
max,
max_opt,
ParquetStatistics::Boolean,
bool
);
make_stats_iterator!(MinInt32StatsIterator, min, ParquetStatistics::Int32, i32);
make_stats_iterator!(MaxInt32StatsIterator, max, ParquetStatistics::Int32, i32);
make_stats_iterator!(MinInt64StatsIterator, min, ParquetStatistics::Int64, i64);
make_stats_iterator!(MaxInt64StatsIterator, max, ParquetStatistics::Int64, i64);
make_stats_iterator!(MinFloatStatsIterator, min, ParquetStatistics::Float, f32);
make_stats_iterator!(MaxFloatStatsIterator, max, ParquetStatistics::Float, f32);
make_stats_iterator!(MinDoubleStatsIterator, min, ParquetStatistics::Double, f64);
make_stats_iterator!(MaxDoubleStatsIterator, max, ParquetStatistics::Double, f64);
make_stats_iterator!(
MinInt32StatsIterator,
min_opt,
ParquetStatistics::Int32,
i32
);
make_stats_iterator!(
MaxInt32StatsIterator,
max_opt,
ParquetStatistics::Int32,
i32
);
make_stats_iterator!(
MinInt64StatsIterator,
min_opt,
ParquetStatistics::Int64,
i64
);
make_stats_iterator!(
MaxInt64StatsIterator,
max_opt,
ParquetStatistics::Int64,
i64
);
make_stats_iterator!(
MinFloatStatsIterator,
min_opt,
ParquetStatistics::Float,
f32
);
make_stats_iterator!(
MaxFloatStatsIterator,
max_opt,
ParquetStatistics::Float,
f32
);
make_stats_iterator!(
MinDoubleStatsIterator,
min_opt,
ParquetStatistics::Double,
f64
);
make_stats_iterator!(
MaxDoubleStatsIterator,
max_opt,
ParquetStatistics::Double,
f64
);
make_stats_iterator!(
MinByteArrayStatsIterator,
min_bytes,
min_bytes_opt,
ParquetStatistics::ByteArray,
[u8]
);
make_stats_iterator!(
MaxByteArrayStatsIterator,
max_bytes,
max_bytes_opt,
ParquetStatistics::ByteArray,
[u8]
);
make_stats_iterator!(
MinFixedLenByteArrayStatsIterator,
min_bytes,
min_bytes_opt,
ParquetStatistics::FixedLenByteArray,
[u8]
);
make_stats_iterator!(
MaxFixedLenByteArrayStatsIterator,
max_bytes,
max_bytes_opt,
ParquetStatistics::FixedLenByteArray,
[u8]
);
Expand Down Expand Up @@ -218,19 +258,18 @@ macro_rules! make_decimal_stats_iterator {
fn next(&mut self) -> Option<Self::Item> {
let next = self.iter.next();
next.map(|x| {
x.and_then(|stats| {
if !stats.has_min_max_set() {
return None;
x.and_then(|stats| match stats {
ParquetStatistics::Int32(s) => {
s.$func().map(|x| $stat_value_type::from(*x))
}
match stats {
ParquetStatistics::Int32(s) => Some($stat_value_type::from(*s.$func())),
ParquetStatistics::Int64(s) => Some($stat_value_type::from(*s.$func())),
ParquetStatistics::ByteArray(s) => Some($convert_func(s.$bytes_func())),
ParquetStatistics::FixedLenByteArray(s) => {
Some($convert_func(s.$bytes_func()))
}
_ => None,
ParquetStatistics::Int64(s) => {
s.$func().map(|x| $stat_value_type::from(*x))
}
ParquetStatistics::ByteArray(s) => s.$bytes_func().map($convert_func),
ParquetStatistics::FixedLenByteArray(s) => {
s.$bytes_func().map($convert_func)
}
_ => None,
})
})
}
Expand All @@ -244,29 +283,29 @@ macro_rules! make_decimal_stats_iterator {

make_decimal_stats_iterator!(
MinDecimal128StatsIterator,
min,
min_bytes,
min_opt,
min_bytes_opt,
i128,
from_bytes_to_i128
);
make_decimal_stats_iterator!(
MaxDecimal128StatsIterator,
max,
max_bytes,
max_opt,
max_bytes_opt,
i128,
from_bytes_to_i128
);
make_decimal_stats_iterator!(
MinDecimal256StatsIterator,
min,
min_bytes,
min_opt,
min_bytes_opt,
i256,
from_bytes_to_i256
);
make_decimal_stats_iterator!(
MaxDecimal256StatsIterator,
max,
max_bytes,
max_opt,
max_bytes_opt,
i256,
from_bytes_to_i256
);
Expand Down Expand Up @@ -1347,7 +1386,7 @@ impl<'a> StatisticsConverter<'a> {
let null_counts = metadatas
.into_iter()
.map(|x| x.column(parquet_index).statistics())
.map(|s| s.map(|s| s.null_count()));
.map(|s| s.and_then(|s| s.null_count_opt()));
Ok(UInt64Array::from_iter(null_counts))
}

Expand Down
32 changes: 21 additions & 11 deletions parquet/src/arrow/arrow_writer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2541,10 +2541,15 @@ mod tests {
row_offset += column.num_values() as usize;

let stats = column.statistics().unwrap();
assert!(stats.has_min_max_set());
if let Statistics::Int32(stats) = stats {
assert_eq!(*stats.min() as u32, *src_slice.iter().min().unwrap());
assert_eq!(*stats.max() as u32, *src_slice.iter().max().unwrap());
assert_eq!(
*stats.min_opt().unwrap() as u32,
*src_slice.iter().min().unwrap()
);
assert_eq!(
*stats.max_opt().unwrap() as u32,
*src_slice.iter().max().unwrap()
);
} else {
panic!("Statistics::Int32 missing")
}
Expand Down Expand Up @@ -2582,10 +2587,15 @@ mod tests {
row_offset += column.num_values() as usize;

let stats = column.statistics().unwrap();
assert!(stats.has_min_max_set());
if let Statistics::Int64(stats) = stats {
assert_eq!(*stats.min() as u64, *src_slice.iter().min().unwrap());
assert_eq!(*stats.max() as u64, *src_slice.iter().max().unwrap());
assert_eq!(
*stats.min_opt().unwrap() as u64,
*src_slice.iter().min().unwrap()
);
assert_eq!(
*stats.max_opt().unwrap() as u64,
*src_slice.iter().max().unwrap()
);
} else {
panic!("Statistics::Int64 missing")
}
Expand All @@ -2608,7 +2618,7 @@ mod tests {
assert_eq!(row_group.num_columns(), 1);
let column = row_group.column(0);
let stats = column.statistics().unwrap();
assert_eq!(stats.null_count(), 2);
assert_eq!(stats.null_count_opt(), Some(2));
}
}

Expand Down Expand Up @@ -3069,8 +3079,8 @@ mod tests {

// Column chunk of column "a" should have chunk level statistics
if let Statistics::ByteArray(byte_array_stats) = a_col.statistics().unwrap() {
let min = byte_array_stats.min();
let max = byte_array_stats.max();
let min = byte_array_stats.min_opt().unwrap();
let max = byte_array_stats.max_opt().unwrap();

assert_eq!(min.as_bytes(), &[b'a']);
assert_eq!(max.as_bytes(), &[b'd']);
Expand Down Expand Up @@ -3141,8 +3151,8 @@ mod tests {

// Column chunk of column "a" should have chunk level statistics
if let Statistics::ByteArray(byte_array_stats) = a_col.statistics().unwrap() {
let min = byte_array_stats.min();
let max = byte_array_stats.max();
let min = byte_array_stats.min_opt().unwrap();
let max = byte_array_stats.max_opt().unwrap();

assert_eq!(min.as_bytes(), &[b'a']);
assert_eq!(max.as_bytes(), &[b'd']);
Expand Down
10 changes: 5 additions & 5 deletions parquet/src/column/page.rs
Original file line number Diff line number Diff line change
Expand Up @@ -370,15 +370,15 @@ mod tests {
encoding: Encoding::PLAIN,
def_level_encoding: Encoding::RLE,
rep_level_encoding: Encoding::RLE,
statistics: Some(Statistics::int32(Some(1), Some(2), None, 1, true)),
statistics: Some(Statistics::int32(Some(1), Some(2), None, Some(1), true)),
};
assert_eq!(data_page.page_type(), PageType::DATA_PAGE);
assert_eq!(data_page.buffer(), vec![0, 1, 2].as_slice());
assert_eq!(data_page.num_values(), 10);
assert_eq!(data_page.encoding(), Encoding::PLAIN);
assert_eq!(
data_page.statistics(),
Some(&Statistics::int32(Some(1), Some(2), None, 1, true))
Some(&Statistics::int32(Some(1), Some(2), None, Some(1), true))
);

let data_page_v2 = Page::DataPageV2 {
Expand All @@ -390,15 +390,15 @@ mod tests {
def_levels_byte_len: 30,
rep_levels_byte_len: 40,
is_compressed: false,
statistics: Some(Statistics::int32(Some(1), Some(2), None, 1, true)),
statistics: Some(Statistics::int32(Some(1), Some(2), None, Some(1), true)),
};
assert_eq!(data_page_v2.page_type(), PageType::DATA_PAGE_V2);
assert_eq!(data_page_v2.buffer(), vec![0, 1, 2].as_slice());
assert_eq!(data_page_v2.num_values(), 10);
assert_eq!(data_page_v2.encoding(), Encoding::PLAIN);
assert_eq!(
data_page_v2.statistics(),
Some(&Statistics::int32(Some(1), Some(2), None, 1, true))
Some(&Statistics::int32(Some(1), Some(2), None, Some(1), true))
);

let dict_page = Page::DictionaryPage {
Expand All @@ -422,7 +422,7 @@ mod tests {
encoding: Encoding::PLAIN,
def_level_encoding: Encoding::RLE,
rep_level_encoding: Encoding::RLE,
statistics: Some(Statistics::int32(Some(1), Some(2), None, 1, true)),
statistics: Some(Statistics::int32(Some(1), Some(2), None, Some(1), true)),
};

let cpage = CompressedPage::new(data_page, 5);
Expand Down
Loading

0 comments on commit 69b17ad

Please sign in to comment.