Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add extend_dictionary in dictionary builder for improved performance #6875

Merged
merged 7 commits into from
Dec 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
187 changes: 185 additions & 2 deletions arrow-array/src/builder/generic_bytes_dictionary_builder.rs
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@

use crate::builder::{ArrayBuilder, GenericByteBuilder, PrimitiveBuilder};
use crate::types::{ArrowDictionaryKeyType, ByteArrayType, GenericBinaryType, GenericStringType};
use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray};
use crate::{Array, ArrayRef, DictionaryArray, GenericByteArray, TypedDictionaryArray};
use arrow_buffer::ArrowNativeType;
use arrow_schema::{ArrowError, DataType};
use hashbrown::HashTable;
Expand Down Expand Up @@ -305,6 +305,63 @@ where
};
}

/// Extends builder with an existing dictionary array.
///
/// This is the same as [`Self::extend`] but is faster as it translates
/// the dictionary values once rather than doing a lookup for each item in the iterator
///
/// when dictionary values are null (the actual mapped values) the keys are null
///
pub fn extend_dictionary(
&mut self,
dictionary: &TypedDictionaryArray<K, GenericByteArray<T>>,
) -> Result<(), ArrowError> {
let values = dictionary.values();

let v_len = values.len();
let k_len = dictionary.keys().len();
if v_len == 0 && k_len == 0 {
return Ok(());
}

// All nulls
if v_len == 0 {
self.append_nulls(k_len);
return Ok(());
}

if k_len == 0 {
return Err(ArrowError::InvalidArgumentError(
"Dictionary keys should not be empty when values are not empty".to_string(),
));
}

// Orphan values will be carried over to the new dictionary
let mapped_values = values
.iter()
// Dictionary values can technically be null, so we need to handle that
.map(|dict_value| {
dict_value
.map(|dict_value| self.get_or_insert_key(dict_value))
.transpose()
})
.collect::<Result<Vec<_>, _>>()?;

// Just insert the keys without additional lookups
dictionary.keys().iter().for_each(|key| match key {
None => self.append_null(),
Some(original_dict_index) => {
let index = original_dict_index.as_usize().min(v_len - 1);
match mapped_values[index] {
None => self.append_null(),
Some(mapped_value) => self.keys_builder.append_value(mapped_value),
}
}
});

Ok(())
}

/// Builds the `DictionaryArray` and reset this builder.
pub fn finish(&mut self) -> DictionaryArray<K> {
self.dedup.clear();
Expand Down Expand Up @@ -445,8 +502,9 @@ mod tests {
use super::*;

use crate::array::Int8Array;
use crate::cast::AsArray;
use crate::types::{Int16Type, Int32Type, Int8Type, Utf8Type};
use crate::{BinaryArray, StringArray};
use crate::{ArrowPrimitiveType, BinaryArray, StringArray};

fn test_bytes_dictionary_builder<T>(values: Vec<&T::Native>)
where
Expand Down Expand Up @@ -664,4 +722,129 @@ mod tests {
assert_eq!(dict.keys().values(), &[0, 1, 2, 0, 1, 2, 2, 3, 0]);
assert_eq!(dict.values().len(), 4);
}

#[test]
fn test_extend_dictionary() {
let some_dict = {
let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
builder.extend(["a", "b", "c", "a", "b", "c"].into_iter().map(Some));
builder.extend([None::<&str>]);
builder.extend(["c", "d", "a"].into_iter().map(Some));
builder.append_null();
builder.finish()
};

let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
builder.extend(["e", "e", "f", "e", "d"].into_iter().map(Some));
builder
.extend_dictionary(&some_dict.downcast_dict().unwrap())
.unwrap();
let dict = builder.finish();

assert_eq!(dict.values().len(), 6);

let values = dict
.downcast_dict::<GenericByteArray<Utf8Type>>()
.unwrap()
.into_iter()
.collect::<Vec<_>>();

assert_eq!(
values,
[
Some("e"),
Some("e"),
Some("f"),
Some("e"),
Some("d"),
Some("a"),
Some("b"),
Some("c"),
Some("a"),
Some("b"),
Some("c"),
None,
Some("c"),
Some("d"),
Some("a"),
None
]
);
}
#[test]
fn test_extend_dictionary_with_null_in_mapped_value() {
let some_dict = {
let mut values_builder = GenericByteBuilder::<Utf8Type>::new();
let mut keys_builder = PrimitiveBuilder::<Int32Type>::new();

// Manually build a dictionary values that the mapped values have null
values_builder.append_null();
keys_builder.append_value(0);
values_builder.append_value("I like worm hugs");
keys_builder.append_value(1);

let values = values_builder.finish();
let keys = keys_builder.finish();

let data_type = DataType::Dictionary(
Box::new(Int32Type::DATA_TYPE),
Box::new(Utf8Type::DATA_TYPE),
);

let builder = keys
.into_data()
.into_builder()
.data_type(data_type)
.child_data(vec![values.into_data()]);

DictionaryArray::from(unsafe { builder.build_unchecked() })
};

let some_dict_values = some_dict.values().as_string::<i32>();
assert_eq!(
some_dict_values.into_iter().collect::<Vec<_>>(),
&[None, Some("I like worm hugs")]
);

let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
builder
.extend_dictionary(&some_dict.downcast_dict().unwrap())
.unwrap();
let dict = builder.finish();

assert_eq!(dict.values().len(), 1);

let values = dict
.downcast_dict::<GenericByteArray<Utf8Type>>()
.unwrap()
.into_iter()
.collect::<Vec<_>>();

assert_eq!(values, [None, Some("I like worm hugs")]);
}

#[test]
fn test_extend_all_null_dictionary() {
let some_dict = {
let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
builder.append_nulls(2);
builder.finish()
};

let mut builder = GenericByteDictionaryBuilder::<Int32Type, Utf8Type>::new();
builder
.extend_dictionary(&some_dict.downcast_dict().unwrap())
.unwrap();
let dict = builder.finish();

assert_eq!(dict.values().len(), 0);

let values = dict
.downcast_dict::<GenericByteArray<Utf8Type>>()
.unwrap()
.into_iter()
.collect::<Vec<_>>();

assert_eq!(values, [None, None]);
}
}
Loading
Loading