From 82fb5b94ae1fe97029f54b9ba29517b3fec30280 Mon Sep 17 00:00:00 2001 From: Yongting You <2010youy01@gmail.com> Date: Sat, 7 Sep 2024 13:15:49 +0800 Subject: [PATCH] Faster `character_length()` string function for ASCII-only case (#12356) * charcter_length() benchmark * char_length() ascii fast path * use usize_as --- datafusion/functions/Cargo.toml | 5 + .../functions/benches/character_length.rs | 114 ++++++++++++++++++ datafusion/functions/src/string/common.rs | 11 ++ .../functions/src/unicode/character_length.rs | 20 ++- 4 files changed, 144 insertions(+), 6 deletions(-) create mode 100644 datafusion/functions/benches/character_length.rs diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 337379a74670..3c95c03896e2 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -166,3 +166,8 @@ required-features = ["math_expressions"] harness = false name = "substr" required-features = ["unicode_expressions"] + +[[bench]] +harness = false +name = "character_length" +required-features = ["unicode_expressions"] diff --git a/datafusion/functions/benches/character_length.rs b/datafusion/functions/benches/character_length.rs new file mode 100644 index 000000000000..17c4dd1f8912 --- /dev/null +++ b/datafusion/functions/benches/character_length.rs @@ -0,0 +1,114 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +extern crate criterion; + +use arrow::array::{StringArray, StringViewArray}; +use criterion::{black_box, criterion_group, criterion_main, Criterion}; +use datafusion_expr::ColumnarValue; +use rand::distributions::Alphanumeric; +use rand::{rngs::StdRng, Rng, SeedableRng}; +use std::sync::Arc; + +/// gen_arr(4096, 128, 0.1, 0.1, true) will generate a StringViewArray with +/// 4096 rows, each row containing a string with 128 random characters. +/// around 10% of the rows are null, around 10% of the rows are non-ASCII. +fn gen_string_array( + n_rows: usize, + str_len_chars: usize, + null_density: f32, + utf8_density: f32, + is_string_view: bool, // false -> StringArray, true -> StringViewArray +) -> Vec { + let mut rng = StdRng::seed_from_u64(42); + let rng_ref = &mut rng; + + let corpus = "DataFusionДатаФусион数据融合📊🔥"; // includes utf8 encoding with 1~4 bytes + let corpus_char_count = corpus.chars().count(); + + let mut output_string_vec: Vec> = Vec::with_capacity(n_rows); + for _ in 0..n_rows { + let rand_num = rng_ref.gen::(); // [0.0, 1.0) + if rand_num < null_density { + output_string_vec.push(None); + } else if rand_num < null_density + utf8_density { + // Generate random UTF8 string + let mut generated_string = String::with_capacity(str_len_chars); + for _ in 0..str_len_chars { + let idx = rng_ref.gen_range(0..corpus_char_count); + let char = corpus.chars().nth(idx).unwrap(); + generated_string.push(char); + } + output_string_vec.push(Some(generated_string)); + } else { + // Generate random ASCII-only string + let value = rng_ref + .sample_iter(&Alphanumeric) + .take(str_len_chars) + .collect(); + let value = String::from_utf8(value).unwrap(); + output_string_vec.push(Some(value)); + } + } + + if is_string_view { + let string_view_array: StringViewArray = output_string_vec.into_iter().collect(); + vec![ColumnarValue::Array(Arc::new(string_view_array))] + } else { + let string_array: StringArray = output_string_vec.clone().into_iter().collect(); + vec![ColumnarValue::Array(Arc::new(string_array))] + } +} + +fn criterion_benchmark(c: &mut Criterion) { + // All benches are single batch run with 8192 rows + let character_length = datafusion_functions::unicode::character_length(); + + let n_rows = 8192; + for str_len in [8, 32, 128, 4096] { + // StringArray ASCII only + let args_string_ascii = gen_string_array(n_rows, str_len, 0.1, 0.0, false); + c.bench_function( + &format!("character_length_StringArray_ascii_str_len_{}", str_len), + |b| b.iter(|| black_box(character_length.invoke(&args_string_ascii))), + ); + + // StringArray UTF8 + let args_string_utf8 = gen_string_array(n_rows, str_len, 0.1, 0.5, false); + c.bench_function( + &format!("character_length_StringArray_utf8_str_len_{}", str_len), + |b| b.iter(|| black_box(character_length.invoke(&args_string_utf8))), + ); + + // StringViewArray ASCII only + let args_string_view_ascii = gen_string_array(n_rows, str_len, 0.1, 0.0, true); + c.bench_function( + &format!("character_length_StringViewArray_ascii_str_len_{}", str_len), + |b| b.iter(|| black_box(character_length.invoke(&args_string_view_ascii))), + ); + + // StringViewArray UTF8 + let args_string_view_utf8 = gen_string_array(n_rows, str_len, 0.1, 0.5, true); + c.bench_function( + &format!("character_length_StringViewArray_utf8_str_len_{}", str_len), + |b| b.iter(|| black_box(character_length.invoke(&args_string_view_utf8))), + ); + } +} + +criterion_group!(benches, criterion_benchmark); +criterion_main!(benches); diff --git a/datafusion/functions/src/string/common.rs b/datafusion/functions/src/string/common.rs index 6ebcc4ee6cd3..9365a6d83331 100644 --- a/datafusion/functions/src/string/common.rs +++ b/datafusion/functions/src/string/common.rs @@ -351,18 +351,29 @@ pub trait StringArrayType<'a>: ArrayAccessor + Sized { /// /// This iterator iterates returns `Option<&str>` for each item in the array. fn iter(&self) -> ArrayIter; + + /// Check if the array is ASCII only. + fn is_ascii(&self) -> bool; } impl<'a, T: OffsetSizeTrait> StringArrayType<'a> for &'a GenericStringArray { fn iter(&self) -> ArrayIter { GenericStringArray::::iter(self) } + + fn is_ascii(&self) -> bool { + GenericStringArray::::is_ascii(self) + } } impl<'a> StringArrayType<'a> for &'a StringViewArray { fn iter(&self) -> ArrayIter { StringViewArray::iter(self) } + + fn is_ascii(&self) -> bool { + StringViewArray::is_ascii(self) + } } /// Optimized version of the StringBuilder in Arrow that: diff --git a/datafusion/functions/src/unicode/character_length.rs b/datafusion/functions/src/unicode/character_length.rs index e46ee162ff12..c9dc96b2a935 100644 --- a/datafusion/functions/src/unicode/character_length.rs +++ b/datafusion/functions/src/unicode/character_length.rs @@ -15,10 +15,10 @@ // specific language governing permissions and limitations // under the License. +use crate::string::common::StringArrayType; use crate::utils::{make_scalar_function, utf8_to_int_type}; use arrow::array::{ - Array, ArrayAccessor, ArrayIter, ArrayRef, ArrowPrimitiveType, AsArray, - OffsetSizeTrait, PrimitiveArray, + Array, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, PrimitiveArray, }; use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type}; use datafusion_common::Result; @@ -99,18 +99,26 @@ fn character_length(args: &[ArrayRef]) -> Result { } } -fn character_length_general<'a, T: ArrowPrimitiveType, V: ArrayAccessor>( +fn character_length_general<'a, T: ArrowPrimitiveType, V: StringArrayType<'a>>( array: V, ) -> Result where T::Native: OffsetSizeTrait, { - let iter = ArrayIter::new(array); + // String characters are variable length encoded in UTF-8, counting the + // number of chars requires expensive decoding, however checking if the + // string is ASCII only is relatively cheap. + // If strings are ASCII only, count bytes instead. + let is_array_ascii_only = array.is_ascii(); + let iter = array.iter(); let result = iter .map(|string| { string.map(|string: &str| { - T::Native::from_usize(string.chars().count()) - .expect("should not fail as string.chars will always return integer") + if is_array_ascii_only { + T::Native::usize_as(string.len()) + } else { + T::Native::usize_as(string.chars().count()) + } }) }) .collect::>();