Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
314 changes: 236 additions & 78 deletions datafusion/functions/benches/pad.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,20 +15,22 @@
// specific language governing permissions and limitations
// under the License.

use arrow::array::{ArrayRef, ArrowPrimitiveType, OffsetSizeTrait, PrimitiveArray};
extern crate criterion;

use arrow::array::{ArrowPrimitiveType, OffsetSizeTrait, PrimitiveArray};
use arrow::datatypes::{DataType, Field, Int64Type};
use arrow::util::bench_util::{
create_string_array_with_len, create_string_view_array_with_len,
};
use criterion::{BenchmarkId, Criterion, criterion_group, criterion_main};
use datafusion_common::DataFusionError;
use criterion::{Criterion, SamplingMode, criterion_group, criterion_main};
use datafusion_common::config::ConfigOptions;
use datafusion_expr::{ColumnarValue, ScalarFunctionArgs};
use datafusion_functions::unicode::{lpad, rpad};
use datafusion_functions::unicode;
use rand::Rng;
use rand::distr::{Distribution, Uniform};
use std::hint::black_box;
use std::sync::Arc;
use std::time::Duration;

struct Filter<Dist> {
dist: Dist,
Expand Down Expand Up @@ -67,104 +69,260 @@ where
.collect()
}

fn create_args<O: OffsetSizeTrait>(
/// Create args for pad benchmark
fn create_pad_args<O: OffsetSizeTrait>(
size: usize,
str_len: usize,
force_view_types: bool,
target_len: usize,
use_string_view: bool,
) -> Vec<ColumnarValue> {
let length_array = Arc::new(create_primitive_array::<Int64Type>(size, 0.0, str_len));

if !force_view_types {
let string_array =
Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
let fill_array = Arc::new(create_string_array_with_len::<O>(size, 0.1, str_len));
let length_array =
Arc::new(create_primitive_array::<Int64Type>(size, 0.0, target_len));

if use_string_view {
let string_array = create_string_view_array_with_len(size, 0.1, str_len, false);
let fill_array = create_string_view_array_with_len(size, 0.1, str_len, false);
vec![
ColumnarValue::Array(string_array),
ColumnarValue::Array(Arc::clone(&length_array) as ArrayRef),
ColumnarValue::Array(fill_array),
ColumnarValue::Array(Arc::new(string_array)),
ColumnarValue::Array(length_array),
ColumnarValue::Array(Arc::new(fill_array)),
]
} else {
let string_array =
Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));
let fill_array =
Arc::new(create_string_view_array_with_len(size, 0.1, str_len, false));

let string_array = create_string_array_with_len::<O>(size, 0.1, str_len);
let fill_array = create_string_array_with_len::<O>(size, 0.1, str_len);
vec![
ColumnarValue::Array(string_array),
ColumnarValue::Array(Arc::clone(&length_array) as ArrayRef),
ColumnarValue::Array(fill_array),
ColumnarValue::Array(Arc::new(string_array)),
ColumnarValue::Array(length_array),
ColumnarValue::Array(Arc::new(fill_array)),
]
}
}

#[expect(clippy::needless_pass_by_value)]
fn invoke_pad_with_args(
args: Vec<ColumnarValue>,
number_rows: usize,
left_pad: bool,
) -> Result<ColumnarValue, DataFusionError> {
let arg_fields = args
.iter()
.enumerate()
.map(|(idx, arg)| Field::new(format!("arg_{idx}"), arg.data_type(), true).into())
.collect::<Vec<_>>();
let config_options = Arc::new(ConfigOptions::default());

let scalar_args = ScalarFunctionArgs {
args: args.clone(),
arg_fields,
number_rows,
return_field: Field::new("f", DataType::Utf8, true).into(),
config_options: Arc::clone(&config_options),
};
fn criterion_benchmark(c: &mut Criterion) {
for size in [1024, 4096] {
let mut group = c.benchmark_group(format!("lpad size={size}"));
group.sampling_mode(SamplingMode::Flat);
group.sample_size(10);
group.measurement_time(Duration::from_secs(10));

if left_pad {
lpad().invoke_with_args(scalar_args)
} else {
rpad().invoke_with_args(scalar_args)
}
}
// Utf8 type
let args = create_pad_args::<i32>(size, 5, 20, false);
let arg_fields = args
.iter()
.enumerate()
.map(|(idx, arg)| {
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
})
.collect::<Vec<_>>();
let config_options = Arc::new(ConfigOptions::default());

fn criterion_benchmark(c: &mut Criterion) {
for size in [1024, 2048] {
let mut group = c.benchmark_group("lpad function");
group.bench_function(
format!("lpad utf8 [size={size}, str_len=5, target=20]"),
|b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
args: args_cloned,
arg_fields: arg_fields.clone(),
number_rows: size,
return_field: Field::new("f", DataType::Utf8, true).into(),
config_options: Arc::clone(&config_options),
}))
})
},
);

let args = create_args::<i32>(size, 32, false);
// StringView type
let args = create_pad_args::<i32>(size, 5, 20, true);
let arg_fields = args
.iter()
.enumerate()
.map(|(idx, arg)| {
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
})
.collect::<Vec<_>>();

group.bench_function(BenchmarkId::new("utf8 type", size), |b| {
b.iter(|| black_box(invoke_pad_with_args(args.clone(), size, true).unwrap()))
});
group.bench_function(
format!("lpad stringview [size={size}, str_len=5, target=20]"),
|b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
args: args_cloned,
arg_fields: arg_fields.clone(),
number_rows: size,
return_field: Field::new("f", DataType::Utf8View, true).into(),
config_options: Arc::clone(&config_options),
}))
})
},
);

let args = create_args::<i64>(size, 32, false);
group.bench_function(BenchmarkId::new("largeutf8 type", size), |b| {
b.iter(|| black_box(invoke_pad_with_args(args.clone(), size, true).unwrap()))
});
// Utf8 type with longer strings
let args = create_pad_args::<i32>(size, 20, 50, false);
let arg_fields = args
.iter()
.enumerate()
.map(|(idx, arg)| {
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
})
.collect::<Vec<_>>();

let args = create_args::<i32>(size, 32, true);
group.bench_function(BenchmarkId::new("stringview type", size), |b| {
b.iter(|| black_box(invoke_pad_with_args(args.clone(), size, true).unwrap()))
});
group.bench_function(
format!("lpad utf8 [size={size}, str_len=20, target=50]"),
|b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
args: args_cloned,
arg_fields: arg_fields.clone(),
number_rows: size,
return_field: Field::new("f", DataType::Utf8, true).into(),
config_options: Arc::clone(&config_options),
}))
})
},
);

// StringView type with longer strings
let args = create_pad_args::<i32>(size, 20, 50, true);
let arg_fields = args
.iter()
.enumerate()
.map(|(idx, arg)| {
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
})
.collect::<Vec<_>>();

group.bench_function(
format!("lpad stringview [size={size}, str_len=20, target=50]"),
|b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(unicode::lpad().invoke_with_args(ScalarFunctionArgs {
args: args_cloned,
arg_fields: arg_fields.clone(),
number_rows: size,
return_field: Field::new("f", DataType::Utf8View, true).into(),
config_options: Arc::clone(&config_options),
}))
})
},
);

group.finish();
}

for size in [1024, 4096] {
let mut group = c.benchmark_group(format!("rpad size={size}"));
group.sampling_mode(SamplingMode::Flat);
group.sample_size(10);
group.measurement_time(Duration::from_secs(10));

// Utf8 type
let args = create_pad_args::<i32>(size, 5, 20, false);
let arg_fields = args
.iter()
.enumerate()
.map(|(idx, arg)| {
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
})
.collect::<Vec<_>>();
let config_options = Arc::new(ConfigOptions::default());

group.bench_function(
format!("rpad utf8 [size={size}, str_len=5, target=20]"),
|b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
args: args_cloned,
arg_fields: arg_fields.clone(),
number_rows: size,
return_field: Field::new("f", DataType::Utf8, true).into(),
config_options: Arc::clone(&config_options),
}))
})
},
);

// StringView type
let args = create_pad_args::<i32>(size, 5, 20, true);
let arg_fields = args
.iter()
.enumerate()
.map(|(idx, arg)| {
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
})
.collect::<Vec<_>>();

group.bench_function(
format!("rpad stringview [size={size}, str_len=5, target=20]"),
|b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
args: args_cloned,
arg_fields: arg_fields.clone(),
number_rows: size,
return_field: Field::new("f", DataType::Utf8View, true).into(),
config_options: Arc::clone(&config_options),
}))
})
},
);

let mut group = c.benchmark_group("rpad function");
// Utf8 type with longer strings
let args = create_pad_args::<i32>(size, 20, 50, false);
let arg_fields = args
.iter()
.enumerate()
.map(|(idx, arg)| {
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
})
.collect::<Vec<_>>();

let args = create_args::<i32>(size, 32, false);
group.bench_function(BenchmarkId::new("utf8 type", size), |b| {
b.iter(|| black_box(invoke_pad_with_args(args.clone(), size, false).unwrap()))
});
group.bench_function(
format!("rpad utf8 [size={size}, str_len=20, target=50]"),
|b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
args: args_cloned,
arg_fields: arg_fields.clone(),
number_rows: size,
return_field: Field::new("f", DataType::Utf8, true).into(),
config_options: Arc::clone(&config_options),
}))
})
},
);

let args = create_args::<i64>(size, 32, false);
group.bench_function(BenchmarkId::new("largeutf8 type", size), |b| {
b.iter(|| black_box(invoke_pad_with_args(args.clone(), size, false).unwrap()))
});
// StringView type with longer strings
let args = create_pad_args::<i32>(size, 20, 50, true);
let arg_fields = args
.iter()
.enumerate()
.map(|(idx, arg)| {
Field::new(format!("arg_{idx}"), arg.data_type(), true).into()
})
.collect::<Vec<_>>();

// rpad for stringview type
let args = create_args::<i32>(size, 32, true);
group.bench_function(BenchmarkId::new("stringview type", size), |b| {
b.iter(|| black_box(invoke_pad_with_args(args.clone(), size, false).unwrap()))
});
group.bench_function(
format!("rpad stringview [size={size}, str_len=20, target=50]"),
|b| {
b.iter(|| {
let args_cloned = args.clone();
black_box(unicode::rpad().invoke_with_args(ScalarFunctionArgs {
args: args_cloned,
arg_fields: arg_fields.clone(),
number_rows: size,
return_field: Field::new("f", DataType::Utf8View, true).into(),
config_options: Arc::clone(&config_options),
}))
})
},
);

group.finish();
}
Expand Down
Loading