diff --git a/datafusion/functions/src/unicode/character_length.rs b/datafusion/functions/src/unicode/character_length.rs index 465b15ace1d10..23177ff9dbf55 100644 --- a/datafusion/functions/src/unicode/character_length.rs +++ b/datafusion/functions/src/unicode/character_length.rs @@ -17,8 +17,8 @@ use crate::utils::{make_scalar_function, utf8_to_int_type}; use arrow::array::{ - Array, ArrayRef, ArrowPrimitiveType, AsArray, OffsetSizeTrait, PrimitiveArray, - StringArrayType, + Array, ArrayRef, ArrowPrimitiveType, AsArray, GenericStringArray, OffsetSizeTrait, + PrimitiveArray, StringViewArray, }; use arrow::datatypes::{ArrowNativeType, DataType, Int32Type, Int64Type}; use datafusion_common::Result; @@ -104,65 +104,114 @@ fn character_length(args: &[ArrayRef]) -> Result { match args[0].data_type() { DataType::Utf8 => { let string_array = args[0].as_string::(); - character_length_general::(&string_array) + character_length_offsets::(string_array) } DataType::LargeUtf8 => { let string_array = args[0].as_string::(); - character_length_general::(&string_array) + character_length_offsets::(string_array) } DataType::Utf8View => { let string_array = args[0].as_string_view(); - character_length_general::(&string_array) + character_length_string_view::(string_array) } _ => unreachable!("CharacterLengthFunc"), } } -fn character_length_general<'a, T, V>(array: &V) -> Result +/// Optimized character_length for offset-based string arrays (Utf8/LargeUtf8). +/// For ASCII-only arrays, computes lengths directly from the offsets buffer +/// without touching the string data at all. +fn character_length_offsets(array: &GenericStringArray) -> Result where T: ArrowPrimitiveType, T::Native: OffsetSizeTrait, - V: StringArrayType<'a>, + O: OffsetSizeTrait, { - // String characters are variable length encoded in UTF-8, counting the - // number of chars requires expensive decoding, however checking if the - // string is ASCII only is relatively cheap. - // If strings are ASCII only, count bytes instead. - let is_array_ascii_only = array.is_ascii(); let nulls = array.nulls().cloned(); - let array = { - if is_array_ascii_only { - let values: Vec<_> = (0..array.len()) - .map(|i| { - // Safety: we are iterating with array.len() so the index is always valid + let offsets = array.offsets(); + + if array.is_ascii() { + // ASCII: byte length == char length, compute from offsets only + let values: Vec = offsets + .windows(2) + .map(|w| T::Native::usize_as((w[1] - w[0]).as_usize())) + .collect(); + Ok(Arc::new(PrimitiveArray::::new(values.into(), nulls))) + } else { + let values: Vec = (0..array.len()) + .map(|i| { + if array.is_null(i) { + T::default_value() + } else { + // Safety: i is within bounds and not null let value = unsafe { array.value_unchecked(i) }; - T::Native::usize_as(value.len()) - }) - .collect(); - PrimitiveArray::::new(values.into(), nulls) - } else { - let values: Vec<_> = (0..array.len()) - .map(|i| { - // Safety: we are iterating with array.len() so the index is always valid - if array.is_null(i) { - T::default_value() + if value.is_ascii() { + T::Native::usize_as(value.len()) } else { - let value = unsafe { array.value_unchecked(i) }; - if value.is_empty() { - T::default_value() - } else if value.is_ascii() { - T::Native::usize_as(value.len()) - } else { - T::Native::usize_as(value.chars().count()) - } + T::Native::usize_as(value.chars().count()) } - }) - .collect(); - PrimitiveArray::::new(values.into(), nulls) - } - }; + } + }) + .collect(); + Ok(Arc::new(PrimitiveArray::::new(values.into(), nulls))) + } +} + +/// Optimized character_length for StringViewArray. +/// For ASCII-only arrays, reads string lengths directly from the view metadata +/// without touching string data. +fn character_length_string_view(array: &StringViewArray) -> Result +where + T: ArrowPrimitiveType, + T::Native: OffsetSizeTrait, +{ + let nulls = array.nulls().cloned(); + let views = array.views(); - Ok(Arc::new(array)) + if array.is_ascii() { + // ASCII: byte length == char length, read length from view (first 4 bytes) + let values: Vec = views + .iter() + .map(|view| { + let len = (*view as u32) as usize; + T::Native::usize_as(len) + }) + .collect(); + Ok(Arc::new(PrimitiveArray::::new(values.into(), nulls))) + } else { + let values: Vec = views + .iter() + .enumerate() + .map(|(i, raw_view)| { + let len = (*raw_view as u32) as usize; + if len == 0 { + T::default_value() + } else if len <= 12 { + // Inlined string: count UTF-8 chars directly from the u128 view. + // Shift right 32 bits to get string bytes in low bits, then + // mask to only the valid `len` bytes (remaining bytes may be garbage). + let valid_mask = (1u128 << (len * 8)) - 1; + let data = (*raw_view >> 32) & valid_mask; + // Count non-continuation bytes: a UTF-8 continuation byte matches + // 10xxxxxx, so (byte | (byte >> 1)) & 0x80 is set for all + // non-continuation bytes (they have bit7=0 or bit6=1). + let not_continuation = + (data | (!data >> 1)) & 0x0080_0080_0080_0080_0080_0080u128; + T::Native::usize_as(not_continuation.count_ones() as usize) + } else { + // Non-inlined string: must access buffer data + // Safety: i is within bounds + let value = unsafe { array.value_unchecked(i) }; + if value.is_ascii() { + T::Native::usize_as(len) + } else { + T::Native::usize_as(value.chars().count()) + } + } + }) + .collect(); + Ok(Arc::new(PrimitiveArray::::new(values.into(), nulls))) + } } #[cfg(test)]