From de35dcf04db7396ace1e3c11ee72812f4de425a4 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 6 Apr 2026 08:26:44 -0400 Subject: [PATCH 1/6] . --- Cargo.lock | 1 - datafusion/functions/Cargo.toml | 4 +- datafusion/functions/src/unicode/common.rs | 18 ++- datafusion/functions/src/unicode/lpad.rs | 46 +++----- datafusion/functions/src/unicode/rpad.rs | 42 +++---- datafusion/functions/src/unicode/translate.rs | 108 ++++++++---------- .../test_files/string/string_literal.slt | 84 ++++++++++++++ 7 files changed, 175 insertions(+), 128 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 895b3059f50c1..e9b230dee88d7 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2259,7 +2259,6 @@ dependencies = [ "regex", "sha2", "tokio", - "unicode-segmentation", "uuid", ] diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 7503c337517ef..30f5da816b3ab 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -59,7 +59,7 @@ regex_expressions = ["regex"] # enable string functions string_expressions = ["uuid"] # enable unicode functions -unicode_expressions = ["unicode-segmentation"] +unicode_expressions = [] [lib] name = "datafusion_functions" @@ -87,7 +87,7 @@ num-traits = { workspace = true } rand = { workspace = true } regex = { workspace = true, optional = true } sha2 = { workspace = true, optional = true } -unicode-segmentation = { version = "^1.13.2", optional = true } + uuid = { workspace = true, features = ["v4"], optional = true } [dev-dependencies] diff --git a/datafusion/functions/src/unicode/common.rs b/datafusion/functions/src/unicode/common.rs index 002776e6c6538..89e614ab092b5 100644 --- a/datafusion/functions/src/unicode/common.rs +++ b/datafusion/functions/src/unicode/common.rs @@ -78,6 +78,16 @@ impl LeftRightSlicer for RightSlicer { } } +/// Returns the byte offset of the `n`th codepoint in `string`, +/// or `string.len()` if the string has fewer than `n` codepoints. +#[inline] +pub(crate) fn byte_offset_of_char(string: &str, n: usize) -> usize { + string + .char_indices() + .nth(n) + .map_or(string.len(), |(i, _)| i) +} + /// Calculate the byte length of the substring of `n` chars from string `string` #[inline] fn left_right_byte_length(string: &str, n: i64) -> usize { @@ -88,11 +98,9 @@ fn left_right_byte_length(string: &str, n: i64) -> usize { .map(|(index, _)| index) .unwrap_or(0), Ordering::Equal => 0, - Ordering::Greater => string - .char_indices() - .nth(n.unsigned_abs().min(usize::MAX as u64) as usize) - .map(|(index, _)| index) - .unwrap_or(string.len()), + Ordering::Greater => { + byte_offset_of_char(string, n.unsigned_abs().min(usize::MAX as u64) as usize) + } } } diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs index d7487c385e84f..18cabf2367be8 100644 --- a/datafusion/functions/src/unicode/lpad.rs +++ b/datafusion/functions/src/unicode/lpad.rs @@ -24,7 +24,6 @@ use arrow::array::{ OffsetSizeTrait, StringArrayType, StringViewArray, }; use arrow::datatypes::DataType; -use unicode_segmentation::UnicodeSegmentation; use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::as_int64_array; @@ -178,7 +177,7 @@ impl ScalarUDFImpl for LPadFunc { } } -use super::common::{try_as_scalar_i64, try_as_scalar_str}; +use super::common::{byte_offset_of_char, try_as_scalar_i64, try_as_scalar_str}; /// Optimized lpad for constant target_len and fill arguments. fn lpad_scalar_args<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>( @@ -270,22 +269,18 @@ fn lpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>( let data_capacity = string_array.len().saturating_mul(target_len * 4); let mut builder = GenericStringBuilder::::with_capacity(string_array.len(), data_capacity); - let mut graphemes_buf = Vec::new(); for maybe_string in string_array.iter() { match maybe_string { Some(string) => { - graphemes_buf.clear(); - graphemes_buf.extend(string.graphemes(true)); + let char_count = string.chars().count(); - if target_len < graphemes_buf.len() { - let end: usize = - graphemes_buf[..target_len].iter().map(|g| g.len()).sum(); - builder.append_value(&string[..end]); + if target_len < char_count { + builder.append_value(&string[..byte_offset_of_char(string, target_len)]); } else if fill_chars.is_empty() { builder.append_value(string); } else { - let pad_chars = target_len - graphemes_buf.len(); + let pad_chars = target_len - char_count; let pad_bytes = char_byte_offsets[pad_chars]; builder.write_str(&padding_buf[..pad_bytes])?; builder.append_value(string); @@ -378,7 +373,6 @@ where { let array = if let Some(fill_array) = fill_array { let mut builder: GenericStringBuilder = GenericStringBuilder::new(); - let mut graphemes_buf = Vec::new(); let mut fill_chars_buf = Vec::new(); for ((string, target_len), fill) in string_array @@ -407,8 +401,7 @@ where } if string.is_ascii() && fill.is_ascii() { - // ASCII fast path: byte length == character length, - // so we skip expensive grapheme segmentation. + // ASCII fast path: byte length == character length. let str_len = string.len(); if target_len < str_len { builder.append_value(&string[..target_len]); @@ -428,21 +421,17 @@ where builder.append_value(string); } } else { - // Reuse buffers by clearing and refilling - graphemes_buf.clear(); - graphemes_buf.extend(string.graphemes(true)); + let char_count = string.chars().count(); fill_chars_buf.clear(); fill_chars_buf.extend(fill.chars()); - if target_len < graphemes_buf.len() { - let end: usize = - graphemes_buf[..target_len].iter().map(|g| g.len()).sum(); - builder.append_value(&string[..end]); + if target_len < char_count { + builder.append_value(&string[..byte_offset_of_char(string, target_len)]); } else if fill_chars_buf.is_empty() { builder.append_value(string); } else { - for l in 0..target_len - graphemes_buf.len() { + for l in 0..target_len - char_count { let c = *fill_chars_buf.get(l % fill_chars_buf.len()).unwrap(); builder.write_char(c)?; @@ -458,7 +447,6 @@ where builder.finish() } else { let mut builder: GenericStringBuilder = GenericStringBuilder::new(); - let mut graphemes_buf = Vec::new(); for (string, target_len) in string_array.iter().zip(length_array.iter()) { if let (Some(string), Some(target_len)) = (string, target_len) { @@ -491,16 +479,12 @@ where builder.append_value(string); } } else { - // Reuse buffer by clearing and refilling - graphemes_buf.clear(); - graphemes_buf.extend(string.graphemes(true)); - - if target_len < graphemes_buf.len() { - let end: usize = - graphemes_buf[..target_len].iter().map(|g| g.len()).sum(); - builder.append_value(&string[..end]); + let char_count = string.chars().count(); + + if target_len < char_count { + builder.append_value(&string[..byte_offset_of_char(string, target_len)]); } else { - for _ in 0..(target_len - graphemes_buf.len()) { + for _ in 0..(target_len - char_count) { builder.write_str(" ")?; } builder.append_value(string); diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs index 44ce4640422d6..72faffd3a7c9d 100644 --- a/datafusion/functions/src/unicode/rpad.rs +++ b/datafusion/functions/src/unicode/rpad.rs @@ -24,7 +24,6 @@ use arrow::array::{ OffsetSizeTrait, StringArrayType, StringViewArray, }; use arrow::datatypes::DataType; -use unicode_segmentation::UnicodeSegmentation; use crate::utils::{make_scalar_function, utf8_to_str_type}; use datafusion_common::cast::as_int64_array; @@ -178,7 +177,7 @@ impl ScalarUDFImpl for RPadFunc { } } -use super::common::{try_as_scalar_i64, try_as_scalar_str}; +use super::common::{byte_offset_of_char, try_as_scalar_i64, try_as_scalar_str}; /// Optimized rpad for constant target_len and fill arguments. fn rpad_scalar_args<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>( @@ -271,22 +270,18 @@ fn rpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>( let data_capacity = string_array.len().saturating_mul(target_len * 4); let mut builder = GenericStringBuilder::::with_capacity(string_array.len(), data_capacity); - let mut graphemes_buf = Vec::new(); for maybe_string in string_array.iter() { match maybe_string { Some(string) => { - graphemes_buf.clear(); - graphemes_buf.extend(string.graphemes(true)); + let char_count = string.chars().count(); - if target_len < graphemes_buf.len() { - let end: usize = - graphemes_buf[..target_len].iter().map(|g| g.len()).sum(); - builder.append_value(&string[..end]); + if target_len < char_count { + builder.append_value(&string[..byte_offset_of_char(string, target_len)]); } else if fill_chars.is_empty() { builder.append_value(string); } else { - let pad_chars = target_len - graphemes_buf.len(); + let pad_chars = target_len - char_count; let pad_bytes = char_byte_offsets[pad_chars]; builder.write_str(string)?; builder.write_str(&padding_buf[..pad_bytes])?; @@ -377,7 +372,6 @@ where { let array = if let Some(fill_array) = fill_array { let mut builder: GenericStringBuilder = GenericStringBuilder::new(); - let mut graphemes_buf = Vec::new(); let mut fill_chars_buf = Vec::new(); for ((string, target_len), fill) in string_array @@ -406,8 +400,7 @@ where } if string.is_ascii() && fill.is_ascii() { - // ASCII fast path: byte length == character length, - // so we skip expensive grapheme segmentation. + // ASCII fast path: byte length == character length. let str_len = string.len(); if target_len < str_len { builder.append_value(&string[..target_len]); @@ -428,21 +421,18 @@ where builder.append_value(""); } } else { - graphemes_buf.clear(); - graphemes_buf.extend(string.graphemes(true)); + let char_count = string.chars().count(); fill_chars_buf.clear(); fill_chars_buf.extend(fill.chars()); - if target_len < graphemes_buf.len() { - let end: usize = - graphemes_buf[..target_len].iter().map(|g| g.len()).sum(); - builder.append_value(&string[..end]); + if target_len < char_count { + builder.append_value(&string[..byte_offset_of_char(string, target_len)]); } else if fill_chars_buf.is_empty() { builder.append_value(string); } else { builder.write_str(string)?; - for l in 0..target_len - graphemes_buf.len() { + for l in 0..target_len - char_count { let c = *fill_chars_buf.get(l % fill_chars_buf.len()).unwrap(); builder.write_char(c)?; @@ -458,7 +448,6 @@ where builder.finish() } else { let mut builder: GenericStringBuilder = GenericStringBuilder::new(); - let mut graphemes_buf = Vec::new(); for (string, target_len) in string_array.iter().zip(length_array.iter()) { if let (Some(string), Some(target_len)) = (string, target_len) { @@ -492,16 +481,13 @@ where builder.append_value(""); } } else { - graphemes_buf.clear(); - graphemes_buf.extend(string.graphemes(true)); + let char_count = string.chars().count(); - if target_len < graphemes_buf.len() { - let end: usize = - graphemes_buf[..target_len].iter().map(|g| g.len()).sum(); - builder.append_value(&string[..end]); + if target_len < char_count { + builder.append_value(&string[..byte_offset_of_char(string, target_len)]); } else { builder.write_str(string)?; - for _ in 0..(target_len - graphemes_buf.len()) { + for _ in 0..(target_len - char_count) { builder.write_str(" ")?; } builder.append_value(""); diff --git a/datafusion/functions/src/unicode/translate.rs b/datafusion/functions/src/unicode/translate.rs index 5f95c095a644e..29dc660b86f62 100644 --- a/datafusion/functions/src/unicode/translate.rs +++ b/datafusion/functions/src/unicode/translate.rs @@ -21,7 +21,6 @@ use arrow::array::{ }; use arrow::datatypes::DataType; use datafusion_common::HashMap; -use unicode_segmentation::UnicodeSegmentation; use crate::utils::make_scalar_function; use datafusion_common::{Result, exec_err}; @@ -97,11 +96,10 @@ impl ScalarUDFImpl for TranslateFunc { try_as_scalar_str(&args.args[1]), try_as_scalar_str(&args.args[2]), ) { - let to_graphemes: Vec<&str> = to_str.graphemes(true).collect(); + let to_chars: Vec = to_str.chars().collect(); - let mut from_map: HashMap<&str, usize> = HashMap::new(); - for (index, c) in from_str.graphemes(true).enumerate() { - // Ignore characters that already exist in from_map + let mut from_map: HashMap = HashMap::new(); + for (index, c) in from_str.chars().enumerate() { from_map.entry(c).or_insert(index); } @@ -117,7 +115,7 @@ impl ScalarUDFImpl for TranslateFunc { translate_with_map( arr, &from_map, - &to_graphemes, + &to_chars, ascii_table.as_ref(), builder, ) @@ -129,7 +127,7 @@ impl ScalarUDFImpl for TranslateFunc { translate_with_map( arr, &from_map, - &to_graphemes, + &to_chars, ascii_table.as_ref(), builder, ) @@ -141,7 +139,7 @@ impl ScalarUDFImpl for TranslateFunc { translate_with_map( arr, &from_map, - &to_graphemes, + &to_chars, ascii_table.as_ref(), builder, ) @@ -215,48 +213,27 @@ where let from_array_iter = ArrayIter::new(from_array); let to_array_iter = ArrayIter::new(to_array); - // Reusable buffers to avoid allocating for each row - let mut from_map: HashMap<&str, usize> = HashMap::new(); - let mut from_graphemes: Vec<&str> = Vec::new(); - let mut to_graphemes: Vec<&str> = Vec::new(); - let mut string_graphemes: Vec<&str> = Vec::new(); - let mut result_graphemes: Vec<&str> = Vec::new(); + let mut from_map: HashMap = HashMap::new(); + let mut to_chars: Vec = Vec::new(); + let mut result_buf = String::new(); for ((string, from), to) in string_array_iter.zip(from_array_iter).zip(to_array_iter) { match (string, from, to) { (Some(string), Some(from), Some(to)) => { - // Clear and reuse buffers from_map.clear(); - from_graphemes.clear(); - to_graphemes.clear(); - string_graphemes.clear(); - result_graphemes.clear(); - - // Build from_map using reusable buffer - from_graphemes.extend(from.graphemes(true)); - for (index, c) in from_graphemes.iter().enumerate() { - // Ignore characters that already exist in from_map - from_map.entry(*c).or_insert(index); - } + to_chars.clear(); + result_buf.clear(); - // Build to_graphemes - to_graphemes.extend(to.graphemes(true)); - - // Process string and build result - string_graphemes.extend(string.graphemes(true)); - for c in &string_graphemes { - match from_map.get(*c) { - Some(n) => { - if let Some(replacement) = to_graphemes.get(*n) { - result_graphemes.push(*replacement); - } - } - None => result_graphemes.push(*c), - } + for (index, c) in from.chars().enumerate() { + from_map.entry(c).or_insert(index); } - builder.append_value(&result_graphemes.concat()); + to_chars.extend(to.chars()); + + translate_char_by_char(string, &from_map, &to_chars, &mut result_buf); + + builder.append_value(&result_buf); } _ => builder.append_null(), } @@ -265,6 +242,27 @@ where Ok(builder.finish()) } +/// Translate `input` character-by-character using `from_map` and `to_chars`, +/// appending the result to `buf`. +#[inline] +fn translate_char_by_char( + input: &str, + from_map: &HashMap, + to_chars: &[char], + buf: &mut String, +) { + for c in input.chars() { + match from_map.get(&c) { + Some(n) => { + if let Some(&replacement) = to_chars.get(*n) { + buf.push(replacement); + } + } + None => buf.push(c), + } + } +} + /// Sentinel value in the ASCII translate table indicating the character should /// be deleted (the `from` character has no corresponding `to` character). Any /// value > 127 works since valid ASCII is 0–127. @@ -301,11 +299,11 @@ fn build_ascii_translate_table(from: &str, to: &str) -> Option<[u8; 128]> { /// Optimized translate for constant `from` and `to` arguments: uses a pre-built /// translation map instead of rebuilding it for every row. When an ASCII byte /// lookup table is provided, ASCII input rows use the lookup table; non-ASCII -/// inputs fallback to using the map. +/// inputs fall back to the char-based map. fn translate_with_map<'a, V, O>( string_array: V, - from_map: &HashMap<&str, usize>, - to_graphemes: &[&str], + from_map: &HashMap, + to_chars: &[char], ascii_table: Option<&[u8; 128]>, mut builder: O, ) -> Result @@ -313,7 +311,7 @@ where V: ArrayAccessor, O: StringLikeArrayBuilder, { - let mut result_graphemes: Vec<&str> = Vec::new(); + let mut result_buf = String::new(); let mut ascii_buf: Vec = Vec::new(); for string in ArrayIter::new(string_array) { @@ -335,21 +333,9 @@ where std::str::from_utf8_unchecked(&ascii_buf) }); } else { - // Slow path: grapheme-based translation - result_graphemes.clear(); - - for c in s.graphemes(true) { - match from_map.get(c) { - Some(n) => { - if let Some(replacement) = to_graphemes.get(*n) { - result_graphemes.push(*replacement); - } - } - None => result_graphemes.push(c), - } - } - - builder.append_value(&result_graphemes.concat()); + result_buf.clear(); + translate_char_by_char(s, from_map, to_chars, &mut result_buf); + builder.append_value(&result_buf); } } None => builder.append_null(), @@ -445,7 +431,7 @@ mod tests { StringArray ); // Non-ASCII input with ASCII scalar from/to: exercises the - // grapheme fallback within translate_with_map. + // char-based fallback within translate_with_map. test_function!( TranslateFunc::new(), vec![ diff --git a/datafusion/sqllogictest/test_files/string/string_literal.slt b/datafusion/sqllogictest/test_files/string/string_literal.slt index d4fe8ee178719..542e2bd8b58fc 100644 --- a/datafusion/sqllogictest/test_files/string/string_literal.slt +++ b/datafusion/sqllogictest/test_files/string/string_literal.slt @@ -312,6 +312,36 @@ SELECT lpad(NULL, 5, 'xy') ---- NULL +# lpad counts Unicode codepoints, not grapheme clusters. +# chr(769) is U+0301 COMBINING ACUTE ACCENT — 'e' || chr(769) is 2 codepoints +# but renders as a single grapheme cluster. +# Use equality comparisons to avoid encoding ambiguity in expected output. + +# Input with combining character: 'e' + combining accent + 'x' = 3 codepoints. +# Padding to 4 means 1 space prepended. +query BII +SELECT lpad('e' || chr(769) || 'x', 4) = ' ' || 'e' || chr(769) || 'x', + character_length('e' || chr(769) || 'x'), + character_length(lpad('e' || chr(769) || 'x', 4)) +---- +true 3 4 + +# Truncating input with combining character: 'e' + combining accent + 'x' + 'y' +# = 4 codepoints. Truncating to 3 keeps first 3 codepoints: 'e' + combining accent + 'x'. +query BI +SELECT lpad('e' || chr(769) || 'xy', 3) = 'e' || chr(769) || 'x', + character_length(lpad('e' || chr(769) || 'xy', 3)) +---- +true 3 + +# Fill string with combining character: fill is 'e' + combining accent = 2 codepoints. +# Padding 'x' (1 codepoint) to length 5 means 4 fill codepoints = 2 cycles of fill. +query BI +SELECT lpad('x', 5, 'e' || chr(769)) = 'e' || chr(769) || 'e' || chr(769) || 'x', + character_length(lpad('x', 5, 'e' || chr(769))) +---- +true 5 + query T SELECT regexp_replace('foobar', 'bar', 'xx', 'gi') ---- @@ -583,6 +613,35 @@ SELECT rpad(arrow_cast(NULL, 'Utf8View'), 5, 'xy') ---- NULL +# rpad counts Unicode codepoints, not grapheme clusters. +# chr(769) is U+0301 COMBINING ACUTE ACCENT. +# Use equality comparisons to avoid encoding ambiguity in expected output. + +# Input with combining character: 'e' + combining accent + 'x' = 3 codepoints. +# Padding to 4 means 1 space appended. +query BII +SELECT rpad('e' || chr(769) || 'x', 4) = 'e' || chr(769) || 'x' || ' ', + character_length('e' || chr(769) || 'x'), + character_length(rpad('e' || chr(769) || 'x', 4)) +---- +true 3 4 + +# Truncating input with combining character: 'e' + combining accent + 'x' + 'y' +# = 4 codepoints. Truncating to 3 keeps first 3 codepoints: 'e' + combining accent + 'x'. +query BI +SELECT rpad('e' || chr(769) || 'xy', 3) = 'e' || chr(769) || 'x', + character_length(rpad('e' || chr(769) || 'xy', 3)) +---- +true 3 + +# Fill string with combining character: fill is 'e' + combining accent = 2 codepoints. +# Padding 'x' (1 codepoint) to length 5 means 4 fill codepoints = 2 cycles of fill. +query BI +SELECT rpad('x', 5, 'e' || chr(769)) = 'x' || 'e' || chr(769) || 'e' || chr(769), + character_length(rpad('x', 5, 'e' || chr(769))) +---- +true 5 + query I SELECT char_length('') ---- @@ -1829,3 +1888,28 @@ query T SELECT arrow_typeof(translate(arrow_cast('12345', 'Utf8View'), '143', 'ax')) ---- Utf8View + +# translate operates on Unicode codepoints, not grapheme clusters. +# chr(769) is U+0301 COMBINING ACUTE ACCENT. +# Use equality comparisons to avoid encoding ambiguity in expected output. + +# Replacing a combining accent (a single codepoint) with another character. +# 'e' || chr(769) is 2 codepoints; translating chr(769) → 'X' replaces just the accent. +query B +SELECT translate('e' || chr(769), chr(769), 'X') = 'eX' +---- +true + +# Replacing the base character but not the combining accent. +query B +SELECT translate('e' || chr(769) || 'y', 'e', 'a') = 'a' || chr(769) || 'y' +---- +true + +# Deleting a combining accent (from longer than to). +# 'e' || chr(769) || 'x' with chr(769) in from but no corresponding to entry → deleted. +query BI +SELECT translate('e' || chr(769) || 'x', chr(769), '') = 'ex', + character_length(translate('e' || chr(769) || 'x', chr(769), '')) +---- +true 2 From 2919ce7cdc2a3b42da96cc018f38fab743f8d317 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Mon, 6 Apr 2026 10:44:51 -0400 Subject: [PATCH 2/6] . --- datafusion/functions/Cargo.toml | 1 - datafusion/functions/src/unicode/common.rs | 4 ++-- datafusion/functions/src/unicode/lpad.rs | 11 ++++++++--- datafusion/functions/src/unicode/rpad.rs | 11 ++++++++--- .../sqllogictest/test_files/string/string_literal.slt | 3 --- 5 files changed, 18 insertions(+), 12 deletions(-) diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml index 30f5da816b3ab..02b8e842280bf 100644 --- a/datafusion/functions/Cargo.toml +++ b/datafusion/functions/Cargo.toml @@ -87,7 +87,6 @@ num-traits = { workspace = true } rand = { workspace = true } regex = { workspace = true, optional = true } sha2 = { workspace = true, optional = true } - uuid = { workspace = true, features = ["v4"], optional = true } [dev-dependencies] diff --git a/datafusion/functions/src/unicode/common.rs b/datafusion/functions/src/unicode/common.rs index 89e614ab092b5..bf80f550a65e4 100644 --- a/datafusion/functions/src/unicode/common.rs +++ b/datafusion/functions/src/unicode/common.rs @@ -78,8 +78,8 @@ impl LeftRightSlicer for RightSlicer { } } -/// Returns the byte offset of the `n`th codepoint in `string`, -/// or `string.len()` if the string has fewer than `n` codepoints. +/// Returns the byte offset of the `n`th codepoint in `string`, or +/// `string.len()` if the string has fewer than `n` codepoints. #[inline] pub(crate) fn byte_offset_of_char(string: &str, n: usize) -> usize { string diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs index 18cabf2367be8..c7a29c671c1d6 100644 --- a/datafusion/functions/src/unicode/lpad.rs +++ b/datafusion/functions/src/unicode/lpad.rs @@ -276,7 +276,8 @@ fn lpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>( let char_count = string.chars().count(); if target_len < char_count { - builder.append_value(&string[..byte_offset_of_char(string, target_len)]); + builder + .append_value(&string[..byte_offset_of_char(string, target_len)]); } else if fill_chars.is_empty() { builder.append_value(string); } else { @@ -427,7 +428,9 @@ where fill_chars_buf.extend(fill.chars()); if target_len < char_count { - builder.append_value(&string[..byte_offset_of_char(string, target_len)]); + builder.append_value( + &string[..byte_offset_of_char(string, target_len)], + ); } else if fill_chars_buf.is_empty() { builder.append_value(string); } else { @@ -482,7 +485,9 @@ where let char_count = string.chars().count(); if target_len < char_count { - builder.append_value(&string[..byte_offset_of_char(string, target_len)]); + builder.append_value( + &string[..byte_offset_of_char(string, target_len)], + ); } else { for _ in 0..(target_len - char_count) { builder.write_str(" ")?; diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs index 72faffd3a7c9d..32d1824eb7dd0 100644 --- a/datafusion/functions/src/unicode/rpad.rs +++ b/datafusion/functions/src/unicode/rpad.rs @@ -277,7 +277,8 @@ fn rpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>( let char_count = string.chars().count(); if target_len < char_count { - builder.append_value(&string[..byte_offset_of_char(string, target_len)]); + builder + .append_value(&string[..byte_offset_of_char(string, target_len)]); } else if fill_chars.is_empty() { builder.append_value(string); } else { @@ -427,7 +428,9 @@ where fill_chars_buf.extend(fill.chars()); if target_len < char_count { - builder.append_value(&string[..byte_offset_of_char(string, target_len)]); + builder.append_value( + &string[..byte_offset_of_char(string, target_len)], + ); } else if fill_chars_buf.is_empty() { builder.append_value(string); } else { @@ -484,7 +487,9 @@ where let char_count = string.chars().count(); if target_len < char_count { - builder.append_value(&string[..byte_offset_of_char(string, target_len)]); + builder.append_value( + &string[..byte_offset_of_char(string, target_len)], + ); } else { builder.write_str(string)?; for _ in 0..(target_len - char_count) { diff --git a/datafusion/sqllogictest/test_files/string/string_literal.slt b/datafusion/sqllogictest/test_files/string/string_literal.slt index 542e2bd8b58fc..e39bdc2ab4dd3 100644 --- a/datafusion/sqllogictest/test_files/string/string_literal.slt +++ b/datafusion/sqllogictest/test_files/string/string_literal.slt @@ -315,7 +315,6 @@ NULL # lpad counts Unicode codepoints, not grapheme clusters. # chr(769) is U+0301 COMBINING ACUTE ACCENT — 'e' || chr(769) is 2 codepoints # but renders as a single grapheme cluster. -# Use equality comparisons to avoid encoding ambiguity in expected output. # Input with combining character: 'e' + combining accent + 'x' = 3 codepoints. # Padding to 4 means 1 space prepended. @@ -615,7 +614,6 @@ NULL # rpad counts Unicode codepoints, not grapheme clusters. # chr(769) is U+0301 COMBINING ACUTE ACCENT. -# Use equality comparisons to avoid encoding ambiguity in expected output. # Input with combining character: 'e' + combining accent + 'x' = 3 codepoints. # Padding to 4 means 1 space appended. @@ -1891,7 +1889,6 @@ Utf8View # translate operates on Unicode codepoints, not grapheme clusters. # chr(769) is U+0301 COMBINING ACUTE ACCENT. -# Use equality comparisons to avoid encoding ambiguity in expected output. # Replacing a combining accent (a single codepoint) with another character. # 'e' || chr(769) is 2 codepoints; translating chr(769) → 'X' replaces just the accent. From c63800a56a1e2e03eadd95f6f0b5888f53e5ca8c Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Tue, 7 Apr 2026 09:04:51 -0400 Subject: [PATCH 3/6] Fixes per code review --- .../test_files/string/string_literal.slt | 2 +- docs/source/library-user-guide/upgrading/54.0.0.md | 12 ++++++++++++ 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/datafusion/sqllogictest/test_files/string/string_literal.slt b/datafusion/sqllogictest/test_files/string/string_literal.slt index e39bdc2ab4dd3..97f2a40c13fea 100644 --- a/datafusion/sqllogictest/test_files/string/string_literal.slt +++ b/datafusion/sqllogictest/test_files/string/string_literal.slt @@ -1904,7 +1904,7 @@ SELECT translate('e' || chr(769) || 'y', 'e', 'a') = 'a' || chr(769) || 'y' true # Deleting a combining accent (from longer than to). -# 'e' || chr(769) || 'x' with chr(769) in from but no corresponding to entry → deleted. +# 'e' || chr(769) || 'x' with chr(769) in `from` but no corresponding `to` entry → deleted. query BI SELECT translate('e' || chr(769) || 'x', chr(769), '') = 'ex', character_length(translate('e' || chr(769) || 'x', chr(769), '')) diff --git a/docs/source/library-user-guide/upgrading/54.0.0.md b/docs/source/library-user-guide/upgrading/54.0.0.md index 6dc08cc344e5f..24cbd113b62f7 100644 --- a/docs/source/library-user-guide/upgrading/54.0.0.md +++ b/docs/source/library-user-guide/upgrading/54.0.0.md @@ -320,4 +320,16 @@ which results in a few changes: `timestamp-*` is UTC timezone-aware, while `local-timestamp-*` is timezone-naive. +### `lpad`, `rpad`, and `translate` now operate on Unicode codepoints instead of grapheme clusters + +Previously, `lpad`, `rpad`, and `translate` used Unicode grapheme cluster +segmentation to measure and manipulate strings. They now use Unicode codepoints, +which is consistent with the SQL standard and most other SQL implementations. It +also matches the behavior of other string-related functions in DataFusion. + +The difference is only observable for strings containing combining characters +(e.g., U+0301 COMBINING ACUTE ACCENT) or other multi-codepoint grapheme +clusters (e.g., ZWJ emoji sequences). For ASCII and most common Unicode text, +behavior is unchanged. + [#17861]: https://github.com/apache/datafusion/pull/17861 From 82d8128910410e1014301dc8d17e2f6ab847ca3f Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Tue, 7 Apr 2026 10:11:18 -0400 Subject: [PATCH 4/6] Clean up repeated string iteration, per review --- datafusion/functions/src/unicode/common.rs | 27 +++++++- datafusion/functions/src/unicode/lpad.rs | 75 ++++++++++---------- datafusion/functions/src/unicode/rpad.rs | 81 +++++++++++----------- 3 files changed, 104 insertions(+), 79 deletions(-) diff --git a/datafusion/functions/src/unicode/common.rs b/datafusion/functions/src/unicode/common.rs index bf80f550a65e4..e3c634133cbf5 100644 --- a/datafusion/functions/src/unicode/common.rs +++ b/datafusion/functions/src/unicode/common.rs @@ -78,8 +78,8 @@ impl LeftRightSlicer for RightSlicer { } } -/// Returns the byte offset of the `n`th codepoint in `string`, or -/// `string.len()` if the string has fewer than `n` codepoints. +/// Returns the byte offset of the `n`th codepoint in `string`, +/// or `string.len()` if the string has fewer than `n` codepoints. #[inline] pub(crate) fn byte_offset_of_char(string: &str, n: usize) -> usize { string @@ -88,6 +88,29 @@ pub(crate) fn byte_offset_of_char(string: &str, n: usize) -> usize { .map_or(string.len(), |(i, _)| i) } +/// Measures the character length of `string` in a single pass, returning +/// early with the truncation byte offset if the string exceeds `n` chars. +#[inline] +pub(crate) fn measure_char_count(string: &str, n: usize) -> StringCharLen { + let mut count = 0; + for (byte_idx, _) in string.char_indices() { + if count == n { + return StringCharLen::TruncateAt(byte_idx); + } + count += 1; + } + StringCharLen::CharCount(count) +} + +/// Result of [`measure_char_count`]. +pub(crate) enum StringCharLen { + /// The string has more than `n` chars; contains the byte offset at the + /// `n`-th character boundary (suitable for slicing `&string[..offset]`). + TruncateAt(usize), + /// The string has `n` or fewer chars; contains the exact character count. + CharCount(usize), +} + /// Calculate the byte length of the substring of `n` chars from string `string` #[inline] fn left_right_byte_length(string: &str, n: i64) -> usize { diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs index c7a29c671c1d6..3e3e80f48e28d 100644 --- a/datafusion/functions/src/unicode/lpad.rs +++ b/datafusion/functions/src/unicode/lpad.rs @@ -177,7 +177,9 @@ impl ScalarUDFImpl for LPadFunc { } } -use super::common::{byte_offset_of_char, try_as_scalar_i64, try_as_scalar_str}; +use super::common::{ + StringCharLen, measure_char_count, try_as_scalar_i64, try_as_scalar_str, +}; /// Optimized lpad for constant target_len and fill arguments. fn lpad_scalar_args<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>( @@ -273,18 +275,18 @@ fn lpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>( for maybe_string in string_array.iter() { match maybe_string { Some(string) => { - let char_count = string.chars().count(); - - if target_len < char_count { - builder - .append_value(&string[..byte_offset_of_char(string, target_len)]); - } else if fill_chars.is_empty() { - builder.append_value(string); - } else { - let pad_chars = target_len - char_count; - let pad_bytes = char_byte_offsets[pad_chars]; - builder.write_str(&padding_buf[..pad_bytes])?; - builder.append_value(string); + match measure_char_count(string, target_len) { + StringCharLen::TruncateAt(offset) => { + builder.append_value(&string[..offset]); + } + StringCharLen::CharCount(char_count) => { + if !fill_chars.is_empty() { + let pad_chars = target_len - char_count; + let pad_bytes = char_byte_offsets[pad_chars]; + builder.write_str(&padding_buf[..pad_bytes])?; + } + builder.append_value(string); + } } } None => builder.append_null(), @@ -422,24 +424,24 @@ where builder.append_value(string); } } else { - let char_count = string.chars().count(); - fill_chars_buf.clear(); fill_chars_buf.extend(fill.chars()); - if target_len < char_count { - builder.append_value( - &string[..byte_offset_of_char(string, target_len)], - ); - } else if fill_chars_buf.is_empty() { - builder.append_value(string); - } else { - for l in 0..target_len - char_count { - let c = - *fill_chars_buf.get(l % fill_chars_buf.len()).unwrap(); - builder.write_char(c)?; + match measure_char_count(string, target_len) { + StringCharLen::TruncateAt(offset) => { + builder.append_value(&string[..offset]); + } + StringCharLen::CharCount(char_count) => { + if !fill_chars_buf.is_empty() { + for l in 0..target_len - char_count { + let c = *fill_chars_buf + .get(l % fill_chars_buf.len()) + .unwrap(); + builder.write_char(c)?; + } + } + builder.append_value(string); } - builder.append_value(string); } } } else { @@ -482,17 +484,16 @@ where builder.append_value(string); } } else { - let char_count = string.chars().count(); - - if target_len < char_count { - builder.append_value( - &string[..byte_offset_of_char(string, target_len)], - ); - } else { - for _ in 0..(target_len - char_count) { - builder.write_str(" ")?; + match measure_char_count(string, target_len) { + StringCharLen::TruncateAt(offset) => { + builder.append_value(&string[..offset]); + } + StringCharLen::CharCount(char_count) => { + for _ in 0..(target_len - char_count) { + builder.write_str(" ")?; + } + builder.append_value(string); } - builder.append_value(string); } } } else { diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs index 32d1824eb7dd0..a027085078a6a 100644 --- a/datafusion/functions/src/unicode/rpad.rs +++ b/datafusion/functions/src/unicode/rpad.rs @@ -177,7 +177,9 @@ impl ScalarUDFImpl for RPadFunc { } } -use super::common::{byte_offset_of_char, try_as_scalar_i64, try_as_scalar_str}; +use super::common::{ + StringCharLen, measure_char_count, try_as_scalar_i64, try_as_scalar_str, +}; /// Optimized rpad for constant target_len and fill arguments. fn rpad_scalar_args<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>( @@ -274,19 +276,19 @@ fn rpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>( for maybe_string in string_array.iter() { match maybe_string { Some(string) => { - let char_count = string.chars().count(); - - if target_len < char_count { - builder - .append_value(&string[..byte_offset_of_char(string, target_len)]); - } else if fill_chars.is_empty() { - builder.append_value(string); - } else { - let pad_chars = target_len - char_count; - let pad_bytes = char_byte_offsets[pad_chars]; - builder.write_str(string)?; - builder.write_str(&padding_buf[..pad_bytes])?; - builder.append_value(""); + match measure_char_count(string, target_len) { + StringCharLen::TruncateAt(offset) => { + builder.append_value(&string[..offset]); + } + StringCharLen::CharCount(char_count) => { + builder.write_str(string)?; + if !fill_chars.is_empty() { + let pad_chars = target_len - char_count; + let pad_bytes = char_byte_offsets[pad_chars]; + builder.write_str(&padding_buf[..pad_bytes])?; + } + builder.append_value(""); + } } } None => builder.append_null(), @@ -422,25 +424,25 @@ where builder.append_value(""); } } else { - let char_count = string.chars().count(); - fill_chars_buf.clear(); fill_chars_buf.extend(fill.chars()); - if target_len < char_count { - builder.append_value( - &string[..byte_offset_of_char(string, target_len)], - ); - } else if fill_chars_buf.is_empty() { - builder.append_value(string); - } else { - builder.write_str(string)?; - for l in 0..target_len - char_count { - let c = - *fill_chars_buf.get(l % fill_chars_buf.len()).unwrap(); - builder.write_char(c)?; + match measure_char_count(string, target_len) { + StringCharLen::TruncateAt(offset) => { + builder.append_value(&string[..offset]); + } + StringCharLen::CharCount(char_count) => { + builder.write_str(string)?; + if !fill_chars_buf.is_empty() { + for l in 0..target_len - char_count { + let c = *fill_chars_buf + .get(l % fill_chars_buf.len()) + .unwrap(); + builder.write_char(c)?; + } + } + builder.append_value(""); } - builder.append_value(""); } } } else { @@ -484,18 +486,17 @@ where builder.append_value(""); } } else { - let char_count = string.chars().count(); - - if target_len < char_count { - builder.append_value( - &string[..byte_offset_of_char(string, target_len)], - ); - } else { - builder.write_str(string)?; - for _ in 0..(target_len - char_count) { - builder.write_str(" ")?; + match measure_char_count(string, target_len) { + StringCharLen::TruncateAt(offset) => { + builder.append_value(&string[..offset]); + } + StringCharLen::CharCount(char_count) => { + builder.write_str(string)?; + for _ in 0..(target_len - char_count) { + builder.write_str(" ")?; + } + builder.append_value(""); } - builder.append_value(""); } } } else { From 7232c3411b435d26e678867d63ad958ac6b96475 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Tue, 7 Apr 2026 10:57:12 -0400 Subject: [PATCH 5/6] Improve function name --- datafusion/functions/src/unicode/common.rs | 18 +++++++++--------- datafusion/functions/src/unicode/lpad.rs | 14 +++++++------- datafusion/functions/src/unicode/rpad.rs | 14 +++++++------- 3 files changed, 23 insertions(+), 23 deletions(-) diff --git a/datafusion/functions/src/unicode/common.rs b/datafusion/functions/src/unicode/common.rs index e3c634133cbf5..0158325e98a19 100644 --- a/datafusion/functions/src/unicode/common.rs +++ b/datafusion/functions/src/unicode/common.rs @@ -88,26 +88,26 @@ pub(crate) fn byte_offset_of_char(string: &str, n: usize) -> usize { .map_or(string.len(), |(i, _)| i) } -/// Measures the character length of `string` in a single pass, returning -/// early with the truncation byte offset if the string exceeds `n` chars. +/// If `string` has more than `n` codepoints, returns the byte offset of +/// the `n`-th codepoint boundary. Otherwise returns the total codepoint count. #[inline] -pub(crate) fn measure_char_count(string: &str, n: usize) -> StringCharLen { +pub(crate) fn char_count_or_boundary(string: &str, n: usize) -> StringCharLen { let mut count = 0; for (byte_idx, _) in string.char_indices() { if count == n { - return StringCharLen::TruncateAt(byte_idx); + return StringCharLen::ByteOffset(byte_idx); } count += 1; } StringCharLen::CharCount(count) } -/// Result of [`measure_char_count`]. +/// Result of [`char_count_or_boundary`]. pub(crate) enum StringCharLen { - /// The string has more than `n` chars; contains the byte offset at the - /// `n`-th character boundary (suitable for slicing `&string[..offset]`). - TruncateAt(usize), - /// The string has `n` or fewer chars; contains the exact character count. + /// The string has more than `n` codepoints; contains the byte offset + /// at the `n`-th codepoint boundary. + ByteOffset(usize), + /// The string has `n` or fewer codepoints; contains the exact count. CharCount(usize), } diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs index 3e3e80f48e28d..b61c3d3900fcc 100644 --- a/datafusion/functions/src/unicode/lpad.rs +++ b/datafusion/functions/src/unicode/lpad.rs @@ -178,7 +178,7 @@ impl ScalarUDFImpl for LPadFunc { } use super::common::{ - StringCharLen, measure_char_count, try_as_scalar_i64, try_as_scalar_str, + StringCharLen, char_count_or_boundary, try_as_scalar_i64, try_as_scalar_str, }; /// Optimized lpad for constant target_len and fill arguments. @@ -275,8 +275,8 @@ fn lpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>( for maybe_string in string_array.iter() { match maybe_string { Some(string) => { - match measure_char_count(string, target_len) { - StringCharLen::TruncateAt(offset) => { + match char_count_or_boundary(string, target_len) { + StringCharLen::ByteOffset(offset) => { builder.append_value(&string[..offset]); } StringCharLen::CharCount(char_count) => { @@ -427,8 +427,8 @@ where fill_chars_buf.clear(); fill_chars_buf.extend(fill.chars()); - match measure_char_count(string, target_len) { - StringCharLen::TruncateAt(offset) => { + match char_count_or_boundary(string, target_len) { + StringCharLen::ByteOffset(offset) => { builder.append_value(&string[..offset]); } StringCharLen::CharCount(char_count) => { @@ -484,8 +484,8 @@ where builder.append_value(string); } } else { - match measure_char_count(string, target_len) { - StringCharLen::TruncateAt(offset) => { + match char_count_or_boundary(string, target_len) { + StringCharLen::ByteOffset(offset) => { builder.append_value(&string[..offset]); } StringCharLen::CharCount(char_count) => { diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs index a027085078a6a..6b0748aead413 100644 --- a/datafusion/functions/src/unicode/rpad.rs +++ b/datafusion/functions/src/unicode/rpad.rs @@ -178,7 +178,7 @@ impl ScalarUDFImpl for RPadFunc { } use super::common::{ - StringCharLen, measure_char_count, try_as_scalar_i64, try_as_scalar_str, + StringCharLen, char_count_or_boundary, try_as_scalar_i64, try_as_scalar_str, }; /// Optimized rpad for constant target_len and fill arguments. @@ -276,8 +276,8 @@ fn rpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>( for maybe_string in string_array.iter() { match maybe_string { Some(string) => { - match measure_char_count(string, target_len) { - StringCharLen::TruncateAt(offset) => { + match char_count_or_boundary(string, target_len) { + StringCharLen::ByteOffset(offset) => { builder.append_value(&string[..offset]); } StringCharLen::CharCount(char_count) => { @@ -427,8 +427,8 @@ where fill_chars_buf.clear(); fill_chars_buf.extend(fill.chars()); - match measure_char_count(string, target_len) { - StringCharLen::TruncateAt(offset) => { + match char_count_or_boundary(string, target_len) { + StringCharLen::ByteOffset(offset) => { builder.append_value(&string[..offset]); } StringCharLen::CharCount(char_count) => { @@ -486,8 +486,8 @@ where builder.append_value(""); } } else { - match measure_char_count(string, target_len) { - StringCharLen::TruncateAt(offset) => { + match char_count_or_boundary(string, target_len) { + StringCharLen::ByteOffset(offset) => { builder.append_value(&string[..offset]); } StringCharLen::CharCount(char_count) => { From 8a17d525165bb5c5089502b73578591c8f19bef9 Mon Sep 17 00:00:00 2001 From: Neil Conway Date: Tue, 7 Apr 2026 11:12:48 -0400 Subject: [PATCH 6/6] cargo fmt --- datafusion/functions/src/unicode/lpad.rs | 24 ++++++++++------------ datafusion/functions/src/unicode/rpad.rs | 26 +++++++++++------------- 2 files changed, 23 insertions(+), 27 deletions(-) diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs index b61c3d3900fcc..d27bc8633e730 100644 --- a/datafusion/functions/src/unicode/lpad.rs +++ b/datafusion/functions/src/unicode/lpad.rs @@ -274,21 +274,19 @@ fn lpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>( for maybe_string in string_array.iter() { match maybe_string { - Some(string) => { - match char_count_or_boundary(string, target_len) { - StringCharLen::ByteOffset(offset) => { - builder.append_value(&string[..offset]); - } - StringCharLen::CharCount(char_count) => { - if !fill_chars.is_empty() { - let pad_chars = target_len - char_count; - let pad_bytes = char_byte_offsets[pad_chars]; - builder.write_str(&padding_buf[..pad_bytes])?; - } - builder.append_value(string); + Some(string) => match char_count_or_boundary(string, target_len) { + StringCharLen::ByteOffset(offset) => { + builder.append_value(&string[..offset]); + } + StringCharLen::CharCount(char_count) => { + if !fill_chars.is_empty() { + let pad_chars = target_len - char_count; + let pad_bytes = char_byte_offsets[pad_chars]; + builder.write_str(&padding_buf[..pad_bytes])?; } + builder.append_value(string); } - } + }, None => builder.append_null(), } } diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs index 6b0748aead413..b3e14f93526ab 100644 --- a/datafusion/functions/src/unicode/rpad.rs +++ b/datafusion/functions/src/unicode/rpad.rs @@ -275,22 +275,20 @@ fn rpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>( for maybe_string in string_array.iter() { match maybe_string { - Some(string) => { - match char_count_or_boundary(string, target_len) { - StringCharLen::ByteOffset(offset) => { - builder.append_value(&string[..offset]); - } - StringCharLen::CharCount(char_count) => { - builder.write_str(string)?; - if !fill_chars.is_empty() { - let pad_chars = target_len - char_count; - let pad_bytes = char_byte_offsets[pad_chars]; - builder.write_str(&padding_buf[..pad_bytes])?; - } - builder.append_value(""); + Some(string) => match char_count_or_boundary(string, target_len) { + StringCharLen::ByteOffset(offset) => { + builder.append_value(&string[..offset]); + } + StringCharLen::CharCount(char_count) => { + builder.write_str(string)?; + if !fill_chars.is_empty() { + let pad_chars = target_len - char_count; + let pad_bytes = char_byte_offsets[pad_chars]; + builder.write_str(&padding_buf[..pad_bytes])?; } + builder.append_value(""); } - } + }, None => builder.append_null(), } }