From de35dcf04db7396ace1e3c11ee72812f4de425a4 Mon Sep 17 00:00:00 2001
From: Neil Conway <neil.conway@gmail.com>
Date: Mon, 6 Apr 2026 08:26:44 -0400
Subject: [PATCH 1/6] .

---
 Cargo.lock                                    |   1 -
 datafusion/functions/Cargo.toml               |   4 +-
 datafusion/functions/src/unicode/common.rs    |  18 ++-
 datafusion/functions/src/unicode/lpad.rs      |  46 +++-----
 datafusion/functions/src/unicode/rpad.rs      |  42 +++----
 datafusion/functions/src/unicode/translate.rs | 108 ++++++++----------
 .../test_files/string/string_literal.slt      |  84 ++++++++++++++
 7 files changed, 175 insertions(+), 128 deletions(-)
diff --git a/Cargo.lock b/Cargo.lock
index 895b3059f50c1..e9b230dee88d7 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -2259,7 +2259,6 @@ dependencies = [
  "regex",
  "sha2",
  "tokio",
- "unicode-segmentation",
  "uuid",
 ]
 
diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
index 7503c337517ef..30f5da816b3ab 100644
--- a/datafusion/functions/Cargo.toml
+++ b/datafusion/functions/Cargo.toml
@@ -59,7 +59,7 @@ regex_expressions = ["regex"]
 # enable string functions
 string_expressions = ["uuid"]
 # enable unicode functions
-unicode_expressions = ["unicode-segmentation"]
+unicode_expressions = []
 
 [lib]
 name = "datafusion_functions"
@@ -87,7 +87,7 @@ num-traits = { workspace = true }
 rand = { workspace = true }
 regex = { workspace = true, optional = true }
 sha2 = { workspace = true, optional = true }
-unicode-segmentation = { version = "^1.13.2", optional = true }
+
 uuid = { workspace = true, features = ["v4"], optional = true }
 
 [dev-dependencies]
diff --git a/datafusion/functions/src/unicode/common.rs b/datafusion/functions/src/unicode/common.rs
index 002776e6c6538..89e614ab092b5 100644
--- a/datafusion/functions/src/unicode/common.rs
+++ b/datafusion/functions/src/unicode/common.rs
@@ -78,6 +78,16 @@ impl LeftRightSlicer for RightSlicer {
     }
 }
 
+/// Returns the byte offset of the `n`th codepoint in `string`,
+/// or `string.len()` if the string has fewer than `n` codepoints.
+#[inline]
+pub(crate) fn byte_offset_of_char(string: &str, n: usize) -> usize {
+    string
+        .char_indices()
+        .nth(n)
+        .map_or(string.len(), |(i, _)| i)
+}
+
 /// Calculate the byte length of the substring of `n` chars from string `string`
 #[inline]
 fn left_right_byte_length(string: &str, n: i64) -> usize {
@@ -88,11 +98,9 @@ fn left_right_byte_length(string: &str, n: i64) -> usize {
             .map(|(index, _)| index)
             .unwrap_or(0),
         Ordering::Equal => 0,
-        Ordering::Greater => string
-            .char_indices()
-            .nth(n.unsigned_abs().min(usize::MAX as u64) as usize)
-            .map(|(index, _)| index)
-            .unwrap_or(string.len()),
+        Ordering::Greater => {
+            byte_offset_of_char(string, n.unsigned_abs().min(usize::MAX as u64) as usize)
+        }
     }
 }
 
diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs
index d7487c385e84f..18cabf2367be8 100644
--- a/datafusion/functions/src/unicode/lpad.rs
+++ b/datafusion/functions/src/unicode/lpad.rs
@@ -24,7 +24,6 @@ use arrow::array::{
     OffsetSizeTrait, StringArrayType, StringViewArray,
 };
 use arrow::datatypes::DataType;
-use unicode_segmentation::UnicodeSegmentation;
 
 use crate::utils::{make_scalar_function, utf8_to_str_type};
 use datafusion_common::cast::as_int64_array;
@@ -178,7 +177,7 @@ impl ScalarUDFImpl for LPadFunc {
     }
 }
 
-use super::common::{try_as_scalar_i64, try_as_scalar_str};
+use super::common::{byte_offset_of_char, try_as_scalar_i64, try_as_scalar_str};
 
 /// Optimized lpad for constant target_len and fill arguments.
 fn lpad_scalar_args<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
@@ -270,22 +269,18 @@ fn lpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
     let data_capacity = string_array.len().saturating_mul(target_len * 4);
     let mut builder =
         GenericStringBuilder::<T>::with_capacity(string_array.len(), data_capacity);
-    let mut graphemes_buf = Vec::new();
 
     for maybe_string in string_array.iter() {
         match maybe_string {
             Some(string) => {
-                graphemes_buf.clear();
-                graphemes_buf.extend(string.graphemes(true));
+                let char_count = string.chars().count();
 
-                if target_len < graphemes_buf.len() {
-                    let end: usize =
-                        graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
-                    builder.append_value(&string[..end]);
+                if target_len < char_count {
+                    builder.append_value(&string[..byte_offset_of_char(string, target_len)]);
                 } else if fill_chars.is_empty() {
                     builder.append_value(string);
                 } else {
-                    let pad_chars = target_len - graphemes_buf.len();
+                    let pad_chars = target_len - char_count;
                     let pad_bytes = char_byte_offsets[pad_chars];
                     builder.write_str(&padding_buf[..pad_bytes])?;
                     builder.append_value(string);
@@ -378,7 +373,6 @@ where
 {
     let array = if let Some(fill_array) = fill_array {
         let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
-        let mut graphemes_buf = Vec::new();
         let mut fill_chars_buf = Vec::new();
 
         for ((string, target_len), fill) in string_array
@@ -407,8 +401,7 @@ where
                 }
 
                 if string.is_ascii() && fill.is_ascii() {
-                    // ASCII fast path: byte length == character length,
-                    // so we skip expensive grapheme segmentation.
+                    // ASCII fast path: byte length == character length.
                     let str_len = string.len();
                     if target_len < str_len {
                         builder.append_value(&string[..target_len]);
@@ -428,21 +421,17 @@ where
                         builder.append_value(string);
                     }
                 } else {
-                    // Reuse buffers by clearing and refilling
-                    graphemes_buf.clear();
-                    graphemes_buf.extend(string.graphemes(true));
+                    let char_count = string.chars().count();
 
                     fill_chars_buf.clear();
                     fill_chars_buf.extend(fill.chars());
 
-                    if target_len < graphemes_buf.len() {
-                        let end: usize =
-                            graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
-                        builder.append_value(&string[..end]);
+                    if target_len < char_count {
+                        builder.append_value(&string[..byte_offset_of_char(string, target_len)]);
                     } else if fill_chars_buf.is_empty() {
                         builder.append_value(string);
                     } else {
-                        for l in 0..target_len - graphemes_buf.len() {
+                        for l in 0..target_len - char_count {
                             let c =
                                 *fill_chars_buf.get(l % fill_chars_buf.len()).unwrap();
                             builder.write_char(c)?;
@@ -458,7 +447,6 @@ where
         builder.finish()
     } else {
         let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
-        let mut graphemes_buf = Vec::new();
 
         for (string, target_len) in string_array.iter().zip(length_array.iter()) {
             if let (Some(string), Some(target_len)) = (string, target_len) {
@@ -491,16 +479,12 @@ where
                         builder.append_value(string);
                     }
                 } else {
-                    // Reuse buffer by clearing and refilling
-                    graphemes_buf.clear();
-                    graphemes_buf.extend(string.graphemes(true));
-
-                    if target_len < graphemes_buf.len() {
-                        let end: usize =
-                            graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
-                        builder.append_value(&string[..end]);
+                    let char_count = string.chars().count();
+
+                    if target_len < char_count {
+                        builder.append_value(&string[..byte_offset_of_char(string, target_len)]);
                     } else {
-                        for _ in 0..(target_len - graphemes_buf.len()) {
+                        for _ in 0..(target_len - char_count) {
                             builder.write_str(" ")?;
                         }
                         builder.append_value(string);
diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs
index 44ce4640422d6..72faffd3a7c9d 100644
--- a/datafusion/functions/src/unicode/rpad.rs
+++ b/datafusion/functions/src/unicode/rpad.rs
@@ -24,7 +24,6 @@ use arrow::array::{
     OffsetSizeTrait, StringArrayType, StringViewArray,
 };
 use arrow::datatypes::DataType;
-use unicode_segmentation::UnicodeSegmentation;
 
 use crate::utils::{make_scalar_function, utf8_to_str_type};
 use datafusion_common::cast::as_int64_array;
@@ -178,7 +177,7 @@ impl ScalarUDFImpl for RPadFunc {
     }
 }
 
-use super::common::{try_as_scalar_i64, try_as_scalar_str};
+use super::common::{byte_offset_of_char, try_as_scalar_i64, try_as_scalar_str};
 
 /// Optimized rpad for constant target_len and fill arguments.
 fn rpad_scalar_args<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
@@ -271,22 +270,18 @@ fn rpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
     let data_capacity = string_array.len().saturating_mul(target_len * 4);
     let mut builder =
         GenericStringBuilder::<T>::with_capacity(string_array.len(), data_capacity);
-    let mut graphemes_buf = Vec::new();
 
     for maybe_string in string_array.iter() {
         match maybe_string {
             Some(string) => {
-                graphemes_buf.clear();
-                graphemes_buf.extend(string.graphemes(true));
+                let char_count = string.chars().count();
 
-                if target_len < graphemes_buf.len() {
-                    let end: usize =
-                        graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
-                    builder.append_value(&string[..end]);
+                if target_len < char_count {
+                    builder.append_value(&string[..byte_offset_of_char(string, target_len)]);
                 } else if fill_chars.is_empty() {
                     builder.append_value(string);
                 } else {
-                    let pad_chars = target_len - graphemes_buf.len();
+                    let pad_chars = target_len - char_count;
                     let pad_bytes = char_byte_offsets[pad_chars];
                     builder.write_str(string)?;
                     builder.write_str(&padding_buf[..pad_bytes])?;
@@ -377,7 +372,6 @@ where
 {
     let array = if let Some(fill_array) = fill_array {
         let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
-        let mut graphemes_buf = Vec::new();
         let mut fill_chars_buf = Vec::new();
 
         for ((string, target_len), fill) in string_array
@@ -406,8 +400,7 @@ where
                 }
 
                 if string.is_ascii() && fill.is_ascii() {
-                    // ASCII fast path: byte length == character length,
-                    // so we skip expensive grapheme segmentation.
+                    // ASCII fast path: byte length == character length.
                     let str_len = string.len();
                     if target_len < str_len {
                         builder.append_value(&string[..target_len]);
@@ -428,21 +421,18 @@ where
                         builder.append_value("");
                     }
                 } else {
-                    graphemes_buf.clear();
-                    graphemes_buf.extend(string.graphemes(true));
+                    let char_count = string.chars().count();
 
                     fill_chars_buf.clear();
                     fill_chars_buf.extend(fill.chars());
 
-                    if target_len < graphemes_buf.len() {
-                        let end: usize =
-                            graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
-                        builder.append_value(&string[..end]);
+                    if target_len < char_count {
+                        builder.append_value(&string[..byte_offset_of_char(string, target_len)]);
                     } else if fill_chars_buf.is_empty() {
                         builder.append_value(string);
                     } else {
                         builder.write_str(string)?;
-                        for l in 0..target_len - graphemes_buf.len() {
+                        for l in 0..target_len - char_count {
                             let c =
                                 *fill_chars_buf.get(l % fill_chars_buf.len()).unwrap();
                             builder.write_char(c)?;
@@ -458,7 +448,6 @@ where
         builder.finish()
     } else {
         let mut builder: GenericStringBuilder<T> = GenericStringBuilder::new();
-        let mut graphemes_buf = Vec::new();
 
         for (string, target_len) in string_array.iter().zip(length_array.iter()) {
             if let (Some(string), Some(target_len)) = (string, target_len) {
@@ -492,16 +481,13 @@ where
                         builder.append_value("");
                     }
                 } else {
-                    graphemes_buf.clear();
-                    graphemes_buf.extend(string.graphemes(true));
+                    let char_count = string.chars().count();
 
-                    if target_len < graphemes_buf.len() {
-                        let end: usize =
-                            graphemes_buf[..target_len].iter().map(|g| g.len()).sum();
-                        builder.append_value(&string[..end]);
+                    if target_len < char_count {
+                        builder.append_value(&string[..byte_offset_of_char(string, target_len)]);
                     } else {
                         builder.write_str(string)?;
-                        for _ in 0..(target_len - graphemes_buf.len()) {
+                        for _ in 0..(target_len - char_count) {
                             builder.write_str(" ")?;
                         }
                         builder.append_value("");
diff --git a/datafusion/functions/src/unicode/translate.rs b/datafusion/functions/src/unicode/translate.rs
index 5f95c095a644e..29dc660b86f62 100644
--- a/datafusion/functions/src/unicode/translate.rs
+++ b/datafusion/functions/src/unicode/translate.rs
@@ -21,7 +21,6 @@ use arrow::array::{
 };
 use arrow::datatypes::DataType;
 use datafusion_common::HashMap;
-use unicode_segmentation::UnicodeSegmentation;
 
 use crate::utils::make_scalar_function;
 use datafusion_common::{Result, exec_err};
@@ -97,11 +96,10 @@ impl ScalarUDFImpl for TranslateFunc {
             try_as_scalar_str(&args.args[1]),
             try_as_scalar_str(&args.args[2]),
         ) {
-            let to_graphemes: Vec<&str> = to_str.graphemes(true).collect();
+            let to_chars: Vec<char> = to_str.chars().collect();
 
-            let mut from_map: HashMap<&str, usize> = HashMap::new();
-            for (index, c) in from_str.graphemes(true).enumerate() {
-                // Ignore characters that already exist in from_map
+            let mut from_map: HashMap<char, usize> = HashMap::new();
+            for (index, c) in from_str.chars().enumerate() {
                 from_map.entry(c).or_insert(index);
             }
 
@@ -117,7 +115,7 @@ impl ScalarUDFImpl for TranslateFunc {
                     translate_with_map(
                         arr,
                         &from_map,
-                        &to_graphemes,
+                        &to_chars,
                         ascii_table.as_ref(),
                         builder,
                     )
@@ -129,7 +127,7 @@ impl ScalarUDFImpl for TranslateFunc {
                     translate_with_map(
                         arr,
                         &from_map,
-                        &to_graphemes,
+                        &to_chars,
                         ascii_table.as_ref(),
                         builder,
                     )
@@ -141,7 +139,7 @@ impl ScalarUDFImpl for TranslateFunc {
                     translate_with_map(
                         arr,
                         &from_map,
-                        &to_graphemes,
+                        &to_chars,
                         ascii_table.as_ref(),
                         builder,
                     )
@@ -215,48 +213,27 @@ where
     let from_array_iter = ArrayIter::new(from_array);
     let to_array_iter = ArrayIter::new(to_array);
 
-    // Reusable buffers to avoid allocating for each row
-    let mut from_map: HashMap<&str, usize> = HashMap::new();
-    let mut from_graphemes: Vec<&str> = Vec::new();
-    let mut to_graphemes: Vec<&str> = Vec::new();
-    let mut string_graphemes: Vec<&str> = Vec::new();
-    let mut result_graphemes: Vec<&str> = Vec::new();
+    let mut from_map: HashMap<char, usize> = HashMap::new();
+    let mut to_chars: Vec<char> = Vec::new();
+    let mut result_buf = String::new();
 
     for ((string, from), to) in string_array_iter.zip(from_array_iter).zip(to_array_iter)
     {
         match (string, from, to) {
             (Some(string), Some(from), Some(to)) => {
-                // Clear and reuse buffers
                 from_map.clear();
-                from_graphemes.clear();
-                to_graphemes.clear();
-                string_graphemes.clear();
-                result_graphemes.clear();
-
-                // Build from_map using reusable buffer
-                from_graphemes.extend(from.graphemes(true));
-                for (index, c) in from_graphemes.iter().enumerate() {
-                    // Ignore characters that already exist in from_map
-                    from_map.entry(*c).or_insert(index);
-                }
+                to_chars.clear();
+                result_buf.clear();
 
-                // Build to_graphemes
-                to_graphemes.extend(to.graphemes(true));
-
-                // Process string and build result
-                string_graphemes.extend(string.graphemes(true));
-                for c in &string_graphemes {
-                    match from_map.get(*c) {
-                        Some(n) => {
-                            if let Some(replacement) = to_graphemes.get(*n) {
-                                result_graphemes.push(*replacement);
-                            }
-                        }
-                        None => result_graphemes.push(*c),
-                    }
+                for (index, c) in from.chars().enumerate() {
+                    from_map.entry(c).or_insert(index);
                 }
 
-                builder.append_value(&result_graphemes.concat());
+                to_chars.extend(to.chars());
+
+                translate_char_by_char(string, &from_map, &to_chars, &mut result_buf);
+
+                builder.append_value(&result_buf);
             }
             _ => builder.append_null(),
         }
@@ -265,6 +242,27 @@ where
     Ok(builder.finish())
 }
 
+/// Translate `input` character-by-character using `from_map` and `to_chars`,
+/// appending the result to `buf`.
+#[inline]
+fn translate_char_by_char(
+    input: &str,
+    from_map: &HashMap<char, usize>,
+    to_chars: &[char],
+    buf: &mut String,
+) {
+    for c in input.chars() {
+        match from_map.get(&c) {
+            Some(n) => {
+                if let Some(&replacement) = to_chars.get(*n) {
+                    buf.push(replacement);
+                }
+            }
+            None => buf.push(c),
+        }
+    }
+}
+
 /// Sentinel value in the ASCII translate table indicating the character should
 /// be deleted (the `from` character has no corresponding `to` character).  Any
 /// value > 127 works since valid ASCII is 0–127.
@@ -301,11 +299,11 @@ fn build_ascii_translate_table(from: &str, to: &str) -> Option<[u8; 128]> {
 /// Optimized translate for constant `from` and `to` arguments: uses a pre-built
 /// translation map instead of rebuilding it for every row.  When an ASCII byte
 /// lookup table is provided, ASCII input rows use the lookup table; non-ASCII
-/// inputs fallback to using the map.
+/// inputs fall back to the char-based map.
 fn translate_with_map<'a, V, O>(
     string_array: V,
-    from_map: &HashMap<&str, usize>,
-    to_graphemes: &[&str],
+    from_map: &HashMap<char, usize>,
+    to_chars: &[char],
     ascii_table: Option<&[u8; 128]>,
     mut builder: O,
 ) -> Result<ArrayRef>
@@ -313,7 +311,7 @@ where
     V: ArrayAccessor<Item = &'a str>,
     O: StringLikeArrayBuilder,
 {
-    let mut result_graphemes: Vec<&str> = Vec::new();
+    let mut result_buf = String::new();
     let mut ascii_buf: Vec<u8> = Vec::new();
 
     for string in ArrayIter::new(string_array) {
@@ -335,21 +333,9 @@ where
                         std::str::from_utf8_unchecked(&ascii_buf)
                     });
                 } else {
-                    // Slow path: grapheme-based translation
-                    result_graphemes.clear();
-
-                    for c in s.graphemes(true) {
-                        match from_map.get(c) {
-                            Some(n) => {
-                                if let Some(replacement) = to_graphemes.get(*n) {
-                                    result_graphemes.push(*replacement);
-                                }
-                            }
-                            None => result_graphemes.push(c),
-                        }
-                    }
-
-                    builder.append_value(&result_graphemes.concat());
+                    result_buf.clear();
+                    translate_char_by_char(s, from_map, to_chars, &mut result_buf);
+                    builder.append_value(&result_buf);
                 }
             }
             None => builder.append_null(),
@@ -445,7 +431,7 @@ mod tests {
             StringArray
         );
         // Non-ASCII input with ASCII scalar from/to: exercises the
-        // grapheme fallback within translate_with_map.
+        // char-based fallback within translate_with_map.
         test_function!(
             TranslateFunc::new(),
             vec![
diff --git a/datafusion/sqllogictest/test_files/string/string_literal.slt b/datafusion/sqllogictest/test_files/string/string_literal.slt
index d4fe8ee178719..542e2bd8b58fc 100644
--- a/datafusion/sqllogictest/test_files/string/string_literal.slt
+++ b/datafusion/sqllogictest/test_files/string/string_literal.slt
@@ -312,6 +312,36 @@ SELECT lpad(NULL, 5, 'xy')
 ----
 NULL
 
+# lpad counts Unicode codepoints, not grapheme clusters.
+# chr(769) is U+0301 COMBINING ACUTE ACCENT — 'e' || chr(769) is 2 codepoints
+# but renders as a single grapheme cluster.
+# Use equality comparisons to avoid encoding ambiguity in expected output.
+
+# Input with combining character: 'e' + combining accent + 'x' = 3 codepoints.
+# Padding to 4 means 1 space prepended.
+query BII
+SELECT lpad('e' || chr(769) || 'x', 4) = ' ' || 'e' || chr(769) || 'x',
+       character_length('e' || chr(769) || 'x'),
+       character_length(lpad('e' || chr(769) || 'x', 4))
+----
+true 3 4
+
+# Truncating input with combining character: 'e' + combining accent + 'x' + 'y'
+# = 4 codepoints. Truncating to 3 keeps first 3 codepoints: 'e' + combining accent + 'x'.
+query BI
+SELECT lpad('e' || chr(769) || 'xy', 3) = 'e' || chr(769) || 'x',
+       character_length(lpad('e' || chr(769) || 'xy', 3))
+----
+true 3
+
+# Fill string with combining character: fill is 'e' + combining accent = 2 codepoints.
+# Padding 'x' (1 codepoint) to length 5 means 4 fill codepoints = 2 cycles of fill.
+query BI
+SELECT lpad('x', 5, 'e' || chr(769)) = 'e' || chr(769) || 'e' || chr(769) || 'x',
+       character_length(lpad('x', 5, 'e' || chr(769)))
+----
+true 5
+
 query T
 SELECT regexp_replace('foobar', 'bar', 'xx', 'gi')
 ----
@@ -583,6 +613,35 @@ SELECT rpad(arrow_cast(NULL, 'Utf8View'), 5, 'xy')
 ----
 NULL
 
+# rpad counts Unicode codepoints, not grapheme clusters.
+# chr(769) is U+0301 COMBINING ACUTE ACCENT.
+# Use equality comparisons to avoid encoding ambiguity in expected output.
+
+# Input with combining character: 'e' + combining accent + 'x' = 3 codepoints.
+# Padding to 4 means 1 space appended.
+query BII
+SELECT rpad('e' || chr(769) || 'x', 4) = 'e' || chr(769) || 'x' || ' ',
+       character_length('e' || chr(769) || 'x'),
+       character_length(rpad('e' || chr(769) || 'x', 4))
+----
+true 3 4
+
+# Truncating input with combining character: 'e' + combining accent + 'x' + 'y'
+# = 4 codepoints. Truncating to 3 keeps first 3 codepoints: 'e' + combining accent + 'x'.
+query BI
+SELECT rpad('e' || chr(769) || 'xy', 3) = 'e' || chr(769) || 'x',
+       character_length(rpad('e' || chr(769) || 'xy', 3))
+----
+true 3
+
+# Fill string with combining character: fill is 'e' + combining accent = 2 codepoints.
+# Padding 'x' (1 codepoint) to length 5 means 4 fill codepoints = 2 cycles of fill.
+query BI
+SELECT rpad('x', 5, 'e' || chr(769)) = 'x' || 'e' || chr(769) || 'e' || chr(769),
+       character_length(rpad('x', 5, 'e' || chr(769)))
+----
+true 5
+
 query I
 SELECT char_length('')
 ----
@@ -1829,3 +1888,28 @@ query T
 SELECT arrow_typeof(translate(arrow_cast('12345', 'Utf8View'), '143', 'ax'))
 ----
 Utf8View
+
+# translate operates on Unicode codepoints, not grapheme clusters.
+# chr(769) is U+0301 COMBINING ACUTE ACCENT.
+# Use equality comparisons to avoid encoding ambiguity in expected output.
+
+# Replacing a combining accent (a single codepoint) with another character.
+# 'e' || chr(769) is 2 codepoints; translating chr(769) → 'X' replaces just the accent.
+query B
+SELECT translate('e' || chr(769), chr(769), 'X') = 'eX'
+----
+true
+
+# Replacing the base character but not the combining accent.
+query B
+SELECT translate('e' || chr(769) || 'y', 'e', 'a') = 'a' || chr(769) || 'y'
+----
+true
+
+# Deleting a combining accent (from longer than to).
+# 'e' || chr(769) || 'x' with chr(769) in from but no corresponding to entry → deleted.
+query BI
+SELECT translate('e' || chr(769) || 'x', chr(769), '') = 'ex',
+       character_length(translate('e' || chr(769) || 'x', chr(769), ''))
+----
+true 2

From 2919ce7cdc2a3b42da96cc018f38fab743f8d317 Mon Sep 17 00:00:00 2001
From: Neil Conway <neil.conway@gmail.com>
Date: Mon, 6 Apr 2026 10:44:51 -0400
Subject: [PATCH 2/6] .

---
 datafusion/functions/Cargo.toml                       |  1 -
 datafusion/functions/src/unicode/common.rs            |  4 ++--
 datafusion/functions/src/unicode/lpad.rs              | 11 ++++++++---
 datafusion/functions/src/unicode/rpad.rs              | 11 ++++++++---
 .../sqllogictest/test_files/string/string_literal.slt |  3 ---
 5 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/datafusion/functions/Cargo.toml b/datafusion/functions/Cargo.toml
index 30f5da816b3ab..02b8e842280bf 100644
--- a/datafusion/functions/Cargo.toml
+++ b/datafusion/functions/Cargo.toml
@@ -87,7 +87,6 @@ num-traits = { workspace = true }
 rand = { workspace = true }
 regex = { workspace = true, optional = true }
 sha2 = { workspace = true, optional = true }
-
 uuid = { workspace = true, features = ["v4"], optional = true }
 
 [dev-dependencies]
diff --git a/datafusion/functions/src/unicode/common.rs b/datafusion/functions/src/unicode/common.rs
index 89e614ab092b5..bf80f550a65e4 100644
--- a/datafusion/functions/src/unicode/common.rs
+++ b/datafusion/functions/src/unicode/common.rs
@@ -78,8 +78,8 @@ impl LeftRightSlicer for RightSlicer {
     }
 }
 
-/// Returns the byte offset of the `n`th codepoint in `string`,
-/// or `string.len()` if the string has fewer than `n` codepoints.
+/// Returns the byte offset of the `n`th codepoint in `string`, or
+/// `string.len()` if the string has fewer than `n` codepoints.
 #[inline]
 pub(crate) fn byte_offset_of_char(string: &str, n: usize) -> usize {
     string
diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs
index 18cabf2367be8..c7a29c671c1d6 100644
--- a/datafusion/functions/src/unicode/lpad.rs
+++ b/datafusion/functions/src/unicode/lpad.rs
@@ -276,7 +276,8 @@ fn lpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
                 let char_count = string.chars().count();
 
                 if target_len < char_count {
-                    builder.append_value(&string[..byte_offset_of_char(string, target_len)]);
+                    builder
+                        .append_value(&string[..byte_offset_of_char(string, target_len)]);
                 } else if fill_chars.is_empty() {
                     builder.append_value(string);
                 } else {
@@ -427,7 +428,9 @@ where
                     fill_chars_buf.extend(fill.chars());
 
                     if target_len < char_count {
-                        builder.append_value(&string[..byte_offset_of_char(string, target_len)]);
+                        builder.append_value(
+                            &string[..byte_offset_of_char(string, target_len)],
+                        );
                     } else if fill_chars_buf.is_empty() {
                         builder.append_value(string);
                     } else {
@@ -482,7 +485,9 @@ where
                     let char_count = string.chars().count();
 
                     if target_len < char_count {
-                        builder.append_value(&string[..byte_offset_of_char(string, target_len)]);
+                        builder.append_value(
+                            &string[..byte_offset_of_char(string, target_len)],
+                        );
                     } else {
                         for _ in 0..(target_len - char_count) {
                             builder.write_str(" ")?;
diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs
index 72faffd3a7c9d..32d1824eb7dd0 100644
--- a/datafusion/functions/src/unicode/rpad.rs
+++ b/datafusion/functions/src/unicode/rpad.rs
@@ -277,7 +277,8 @@ fn rpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
                 let char_count = string.chars().count();
 
                 if target_len < char_count {
-                    builder.append_value(&string[..byte_offset_of_char(string, target_len)]);
+                    builder
+                        .append_value(&string[..byte_offset_of_char(string, target_len)]);
                 } else if fill_chars.is_empty() {
                     builder.append_value(string);
                 } else {
@@ -427,7 +428,9 @@ where
                     fill_chars_buf.extend(fill.chars());
 
                     if target_len < char_count {
-                        builder.append_value(&string[..byte_offset_of_char(string, target_len)]);
+                        builder.append_value(
+                            &string[..byte_offset_of_char(string, target_len)],
+                        );
                     } else if fill_chars_buf.is_empty() {
                         builder.append_value(string);
                     } else {
@@ -484,7 +487,9 @@ where
                     let char_count = string.chars().count();
 
                     if target_len < char_count {
-                        builder.append_value(&string[..byte_offset_of_char(string, target_len)]);
+                        builder.append_value(
+                            &string[..byte_offset_of_char(string, target_len)],
+                        );
                     } else {
                         builder.write_str(string)?;
                         for _ in 0..(target_len - char_count) {
diff --git a/datafusion/sqllogictest/test_files/string/string_literal.slt b/datafusion/sqllogictest/test_files/string/string_literal.slt
index 542e2bd8b58fc..e39bdc2ab4dd3 100644
--- a/datafusion/sqllogictest/test_files/string/string_literal.slt
+++ b/datafusion/sqllogictest/test_files/string/string_literal.slt
@@ -315,7 +315,6 @@ NULL
 # lpad counts Unicode codepoints, not grapheme clusters.
 # chr(769) is U+0301 COMBINING ACUTE ACCENT — 'e' || chr(769) is 2 codepoints
 # but renders as a single grapheme cluster.
-# Use equality comparisons to avoid encoding ambiguity in expected output.
 
 # Input with combining character: 'e' + combining accent + 'x' = 3 codepoints.
 # Padding to 4 means 1 space prepended.
@@ -615,7 +614,6 @@ NULL
 
 # rpad counts Unicode codepoints, not grapheme clusters.
 # chr(769) is U+0301 COMBINING ACUTE ACCENT.
-# Use equality comparisons to avoid encoding ambiguity in expected output.
 
 # Input with combining character: 'e' + combining accent + 'x' = 3 codepoints.
 # Padding to 4 means 1 space appended.
@@ -1891,7 +1889,6 @@ Utf8View
 
 # translate operates on Unicode codepoints, not grapheme clusters.
 # chr(769) is U+0301 COMBINING ACUTE ACCENT.
-# Use equality comparisons to avoid encoding ambiguity in expected output.
 
 # Replacing a combining accent (a single codepoint) with another character.
 # 'e' || chr(769) is 2 codepoints; translating chr(769) → 'X' replaces just the accent.

From c63800a56a1e2e03eadd95f6f0b5888f53e5ca8c Mon Sep 17 00:00:00 2001
From: Neil Conway <neil.conway@gmail.com>
Date: Tue, 7 Apr 2026 09:04:51 -0400
Subject: [PATCH 3/6] Fixes per code review

---
 .../test_files/string/string_literal.slt             |  2 +-
 docs/source/library-user-guide/upgrading/54.0.0.md   | 12 ++++++++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/datafusion/sqllogictest/test_files/string/string_literal.slt b/datafusion/sqllogictest/test_files/string/string_literal.slt
index e39bdc2ab4dd3..97f2a40c13fea 100644
--- a/datafusion/sqllogictest/test_files/string/string_literal.slt
+++ b/datafusion/sqllogictest/test_files/string/string_literal.slt
@@ -1904,7 +1904,7 @@ SELECT translate('e' || chr(769) || 'y', 'e', 'a') = 'a' || chr(769) || 'y'
 true
 
 # Deleting a combining accent (from longer than to).
-# 'e' || chr(769) || 'x' with chr(769) in from but no corresponding to entry → deleted.
+# 'e' || chr(769) || 'x' with chr(769) in `from` but no corresponding `to` entry → deleted.
 query BI
 SELECT translate('e' || chr(769) || 'x', chr(769), '') = 'ex',
        character_length(translate('e' || chr(769) || 'x', chr(769), ''))
diff --git a/docs/source/library-user-guide/upgrading/54.0.0.md b/docs/source/library-user-guide/upgrading/54.0.0.md
index 6dc08cc344e5f..24cbd113b62f7 100644
--- a/docs/source/library-user-guide/upgrading/54.0.0.md
+++ b/docs/source/library-user-guide/upgrading/54.0.0.md
@@ -320,4 +320,16 @@ which results in a few changes:
   `timestamp-*` is UTC timezone-aware, while `local-timestamp-*` is
   timezone-naive.
 
+### `lpad`, `rpad`, and `translate` now operate on Unicode codepoints instead of grapheme clusters
+
+Previously, `lpad`, `rpad`, and `translate` used Unicode grapheme cluster
+segmentation to measure and manipulate strings. They now use Unicode codepoints,
+which is consistent with the SQL standard and most other SQL implementations. It
+also matches the behavior of other string-related functions in DataFusion.
+
+The difference is only observable for strings containing combining characters
+(e.g., U+0301 COMBINING ACUTE ACCENT) or other multi-codepoint grapheme
+clusters (e.g., ZWJ emoji sequences). For ASCII and most common Unicode text,
+behavior is unchanged.
+
 [#17861]: https://github.com/apache/datafusion/pull/17861

From 82d8128910410e1014301dc8d17e2f6ab847ca3f Mon Sep 17 00:00:00 2001
From: Neil Conway <neil.conway@gmail.com>
Date: Tue, 7 Apr 2026 10:11:18 -0400
Subject: [PATCH 4/6] Clean up repeated string iteration, per review

---
 datafusion/functions/src/unicode/common.rs | 27 +++++++-
 datafusion/functions/src/unicode/lpad.rs   | 75 ++++++++++----------
 datafusion/functions/src/unicode/rpad.rs   | 81 +++++++++++-----------
 3 files changed, 104 insertions(+), 79 deletions(-)

diff --git a/datafusion/functions/src/unicode/common.rs b/datafusion/functions/src/unicode/common.rs
index bf80f550a65e4..e3c634133cbf5 100644
--- a/datafusion/functions/src/unicode/common.rs
+++ b/datafusion/functions/src/unicode/common.rs
@@ -78,8 +78,8 @@ impl LeftRightSlicer for RightSlicer {
     }
 }
 
-/// Returns the byte offset of the `n`th codepoint in `string`, or
-/// `string.len()` if the string has fewer than `n` codepoints.
+/// Returns the byte offset of the `n`th codepoint in `string`,
+/// or `string.len()` if the string has fewer than `n` codepoints.
 #[inline]
 pub(crate) fn byte_offset_of_char(string: &str, n: usize) -> usize {
     string
@@ -88,6 +88,29 @@ pub(crate) fn byte_offset_of_char(string: &str, n: usize) -> usize {
         .map_or(string.len(), |(i, _)| i)
 }
 
+/// Measures the character length of `string` in a single pass, returning
+/// early with the truncation byte offset if the string exceeds `n` chars.
+#[inline]
+pub(crate) fn measure_char_count(string: &str, n: usize) -> StringCharLen {
+    let mut count = 0;
+    for (byte_idx, _) in string.char_indices() {
+        if count == n {
+            return StringCharLen::TruncateAt(byte_idx);
+        }
+        count += 1;
+    }
+    StringCharLen::CharCount(count)
+}
+
+/// Result of [`measure_char_count`].
+pub(crate) enum StringCharLen {
+    /// The string has more than `n` chars; contains the byte offset at the
+    /// `n`-th character boundary (suitable for slicing `&string[..offset]`).
+    TruncateAt(usize),
+    /// The string has `n` or fewer chars; contains the exact character count.
+    CharCount(usize),
+}
+
 /// Calculate the byte length of the substring of `n` chars from string `string`
 #[inline]
 fn left_right_byte_length(string: &str, n: i64) -> usize {
diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs
index c7a29c671c1d6..3e3e80f48e28d 100644
--- a/datafusion/functions/src/unicode/lpad.rs
+++ b/datafusion/functions/src/unicode/lpad.rs
@@ -177,7 +177,9 @@ impl ScalarUDFImpl for LPadFunc {
     }
 }
 
-use super::common::{byte_offset_of_char, try_as_scalar_i64, try_as_scalar_str};
+use super::common::{
+    StringCharLen, measure_char_count, try_as_scalar_i64, try_as_scalar_str,
+};
 
 /// Optimized lpad for constant target_len and fill arguments.
 fn lpad_scalar_args<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
@@ -273,18 +275,18 @@ fn lpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
     for maybe_string in string_array.iter() {
         match maybe_string {
             Some(string) => {
-                let char_count = string.chars().count();
-
-                if target_len < char_count {
-                    builder
-                        .append_value(&string[..byte_offset_of_char(string, target_len)]);
-                } else if fill_chars.is_empty() {
-                    builder.append_value(string);
-                } else {
-                    let pad_chars = target_len - char_count;
-                    let pad_bytes = char_byte_offsets[pad_chars];
-                    builder.write_str(&padding_buf[..pad_bytes])?;
-                    builder.append_value(string);
+                match measure_char_count(string, target_len) {
+                    StringCharLen::TruncateAt(offset) => {
+                        builder.append_value(&string[..offset]);
+                    }
+                    StringCharLen::CharCount(char_count) => {
+                        if !fill_chars.is_empty() {
+                            let pad_chars = target_len - char_count;
+                            let pad_bytes = char_byte_offsets[pad_chars];
+                            builder.write_str(&padding_buf[..pad_bytes])?;
+                        }
+                        builder.append_value(string);
+                    }
                 }
             }
             None => builder.append_null(),
@@ -422,24 +424,24 @@ where
                         builder.append_value(string);
                     }
                 } else {
-                    let char_count = string.chars().count();
-
                     fill_chars_buf.clear();
                     fill_chars_buf.extend(fill.chars());
 
-                    if target_len < char_count {
-                        builder.append_value(
-                            &string[..byte_offset_of_char(string, target_len)],
-                        );
-                    } else if fill_chars_buf.is_empty() {
-                        builder.append_value(string);
-                    } else {
-                        for l in 0..target_len - char_count {
-                            let c =
-                                *fill_chars_buf.get(l % fill_chars_buf.len()).unwrap();
-                            builder.write_char(c)?;
+                    match measure_char_count(string, target_len) {
+                        StringCharLen::TruncateAt(offset) => {
+                            builder.append_value(&string[..offset]);
+                        }
+                        StringCharLen::CharCount(char_count) => {
+                            if !fill_chars_buf.is_empty() {
+                                for l in 0..target_len - char_count {
+                                    let c = *fill_chars_buf
+                                        .get(l % fill_chars_buf.len())
+                                        .unwrap();
+                                    builder.write_char(c)?;
+                                }
+                            }
+                            builder.append_value(string);
                         }
-                        builder.append_value(string);
                     }
                 }
             } else {
@@ -482,17 +484,16 @@ where
                         builder.append_value(string);
                     }
                 } else {
-                    let char_count = string.chars().count();
-
-                    if target_len < char_count {
-                        builder.append_value(
-                            &string[..byte_offset_of_char(string, target_len)],
-                        );
-                    } else {
-                        for _ in 0..(target_len - char_count) {
-                            builder.write_str(" ")?;
+                    match measure_char_count(string, target_len) {
+                        StringCharLen::TruncateAt(offset) => {
+                            builder.append_value(&string[..offset]);
+                        }
+                        StringCharLen::CharCount(char_count) => {
+                            for _ in 0..(target_len - char_count) {
+                                builder.write_str(" ")?;
+                            }
+                            builder.append_value(string);
                         }
-                        builder.append_value(string);
                     }
                 }
             } else {
diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs
index 32d1824eb7dd0..a027085078a6a 100644
--- a/datafusion/functions/src/unicode/rpad.rs
+++ b/datafusion/functions/src/unicode/rpad.rs
@@ -177,7 +177,9 @@ impl ScalarUDFImpl for RPadFunc {
     }
 }
 
-use super::common::{byte_offset_of_char, try_as_scalar_i64, try_as_scalar_str};
+use super::common::{
+    StringCharLen, measure_char_count, try_as_scalar_i64, try_as_scalar_str,
+};
 
 /// Optimized rpad for constant target_len and fill arguments.
 fn rpad_scalar_args<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
@@ -274,19 +276,19 @@ fn rpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
     for maybe_string in string_array.iter() {
         match maybe_string {
             Some(string) => {
-                let char_count = string.chars().count();
-
-                if target_len < char_count {
-                    builder
-                        .append_value(&string[..byte_offset_of_char(string, target_len)]);
-                } else if fill_chars.is_empty() {
-                    builder.append_value(string);
-                } else {
-                    let pad_chars = target_len - char_count;
-                    let pad_bytes = char_byte_offsets[pad_chars];
-                    builder.write_str(string)?;
-                    builder.write_str(&padding_buf[..pad_bytes])?;
-                    builder.append_value("");
+                match measure_char_count(string, target_len) {
+                    StringCharLen::TruncateAt(offset) => {
+                        builder.append_value(&string[..offset]);
+                    }
+                    StringCharLen::CharCount(char_count) => {
+                        builder.write_str(string)?;
+                        if !fill_chars.is_empty() {
+                            let pad_chars = target_len - char_count;
+                            let pad_bytes = char_byte_offsets[pad_chars];
+                            builder.write_str(&padding_buf[..pad_bytes])?;
+                        }
+                        builder.append_value("");
+                    }
                 }
             }
             None => builder.append_null(),
@@ -422,25 +424,25 @@ where
                         builder.append_value("");
                     }
                 } else {
-                    let char_count = string.chars().count();
-
                     fill_chars_buf.clear();
                     fill_chars_buf.extend(fill.chars());
 
-                    if target_len < char_count {
-                        builder.append_value(
-                            &string[..byte_offset_of_char(string, target_len)],
-                        );
-                    } else if fill_chars_buf.is_empty() {
-                        builder.append_value(string);
-                    } else {
-                        builder.write_str(string)?;
-                        for l in 0..target_len - char_count {
-                            let c =
-                                *fill_chars_buf.get(l % fill_chars_buf.len()).unwrap();
-                            builder.write_char(c)?;
+                    match measure_char_count(string, target_len) {
+                        StringCharLen::TruncateAt(offset) => {
+                            builder.append_value(&string[..offset]);
+                        }
+                        StringCharLen::CharCount(char_count) => {
+                            builder.write_str(string)?;
+                            if !fill_chars_buf.is_empty() {
+                                for l in 0..target_len - char_count {
+                                    let c = *fill_chars_buf
+                                        .get(l % fill_chars_buf.len())
+                                        .unwrap();
+                                    builder.write_char(c)?;
+                                }
+                            }
+                            builder.append_value("");
                         }
-                        builder.append_value("");
                     }
                 }
             } else {
@@ -484,18 +486,17 @@ where
                         builder.append_value("");
                     }
                 } else {
-                    let char_count = string.chars().count();
-
-                    if target_len < char_count {
-                        builder.append_value(
-                            &string[..byte_offset_of_char(string, target_len)],
-                        );
-                    } else {
-                        builder.write_str(string)?;
-                        for _ in 0..(target_len - char_count) {
-                            builder.write_str(" ")?;
+                    match measure_char_count(string, target_len) {
+                        StringCharLen::TruncateAt(offset) => {
+                            builder.append_value(&string[..offset]);
+                        }
+                        StringCharLen::CharCount(char_count) => {
+                            builder.write_str(string)?;
+                            for _ in 0..(target_len - char_count) {
+                                builder.write_str(" ")?;
+                            }
+                            builder.append_value("");
                         }
-                        builder.append_value("");
                     }
                 }
             } else {

From 7232c3411b435d26e678867d63ad958ac6b96475 Mon Sep 17 00:00:00 2001
From: Neil Conway <neil.conway@gmail.com>
Date: Tue, 7 Apr 2026 10:57:12 -0400
Subject: [PATCH 5/6] Improve function name

---
 datafusion/functions/src/unicode/common.rs | 18 +++++++++---------
 datafusion/functions/src/unicode/lpad.rs   | 14 +++++++-------
 datafusion/functions/src/unicode/rpad.rs   | 14 +++++++-------
 3 files changed, 23 insertions(+), 23 deletions(-)

diff --git a/datafusion/functions/src/unicode/common.rs b/datafusion/functions/src/unicode/common.rs
index e3c634133cbf5..0158325e98a19 100644
--- a/datafusion/functions/src/unicode/common.rs
+++ b/datafusion/functions/src/unicode/common.rs
@@ -88,26 +88,26 @@ pub(crate) fn byte_offset_of_char(string: &str, n: usize) -> usize {
         .map_or(string.len(), |(i, _)| i)
 }
 
-/// Measures the character length of `string` in a single pass, returning
-/// early with the truncation byte offset if the string exceeds `n` chars.
+/// If `string` has more than `n` codepoints, returns the byte offset of
+/// the `n`-th codepoint boundary. Otherwise returns the total codepoint count.
 #[inline]
-pub(crate) fn measure_char_count(string: &str, n: usize) -> StringCharLen {
+pub(crate) fn char_count_or_boundary(string: &str, n: usize) -> StringCharLen {
     let mut count = 0;
     for (byte_idx, _) in string.char_indices() {
         if count == n {
-            return StringCharLen::TruncateAt(byte_idx);
+            return StringCharLen::ByteOffset(byte_idx);
         }
         count += 1;
     }
     StringCharLen::CharCount(count)
 }
 
-/// Result of [`measure_char_count`].
+/// Result of [`char_count_or_boundary`].
 pub(crate) enum StringCharLen {
-    /// The string has more than `n` chars; contains the byte offset at the
-    /// `n`-th character boundary (suitable for slicing `&string[..offset]`).
-    TruncateAt(usize),
-    /// The string has `n` or fewer chars; contains the exact character count.
+    /// The string has more than `n` codepoints; contains the byte offset
+    /// at the `n`-th codepoint boundary.
+    ByteOffset(usize),
+    /// The string has `n` or fewer codepoints; contains the exact count.
     CharCount(usize),
 }
 
diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs
index 3e3e80f48e28d..b61c3d3900fcc 100644
--- a/datafusion/functions/src/unicode/lpad.rs
+++ b/datafusion/functions/src/unicode/lpad.rs
@@ -178,7 +178,7 @@ impl ScalarUDFImpl for LPadFunc {
 }
 
 use super::common::{
-    StringCharLen, measure_char_count, try_as_scalar_i64, try_as_scalar_str,
+    StringCharLen, char_count_or_boundary, try_as_scalar_i64, try_as_scalar_str,
 };
 
 /// Optimized lpad for constant target_len and fill arguments.
@@ -275,8 +275,8 @@ fn lpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
     for maybe_string in string_array.iter() {
         match maybe_string {
             Some(string) => {
-                match measure_char_count(string, target_len) {
-                    StringCharLen::TruncateAt(offset) => {
+                match char_count_or_boundary(string, target_len) {
+                    StringCharLen::ByteOffset(offset) => {
                         builder.append_value(&string[..offset]);
                     }
                     StringCharLen::CharCount(char_count) => {
@@ -427,8 +427,8 @@ where
                     fill_chars_buf.clear();
                     fill_chars_buf.extend(fill.chars());
 
-                    match measure_char_count(string, target_len) {
-                        StringCharLen::TruncateAt(offset) => {
+                    match char_count_or_boundary(string, target_len) {
+                        StringCharLen::ByteOffset(offset) => {
                             builder.append_value(&string[..offset]);
                         }
                         StringCharLen::CharCount(char_count) => {
@@ -484,8 +484,8 @@ where
                         builder.append_value(string);
                     }
                 } else {
-                    match measure_char_count(string, target_len) {
-                        StringCharLen::TruncateAt(offset) => {
+                    match char_count_or_boundary(string, target_len) {
+                        StringCharLen::ByteOffset(offset) => {
                             builder.append_value(&string[..offset]);
                         }
                         StringCharLen::CharCount(char_count) => {
diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs
index a027085078a6a..6b0748aead413 100644
--- a/datafusion/functions/src/unicode/rpad.rs
+++ b/datafusion/functions/src/unicode/rpad.rs
@@ -178,7 +178,7 @@ impl ScalarUDFImpl for RPadFunc {
 }
 
 use super::common::{
-    StringCharLen, measure_char_count, try_as_scalar_i64, try_as_scalar_str,
+    StringCharLen, char_count_or_boundary, try_as_scalar_i64, try_as_scalar_str,
 };
 
 /// Optimized rpad for constant target_len and fill arguments.
@@ -276,8 +276,8 @@ fn rpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
     for maybe_string in string_array.iter() {
         match maybe_string {
             Some(string) => {
-                match measure_char_count(string, target_len) {
-                    StringCharLen::TruncateAt(offset) => {
+                match char_count_or_boundary(string, target_len) {
+                    StringCharLen::ByteOffset(offset) => {
                         builder.append_value(&string[..offset]);
                     }
                     StringCharLen::CharCount(char_count) => {
@@ -427,8 +427,8 @@ where
                     fill_chars_buf.clear();
                     fill_chars_buf.extend(fill.chars());
 
-                    match measure_char_count(string, target_len) {
-                        StringCharLen::TruncateAt(offset) => {
+                    match char_count_or_boundary(string, target_len) {
+                        StringCharLen::ByteOffset(offset) => {
                             builder.append_value(&string[..offset]);
                         }
                         StringCharLen::CharCount(char_count) => {
@@ -486,8 +486,8 @@ where
                         builder.append_value("");
                     }
                 } else {
-                    match measure_char_count(string, target_len) {
-                        StringCharLen::TruncateAt(offset) => {
+                    match char_count_or_boundary(string, target_len) {
+                        StringCharLen::ByteOffset(offset) => {
                             builder.append_value(&string[..offset]);
                         }
                         StringCharLen::CharCount(char_count) => {

From 8a17d525165bb5c5089502b73578591c8f19bef9 Mon Sep 17 00:00:00 2001
From: Neil Conway <neil.conway@gmail.com>
Date: Tue, 7 Apr 2026 11:12:48 -0400
Subject: [PATCH 6/6] cargo fmt

---
 datafusion/functions/src/unicode/lpad.rs | 24 ++++++++++------------
 datafusion/functions/src/unicode/rpad.rs | 26 +++++++++++-------------
 2 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/datafusion/functions/src/unicode/lpad.rs b/datafusion/functions/src/unicode/lpad.rs
index b61c3d3900fcc..d27bc8633e730 100644
--- a/datafusion/functions/src/unicode/lpad.rs
+++ b/datafusion/functions/src/unicode/lpad.rs
@@ -274,21 +274,19 @@ fn lpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
 
     for maybe_string in string_array.iter() {
         match maybe_string {
-            Some(string) => {
-                match char_count_or_boundary(string, target_len) {
-                    StringCharLen::ByteOffset(offset) => {
-                        builder.append_value(&string[..offset]);
-                    }
-                    StringCharLen::CharCount(char_count) => {
-                        if !fill_chars.is_empty() {
-                            let pad_chars = target_len - char_count;
-                            let pad_bytes = char_byte_offsets[pad_chars];
-                            builder.write_str(&padding_buf[..pad_bytes])?;
-                        }
-                        builder.append_value(string);
+            Some(string) => match char_count_or_boundary(string, target_len) {
+                StringCharLen::ByteOffset(offset) => {
+                    builder.append_value(&string[..offset]);
+                }
+                StringCharLen::CharCount(char_count) => {
+                    if !fill_chars.is_empty() {
+                        let pad_chars = target_len - char_count;
+                        let pad_bytes = char_byte_offsets[pad_chars];
+                        builder.write_str(&padding_buf[..pad_bytes])?;
                     }
+                    builder.append_value(string);
                 }
-            }
+            },
             None => builder.append_null(),
         }
     }
diff --git a/datafusion/functions/src/unicode/rpad.rs b/datafusion/functions/src/unicode/rpad.rs
index 6b0748aead413..b3e14f93526ab 100644
--- a/datafusion/functions/src/unicode/rpad.rs
+++ b/datafusion/functions/src/unicode/rpad.rs
@@ -275,22 +275,20 @@ fn rpad_scalar_unicode<'a, V: StringArrayType<'a> + Copy, T: OffsetSizeTrait>(
 
     for maybe_string in string_array.iter() {
         match maybe_string {
-            Some(string) => {
-                match char_count_or_boundary(string, target_len) {
-                    StringCharLen::ByteOffset(offset) => {
-                        builder.append_value(&string[..offset]);
-                    }
-                    StringCharLen::CharCount(char_count) => {
-                        builder.write_str(string)?;
-                        if !fill_chars.is_empty() {
-                            let pad_chars = target_len - char_count;
-                            let pad_bytes = char_byte_offsets[pad_chars];
-                            builder.write_str(&padding_buf[..pad_bytes])?;
-                        }
-                        builder.append_value("");
+            Some(string) => match char_count_or_boundary(string, target_len) {
+                StringCharLen::ByteOffset(offset) => {
+                    builder.append_value(&string[..offset]);
+                }
+                StringCharLen::CharCount(char_count) => {
+                    builder.write_str(string)?;
+                    if !fill_chars.is_empty() {
+                        let pad_chars = target_len - char_count;
+                        let pad_bytes = char_byte_offsets[pad_chars];
+                        builder.write_str(&padding_buf[..pad_bytes])?;
                     }
+                    builder.append_value("");
                 }
-            }
+            },
             None => builder.append_null(),
         }
     }