Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
87 changes: 74 additions & 13 deletions datafusion/functions/src/regex/regexpreplace.rs
Original file line number Diff line number Diff line change
Expand Up @@ -199,6 +199,22 @@ fn regex_replace_posix_groups(replacement: &str) -> String {
.into_owned()
}

/// For anchored patterns like `^...(capture)....*$` where the replacement
/// is `\1`, build a shorter regex (stripping trailing `.*$`) and use
/// `captures_read` with `CaptureLocations` for direct extraction — no
/// `expand()`, no `String` allocation.
fn try_build_short_extract_regex(pattern: &str, replacement: &str) -> Option<Regex> {
if replacement != "${1}" || !pattern.starts_with('^') || !pattern.ends_with(".*$") {
return None;
}
let short = &pattern[..pattern.len() - 3];
let re = Regex::new(short).ok()?;
if re.captures_len() != 2 {
return None;
}
Some(re)
}

/// Replaces substring(s) matching a PCRE-like regular expression.
///
/// The full list of supported features and syntax can be found at
Expand Down Expand Up @@ -457,6 +473,14 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
// with rust ones.
let replacement = regex_replace_posix_groups(replacement);

// For anchored patterns like ^...(capture)....*$, build a shorter
// regex and use captures_read for direct extraction.
let short_re = if limit == 1 {
try_build_short_extract_regex(&pattern, &replacement)
} else {
None
};

let string_array_type = args[0].data_type();
match string_array_type {
DataType::Utf8 | DataType::LargeUtf8 => {
Expand All @@ -473,13 +497,31 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(
let mut new_offsets = BufferBuilder::<T>::new(string_array.len() + 1);
new_offsets.append(T::zero());

string_array.iter().for_each(|val| {
if let Some(val) = val {
let result = re.replacen(val, limit, replacement.as_str());
vals.append_slice(result.as_bytes());
}
new_offsets.append(T::from_usize(vals.len()).unwrap());
});
if let Some(ref short_re) = short_re {
let mut locs = short_re.capture_locations();
string_array.iter().for_each(|val| {
if let Some(val) = val {
if short_re.captures_read(&mut locs, val).is_some() {
if let Some((start, end)) = locs.get(1) {
vals.append_slice(&val.as_bytes()[start..end]);
} else {
vals.append_slice(val.as_bytes());
}
} else {
vals.append_slice(val.as_bytes());
}
}
new_offsets.append(T::from_usize(vals.len()).unwrap());
});
} else {
string_array.iter().for_each(|val| {
if let Some(val) = val {
let result = re.replacen(val, limit, replacement.as_str());
vals.append_slice(result.as_bytes());
}
new_offsets.append(T::from_usize(vals.len()).unwrap());
});
}

let data = ArrayDataBuilder::new(GenericStringArray::<T>::DATA_TYPE)
.len(string_array.len())
Expand All @@ -494,12 +536,31 @@ fn _regexp_replace_static_pattern_replace<T: OffsetSizeTrait>(

let mut builder = StringViewBuilder::with_capacity(string_view_array.len());

for val in string_view_array.iter() {
if let Some(val) = val {
let result = re.replacen(val, limit, replacement.as_str());
builder.append_value(result);
} else {
builder.append_null();
if let Some(ref short_re) = short_re {
let mut locs = short_re.capture_locations();
for val in string_view_array.iter() {
if let Some(val) = val {
if short_re.captures_read(&mut locs, val).is_some() {
if let Some((start, end)) = locs.get(1) {
builder.append_value(&val[start..end]);
} else {
builder.append_value(val);
}
} else {
builder.append_value(val);
}
} else {
builder.append_null();
}
}
} else {
for val in string_view_array.iter() {
if let Some(val) = val {
let result = re.replacen(val, limit, replacement.as_str());
builder.append_value(result);
} else {
builder.append_null();
}
}
}

Expand Down
15 changes: 15 additions & 0 deletions datafusion/sqllogictest/test_files/regexp/regexp_replace.slt
Original file line number Diff line number Diff line change
Expand Up @@ -127,3 +127,18 @@ from (values ('a'), ('b')) as tbl(col);
----
NULL NULL NULL
NULL NULL NULL

# Extract domain from URL using anchored pattern with trailing .*
# This tests that the full URL suffix is replaced, not just the matched prefix
query T
SELECT regexp_replace(url, '^https?://(?:www\.)?([^/]+)/.*$', '\1') FROM (VALUES
('https://www.example.com/path/to/page?q=1'),
('http://test.org/foo/bar'),
('https://example.com/'),
('not-a-url')
) AS t(url);
----
example.com
test.org
example.com
not-a-url
Loading