Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
32 changes: 22 additions & 10 deletions datafusion/optimizer/src/simplify_expressions/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -283,20 +283,23 @@ fn partial_anchored_literal_to_like(v: &[Hir]) -> Option<String> {

/// Extracts a string literal expression assuming that [`is_anchored_literal`]
/// returned true.
fn anchored_literal_to_expr(v: &[Hir]) -> Option<Expr> {
fn anchored_literal_to_expr(v: &[Hir], string_scalar: &StringScalar) -> Option<Expr> {
match v.len() {
2 => Some(lit("")),
2 => Some(string_scalar.to_expr("")),
3 => {
let HirKind::Literal(l) = v[1].kind() else {
return None;
};
like_str_from_literal(l).map(lit)
like_str_from_literal(l).map(|s| string_scalar.to_expr(s))
}
_ => None,
}
}

fn anchored_alternation_to_exprs(v: &[Hir]) -> Option<Vec<Expr>> {
fn anchored_alternation_to_exprs(
v: &[Hir],
string_scalar: &StringScalar,
) -> Option<Vec<Expr>> {
if 3 != v.len() {
return None;
}
Expand All @@ -308,7 +311,8 @@ fn anchored_alternation_to_exprs(v: &[Hir]) -> Option<Vec<Expr>> {
for hir in alters {
let mut is_safe = false;
if let HirKind::Literal(l) = hir.kind()
&& let Some(safe_literal) = str_from_literal(l).map(lit)
&& let Some(safe_literal) =
str_from_literal(l).map(|s| string_scalar.to_expr(s))
{
literals.push(safe_literal);
is_safe = true;
Expand All @@ -321,7 +325,9 @@ fn anchored_alternation_to_exprs(v: &[Hir]) -> Option<Vec<Expr>> {

return Some(literals);
} else if let HirKind::Literal(l) = sub.kind() {
if let Some(safe_literal) = str_from_literal(l).map(lit) {
if let Some(safe_literal) =
str_from_literal(l).map(|s| string_scalar.to_expr(s))
{
return Some(vec![safe_literal]);
}
return None;
Expand Down Expand Up @@ -351,12 +357,18 @@ fn lower_simple(
));
}
HirKind::Concat(inner) if is_anchored_literal(inner) => {
return anchored_literal_to_expr(inner).map(|right| {
mode.expr_matches_literal(Box::new(left.clone()), Box::new(right))
return anchored_literal_to_expr(inner, string_scalar).map(|right| {
if mode.i {
// Case-insensitive: use ILIKE for exact match (no wildcards)
mode.expr(Box::new(left.clone()), Box::new(right))
} else {
// Case-sensitive: use Eq / NotEq
mode.expr_matches_literal(Box::new(left.clone()), Box::new(right))
}
});
}
HirKind::Concat(inner) if is_anchored_capture(inner) => {
return anchored_alternation_to_exprs(inner)
HirKind::Concat(inner) if !mode.i && is_anchored_capture(inner) => {
return anchored_alternation_to_exprs(inner, string_scalar)
.map(|right| left.clone().in_list(right, mode.not));
}
HirKind::Concat(inner) => {
Expand Down
53 changes: 53 additions & 0 deletions datafusion/sqllogictest/test_files/predicates.slt
Original file line number Diff line number Diff line change
Expand Up @@ -204,12 +204,65 @@ SELECT * FROM test WHERE column1 ~ 'z'
----
Bazzz

query T
SELECT * FROM test WHERE column1 ~ '^Bazzz$'
----
Bazzz

query T
SELECT * FROM test WHERE column1 ~ '^(foo|Bazzz)$'
----
foo
Bazzz

statement ok
CREATE TABLE test_regex_utf8view(s VARCHAR) AS VALUES ('foo'), ('Bazzz');
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Question to educate myself: How the values here are Utf8View ?
I'd expect some casting to achieve that.


query T
SELECT * FROM test_regex_utf8view WHERE s ~ '^Bazzz$'
----
Bazzz

query T
SELECT * FROM test_regex_utf8view WHERE s ~ '^(foo|Bazzz)$'
----
foo
Bazzz

# Case-insensitive anchored match over Utf8View: must be simplified to ILIKE
# (not a case-sensitive Eq) and must keep operand types as Utf8View.
query T
SELECT * FROM test_regex_utf8view WHERE s ~* '^bazzz$'
----
Bazzz
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

How this asserts the expected result ?
Neither the optimization nor the type is asserted.
Maybe use EXPLAIN ... and assert its output instead ?!


# Case-insensitive anchored alternation over Utf8View
query T rowsort
SELECT * FROM test_regex_utf8view WHERE s ~* '^(foo|bazzz)$'
----
Bazzz
foo

statement ok
DROP TABLE test_regex_utf8view;

query T
SELECT * FROM test WHERE column1 ~* 'z'
----
Bazzz
ZZZZZ

query T
SELECT * FROM test WHERE column1 ~* '^barrr$'
----
Barrr

query T
SELECT * FROM test WHERE column1 ~* '^(barrr|bazzz)$'
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Tests with negation+regex are missing (!~ and !~*).

----
Barrr
Bazzz

query T
SELECT * FROM test WHERE column1 !~ 'z'
----
Expand Down
Loading