Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

60 changes: 60 additions & 0 deletions misc/python/materialize/checks/all_checks/regex.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,66 @@
from materialize.checks.checks import Check


class RegexpExtractNonNullable(Check):
"""Capture groups that always participate in a match yield non-nullable
columns (database-issues#612). This exercises that such a narrowed,
persisted desc survives restart/upgrade: an MV created on the previous
release registers all-nullable columns, while the current build re-derives
the desc with non-nullable columns. (The persist schema is
nullability-invariant, see database-issues#2488, so this must not conflict.)
"""

def initialize(self) -> Testdrive:
return Testdrive(dedent("""
> CREATE TABLE regexp_extract_nn_table (f1 STRING);
> INSERT INTO regexp_extract_nn_table VALUES ('ab');
"""))

def manipulate(self) -> list[Testdrive]:
return [
Testdrive(dedent(s))
for s in [
"""
> CREATE MATERIALIZED VIEW regexp_extract_nn_view1 AS
SELECT g.a, g.b FROM regexp_extract_nn_table,
regexp_extract('(?P<a>[a-z])(?P<b>[a-z])', f1) AS g;
> INSERT INTO regexp_extract_nn_table VALUES ('kl');
""",
"""
> CREATE MATERIALIZED VIEW regexp_extract_nn_view2 AS
SELECT g.a, g.b FROM regexp_extract_nn_table,
regexp_extract('(?P<a>[a-z])(?P<b>[a-z])', f1) AS g;
> INSERT INTO regexp_extract_nn_table VALUES ('yz');
""",
]
]

def validate(self) -> Testdrive:
return Testdrive(dedent("""
> SELECT a, b FROM regexp_extract_nn_view1 ORDER BY a;
a b
k l
y z

> SELECT a, b FROM regexp_extract_nn_view2 ORDER BY a;
a b
k l
y z

> SELECT c.name, c.nullable FROM mz_columns c
JOIN mz_materialized_views v ON c.id = v.id
WHERE v.name = 'regexp_extract_nn_view1' ORDER BY c.position;
a false
b false

> SELECT c.name, c.nullable FROM mz_columns c
JOIN mz_materialized_views v ON c.id = v.id
WHERE v.name = 'regexp_extract_nn_view2' ORDER BY c.position;
a false
b false
"""))


class RegexpExtract(Check):
"""The regex from regexp_extract has its own ProtoAnalyzedRegex"""

Expand Down
8 changes: 5 additions & 3 deletions src/expr/src/relation/func.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3240,6 +3240,10 @@ pub struct AnalyzedRegex(ReprRegex, Vec<CaptureGroupDesc>, AnalyzedRegexOpts);
impl AnalyzedRegex {
pub fn new(s: &str, opts: AnalyzedRegexOpts) -> Result<Self, RegexCompilationError> {
let r = ReprRegex::new(s, opts.case_insensitive)?;
// A capture group that always participates in a match (when the regex
// matches at all) extracts a non-null value, so its column can be
// marked non-nullable.
let non_nullable = r.non_nullable_capture_groups();
// TODO(benesch): remove potentially dangerous usage of `as`.
#[allow(clippy::as_conversions)]
let descs: Vec<_> = r
Expand All @@ -3253,9 +3257,7 @@ impl AnalyzedRegex {
.map(|(i, name)| CaptureGroupDesc {
index: i as u32,
name: name.map(String::from),
// TODO -- we can do better.
// https://github.com/MaterializeInc/database-issues/issues/612
nullable: true,
nullable: !non_nullable.contains(&(i as u32)),
})
.collect();
Ok(Self(r, descs, opts))
Expand Down
1 change: 1 addition & 0 deletions src/repr/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -62,6 +62,7 @@ ordered-float.workspace = true
postgres-protocol.workspace = true
prost.workspace = true
regex.workspace = true
regex-syntax.workspace = true
ryu.workspace = true
serde.workspace = true
serde_json = { workspace = true, features = ["arbitrary_precision", "preserve_order"] }
Expand Down
139 changes: 139 additions & 0 deletions src/repr/src/adt/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

use std::borrow::Cow;
use std::cmp::Ordering;
use std::collections::BTreeSet;
use std::fmt;
use std::hash::{Hash, Hasher};
use std::ops::Deref;
Expand Down Expand Up @@ -107,6 +108,85 @@ impl Regex {
// and doesn't include any of the flags.
self.regex.as_str()
}

/// Returns the indices of the capture groups that are guaranteed to
/// participate in every successful match of this regex.
///
/// A capture group does *not* participate in a match when it is skipped
/// over by an optional construct: a `?` or `*` quantifier, a `{0,n}`-style
/// repetition, or an unmatched branch of an alternation. The indices
/// returned here are the complement of that set — capture groups that
/// always capture a value when the regex matches at all, and whose
/// extracted value is therefore non-nullable.
///
/// Capture group indices match those used by [`regex::Captures`], where
/// index 0 refers to the entire match and is always included (it
/// necessarily participates whenever the regex matches).
///
/// If the pattern cannot be parsed — which should not happen for a regex
/// that compiled successfully — an empty set is returned, conservatively
/// treating every capture group as nullable.
pub fn non_nullable_capture_groups(&self) -> BTreeSet<u32> {
let mut groups = BTreeSet::new();
if let Ok(ast) = regex_syntax::ast::parse::Parser::new().parse(self.pattern()) {
// The entire match (index 0) always participates in a match.
groups.insert(0);
collect_non_nullable_capture_groups(&ast, &mut groups);
}
groups
}
}

/// Walks the always-matched portion of `ast`, recording into `groups` the index
/// of every capture group that is guaranteed to participate in a match.
///
/// This only recurses through nodes that a match cannot skip over, so every
/// capture group it reaches is non-nullable. It stops descending at any node
/// that introduces an optional context — an optional repetition or an
/// alternation — since nothing beneath such a node is guaranteed to match.
///
/// The recursion depth is bounded by the AST nesting depth. This is safe
/// because the only caller obtains `ast` from `regex_syntax`'s parser, whose
/// default `nest_limit` (250) rejects more deeply nested patterns before we
/// ever get here.
fn collect_non_nullable_capture_groups(ast: &regex_syntax::ast::Ast, groups: &mut BTreeSet<u32>) {
use regex_syntax::ast::Ast;
match ast {
Ast::Group(group) => {
if let Some(index) = group.capture_index() {
groups.insert(index);
}
collect_non_nullable_capture_groups(&group.ast, groups);
}
// A required repetition (e.g. `+` or `{1,n}`) still always matches, so
// keep descending; an optional one is handled by the catch-all below.
Ast::Repetition(rep) if !repetition_is_optional(&rep.op.kind) => {
collect_non_nullable_capture_groups(&rep.ast, groups);
}
Ast::Concat(concat) => {
for ast in &concat.asts {
collect_non_nullable_capture_groups(ast, groups);
}
}
// Everything else — alternations (only one branch matches), optional
// repetitions, and leaf nodes — cannot contribute a non-nullable
// capture group, so we stop here.
_ => {}
}
}

/// Returns whether a repetition with the given kind may match zero times, in
/// which case the repeated subexpression might be skipped over by a match.
fn repetition_is_optional(kind: &regex_syntax::ast::RepetitionKind) -> bool {
use regex_syntax::ast::{RepetitionKind, RepetitionRange};
match kind {
RepetitionKind::ZeroOrOne | RepetitionKind::ZeroOrMore => true,
RepetitionKind::OneOrMore => false,
RepetitionKind::Range(range) => match range {
RepetitionRange::Exactly(min) | RepetitionRange::AtLeast(min) => *min == 0,
RepetitionRange::Bounded(min, _) => *min == 0,
},
}
}

/// Error type for regex compilation failures.
Expand Down Expand Up @@ -380,6 +460,65 @@ mod tests {
}
}

#[mz_ore::test]
fn regex_non_nullable_capture_groups() {
// `(idx, expected_non_nullable_indices)`. Index 0 (the whole match)
// always participates and is always reported.
let cases: &[(&str, &[u32])] = &[
// A plain capture group always participates.
("(a)(b)", &[0, 1, 2]),
// `?` and `*` make the group optional.
("(a)(b)?", &[0, 1]),
("(a)(b)*", &[0, 1]),
// `+` and `{1,n}` keep the group required.
("(a)(b)+", &[0, 1, 2]),
("(a)(b){1,3}", &[0, 1, 2]),
// `{0,n}` and `{0}` make the group optional.
("(a)(b){0,3}", &[0, 1]),
("(a)(b){0}", &[0, 1]),
// Both branches of a top-level alternation are optional.
("(a)|(b)", &[0]),
// A group wrapping an alternation still always participates, even
// though groups *inside* the branches do not.
("(a|b)", &[0, 1]),
("((a)|(b))", &[0, 1]),
// The soundness crux: a group that *wraps* an optional sub-expression
// still always captures (the empty string), so it is non-nullable —
// unlike a group that is *itself* made optional (`(a)?`, tested
// above). Note the difference in `?`/`*` position.
("(a*)", &[0, 1]),
("(a?)", &[0, 1]),
("(a*)(b)?", &[0, 1]),
// Nesting inside an optional construct is contagious.
("((a)(b))?", &[0]),
// A required group following an optional one is still required.
("(a)?(b)", &[0, 2]),
// Named groups are handled the same way.
("(?P<x>a)(?P<y>b)?", &[0, 1]),
// Non-capturing groups don't introduce indices.
("(?:a)(b)", &[0, 1]),
// Real-world patterns exercised by our test suite. The request-log
// regex used by `test/testdrive/regex-sources.td`: `ip` (1), `ts`
// (2), `path` (3), and `code` (6) always participate, while
// `search_kw` (4) and `product_detail_id` (5) live inside
// alternation branches and are therefore nullable.
(
r#"(?P<ip>\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P<ts>[^]]+)\] "(?P<path>(?:GET /search/\?kw=(?P<search_kw>[^ ]*) HTTP/\d\.\d)|(?:GET /detail/(?P<product_detail_id>[a-zA-Z0-9]+) HTTP/\d\.\d)|(?:[^"]+))" (?P<code>\d{3}) -"#,
&[0, 1, 2, 3, 6],
),
// The key/value regexes from
// `test/testdrive/kafka-include-key-sources.td`, where both groups
// always participate.
(r#"(?P<animal>[^,]+),(?P<food>\w+)"#, &[0, 1, 2]),
];
for (pattern, expected) in cases {
let regex = Regex::new(pattern, false).unwrap();
let got = regex.non_nullable_capture_groups();
let expected: BTreeSet<u32> = expected.iter().copied().collect();
assert_eq!(got, expected, "pattern: {pattern}");
}
}

#[mz_ore::test]
fn regex_serde_from_reader() {
let pattern = "A.*B";
Expand Down
42 changes: 25 additions & 17 deletions src/storage-types/src/sources/encoding.rs
Original file line number Diff line number Diff line change
Expand Up @@ -168,23 +168,31 @@ impl<C: ConnectionAccess> DataEncoding<C> {
desc.with_column(name, ty.clone())
})
.finish(),
Self::Regex(RegexEncoding { regex }) => regex
.capture_names()
.enumerate()
// The first capture is the entire matched string. This will
// often not be useful, so skip it. If people want it they can
// just surround their entire regex in an explicit capture
// group.
.skip(1)
.fold(RelationDesc::builder(), |desc, (i, name)| {
let name = match name {
None => format!("column{}", i),
Some(name) => name.to_owned(),
};
let ty = SqlScalarType::String.nullable(true);
desc.with_column(name, ty)
})
.finish(),
Self::Regex(RegexEncoding { regex }) => {
// A capture group that always participates in a match (when the
// regex matches at all) extracts a non-null value, so its column
// can be marked non-nullable.
let non_nullable = regex.non_nullable_capture_groups();
regex
.capture_names()
.enumerate()
// The first capture is the entire matched string. This will
// often not be useful, so skip it. If people want it they can
// just surround their entire regex in an explicit capture
// group.
.skip(1)
.fold(RelationDesc::builder(), |desc, (i, name)| {
let name = match name {
None => format!("column{}", i),
Some(name) => name.to_owned(),
};
// `capture_names` yields at most `u32::MAX` groups.
let nullable = !non_nullable.contains(&u32::try_from(i).unwrap());
let ty = SqlScalarType::String.nullable(nullable);
desc.with_column(name, ty)
})
.finish()
}
Self::Csv(CsvEncoding { columns, .. }) => match columns {
ColumnSpec::Count(n) => (1..=*n)
.fold(RelationDesc::builder(), |desc, i| {
Expand Down
20 changes: 19 additions & 1 deletion test/sqllogictest/regex.slt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,25 @@ asdf asdf NULL
asdfjkl asdf NULL
jkl NULL jkl

# TODO - Test that the columns have the correct nullability, once they actually do (database-issues#612)
# Regression test for database-issues#612: capture groups that always
# participate in a match produce non-nullable columns, while those guarded by a
# `?`, `*`, or `{0,n}` quantifier or an alternation branch are nullable. The
# haystack is a column rather than a literal so that the table function isn't
# constant-folded into an (always non-nullable) empty constant.
statement ok
CREATE VIEW regex_nullability AS
SELECT r.* FROM data, regexp_extract('(?P<g1>a)(?P<g2>b)?(?P<g3>(?P<g4>c)|d)', data.input) AS r

query TT
SELECT c.name, c.nullable::text
FROM mz_columns c JOIN mz_views v ON c.id = v.id
WHERE v.name = 'regex_nullability'
ORDER BY c.position
----
g1 false
g2 true
g3 false
g4 true

# Standard regex matching.
query TTT
Expand Down
24 changes: 12 additions & 12 deletions test/testdrive-old-kafka-src-syntax/regex-sources.td
Original file line number Diff line number Diff line change
Expand Up @@ -30,12 +30,12 @@ $ kafka-ingest topic=request-log format=bytes
> SHOW COLUMNS FROM regex_source
name nullable type comment
------------------------------------------
ip true text ""
ts true text ""
path true text ""
ip false text ""
ts false text ""
path false text ""
search_kw true text ""
product_detail_id true text ""
code true text ""
code false text ""

> SELECT * FROM regex_source
ip ts path search_kw product_detail_id code
Expand All @@ -58,12 +58,12 @@ helper+ins+hennaed
> SHOW COLUMNS FROM regex_source_named_cols
name nullable type comment
------------------------------------------
ip true text ""
ts true text ""
path true text ""
ip false text ""
ts false text ""
path false text ""
search_kw true text ""
product_detail_id true text ""
code true text ""
code false text ""

# verify metadata column renaming
> CREATE CLUSTER regex_source_renamed_cols_cluster SIZE '${arg.default-storage-size}';
Expand All @@ -75,12 +75,12 @@ code true text ""
> SHOW COLUMNS FROM regex_source_renamed_cols
name nullable type comment
------------------------------------------
ip true text ""
ts true text ""
path true text ""
ip false text ""
ts false text ""
path false text ""
search_kw true text ""
product_detail_id true text ""
code true text ""
code false text ""

> SELECT * FROM regex_source_named_cols
ip ts path search_kw product_detail_id code
Expand Down
Loading
Loading