From d4e5aad2782c17e3f1d4a84e3954507afbb579bd Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 1 Jun 2026 13:10:39 +0000 Subject: [PATCH 1/4] expr: infer non-nullable columns from regex capture groups MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Columns derived from regex capture groups were unconditionally marked nullable. We can do better: a capture group that is not nested inside an optional construct — a `?`/`*` quantifier, a `{0,n}`-style repetition, or an alternation branch — always participates whenever the regex matches at all, so its extracted column is non-nullable. Add `Regex::non_nullable_capture_groups`, which parses the pattern AST and returns the indices of capture groups guaranteed to participate in a match (falling back to "all nullable" if the pattern somehow fails to parse). Use it in both `regexp_extract` and the `FORMAT REGEX` source encoding. This resolves the long-standing TODO referencing database-issues#612. --- Cargo.lock | 1 + src/expr/src/relation/func.rs | 8 +- src/repr/Cargo.toml | 1 + src/repr/src/adt/regex.rs | 134 ++++++++++++++++++ src/storage-types/src/sources/encoding.rs | 42 +++--- test/sqllogictest/regex.slt | 18 ++- .../regex-sources.td | 24 ++-- test/testdrive/regex-sources.td | 24 ++-- 8 files changed, 207 insertions(+), 45 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index ed98cb17a8f7f..9c8d6c278fa61 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -7571,6 +7571,7 @@ dependencies = [ "prost-build", "rand 0.9.4", "regex", + "regex-syntax", "ryu", "serde", "serde_json", diff --git a/src/expr/src/relation/func.rs b/src/expr/src/relation/func.rs index 63d624115f6ab..00bdd1d7cbdd8 100644 --- a/src/expr/src/relation/func.rs +++ b/src/expr/src/relation/func.rs @@ -3240,6 +3240,10 @@ pub struct AnalyzedRegex(ReprRegex, Vec, AnalyzedRegexOpts); impl AnalyzedRegex { pub fn new(s: &str, opts: AnalyzedRegexOpts) -> Result { let r = ReprRegex::new(s, opts.case_insensitive)?; + // A capture group that always participates in a match (when the regex + // matches at all) extracts a non-null value, so its column can be + // marked non-nullable. + let non_nullable = r.non_nullable_capture_groups(); // TODO(benesch): remove potentially dangerous usage of `as`. #[allow(clippy::as_conversions)] let descs: Vec<_> = r @@ -3253,9 +3257,7 @@ impl AnalyzedRegex { .map(|(i, name)| CaptureGroupDesc { index: i as u32, name: name.map(String::from), - // TODO -- we can do better. - // https://github.com/MaterializeInc/database-issues/issues/612 - nullable: true, + nullable: !non_nullable.contains(&(i as u32)), }) .collect(); Ok(Self(r, descs, opts)) diff --git a/src/repr/Cargo.toml b/src/repr/Cargo.toml index fcac5af5bd7e9..5b140663d11e8 100644 --- a/src/repr/Cargo.toml +++ b/src/repr/Cargo.toml @@ -62,6 +62,7 @@ ordered-float.workspace = true postgres-protocol.workspace = true prost.workspace = true regex.workspace = true +regex-syntax.workspace = true ryu.workspace = true serde.workspace = true serde_json = { workspace = true, features = ["arbitrary_precision", "preserve_order"] } diff --git a/src/repr/src/adt/regex.rs b/src/repr/src/adt/regex.rs index 5e4ac60955c26..922ec0754ee8b 100644 --- a/src/repr/src/adt/regex.rs +++ b/src/repr/src/adt/regex.rs @@ -11,6 +11,7 @@ use std::borrow::Cow; use std::cmp::Ordering; +use std::collections::BTreeSet; use std::fmt; use std::hash::{Hash, Hasher}; use std::ops::Deref; @@ -107,6 +108,87 @@ impl Regex { // and doesn't include any of the flags. self.regex.as_str() } + + /// Returns the indices of the capture groups that are guaranteed to + /// participate in every successful match of this regex. + /// + /// A capture group does *not* participate in a match when it is skipped + /// over by an optional construct: a `?` or `*` quantifier, a `{0,n}`-style + /// repetition, or an unmatched branch of an alternation. The indices + /// returned here are the complement of that set — capture groups that + /// always capture a value when the regex matches at all, and whose + /// extracted value is therefore non-nullable. + /// + /// Capture group indices match those used by [`regex::Captures`], where + /// index 0 refers to the entire match and is always included (it + /// necessarily participates whenever the regex matches). + /// + /// If the pattern cannot be parsed — which should not happen for a regex + /// that compiled successfully — an empty set is returned, conservatively + /// treating every capture group as nullable. + pub fn non_nullable_capture_groups(&self) -> BTreeSet { + let mut groups = BTreeSet::new(); + if let Ok(ast) = regex_syntax::ast::parse::Parser::new().parse(self.pattern()) { + // The entire match (index 0) always participates in a match. + groups.insert(0); + collect_non_nullable_capture_groups(&ast, false, &mut groups); + } + groups + } +} + +/// Walks `ast`, recording into `groups` the index of every capture group that +/// is guaranteed to participate in a match. `optional` tracks whether `ast` is +/// nested inside a construct that a match may skip over, in which case any +/// capture group found within is *not* guaranteed to participate. +fn collect_non_nullable_capture_groups( + ast: ®ex_syntax::ast::Ast, + optional: bool, + groups: &mut BTreeSet, +) { + use regex_syntax::ast::Ast; + match ast { + Ast::Repetition(rep) => { + let optional = optional || repetition_is_optional(&rep.op.kind); + collect_non_nullable_capture_groups(&rep.ast, optional, groups); + } + Ast::Group(group) => { + if !optional { + if let Some(index) = group.capture_index() { + groups.insert(index); + } + } + collect_non_nullable_capture_groups(&group.ast, optional, groups); + } + Ast::Alternation(alternation) => { + // Only one branch of an alternation matches, so any capture group + // inside a branch might be skipped over by a given match. + for ast in &alternation.asts { + collect_non_nullable_capture_groups(ast, true, groups); + } + } + Ast::Concat(concat) => { + for ast in &concat.asts { + collect_non_nullable_capture_groups(ast, optional, groups); + } + } + // The remaining AST node kinds cannot contain a capture group. + _ => {} + } +} + +/// Returns whether a repetition with the given kind may match zero times, in +/// which case the repeated subexpression might be skipped over by a match. +fn repetition_is_optional(kind: ®ex_syntax::ast::RepetitionKind) -> bool { + use regex_syntax::ast::{RepetitionKind, RepetitionRange}; + match kind { + RepetitionKind::ZeroOrOne | RepetitionKind::ZeroOrMore => true, + RepetitionKind::OneOrMore => false, + RepetitionKind::Range(range) => match range { + RepetitionRange::Exactly(min) | RepetitionRange::AtLeast(min) => *min == 0, + RepetitionRange::Bounded(min, _) => *min == 0, + }, + } } /// Error type for regex compilation failures. @@ -380,6 +462,58 @@ mod tests { } } + #[mz_ore::test] + fn regex_non_nullable_capture_groups() { + // `(idx, expected_non_nullable_indices)`. Index 0 (the whole match) + // always participates and is always reported. + let cases: &[(&str, &[u32])] = &[ + // A plain capture group always participates. + ("(a)(b)", &[0, 1, 2]), + // `?` and `*` make the group optional. + ("(a)(b)?", &[0, 1]), + ("(a)(b)*", &[0, 1]), + // `+` and `{1,n}` keep the group required. + ("(a)(b)+", &[0, 1, 2]), + ("(a)(b){1,3}", &[0, 1, 2]), + // `{0,n}` and `{0}` make the group optional. + ("(a)(b){0,3}", &[0, 1]), + ("(a)(b){0}", &[0, 1]), + // Both branches of a top-level alternation are optional. + ("(a)|(b)", &[0]), + // A group wrapping an alternation still always participates, even + // though groups *inside* the branches do not. + ("(a|b)", &[0, 1]), + ("((a)|(b))", &[0, 1]), + // Nesting inside an optional construct is contagious. + ("((a)(b))?", &[0]), + // A required group following an optional one is still required. + ("(a)?(b)", &[0, 2]), + // Named groups are handled the same way. + ("(?Pa)(?Pb)?", &[0, 1]), + // Non-capturing groups don't introduce indices. + ("(?:a)(b)", &[0, 1]), + // Real-world patterns exercised by our test suite. The request-log + // regex used by `test/testdrive/regex-sources.td`: `ip` (1), `ts` + // (2), `path` (3), and `code` (6) always participate, while + // `search_kw` (4) and `product_detail_id` (5) live inside + // alternation branches and are therefore nullable. + ( + r#"(?P\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}) - - \[(?P[^]]+)\] "(?P(?:GET /search/\?kw=(?P[^ ]*) HTTP/\d\.\d)|(?:GET /detail/(?P[a-zA-Z0-9]+) HTTP/\d\.\d)|(?:[^"]+))" (?P\d{3}) -"#, + &[0, 1, 2, 3, 6], + ), + // The key/value regexes from + // `test/testdrive/kafka-include-key-sources.td`, where both groups + // always participate. + (r#"(?P[^,]+),(?P\w+)"#, &[0, 1, 2]), + ]; + for (pattern, expected) in cases { + let regex = Regex::new(pattern, false).unwrap(); + let got = regex.non_nullable_capture_groups(); + let expected: BTreeSet = expected.iter().copied().collect(); + assert_eq!(got, expected, "pattern: {pattern}"); + } + } + #[mz_ore::test] fn regex_serde_from_reader() { let pattern = "A.*B"; diff --git a/src/storage-types/src/sources/encoding.rs b/src/storage-types/src/sources/encoding.rs index b4149972f2db1..051cb3503942e 100644 --- a/src/storage-types/src/sources/encoding.rs +++ b/src/storage-types/src/sources/encoding.rs @@ -168,23 +168,31 @@ impl DataEncoding { desc.with_column(name, ty.clone()) }) .finish(), - Self::Regex(RegexEncoding { regex }) => regex - .capture_names() - .enumerate() - // The first capture is the entire matched string. This will - // often not be useful, so skip it. If people want it they can - // just surround their entire regex in an explicit capture - // group. - .skip(1) - .fold(RelationDesc::builder(), |desc, (i, name)| { - let name = match name { - None => format!("column{}", i), - Some(name) => name.to_owned(), - }; - let ty = SqlScalarType::String.nullable(true); - desc.with_column(name, ty) - }) - .finish(), + Self::Regex(RegexEncoding { regex }) => { + // A capture group that always participates in a match (when the + // regex matches at all) extracts a non-null value, so its column + // can be marked non-nullable. + let non_nullable = regex.non_nullable_capture_groups(); + regex + .capture_names() + .enumerate() + // The first capture is the entire matched string. This will + // often not be useful, so skip it. If people want it they can + // just surround their entire regex in an explicit capture + // group. + .skip(1) + .fold(RelationDesc::builder(), |desc, (i, name)| { + let name = match name { + None => format!("column{}", i), + Some(name) => name.to_owned(), + }; + // `capture_names` yields at most `u32::MAX` groups. + let nullable = !non_nullable.contains(&u32::try_from(i).unwrap()); + let ty = SqlScalarType::String.nullable(nullable); + desc.with_column(name, ty) + }) + .finish() + } Self::Csv(CsvEncoding { columns, .. }) => match columns { ColumnSpec::Count(n) => (1..=*n) .fold(RelationDesc::builder(), |desc, i| { diff --git a/test/sqllogictest/regex.slt b/test/sqllogictest/regex.slt index 2b923e0c20b0d..53699da2aa5a1 100644 --- a/test/sqllogictest/regex.slt +++ b/test/sqllogictest/regex.slt @@ -30,7 +30,23 @@ asdf asdf NULL asdfjkl asdf NULL jkl NULL jkl -# TODO - Test that the columns have the correct nullability, once they actually do (database-issues#612) +# Regression test for database-issues#612: capture groups that always +# participate in a match produce non-nullable columns, while those guarded by a +# `?`, `*`, or `{0,n}` quantifier or an alternation branch are nullable. +statement ok +CREATE VIEW regex_nullability AS +SELECT * FROM regexp_extract('(?Pa)(?Pb)?(?P(?Pc)|d)', 'x') AS r + +query TT +SELECT c.name, c.nullable::text +FROM mz_columns c JOIN mz_views v ON c.id = v.id +WHERE v.name = 'regex_nullability' +ORDER BY c.position +---- +g1 false +g2 true +g3 false +g4 true # Standard regex matching. query TTT diff --git a/test/testdrive-old-kafka-src-syntax/regex-sources.td b/test/testdrive-old-kafka-src-syntax/regex-sources.td index a33a0601f6e64..0cba16505ed4d 100644 --- a/test/testdrive-old-kafka-src-syntax/regex-sources.td +++ b/test/testdrive-old-kafka-src-syntax/regex-sources.td @@ -30,12 +30,12 @@ $ kafka-ingest topic=request-log format=bytes > SHOW COLUMNS FROM regex_source name nullable type comment ------------------------------------------ -ip true text "" -ts true text "" -path true text "" +ip false text "" +ts false text "" +path false text "" search_kw true text "" product_detail_id true text "" -code true text "" +code false text "" > SELECT * FROM regex_source ip ts path search_kw product_detail_id code @@ -58,12 +58,12 @@ helper+ins+hennaed > SHOW COLUMNS FROM regex_source_named_cols name nullable type comment ------------------------------------------ -ip true text "" -ts true text "" -path true text "" +ip false text "" +ts false text "" +path false text "" search_kw true text "" product_detail_id true text "" -code true text "" +code false text "" # verify metadata column renaming > CREATE CLUSTER regex_source_renamed_cols_cluster SIZE '${arg.default-storage-size}'; @@ -75,12 +75,12 @@ code true text "" > SHOW COLUMNS FROM regex_source_renamed_cols name nullable type comment ------------------------------------------ -ip true text "" -ts true text "" -path true text "" +ip false text "" +ts false text "" +path false text "" search_kw true text "" product_detail_id true text "" -code true text "" +code false text "" > SELECT * FROM regex_source_named_cols ip ts path search_kw product_detail_id code diff --git a/test/testdrive/regex-sources.td b/test/testdrive/regex-sources.td index e36281dbd67a3..dd4de9ba6cfe0 100644 --- a/test/testdrive/regex-sources.td +++ b/test/testdrive/regex-sources.td @@ -30,12 +30,12 @@ $ kafka-ingest topic=request-log format=bytes > SHOW COLUMNS FROM regex_source_tbl name nullable type comment ------------------------------------------ -ip true text "" -ts true text "" -path true text "" +ip false text "" +ts false text "" +path false text "" search_kw true text "" product_detail_id true text "" -code true text "" +code false text "" > SELECT * FROM regex_source_tbl ip ts path search_kw product_detail_id code @@ -60,12 +60,12 @@ helper+ins+hennaed > SHOW COLUMNS FROM regex_source_named_cols_tbl name nullable type comment ------------------------------------------ -ip true text "" -ts true text "" -path true text "" +ip false text "" +ts false text "" +path false text "" search_kw true text "" product_detail_id true text "" -code true text "" +code false text "" # verify metadata column renaming > CREATE CLUSTER regex_source_renamed_cols_cluster SIZE '${arg.default-storage-size}'; @@ -79,12 +79,12 @@ code true text "" > SHOW COLUMNS FROM regex_source_renamed_cols_tbl name nullable type comment ------------------------------------------ -ip true text "" -ts true text "" -path true text "" +ip false text "" +ts false text "" +path false text "" search_kw true text "" product_detail_id true text "" -code true text "" +code false text "" > SELECT * FROM regex_source_named_cols_tbl ip ts path search_kw product_detail_id code From 26e8f10f17558a5843efb30eb2323d2c3819a476 Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 1 Jun 2026 16:52:48 +0000 Subject: [PATCH 2/4] repr: prune redundant traversal in capture-group analysis Once a subexpression is in an optional context the flag never resets, so no descendant can ever be non-nullable. Stop descending at optional repetitions and alternations instead of walking their subtrees, and drop the now-unnecessary `optional` parameter. --- src/repr/src/adt/regex.rs | 47 +++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 27 deletions(-) diff --git a/src/repr/src/adt/regex.rs b/src/repr/src/adt/regex.rs index 922ec0754ee8b..b5fc82c8593ab 100644 --- a/src/repr/src/adt/regex.rs +++ b/src/repr/src/adt/regex.rs @@ -131,48 +131,41 @@ impl Regex { if let Ok(ast) = regex_syntax::ast::parse::Parser::new().parse(self.pattern()) { // The entire match (index 0) always participates in a match. groups.insert(0); - collect_non_nullable_capture_groups(&ast, false, &mut groups); + collect_non_nullable_capture_groups(&ast, &mut groups); } groups } } -/// Walks `ast`, recording into `groups` the index of every capture group that -/// is guaranteed to participate in a match. `optional` tracks whether `ast` is -/// nested inside a construct that a match may skip over, in which case any -/// capture group found within is *not* guaranteed to participate. -fn collect_non_nullable_capture_groups( - ast: ®ex_syntax::ast::Ast, - optional: bool, - groups: &mut BTreeSet, -) { +/// Walks the always-matched portion of `ast`, recording into `groups` the index +/// of every capture group that is guaranteed to participate in a match. +/// +/// This only recurses through nodes that a match cannot skip over, so every +/// capture group it reaches is non-nullable. It stops descending at any node +/// that introduces an optional context — an optional repetition or an +/// alternation — since nothing beneath such a node is guaranteed to match. +fn collect_non_nullable_capture_groups(ast: ®ex_syntax::ast::Ast, groups: &mut BTreeSet) { use regex_syntax::ast::Ast; match ast { - Ast::Repetition(rep) => { - let optional = optional || repetition_is_optional(&rep.op.kind); - collect_non_nullable_capture_groups(&rep.ast, optional, groups); - } Ast::Group(group) => { - if !optional { - if let Some(index) = group.capture_index() { - groups.insert(index); - } + if let Some(index) = group.capture_index() { + groups.insert(index); } - collect_non_nullable_capture_groups(&group.ast, optional, groups); + collect_non_nullable_capture_groups(&group.ast, groups); } - Ast::Alternation(alternation) => { - // Only one branch of an alternation matches, so any capture group - // inside a branch might be skipped over by a given match. - for ast in &alternation.asts { - collect_non_nullable_capture_groups(ast, true, groups); - } + // A required repetition (e.g. `+` or `{1,n}`) still always matches, so + // keep descending; an optional one is handled by the catch-all below. + Ast::Repetition(rep) if !repetition_is_optional(&rep.op.kind) => { + collect_non_nullable_capture_groups(&rep.ast, groups); } Ast::Concat(concat) => { for ast in &concat.asts { - collect_non_nullable_capture_groups(ast, optional, groups); + collect_non_nullable_capture_groups(ast, groups); } } - // The remaining AST node kinds cannot contain a capture group. + // Everything else — alternations (only one branch matches), optional + // repetitions, and leaf nodes — cannot contribute a non-nullable + // capture group, so we stop here. _ => {} } } From c3935dda71cc90fda550e6caf2c7ed5b7d85a76f Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 1 Jun 2026 18:41:05 +0000 Subject: [PATCH 3/4] expr: harden regex capture-group nullability tests Address review feedback: - Fix the sqllogictest: a literal haystack let the optimizer constant-fold regexp_extract into an empty constant (typed all non-nullable), so the test never exercised the analysis. Use a column haystack so the FlatMap survives and the declared per-group nullability is observed. - Unit-test the soundness crux directly: a group that *wraps* an optional sub-expression ((a*), (a?)) still always captures (the empty string) and is non-nullable, unlike a group that is itself optional ((a)?). - Document that the AST recursion depth is bounded by regex-syntax's default nest_limit (250), which rejects deeper patterns before analysis. - Add a platform-check (RegexpExtractNonNullable) whose MV has always-present groups, so a desc registered all-nullable on the previous release must survive boot on the current build that re-derives it non-nullable. The persist schema is nullability-invariant (database-issues#2488), so this is safe; the check guards against regressions in that contract. --- .../materialize/checks/all_checks/regex.py | 88 +++++++++++++++++-- src/repr/src/adt/regex.rs | 12 +++ test/sqllogictest/regex.slt | 6 +- 3 files changed, 96 insertions(+), 10 deletions(-) diff --git a/misc/python/materialize/checks/all_checks/regex.py b/misc/python/materialize/checks/all_checks/regex.py index 17619998a05b3..ec0dead663944 100644 --- a/misc/python/materialize/checks/all_checks/regex.py +++ b/misc/python/materialize/checks/all_checks/regex.py @@ -12,14 +12,80 @@ from materialize.checks.checks import Check +class RegexpExtractNonNullable(Check): + """Capture groups that always participate in a match yield non-nullable + columns (database-issues#612). This exercises that such a narrowed, + persisted desc survives restart/upgrade: an MV created on the previous + release registers all-nullable columns, while the current build re-derives + the desc with non-nullable columns. (The persist schema is + nullability-invariant, see database-issues#2488, so this must not conflict.) + """ + + def initialize(self) -> Testdrive: + return Testdrive( + dedent(""" + > CREATE TABLE regexp_extract_nn_table (f1 STRING); + > INSERT INTO regexp_extract_nn_table VALUES ('ab'); + """) + ) + + def manipulate(self) -> list[Testdrive]: + return [ + Testdrive(dedent(s)) + for s in [ + """ + > CREATE MATERIALIZED VIEW regexp_extract_nn_view1 AS + SELECT g.a, g.b FROM regexp_extract_nn_table, + regexp_extract('(?P[a-z])(?P[a-z])', f1) AS g; + > INSERT INTO regexp_extract_nn_table VALUES ('kl'); + """, + """ + > CREATE MATERIALIZED VIEW regexp_extract_nn_view2 AS + SELECT g.a, g.b FROM regexp_extract_nn_table, + regexp_extract('(?P[a-z])(?P[a-z])', f1) AS g; + > INSERT INTO regexp_extract_nn_table VALUES ('yz'); + """, + ] + ] + + def validate(self) -> Testdrive: + return Testdrive( + dedent(""" + > SELECT a, b FROM regexp_extract_nn_view1 ORDER BY a; + a b + k l + y z + + > SELECT a, b FROM regexp_extract_nn_view2 ORDER BY a; + a b + k l + y z + + > SELECT c.name, c.nullable FROM mz_columns c + JOIN mz_materialized_views v ON c.id = v.id + WHERE v.name = 'regexp_extract_nn_view1' ORDER BY c.position; + a false + b false + + > SELECT c.name, c.nullable FROM mz_columns c + JOIN mz_materialized_views v ON c.id = v.id + WHERE v.name = 'regexp_extract_nn_view2' ORDER BY c.position; + a false + b false + """) + ) + + class RegexpExtract(Check): """The regex from regexp_extract has its own ProtoAnalyzedRegex""" def initialize(self) -> Testdrive: - return Testdrive(dedent(""" + return Testdrive( + dedent(""" > CREATE TABLE regexp_extract_table (f1 STRING); > INSERT INTO regexp_extract_table VALUES ('abc'); - """)) + """) + ) def manipulate(self) -> list[Testdrive]: return [ @@ -37,22 +103,26 @@ def manipulate(self) -> list[Testdrive]: ] def validate(self) -> Testdrive: - return Testdrive(dedent(""" + return Testdrive( + dedent(""" > SELECT c1::string FROM regexp_extract_view1; (,,,xyz,x,yz) (abc,a,bc,,,) > SELECT c1::string FROM regexp_extract_view2; (,,,xyz,x,yz) (abc,a,bc,,,) - """)) + """) + ) class Regex(Check): def initialize(self) -> Testdrive: - return Testdrive(dedent(""" + return Testdrive( + dedent(""" > CREATE TABLE regex_table (f1 STRING, f2 STRING); > INSERT INTO regex_table VALUES ('abc', 'abc'); - """)) + """) + ) def manipulate(self) -> list[Testdrive]: return [ @@ -70,7 +140,8 @@ def manipulate(self) -> list[Testdrive]: ] def validate(self) -> Testdrive: - return Testdrive(dedent(""" + return Testdrive( + dedent(""" > SELECT * FROM regex_view1; true true false false true true true true @@ -80,4 +151,5 @@ def validate(self) -> Testdrive: true true false false true true true true true true true true - """)) + """) + ) diff --git a/src/repr/src/adt/regex.rs b/src/repr/src/adt/regex.rs index b5fc82c8593ab..bec285656bb6b 100644 --- a/src/repr/src/adt/regex.rs +++ b/src/repr/src/adt/regex.rs @@ -144,6 +144,11 @@ impl Regex { /// capture group it reaches is non-nullable. It stops descending at any node /// that introduces an optional context — an optional repetition or an /// alternation — since nothing beneath such a node is guaranteed to match. +/// +/// The recursion depth is bounded by the AST nesting depth. This is safe +/// because the only caller obtains `ast` from `regex_syntax`'s parser, whose +/// default `nest_limit` (250) rejects more deeply nested patterns before we +/// ever get here. fn collect_non_nullable_capture_groups(ast: ®ex_syntax::ast::Ast, groups: &mut BTreeSet) { use regex_syntax::ast::Ast; match ast { @@ -477,6 +482,13 @@ mod tests { // though groups *inside* the branches do not. ("(a|b)", &[0, 1]), ("((a)|(b))", &[0, 1]), + // The soundness crux: a group that *wraps* an optional sub-expression + // still always captures (the empty string), so it is non-nullable — + // unlike a group that is *itself* made optional (`(a)?`, tested + // above). Note the difference in `?`/`*` position. + ("(a*)", &[0, 1]), + ("(a?)", &[0, 1]), + ("(a*)(b)?", &[0, 1]), // Nesting inside an optional construct is contagious. ("((a)(b))?", &[0]), // A required group following an optional one is still required. diff --git a/test/sqllogictest/regex.slt b/test/sqllogictest/regex.slt index 53699da2aa5a1..fdb4ded87c667 100644 --- a/test/sqllogictest/regex.slt +++ b/test/sqllogictest/regex.slt @@ -32,10 +32,12 @@ jkl NULL jkl # Regression test for database-issues#612: capture groups that always # participate in a match produce non-nullable columns, while those guarded by a -# `?`, `*`, or `{0,n}` quantifier or an alternation branch are nullable. +# `?`, `*`, or `{0,n}` quantifier or an alternation branch are nullable. The +# haystack is a column rather than a literal so that the table function isn't +# constant-folded into an (always non-nullable) empty constant. statement ok CREATE VIEW regex_nullability AS -SELECT * FROM regexp_extract('(?Pa)(?Pb)?(?P(?Pc)|d)', 'x') AS r +SELECT r.* FROM data, regexp_extract('(?Pa)(?Pb)?(?P(?Pc)|d)', data.input) AS r query TT SELECT c.name, c.nullable::text From 8468f784c1aa86f635048b8d60c78da74469576d Mon Sep 17 00:00:00 2001 From: Claude Date: Mon, 1 Jun 2026 19:20:09 +0000 Subject: [PATCH 4/4] checks: format regex.py with black bin/fmt formats Python with black, not `ruff format`; the two diverge on the dedent-in-call wrapping. Reformat to black's style. --- .../materialize/checks/all_checks/regex.py | 36 +++++++------------ 1 file changed, 12 insertions(+), 24 deletions(-) diff --git a/misc/python/materialize/checks/all_checks/regex.py b/misc/python/materialize/checks/all_checks/regex.py index ec0dead663944..0f4ca14feb025 100644 --- a/misc/python/materialize/checks/all_checks/regex.py +++ b/misc/python/materialize/checks/all_checks/regex.py @@ -22,12 +22,10 @@ class RegexpExtractNonNullable(Check): """ def initialize(self) -> Testdrive: - return Testdrive( - dedent(""" + return Testdrive(dedent(""" > CREATE TABLE regexp_extract_nn_table (f1 STRING); > INSERT INTO regexp_extract_nn_table VALUES ('ab'); - """) - ) + """)) def manipulate(self) -> list[Testdrive]: return [ @@ -49,8 +47,7 @@ def manipulate(self) -> list[Testdrive]: ] def validate(self) -> Testdrive: - return Testdrive( - dedent(""" + return Testdrive(dedent(""" > SELECT a, b FROM regexp_extract_nn_view1 ORDER BY a; a b k l @@ -72,20 +69,17 @@ def validate(self) -> Testdrive: WHERE v.name = 'regexp_extract_nn_view2' ORDER BY c.position; a false b false - """) - ) + """)) class RegexpExtract(Check): """The regex from regexp_extract has its own ProtoAnalyzedRegex""" def initialize(self) -> Testdrive: - return Testdrive( - dedent(""" + return Testdrive(dedent(""" > CREATE TABLE regexp_extract_table (f1 STRING); > INSERT INTO regexp_extract_table VALUES ('abc'); - """) - ) + """)) def manipulate(self) -> list[Testdrive]: return [ @@ -103,26 +97,22 @@ def manipulate(self) -> list[Testdrive]: ] def validate(self) -> Testdrive: - return Testdrive( - dedent(""" + return Testdrive(dedent(""" > SELECT c1::string FROM regexp_extract_view1; (,,,xyz,x,yz) (abc,a,bc,,,) > SELECT c1::string FROM regexp_extract_view2; (,,,xyz,x,yz) (abc,a,bc,,,) - """) - ) + """)) class Regex(Check): def initialize(self) -> Testdrive: - return Testdrive( - dedent(""" + return Testdrive(dedent(""" > CREATE TABLE regex_table (f1 STRING, f2 STRING); > INSERT INTO regex_table VALUES ('abc', 'abc'); - """) - ) + """)) def manipulate(self) -> list[Testdrive]: return [ @@ -140,8 +130,7 @@ def manipulate(self) -> list[Testdrive]: ] def validate(self) -> Testdrive: - return Testdrive( - dedent(""" + return Testdrive(dedent(""" > SELECT * FROM regex_view1; true true false false true true true true @@ -151,5 +140,4 @@ def validate(self) -> Testdrive: true true false false true true true true true true true true - """) - ) + """))