From 1ce54331ea7b174f013af5caac684e04e7b9860e Mon Sep 17 00:00:00 2001 From: tiancaiamao Date: Mon, 1 Jun 2026 20:49:00 +0800 Subject: [PATCH 1/2] expression: recognize utf8mb4_0900_bin as a binary collation in isBinCollation The isBinCollation function did not include utf8mb4_0900_bin (MySQL 8.0 binary collation, ID 309). When inferCollation aggregates a column using utf8mb4_0900_bin with another non-bin utf8mb4 collation, the result incorrectly degrades to CoercibilityNone instead of keeping the _bin collation. This causes subsequent aggregation with a binary/blob column to produce a from_binary() cast that fails on non-UTF-8 bytes (ERROR 3854). Fix: add charset.CollationUTF8MB40900Bin to isBinCollation and add regression tests covering the two-column and three-column cases. Closes #68845 --- pkg/expression/collation.go | 2 +- pkg/expression/collation_test.go | 30 ++++++++++++++++++++++++++++++ pkg/parser/charset/charset.go | 2 ++ 3 files changed, 33 insertions(+), 1 deletion(-) diff --git a/pkg/expression/collation.go b/pkg/expression/collation.go index 829bc5b184237..9c51aad329c3e 100644 --- a/pkg/expression/collation.go +++ b/pkg/expression/collation.go @@ -571,7 +571,7 @@ func isUnicodeCollation(ch string) bool { func isBinCollation(collate string) bool { return collate == charset.CollationASCII || collate == charset.CollationLatin1 || collate == charset.CollationUTF8 || collate == charset.CollationUTF8MB4 || - collate == charset.CollationGBKBin + collate == charset.CollationGBKBin || collate == charset.CollationUTF8MB40900Bin } // getBinCollation get binary collation by charset diff --git a/pkg/expression/collation_test.go b/pkg/expression/collation_test.go index 550efb969de63..75d8f8c3cbada 100644 --- a/pkg/expression/collation_test.go +++ b/pkg/expression/collation_test.go @@ -147,6 +147,36 @@ func TestInferCollation(t *testing.T) { false, &ExprCollation{CoercibilityImplicit, UNICODE, charset.CharsetUTF8MB4, charset.CollationUTF8MB4}, }, + // Regression test: utf8mb4_0900_bin is a binary collation and should win + // over non-bin collations at the same coercibility (same as utf8mb4_bin). + { + []Expression{ + newExpression(CoercibilityImplicit, UNICODE, charset.CharsetUTF8MB4, charset.CollationUTF8MB40900Bin), + newExpression(CoercibilityImplicit, UNICODE, charset.CharsetUTF8MB4, "utf8mb4_unicode_ci"), + }, + false, + &ExprCollation{CoercibilityImplicit, UNICODE, charset.CharsetUTF8MB4, charset.CollationUTF8MB40900Bin}, + }, + { + []Expression{ + newExpression(CoercibilityImplicit, UNICODE, charset.CharsetUTF8MB4, "utf8mb4_unicode_ci"), + newExpression(CoercibilityImplicit, UNICODE, charset.CharsetUTF8MB4, charset.CollationUTF8MB40900Bin), + }, + false, + &ExprCollation{CoercibilityImplicit, UNICODE, charset.CharsetUTF8MB4, charset.CollationUTF8MB40900Bin}, + }, + // Regression test: two utf8mb4 columns with utf8mb4_0900_bin + utf8mb4_unicode_ci + // combined with a binary blob. utf8mb4_0900_bin should be recognized as bin + // collation so binary wins without triggering from_binary() cast. + { + []Expression{ + newExpression(CoercibilityImplicit, UNICODE, charset.CharsetUTF8MB4, charset.CollationUTF8MB40900Bin), + newExpression(CoercibilityImplicit, UNICODE, charset.CharsetUTF8MB4, "utf8mb4_unicode_ci"), + newExpression(CoercibilityImplicit, UNICODE, charset.CharsetBin, charset.CollationBin), + }, + false, + &ExprCollation{CoercibilityImplicit, UNICODE, charset.CharsetBin, charset.CollationBin}, + }, // binary charset with non-binary charset. { []Expression{ diff --git a/pkg/parser/charset/charset.go b/pkg/parser/charset/charset.go index 6a133c4ad2308..05957dd3bed34 100644 --- a/pkg/parser/charset/charset.go +++ b/pkg/parser/charset/charset.go @@ -238,6 +238,8 @@ const ( CollationLatin1 = "latin1_bin" // CollationGBKBin is the default collation for CharsetGBK when new collation is disabled. CollationGBKBin = "gbk_bin" + // CollationUTF8MB40900Bin is the utf8mb4_0900_bin collation (MySQL 8.0 binary collation for utf8mb4). + CollationUTF8MB40900Bin = "utf8mb4_0900_bin" // CollationGBKChineseCI is the default collation for CharsetGBK when new collation is enabled. CollationGBKChineseCI = "gbk_chinese_ci" // CollationGB18030Bin is the default collation for CharsetGB18030 when new collation is disabled. From 593e5edbeb3da2313c79f51760d18b6259c6b588 Mon Sep 17 00:00:00 2001 From: tiancaiamao Date: Tue, 2 Jun 2026 11:43:03 +0800 Subject: [PATCH 2/2] address comment --- pkg/expression/collation.go | 13 +++++++++++++ pkg/util/collate/collate.go | 18 +++++++++++++----- 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/pkg/expression/collation.go b/pkg/expression/collation.go index 9c51aad329c3e..3b3ba1747400c 100644 --- a/pkg/expression/collation.go +++ b/pkg/expression/collation.go @@ -568,6 +568,19 @@ func isUnicodeCollation(ch string) bool { return ch == charset.CharsetUTF8 || ch == charset.CharsetUTF8MB4 } +// isBinCollation checks whether the collation has _bin semantics for coercibility +// derivation. In MySQL's aggregation rules, when two same-charset collations conflict +// with equal coercibility, a _bin collation yields to a non-_bin one (e.g. gbk_bin +// yields to gbk_chinese_ci) instead of degrading to CoercibilityNone. +// +// This is DIFFERENT from collate.IsBinCollation which tests "sortkey == raw data" +// (a storage-level property). The two diverge on: +// - gbk_bin: is a _bin collation (listed here) but its Key() does UTF-8→GBK +// conversion, so sortkey ≠ data (NOT in collate.IsBinCollation). +// - "binary": sortkey == data (in collate.IsBinCollation) but belongs to +// charset=bin which takes a different coercibility path (NOT listed here). +// +// If you add a new _bin collation here, also check collate.IsBinCollation. func isBinCollation(collate string) bool { return collate == charset.CollationASCII || collate == charset.CollationLatin1 || collate == charset.CollationUTF8 || collate == charset.CollationUTF8MB4 || diff --git a/pkg/util/collate/collate.go b/pkg/util/collate/collate.go index 049deb2216ecc..f85b7cba66af7 100644 --- a/pkg/util/collate/collate.go +++ b/pkg/util/collate/collate.go @@ -335,14 +335,22 @@ func ConvertAndGetBinCollator(collate string) Collator { return GetCollator(ConvertAndGetBinCollation(collate)) } -// IsBinCollation returns if the collation is 'xx_bin' or 'bin'. -// The function is to determine whether the sortkey of a char type of data under the collation is equal to the data itself, -// and both xx_bin and collationBin are satisfied. +// IsBinCollation returns whether the sortkey of a char/varchar under this collation +// equals the raw data itself. This is a STORAGE-LEVEL property used by: +// - tablecodec: deciding whether restore-data is needed +// - NeedRestoredData: padding optimization +// - ranger/selectivity: assuming sortkey == data for fast paths +// +// DO NOT use this for coercibility derivation (use expression.isBinCollation instead). +// The two concepts diverge on GBK: gbk_bin's Key() does UTF-8→GBK encoding conversion +// (sortkey ≠ data), but it IS still a _bin collation for coercibility purposes. +// +// Included: ascii_bin, latin1_bin, utf8_bin, utf8mb4_bin, binary, utf8mb4_0900_bin +// NOT included: gbk_bin (its Key() transforms data via encoding) func IsBinCollation(collate string) bool { return collate == charset.CollationASCII || collate == charset.CollationLatin1 || collate == charset.CollationUTF8 || collate == charset.CollationUTF8MB4 || - collate == charset.CollationBin || collate == "utf8mb4_0900_bin" - // TODO: define a constant to reference collations + collate == charset.CollationBin || collate == charset.CollationUTF8MB40900Bin } // IsPadSpaceCollation returns whether the collation is a PAD SPACE collation.