Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 14 additions & 1 deletion pkg/expression/collation.go
Original file line number Diff line number Diff line change
Expand Up @@ -568,10 +568,23 @@ func isUnicodeCollation(ch string) bool {
return ch == charset.CharsetUTF8 || ch == charset.CharsetUTF8MB4
}

// isBinCollation checks whether the collation has _bin semantics for coercibility
// derivation. In MySQL's aggregation rules, when two same-charset collations conflict
// with equal coercibility, a _bin collation yields to a non-_bin one (e.g. gbk_bin
// yields to gbk_chinese_ci) instead of degrading to CoercibilityNone.
//
// This is DIFFERENT from collate.IsBinCollation which tests "sortkey == raw data"
// (a storage-level property). The two diverge on:
// - gbk_bin: is a _bin collation (listed here) but its Key() does UTF-8→GBK
// conversion, so sortkey ≠ data (NOT in collate.IsBinCollation).
// - "binary": sortkey == data (in collate.IsBinCollation) but belongs to
// charset=bin which takes a different coercibility path (NOT listed here).
//
// If you add a new _bin collation here, also check collate.IsBinCollation.
func isBinCollation(collate string) bool {
return collate == charset.CollationASCII || collate == charset.CollationLatin1 ||
collate == charset.CollationUTF8 || collate == charset.CollationUTF8MB4 ||
collate == charset.CollationGBKBin
collate == charset.CollationGBKBin || collate == charset.CollationUTF8MB40900Bin
}

// getBinCollation get binary collation by charset
Expand Down
30 changes: 30 additions & 0 deletions pkg/expression/collation_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -147,6 +147,36 @@ func TestInferCollation(t *testing.T) {
false,
&ExprCollation{CoercibilityImplicit, UNICODE, charset.CharsetUTF8MB4, charset.CollationUTF8MB4},
},
// Regression test: utf8mb4_0900_bin is a binary collation and should win
// over non-bin collations at the same coercibility (same as utf8mb4_bin).
{
[]Expression{
newExpression(CoercibilityImplicit, UNICODE, charset.CharsetUTF8MB4, charset.CollationUTF8MB40900Bin),
newExpression(CoercibilityImplicit, UNICODE, charset.CharsetUTF8MB4, "utf8mb4_unicode_ci"),
},
false,
&ExprCollation{CoercibilityImplicit, UNICODE, charset.CharsetUTF8MB4, charset.CollationUTF8MB40900Bin},
},
{
[]Expression{
newExpression(CoercibilityImplicit, UNICODE, charset.CharsetUTF8MB4, "utf8mb4_unicode_ci"),
newExpression(CoercibilityImplicit, UNICODE, charset.CharsetUTF8MB4, charset.CollationUTF8MB40900Bin),
},
false,
&ExprCollation{CoercibilityImplicit, UNICODE, charset.CharsetUTF8MB4, charset.CollationUTF8MB40900Bin},
},
// Regression test: two utf8mb4 columns with utf8mb4_0900_bin + utf8mb4_unicode_ci
// combined with a binary blob. utf8mb4_0900_bin should be recognized as bin
// collation so binary wins without triggering from_binary() cast.
{
[]Expression{
newExpression(CoercibilityImplicit, UNICODE, charset.CharsetUTF8MB4, charset.CollationUTF8MB40900Bin),
newExpression(CoercibilityImplicit, UNICODE, charset.CharsetUTF8MB4, "utf8mb4_unicode_ci"),
newExpression(CoercibilityImplicit, UNICODE, charset.CharsetBin, charset.CollationBin),
},
false,
&ExprCollation{CoercibilityImplicit, UNICODE, charset.CharsetBin, charset.CollationBin},
},
// binary charset with non-binary charset.
{
[]Expression{
Expand Down
2 changes: 2 additions & 0 deletions pkg/parser/charset/charset.go
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,8 @@ const (
CollationLatin1 = "latin1_bin"
// CollationGBKBin is the default collation for CharsetGBK when new collation is disabled.
CollationGBKBin = "gbk_bin"
// CollationUTF8MB40900Bin is the utf8mb4_0900_bin collation (MySQL 8.0 binary collation for utf8mb4).
CollationUTF8MB40900Bin = "utf8mb4_0900_bin"
// CollationGBKChineseCI is the default collation for CharsetGBK when new collation is enabled.
CollationGBKChineseCI = "gbk_chinese_ci"
// CollationGB18030Bin is the default collation for CharsetGB18030 when new collation is disabled.
Expand Down
18 changes: 13 additions & 5 deletions pkg/util/collate/collate.go
Original file line number Diff line number Diff line change
Expand Up @@ -335,14 +335,22 @@ func ConvertAndGetBinCollator(collate string) Collator {
return GetCollator(ConvertAndGetBinCollation(collate))
}

// IsBinCollation returns if the collation is 'xx_bin' or 'bin'.
// The function is to determine whether the sortkey of a char type of data under the collation is equal to the data itself,
// and both xx_bin and collationBin are satisfied.
// IsBinCollation returns whether the sortkey of a char/varchar under this collation
// equals the raw data itself. This is a STORAGE-LEVEL property used by:
// - tablecodec: deciding whether restore-data is needed
// - NeedRestoredData: padding optimization
// - ranger/selectivity: assuming sortkey == data for fast paths
//
// DO NOT use this for coercibility derivation (use expression.isBinCollation instead).
// The two concepts diverge on GBK: gbk_bin's Key() does UTF-8→GBK encoding conversion
// (sortkey ≠ data), but it IS still a _bin collation for coercibility purposes.
//
// Included: ascii_bin, latin1_bin, utf8_bin, utf8mb4_bin, binary, utf8mb4_0900_bin
// NOT included: gbk_bin (its Key() transforms data via encoding)
func IsBinCollation(collate string) bool {
return collate == charset.CollationASCII || collate == charset.CollationLatin1 ||
collate == charset.CollationUTF8 || collate == charset.CollationUTF8MB4 ||
collate == charset.CollationBin || collate == "utf8mb4_0900_bin"
// TODO: define a constant to reference collations
collate == charset.CollationBin || collate == charset.CollationUTF8MB40900Bin
}

// IsPadSpaceCollation returns whether the collation is a PAD SPACE collation.
Expand Down
Loading