tidb collate 源码

  • 2022-09-19
  • 浏览 (401)

tidb collate 代码

文件路径:/util/collate/collate.go

// Copyright 2020 PingCAP, Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
//     http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.

package collate

import (
	"fmt"
	"sync/atomic"

	"github.com/pingcap/errors"
	"github.com/pingcap/tidb/parser/charset"
	"github.com/pingcap/tidb/parser/mysql"
	"github.com/pingcap/tidb/parser/terror"
	"github.com/pingcap/tidb/util/dbterror"
	"github.com/pingcap/tidb/util/logutil"
	"go.uber.org/zap"
	"golang.org/x/exp/slices"
)

var (
	newCollatorMap      map[string]Collator
	newCollatorIDMap    map[int]Collator
	newCollationEnabled int32

	// binCollatorInstance is a singleton used for all collations when newCollationEnabled is false.
	binCollatorInstance              = &binCollator{}
	binCollatorInstanceSliceWithLen1 = []Collator{binCollatorInstance}

	// ErrUnsupportedCollation is returned when an unsupported collation is specified.
	ErrUnsupportedCollation = dbterror.ClassDDL.NewStdErr(mysql.ErrUnknownCollation, mysql.Message("Unsupported collation when new collation is enabled: '%-.64s'", nil))
	// ErrIllegalMixCollation is returned when illegal mix of collations.
	ErrIllegalMixCollation = dbterror.ClassExpression.NewStd(mysql.ErrCantAggregateNcollations)
	// ErrIllegalMix2Collation is returned when illegal mix of 2 collations.
	ErrIllegalMix2Collation = dbterror.ClassExpression.NewStd(mysql.ErrCantAggregate2collations)
	// ErrIllegalMix3Collation is returned when illegal mix of 3 collations.
	ErrIllegalMix3Collation = dbterror.ClassExpression.NewStd(mysql.ErrCantAggregate3collations)
)

const (
	// DefaultLen is set for datum if the string datum don't know its length.
	DefaultLen = 0
	// first byte of a 2-byte encoding starts 110 and carries 5 bits of data
	b2Mask = 0x1F // 0001 1111
	// first byte of a 3-byte encoding starts 1110 and carries 4 bits of data
	b3Mask = 0x0F // 0000 1111
	// first byte of a 4-byte encoding starts 11110 and carries 3 bits of data
	b4Mask = 0x07 // 0000 0111
	// non-first bytes start 10 and carry 6 bits of data
	mbMask = 0x3F // 0011 1111
)

// Collator provides functionality for comparing strings for a given
// collation order.
type Collator interface {
	// Compare returns an integer comparing the two strings. The result will be 0 if a == b, -1 if a < b, and +1 if a > b.
	Compare(a, b string) int
	// Key returns the collate key for str. If the collation is padding, make sure the PadLen >= len(rune[]str) in opt.
	Key(str string) []byte
	// KeyWithoutTrimRightSpace returns the collate key for str. The difference with Key is str will not be trimed.
	KeyWithoutTrimRightSpace(str string) []byte
	// Pattern get a collation-aware WildcardPattern.
	Pattern() WildcardPattern
}

// WildcardPattern is the interface used for wildcard pattern match.
type WildcardPattern interface {
	// Compile compiles the patternStr with specified escape character.
	Compile(patternStr string, escape byte)
	// DoMatch tries to match the str with compiled pattern, `Compile()` must be called before calling it.
	DoMatch(str string) bool
}

// SetNewCollationEnabledForTest sets if the new collation are enabled in test.
// Note: Be careful to use this function, if this functions is used in tests, make sure the tests are serial.
func SetNewCollationEnabledForTest(flag bool) {
	switchDefaultCollation(flag)
	if flag {
		atomic.StoreInt32(&newCollationEnabled, 1)
		return
	}
	atomic.StoreInt32(&newCollationEnabled, 0)
}

// NewCollationEnabled returns if the new collations are enabled.
func NewCollationEnabled() bool {
	return atomic.LoadInt32(&newCollationEnabled) == 1
}

// CompatibleCollate checks whether the two collate are the same.
func CompatibleCollate(collate1, collate2 string) bool {
	if (collate1 == "utf8mb4_general_ci" || collate1 == "utf8_general_ci") && (collate2 == "utf8mb4_general_ci" || collate2 == "utf8_general_ci") {
		return true
	} else if (collate1 == "utf8mb4_bin" || collate1 == "utf8_bin" || collate1 == "latin1_bin") && (collate2 == "utf8mb4_bin" || collate2 == "utf8_bin") {
		return true
	} else if (collate1 == "utf8mb4_unicode_ci" || collate1 == "utf8_unicode_ci") && (collate2 == "utf8mb4_unicode_ci" || collate2 == "utf8_unicode_ci") {
		return true
	} else {
		return collate1 == collate2
	}
}

// RewriteNewCollationIDIfNeeded rewrites a collation id if the new collations are enabled.
// When new collations are enabled, we turn the collation id to negative so that other the
// components of the cluster(for example, TiKV) is able to aware of it without any change to
// the protocol definition.
// When new collations are not enabled, collation id remains the same.
func RewriteNewCollationIDIfNeeded(id int32) int32 {
	if atomic.LoadInt32(&newCollationEnabled) == 1 {
		if id >= 0 {
			return -id
		}
		logutil.BgLogger().Warn("Unexpected negative collation ID for rewrite.", zap.Int32("ID", id))
	}
	return id
}

// RestoreCollationIDIfNeeded restores a collation id if the new collations are enabled.
func RestoreCollationIDIfNeeded(id int32) int32 {
	if atomic.LoadInt32(&newCollationEnabled) == 1 {
		if id <= 0 {
			return -id
		}
		logutil.BgLogger().Warn("Unexpected positive collation ID for restore.", zap.Int32("ID", id))
	}
	return id
}

// GetCollator get the collator according to collate, it will return the binary collator if the corresponding collator doesn't exist.
func GetCollator(collate string) Collator {
	if atomic.LoadInt32(&newCollationEnabled) == 1 {
		ctor, ok := newCollatorMap[collate]
		if !ok {
			if collate != "" {
				logutil.BgLogger().Warn(
					"Unable to get collator by name, use binCollator instead.",
					zap.String("name", collate),
					zap.Stack("stack"))
			}
			return newCollatorMap[charset.CollationUTF8MB4]
		}
		return ctor
	}
	return binCollatorInstance
}

// GetBinaryCollator gets the binary collator, it is often used when we want to apply binary compare.
func GetBinaryCollator() Collator {
	return binCollatorInstance
}

// GetBinaryCollatorSlice gets the binary collator slice with len n.
func GetBinaryCollatorSlice(n int) []Collator {
	if n == 1 {
		return binCollatorInstanceSliceWithLen1
	}
	collators := make([]Collator, n)
	for i := 0; i < n; i++ {
		collators[i] = binCollatorInstance
	}
	return collators
}

// GetCollatorByID get the collator according to id, it will return the binary collator if the corresponding collator doesn't exist.
func GetCollatorByID(id int) Collator {
	if atomic.LoadInt32(&newCollationEnabled) == 1 {
		ctor, ok := newCollatorIDMap[id]
		if !ok {
			logutil.BgLogger().Warn(
				"Unable to get collator by ID, use binCollator instead.",
				zap.Int("ID", id),
				zap.Stack("stack"))
			return newCollatorMap["utf8mb4_bin"]
		}
		return ctor
	}
	return binCollatorInstance
}

// CollationID2Name return the collation name by the given id.
// If the id is not found in the map, the default collation is returned.
func CollationID2Name(id int32) string {
	collation, err := charset.GetCollationByID(int(id))
	if err != nil {
		// TODO(bb7133): fix repeating logs when the following code is uncommented.
		// logutil.BgLogger().Warn(
		// 	"Unable to get collation name from ID, use default collation instead.",
		// 	zap.Int32("ID", id),
		// 	zap.Stack("stack"))
		return mysql.DefaultCollationName
	}
	return collation.Name
}

// CollationName2ID return the collation id by the given name.
// If the name is not found in the map, the default collation id is returned
func CollationName2ID(name string) int {
	if coll, err := charset.GetCollationByName(name); err == nil {
		return coll.ID
	}
	return mysql.DefaultCollationID
}

// SubstituteMissingCollationToDefault will switch to the default collation if
// new collations are enabled and the specified collation is not supported.
func SubstituteMissingCollationToDefault(co string) string {
	var err error
	if _, err = GetCollationByName(co); err == nil {
		return co
	}
	logutil.BgLogger().Warn(fmt.Sprintf("The collation %s specified on connection is not supported when new collation is enabled, switch to the default collation: %s", co, mysql.DefaultCollationName))
	var coll *charset.Collation
	if coll, err = GetCollationByName(charset.CollationUTF8MB4); err != nil {
		logutil.BgLogger().Warn(err.Error())
	}
	return coll.Name
}

// GetCollationByName wraps charset.GetCollationByName, it checks the collation.
func GetCollationByName(name string) (coll *charset.Collation, err error) {
	if coll, err = charset.GetCollationByName(name); err != nil {
		return nil, errors.Trace(err)
	}
	if atomic.LoadInt32(&newCollationEnabled) == 1 {
		if _, ok := newCollatorIDMap[coll.ID]; !ok {
			return nil, ErrUnsupportedCollation.GenWithStackByArgs(name)
		}
	}
	return
}

// GetSupportedCollations gets information for all collations supported so far.
func GetSupportedCollations() []*charset.Collation {
	if atomic.LoadInt32(&newCollationEnabled) == 1 {
		newSupportedCollations := make([]*charset.Collation, 0, len(newCollatorMap))
		for name := range newCollatorMap {
			// utf8mb4_zh_pinyin_tidb_as_cs is under developing, should not be shown to user.
			if name == "utf8mb4_zh_pinyin_tidb_as_cs" {
				continue
			}
			if coll, err := charset.GetCollationByName(name); err != nil {
				// Should never happens.
				terror.Log(err)
			} else {
				newSupportedCollations = append(newSupportedCollations, coll)
			}
		}
		slices.SortFunc(newSupportedCollations, func(i, j *charset.Collation) bool {
			return i.Name < j.Name
		})
		return newSupportedCollations
	}
	return charset.GetSupportedCollations()
}

func truncateTailingSpace(str string) string {
	byteLen := len(str)
	i := byteLen - 1
	for ; i >= 0; i-- {
		if str[i] != ' ' {
			break
		}
	}
	str = str[:i+1]
	return str
}

func sign(i int) int {
	if i < 0 {
		return -1
	} else if i > 0 {
		return 1
	}
	return 0
}

// decode rune by hand
func decodeRune(s string, si int) (r rune, newIndex int) {
	b := s[si]
	switch runeLen(b) {
	case 1:
		r = rune(b)
		newIndex = si + 1
	case 2:
		r = rune(b&b2Mask)<<6 |
			rune(s[1+si]&mbMask)
		newIndex = si + 2
	case 3:
		r = rune(b&b3Mask)<<12 |
			rune(s[si+1]&mbMask)<<6 |
			rune(s[si+2]&mbMask)
		newIndex = si + 3
	default:
		r = rune(b&b4Mask)<<18 |
			rune(s[si+1]&mbMask)<<12 |
			rune(s[si+2]&mbMask)<<6 |
			rune(s[si+3]&mbMask)
		newIndex = si + 4
	}
	return
}

func runeLen(b byte) int {
	if b < 0x80 {
		return 1
	} else if b < 0xE0 {
		return 2
	} else if b < 0xF0 {
		return 3
	}
	return 4
}

// IsCICollation returns if the collation is case-insensitive
func IsCICollation(collate string) bool {
	return collate == "utf8_general_ci" || collate == "utf8mb4_general_ci" ||
		collate == "utf8_unicode_ci" || collate == "utf8mb4_unicode_ci" || collate == "gbk_chinese_ci"
}

// IsBinCollation returns if the collation is 'xx_bin' or 'bin'.
// The function is to determine whether the sortkey of a char type of data under the collation is equal to the data itself,
// and both xx_bin and collationBin are satisfied.
func IsBinCollation(collate string) bool {
	return collate == charset.CollationASCII || collate == charset.CollationLatin1 ||
		collate == charset.CollationUTF8 || collate == charset.CollationUTF8MB4 ||
		collate == charset.CollationBin
}

// CollationToProto converts collation from string to int32(used by protocol).
func CollationToProto(c string) int32 {
	if coll, err := charset.GetCollationByName(c); err == nil {
		return RewriteNewCollationIDIfNeeded(int32(coll.ID))
	}
	v := RewriteNewCollationIDIfNeeded(int32(mysql.DefaultCollationID))
	logutil.BgLogger().Warn(
		"Unable to get collation ID by name, use ID of the default collation instead",
		zap.String("name", c),
		zap.Int32("default collation ID", v),
		zap.String("default collation", mysql.DefaultCollationName),
	)
	return v
}

// ProtoToCollation converts collation from int32(used by protocol) to string.
func ProtoToCollation(c int32) string {
	coll, err := charset.GetCollationByID(int(RestoreCollationIDIfNeeded(c)))
	if err == nil {
		return coll.Name
	}
	logutil.BgLogger().Warn(
		"Unable to get collation name from ID, use name of the default collation instead",
		zap.Int32("id", c),
		zap.Int("default collation ID", mysql.DefaultCollationID),
		zap.String("default collation", mysql.DefaultCollationName),
	)
	return mysql.DefaultCollationName
}

func init() {
	// Set it to 1 in init() to make sure the tests enable the new collation, it would be covered in bootstrap().
	newCollationEnabled = 1

	newCollatorMap = make(map[string]Collator)
	newCollatorIDMap = make(map[int]Collator)

	newCollatorMap["binary"] = &binCollator{}
	newCollatorIDMap[CollationName2ID("binary")] = &binCollator{}
	newCollatorMap["ascii_bin"] = &binPaddingCollator{}
	newCollatorIDMap[CollationName2ID("ascii_bin")] = &binPaddingCollator{}
	newCollatorMap["latin1_bin"] = &binPaddingCollator{}
	newCollatorIDMap[CollationName2ID("latin1_bin")] = &binPaddingCollator{}
	newCollatorMap["utf8mb4_bin"] = &binPaddingCollator{}
	newCollatorIDMap[CollationName2ID("utf8mb4_bin")] = &binPaddingCollator{}
	newCollatorMap["utf8_bin"] = &binPaddingCollator{}
	newCollatorIDMap[CollationName2ID("utf8_bin")] = &binPaddingCollator{}
	newCollatorMap["utf8mb4_general_ci"] = &generalCICollator{}
	newCollatorIDMap[CollationName2ID("utf8mb4_general_ci")] = &generalCICollator{}
	newCollatorMap["utf8_general_ci"] = &generalCICollator{}
	newCollatorIDMap[CollationName2ID("utf8_general_ci")] = &generalCICollator{}
	newCollatorMap["utf8mb4_unicode_ci"] = &unicodeCICollator{}
	newCollatorIDMap[CollationName2ID("utf8mb4_unicode_ci")] = &unicodeCICollator{}
	newCollatorMap["utf8_unicode_ci"] = &unicodeCICollator{}
	newCollatorIDMap[CollationName2ID("utf8_unicode_ci")] = &unicodeCICollator{}
	newCollatorMap["utf8mb4_zh_pinyin_tidb_as_cs"] = &zhPinyinTiDBASCSCollator{}
	newCollatorIDMap[CollationName2ID("utf8mb4_zh_pinyin_tidb_as_cs")] = &zhPinyinTiDBASCSCollator{}
	newCollatorMap[charset.CollationGBKBin] = &gbkBinCollator{charset.NewCustomGBKEncoder()}
	newCollatorIDMap[CollationName2ID(charset.CollationGBKBin)] = &gbkBinCollator{charset.NewCustomGBKEncoder()}
	newCollatorMap[charset.CollationGBKChineseCI] = &gbkChineseCICollator{}
	newCollatorIDMap[CollationName2ID(charset.CollationGBKChineseCI)] = &gbkChineseCICollator{}
}

相关信息

tidb 源码目录

相关文章

tidb bin 源码

tidb charset 源码

tidb gbk_bin 源码

tidb gbk_chinese_ci 源码

tidb gbk_chinese_ci_data 源码

tidb general_ci 源码

tidb pinyin_tidb_as_cs 源码

tidb unicode_ci 源码

tidb unicode_ci_data 源码

0  赞