go normalize 源码

  • 2022-07-15
  • 浏览 (1027)

golang normalize 代码

文件路径:/src/vendor/golang.org/x/text/unicode/norm/normalize.go

// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

// Note: the file data_test.go that is generated should not be checked in.
//go:generate go run maketables.go triegen.go
//go:generate go test -tags test

// Package norm contains types and functions for normalizing Unicode strings.
package norm // import "golang.org/x/text/unicode/norm"

import (
	"unicode/utf8"

	"golang.org/x/text/transform"
)

// A Form denotes a canonical representation of Unicode code points.
// The Unicode-defined normalization and equivalence forms are:
//
//	NFC   Unicode Normalization Form C
//	NFD   Unicode Normalization Form D
//	NFKC  Unicode Normalization Form KC
//	NFKD  Unicode Normalization Form KD
//
// For a Form f, this documentation uses the notation f(x) to mean
// the bytes or string x converted to the given form.
// A position n in x is called a boundary if conversion to the form can
// proceed independently on both sides:
//
//	f(x) == append(f(x[0:n]), f(x[n:])...)
//
// References: https://unicode.org/reports/tr15/ and
// https://unicode.org/notes/tn5/.
type Form int

const (
	NFC Form = iota
	NFD
	NFKC
	NFKD
)

// Bytes returns f(b). May return b if f(b) = b.
func (f Form) Bytes(b []byte) []byte {
	src := inputBytes(b)
	ft := formTable[f]
	n, ok := ft.quickSpan(src, 0, len(b), true)
	if ok {
		return b
	}
	out := make([]byte, n, len(b))
	copy(out, b[0:n])
	rb := reorderBuffer{f: *ft, src: src, nsrc: len(b), out: out, flushF: appendFlush}
	return doAppendInner(&rb, n)
}

// String returns f(s).
func (f Form) String(s string) string {
	src := inputString(s)
	ft := formTable[f]
	n, ok := ft.quickSpan(src, 0, len(s), true)
	if ok {
		return s
	}
	out := make([]byte, n, len(s))
	copy(out, s[0:n])
	rb := reorderBuffer{f: *ft, src: src, nsrc: len(s), out: out, flushF: appendFlush}
	return string(doAppendInner(&rb, n))
}

// IsNormal returns true if b == f(b).
func (f Form) IsNormal(b []byte) bool {
	src := inputBytes(b)
	ft := formTable[f]
	bp, ok := ft.quickSpan(src, 0, len(b), true)
	if ok {
		return true
	}
	rb := reorderBuffer{f: *ft, src: src, nsrc: len(b)}
	rb.setFlusher(nil, cmpNormalBytes)
	for bp < len(b) {
		rb.out = b[bp:]
		if bp = decomposeSegment(&rb, bp, true); bp < 0 {
			return false
		}
		bp, _ = rb.f.quickSpan(rb.src, bp, len(b), true)
	}
	return true
}

func cmpNormalBytes(rb *reorderBuffer) bool {
	b := rb.out
	for i := 0; i < rb.nrune; i++ {
		info := rb.rune[i]
		if int(info.size) > len(b) {
			return false
		}
		p := info.pos
		pe := p + info.size
		for ; p < pe; p++ {
			if b[0] != rb.byte[p] {
				return false
			}
			b = b[1:]
		}
	}
	return true
}

// IsNormalString returns true if s == f(s).
func (f Form) IsNormalString(s string) bool {
	src := inputString(s)
	ft := formTable[f]
	bp, ok := ft.quickSpan(src, 0, len(s), true)
	if ok {
		return true
	}
	rb := reorderBuffer{f: *ft, src: src, nsrc: len(s)}
	rb.setFlusher(nil, func(rb *reorderBuffer) bool {
		for i := 0; i < rb.nrune; i++ {
			info := rb.rune[i]
			if bp+int(info.size) > len(s) {
				return false
			}
			p := info.pos
			pe := p + info.size
			for ; p < pe; p++ {
				if s[bp] != rb.byte[p] {
					return false
				}
				bp++
			}
		}
		return true
	})
	for bp < len(s) {
		if bp = decomposeSegment(&rb, bp, true); bp < 0 {
			return false
		}
		bp, _ = rb.f.quickSpan(rb.src, bp, len(s), true)
	}
	return true
}

// patchTail fixes a case where a rune may be incorrectly normalized
// if it is followed by illegal continuation bytes. It returns the
// patched buffer and whether the decomposition is still in progress.
func patchTail(rb *reorderBuffer) bool {
	info, p := lastRuneStart(&rb.f, rb.out)
	if p == -1 || info.size == 0 {
		return true
	}
	end := p + int(info.size)
	extra := len(rb.out) - end
	if extra > 0 {
		// Potentially allocating memory. However, this only
		// happens with ill-formed UTF-8.
		x := make([]byte, 0)
		x = append(x, rb.out[len(rb.out)-extra:]...)
		rb.out = rb.out[:end]
		decomposeToLastBoundary(rb)
		rb.doFlush()
		rb.out = append(rb.out, x...)
		return false
	}
	buf := rb.out[p:]
	rb.out = rb.out[:p]
	decomposeToLastBoundary(rb)
	if s := rb.ss.next(info); s == ssStarter {
		rb.doFlush()
		rb.ss.first(info)
	} else if s == ssOverflow {
		rb.doFlush()
		rb.insertCGJ()
		rb.ss = 0
	}
	rb.insertUnsafe(inputBytes(buf), 0, info)
	return true
}

func appendQuick(rb *reorderBuffer, i int) int {
	if rb.nsrc == i {
		return i
	}
	end, _ := rb.f.quickSpan(rb.src, i, rb.nsrc, true)
	rb.out = rb.src.appendSlice(rb.out, i, end)
	return end
}

// Append returns f(append(out, b...)).
// The buffer out must be nil, empty, or equal to f(out).
func (f Form) Append(out []byte, src ...byte) []byte {
	return f.doAppend(out, inputBytes(src), len(src))
}

func (f Form) doAppend(out []byte, src input, n int) []byte {
	if n == 0 {
		return out
	}
	ft := formTable[f]
	// Attempt to do a quickSpan first so we can avoid initializing the reorderBuffer.
	if len(out) == 0 {
		p, _ := ft.quickSpan(src, 0, n, true)
		out = src.appendSlice(out, 0, p)
		if p == n {
			return out
		}
		rb := reorderBuffer{f: *ft, src: src, nsrc: n, out: out, flushF: appendFlush}
		return doAppendInner(&rb, p)
	}
	rb := reorderBuffer{f: *ft, src: src, nsrc: n}
	return doAppend(&rb, out, 0)
}

func doAppend(rb *reorderBuffer, out []byte, p int) []byte {
	rb.setFlusher(out, appendFlush)
	src, n := rb.src, rb.nsrc
	doMerge := len(out) > 0
	if q := src.skipContinuationBytes(p); q > p {
		// Move leading non-starters to destination.
		rb.out = src.appendSlice(rb.out, p, q)
		p = q
		doMerge = patchTail(rb)
	}
	fd := &rb.f
	if doMerge {
		var info Properties
		if p < n {
			info = fd.info(src, p)
			if !info.BoundaryBefore() || info.nLeadingNonStarters() > 0 {
				if p == 0 {
					decomposeToLastBoundary(rb)
				}
				p = decomposeSegment(rb, p, true)
			}
		}
		if info.size == 0 {
			rb.doFlush()
			// Append incomplete UTF-8 encoding.
			return src.appendSlice(rb.out, p, n)
		}
		if rb.nrune > 0 {
			return doAppendInner(rb, p)
		}
	}
	p = appendQuick(rb, p)
	return doAppendInner(rb, p)
}

func doAppendInner(rb *reorderBuffer, p int) []byte {
	for n := rb.nsrc; p < n; {
		p = decomposeSegment(rb, p, true)
		p = appendQuick(rb, p)
	}
	return rb.out
}

// AppendString returns f(append(out, []byte(s))).
// The buffer out must be nil, empty, or equal to f(out).
func (f Form) AppendString(out []byte, src string) []byte {
	return f.doAppend(out, inputString(src), len(src))
}

// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]).
// It is not guaranteed to return the largest such n.
func (f Form) QuickSpan(b []byte) int {
	n, _ := formTable[f].quickSpan(inputBytes(b), 0, len(b), true)
	return n
}

// Span implements transform.SpanningTransformer. It returns a boundary n such
// that b[0:n] == f(b[0:n]). It is not guaranteed to return the largest such n.
func (f Form) Span(b []byte, atEOF bool) (n int, err error) {
	n, ok := formTable[f].quickSpan(inputBytes(b), 0, len(b), atEOF)
	if n < len(b) {
		if !ok {
			err = transform.ErrEndOfSpan
		} else {
			err = transform.ErrShortSrc
		}
	}
	return n, err
}

// SpanString returns a boundary n such that s[0:n] == f(s[0:n]).
// It is not guaranteed to return the largest such n.
func (f Form) SpanString(s string, atEOF bool) (n int, err error) {
	n, ok := formTable[f].quickSpan(inputString(s), 0, len(s), atEOF)
	if n < len(s) {
		if !ok {
			err = transform.ErrEndOfSpan
		} else {
			err = transform.ErrShortSrc
		}
	}
	return n, err
}

// quickSpan returns a boundary n such that src[0:n] == f(src[0:n]) and
// whether any non-normalized parts were found. If atEOF is false, n will
// not point past the last segment if this segment might be become
// non-normalized by appending other runes.
func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool) {
	var lastCC uint8
	ss := streamSafe(0)
	lastSegStart := i
	for n = end; i < n; {
		if j := src.skipASCII(i, n); i != j {
			i = j
			lastSegStart = i - 1
			lastCC = 0
			ss = 0
			continue
		}
		info := f.info(src, i)
		if info.size == 0 {
			if atEOF {
				// include incomplete runes
				return n, true
			}
			return lastSegStart, true
		}
		// This block needs to be before the next, because it is possible to
		// have an overflow for runes that are starters (e.g. with U+FF9E).
		switch ss.next(info) {
		case ssStarter:
			lastSegStart = i
		case ssOverflow:
			return lastSegStart, false
		case ssSuccess:
			if lastCC > info.ccc {
				return lastSegStart, false
			}
		}
		if f.composing {
			if !info.isYesC() {
				break
			}
		} else {
			if !info.isYesD() {
				break
			}
		}
		lastCC = info.ccc
		i += int(info.size)
	}
	if i == n {
		if !atEOF {
			n = lastSegStart
		}
		return n, true
	}
	return lastSegStart, false
}

// QuickSpanString returns a boundary n such that s[0:n] == f(s[0:n]).
// It is not guaranteed to return the largest such n.
func (f Form) QuickSpanString(s string) int {
	n, _ := formTable[f].quickSpan(inputString(s), 0, len(s), true)
	return n
}

// FirstBoundary returns the position i of the first boundary in b
// or -1 if b contains no boundary.
func (f Form) FirstBoundary(b []byte) int {
	return f.firstBoundary(inputBytes(b), len(b))
}

func (f Form) firstBoundary(src input, nsrc int) int {
	i := src.skipContinuationBytes(0)
	if i >= nsrc {
		return -1
	}
	fd := formTable[f]
	ss := streamSafe(0)
	// We should call ss.first here, but we can't as the first rune is
	// skipped already. This means FirstBoundary can't really determine
	// CGJ insertion points correctly. Luckily it doesn't have to.
	for {
		info := fd.info(src, i)
		if info.size == 0 {
			return -1
		}
		if s := ss.next(info); s != ssSuccess {
			return i
		}
		i += int(info.size)
		if i >= nsrc {
			if !info.BoundaryAfter() && !ss.isMax() {
				return -1
			}
			return nsrc
		}
	}
}

// FirstBoundaryInString returns the position i of the first boundary in s
// or -1 if s contains no boundary.
func (f Form) FirstBoundaryInString(s string) int {
	return f.firstBoundary(inputString(s), len(s))
}

// NextBoundary reports the index of the boundary between the first and next
// segment in b or -1 if atEOF is false and there are not enough bytes to
// determine this boundary.
func (f Form) NextBoundary(b []byte, atEOF bool) int {
	return f.nextBoundary(inputBytes(b), len(b), atEOF)
}

// NextBoundaryInString reports the index of the boundary between the first and
// next segment in b or -1 if atEOF is false and there are not enough bytes to
// determine this boundary.
func (f Form) NextBoundaryInString(s string, atEOF bool) int {
	return f.nextBoundary(inputString(s), len(s), atEOF)
}

func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int {
	if nsrc == 0 {
		if atEOF {
			return 0
		}
		return -1
	}
	fd := formTable[f]
	info := fd.info(src, 0)
	if info.size == 0 {
		if atEOF {
			return 1
		}
		return -1
	}
	ss := streamSafe(0)
	ss.first(info)

	for i := int(info.size); i < nsrc; i += int(info.size) {
		info = fd.info(src, i)
		if info.size == 0 {
			if atEOF {
				return i
			}
			return -1
		}
		// TODO: Using streamSafe to determine the boundary isn't the same as
		// using BoundaryBefore. Determine which should be used.
		if s := ss.next(info); s != ssSuccess {
			return i
		}
	}
	if !atEOF && !info.BoundaryAfter() && !ss.isMax() {
		return -1
	}
	return nsrc
}

// LastBoundary returns the position i of the last boundary in b
// or -1 if b contains no boundary.
func (f Form) LastBoundary(b []byte) int {
	return lastBoundary(formTable[f], b)
}

func lastBoundary(fd *formInfo, b []byte) int {
	i := len(b)
	info, p := lastRuneStart(fd, b)
	if p == -1 {
		return -1
	}
	if info.size == 0 { // ends with incomplete rune
		if p == 0 { // starts with incomplete rune
			return -1
		}
		i = p
		info, p = lastRuneStart(fd, b[:i])
		if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter
			return i
		}
	}
	if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8
		return i
	}
	if info.BoundaryAfter() {
		return i
	}
	ss := streamSafe(0)
	v := ss.backwards(info)
	for i = p; i >= 0 && v != ssStarter; i = p {
		info, p = lastRuneStart(fd, b[:i])
		if v = ss.backwards(info); v == ssOverflow {
			break
		}
		if p+int(info.size) != i {
			if p == -1 { // no boundary found
				return -1
			}
			return i // boundary after an illegal UTF-8 encoding
		}
	}
	return i
}

// decomposeSegment scans the first segment in src into rb. It inserts 0x034f
// (Grapheme Joiner) when it encounters a sequence of more than 30 non-starters
// and returns the number of bytes consumed from src or iShortDst or iShortSrc.
func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
	// Force one character to be consumed.
	info := rb.f.info(rb.src, sp)
	if info.size == 0 {
		return 0
	}
	if s := rb.ss.next(info); s == ssStarter {
		// TODO: this could be removed if we don't support merging.
		if rb.nrune > 0 {
			goto end
		}
	} else if s == ssOverflow {
		rb.insertCGJ()
		goto end
	}
	if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
		return int(err)
	}
	for {
		sp += int(info.size)
		if sp >= rb.nsrc {
			if !atEOF && !info.BoundaryAfter() {
				return int(iShortSrc)
			}
			break
		}
		info = rb.f.info(rb.src, sp)
		if info.size == 0 {
			if !atEOF {
				return int(iShortSrc)
			}
			break
		}
		if s := rb.ss.next(info); s == ssStarter {
			break
		} else if s == ssOverflow {
			rb.insertCGJ()
			break
		}
		if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
			return int(err)
		}
	}
end:
	if !rb.doFlush() {
		return int(iShortDst)
	}
	return sp
}

// lastRuneStart returns the runeInfo and position of the last
// rune in buf or the zero runeInfo and -1 if no rune was found.
func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) {
	p := len(buf) - 1
	for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- {
	}
	if p < 0 {
		return Properties{}, -1
	}
	return fd.info(inputBytes(buf), p), p
}

// decomposeToLastBoundary finds an open segment at the end of the buffer
// and scans it into rb. Returns the buffer minus the last segment.
func decomposeToLastBoundary(rb *reorderBuffer) {
	fd := &rb.f
	info, i := lastRuneStart(fd, rb.out)
	if int(info.size) != len(rb.out)-i {
		// illegal trailing continuation bytes
		return
	}
	if info.BoundaryAfter() {
		return
	}
	var add [maxNonStarters + 1]Properties // stores runeInfo in reverse order
	padd := 0
	ss := streamSafe(0)
	p := len(rb.out)
	for {
		add[padd] = info
		v := ss.backwards(info)
		if v == ssOverflow {
			// Note that if we have an overflow, it the string we are appending to
			// is not correctly normalized. In this case the behavior is undefined.
			break
		}
		padd++
		p -= int(info.size)
		if v == ssStarter || p < 0 {
			break
		}
		info, i = lastRuneStart(fd, rb.out[:p])
		if int(info.size) != p-i {
			break
		}
	}
	rb.ss = ss
	// Copy bytes for insertion as we may need to overwrite rb.out.
	var buf [maxBufferSize * utf8.UTFMax]byte
	cp := buf[:copy(buf[:], rb.out[p:])]
	rb.out = rb.out[:p]
	for padd--; padd >= 0; padd-- {
		info = add[padd]
		rb.insertUnsafe(inputBytes(cp), 0, info)
		cp = cp[info.size:]
	}
}

相关信息

go 源码目录

相关文章

go composition 源码

go forminfo 源码

go input 源码

go iter 源码

go readwriter 源码

go tables10.0.0 源码

go tables11.0.0 源码

go tables12.0.0 源码

go tables13.0.0 源码

go tables9.0.0 源码

0  赞