go iter 源码

  • 2022-07-15
  • 浏览 (1117)

golang iter 代码

文件路径:/src/vendor/golang.org/x/text/unicode/norm/iter.go

// Copyright 2011 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.

package norm

import (
	"fmt"
	"unicode/utf8"
)

// MaxSegmentSize is the maximum size of a byte buffer needed to consider any
// sequence of starter and non-starter runes for the purpose of normalization.
const MaxSegmentSize = maxByteBufferSize

// An Iter iterates over a string or byte slice, while normalizing it
// to a given Form.
type Iter struct {
	rb     reorderBuffer
	buf    [maxByteBufferSize]byte
	info   Properties // first character saved from previous iteration
	next   iterFunc   // implementation of next depends on form
	asciiF iterFunc

	p        int    // current position in input source
	multiSeg []byte // remainder of multi-segment decomposition
}

type iterFunc func(*Iter) []byte

// Init initializes i to iterate over src after normalizing it to Form f.
func (i *Iter) Init(f Form, src []byte) {
	i.p = 0
	if len(src) == 0 {
		i.setDone()
		i.rb.nsrc = 0
		return
	}
	i.multiSeg = nil
	i.rb.init(f, src)
	i.next = i.rb.f.nextMain
	i.asciiF = nextASCIIBytes
	i.info = i.rb.f.info(i.rb.src, i.p)
	i.rb.ss.first(i.info)
}

// InitString initializes i to iterate over src after normalizing it to Form f.
func (i *Iter) InitString(f Form, src string) {
	i.p = 0
	if len(src) == 0 {
		i.setDone()
		i.rb.nsrc = 0
		return
	}
	i.multiSeg = nil
	i.rb.initString(f, src)
	i.next = i.rb.f.nextMain
	i.asciiF = nextASCIIString
	i.info = i.rb.f.info(i.rb.src, i.p)
	i.rb.ss.first(i.info)
}

// Seek sets the segment to be returned by the next call to Next to start
// at position p.  It is the responsibility of the caller to set p to the
// start of a segment.
func (i *Iter) Seek(offset int64, whence int) (int64, error) {
	var abs int64
	switch whence {
	case 0:
		abs = offset
	case 1:
		abs = int64(i.p) + offset
	case 2:
		abs = int64(i.rb.nsrc) + offset
	default:
		return 0, fmt.Errorf("norm: invalid whence")
	}
	if abs < 0 {
		return 0, fmt.Errorf("norm: negative position")
	}
	if int(abs) >= i.rb.nsrc {
		i.setDone()
		return int64(i.p), nil
	}
	i.p = int(abs)
	i.multiSeg = nil
	i.next = i.rb.f.nextMain
	i.info = i.rb.f.info(i.rb.src, i.p)
	i.rb.ss.first(i.info)
	return abs, nil
}

// returnSlice returns a slice of the underlying input type as a byte slice.
// If the underlying is of type []byte, it will simply return a slice.
// If the underlying is of type string, it will copy the slice to the buffer
// and return that.
func (i *Iter) returnSlice(a, b int) []byte {
	if i.rb.src.bytes == nil {
		return i.buf[:copy(i.buf[:], i.rb.src.str[a:b])]
	}
	return i.rb.src.bytes[a:b]
}

// Pos returns the byte position at which the next call to Next will commence processing.
func (i *Iter) Pos() int {
	return i.p
}

func (i *Iter) setDone() {
	i.next = nextDone
	i.p = i.rb.nsrc
}

// Done returns true if there is no more input to process.
func (i *Iter) Done() bool {
	return i.p >= i.rb.nsrc
}

// Next returns f(i.input[i.Pos():n]), where n is a boundary of i.input.
// For any input a and b for which f(a) == f(b), subsequent calls
// to Next will return the same segments.
// Modifying runes are grouped together with the preceding starter, if such a starter exists.
// Although not guaranteed, n will typically be the smallest possible n.
func (i *Iter) Next() []byte {
	return i.next(i)
}

func nextASCIIBytes(i *Iter) []byte {
	p := i.p + 1
	if p >= i.rb.nsrc {
		p0 := i.p
		i.setDone()
		return i.rb.src.bytes[p0:p]
	}
	if i.rb.src.bytes[p] < utf8.RuneSelf {
		p0 := i.p
		i.p = p
		return i.rb.src.bytes[p0:p]
	}
	i.info = i.rb.f.info(i.rb.src, i.p)
	i.next = i.rb.f.nextMain
	return i.next(i)
}

func nextASCIIString(i *Iter) []byte {
	p := i.p + 1
	if p >= i.rb.nsrc {
		i.buf[0] = i.rb.src.str[i.p]
		i.setDone()
		return i.buf[:1]
	}
	if i.rb.src.str[p] < utf8.RuneSelf {
		i.buf[0] = i.rb.src.str[i.p]
		i.p = p
		return i.buf[:1]
	}
	i.info = i.rb.f.info(i.rb.src, i.p)
	i.next = i.rb.f.nextMain
	return i.next(i)
}

func nextHangul(i *Iter) []byte {
	p := i.p
	next := p + hangulUTF8Size
	if next >= i.rb.nsrc {
		i.setDone()
	} else if i.rb.src.hangul(next) == 0 {
		i.rb.ss.next(i.info)
		i.info = i.rb.f.info(i.rb.src, i.p)
		i.next = i.rb.f.nextMain
		return i.next(i)
	}
	i.p = next
	return i.buf[:decomposeHangul(i.buf[:], i.rb.src.hangul(p))]
}

func nextDone(i *Iter) []byte {
	return nil
}

// nextMulti is used for iterating over multi-segment decompositions
// for decomposing normal forms.
func nextMulti(i *Iter) []byte {
	j := 0
	d := i.multiSeg
	// skip first rune
	for j = 1; j < len(d) && !utf8.RuneStart(d[j]); j++ {
	}
	for j < len(d) {
		info := i.rb.f.info(input{bytes: d}, j)
		if info.BoundaryBefore() {
			i.multiSeg = d[j:]
			return d[:j]
		}
		j += int(info.size)
	}
	// treat last segment as normal decomposition
	i.next = i.rb.f.nextMain
	return i.next(i)
}

// nextMultiNorm is used for iterating over multi-segment decompositions
// for composing normal forms.
func nextMultiNorm(i *Iter) []byte {
	j := 0
	d := i.multiSeg
	for j < len(d) {
		info := i.rb.f.info(input{bytes: d}, j)
		if info.BoundaryBefore() {
			i.rb.compose()
			seg := i.buf[:i.rb.flushCopy(i.buf[:])]
			i.rb.insertUnsafe(input{bytes: d}, j, info)
			i.multiSeg = d[j+int(info.size):]
			return seg
		}
		i.rb.insertUnsafe(input{bytes: d}, j, info)
		j += int(info.size)
	}
	i.multiSeg = nil
	i.next = nextComposed
	return doNormComposed(i)
}

// nextDecomposed is the implementation of Next for forms NFD and NFKD.
func nextDecomposed(i *Iter) (next []byte) {
	outp := 0
	inCopyStart, outCopyStart := i.p, 0
	for {
		if sz := int(i.info.size); sz <= 1 {
			i.rb.ss = 0
			p := i.p
			i.p++ // ASCII or illegal byte.  Either way, advance by 1.
			if i.p >= i.rb.nsrc {
				i.setDone()
				return i.returnSlice(p, i.p)
			} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
				i.next = i.asciiF
				return i.returnSlice(p, i.p)
			}
			outp++
		} else if d := i.info.Decomposition(); d != nil {
			// Note: If leading CCC != 0, then len(d) == 2 and last is also non-zero.
			// Case 1: there is a leftover to copy.  In this case the decomposition
			// must begin with a modifier and should always be appended.
			// Case 2: no leftover. Simply return d if followed by a ccc == 0 value.
			p := outp + len(d)
			if outp > 0 {
				i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
				// TODO: this condition should not be possible, but we leave it
				// in for defensive purposes.
				if p > len(i.buf) {
					return i.buf[:outp]
				}
			} else if i.info.multiSegment() {
				// outp must be 0 as multi-segment decompositions always
				// start a new segment.
				if i.multiSeg == nil {
					i.multiSeg = d
					i.next = nextMulti
					return nextMulti(i)
				}
				// We are in the last segment.  Treat as normal decomposition.
				d = i.multiSeg
				i.multiSeg = nil
				p = len(d)
			}
			prevCC := i.info.tccc
			if i.p += sz; i.p >= i.rb.nsrc {
				i.setDone()
				i.info = Properties{} // Force BoundaryBefore to succeed.
			} else {
				i.info = i.rb.f.info(i.rb.src, i.p)
			}
			switch i.rb.ss.next(i.info) {
			case ssOverflow:
				i.next = nextCGJDecompose
				fallthrough
			case ssStarter:
				if outp > 0 {
					copy(i.buf[outp:], d)
					return i.buf[:p]
				}
				return d
			}
			copy(i.buf[outp:], d)
			outp = p
			inCopyStart, outCopyStart = i.p, outp
			if i.info.ccc < prevCC {
				goto doNorm
			}
			continue
		} else if r := i.rb.src.hangul(i.p); r != 0 {
			outp = decomposeHangul(i.buf[:], r)
			i.p += hangulUTF8Size
			inCopyStart, outCopyStart = i.p, outp
			if i.p >= i.rb.nsrc {
				i.setDone()
				break
			} else if i.rb.src.hangul(i.p) != 0 {
				i.next = nextHangul
				return i.buf[:outp]
			}
		} else {
			p := outp + sz
			if p > len(i.buf) {
				break
			}
			outp = p
			i.p += sz
		}
		if i.p >= i.rb.nsrc {
			i.setDone()
			break
		}
		prevCC := i.info.tccc
		i.info = i.rb.f.info(i.rb.src, i.p)
		if v := i.rb.ss.next(i.info); v == ssStarter {
			break
		} else if v == ssOverflow {
			i.next = nextCGJDecompose
			break
		}
		if i.info.ccc < prevCC {
			goto doNorm
		}
	}
	if outCopyStart == 0 {
		return i.returnSlice(inCopyStart, i.p)
	} else if inCopyStart < i.p {
		i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
	}
	return i.buf[:outp]
doNorm:
	// Insert what we have decomposed so far in the reorderBuffer.
	// As we will only reorder, there will always be enough room.
	i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
	i.rb.insertDecomposed(i.buf[0:outp])
	return doNormDecomposed(i)
}

func doNormDecomposed(i *Iter) []byte {
	for {
		i.rb.insertUnsafe(i.rb.src, i.p, i.info)
		if i.p += int(i.info.size); i.p >= i.rb.nsrc {
			i.setDone()
			break
		}
		i.info = i.rb.f.info(i.rb.src, i.p)
		if i.info.ccc == 0 {
			break
		}
		if s := i.rb.ss.next(i.info); s == ssOverflow {
			i.next = nextCGJDecompose
			break
		}
	}
	// new segment or too many combining characters: exit normalization
	return i.buf[:i.rb.flushCopy(i.buf[:])]
}

func nextCGJDecompose(i *Iter) []byte {
	i.rb.ss = 0
	i.rb.insertCGJ()
	i.next = nextDecomposed
	i.rb.ss.first(i.info)
	buf := doNormDecomposed(i)
	return buf
}

// nextComposed is the implementation of Next for forms NFC and NFKC.
func nextComposed(i *Iter) []byte {
	outp, startp := 0, i.p
	var prevCC uint8
	for {
		if !i.info.isYesC() {
			goto doNorm
		}
		prevCC = i.info.tccc
		sz := int(i.info.size)
		if sz == 0 {
			sz = 1 // illegal rune: copy byte-by-byte
		}
		p := outp + sz
		if p > len(i.buf) {
			break
		}
		outp = p
		i.p += sz
		if i.p >= i.rb.nsrc {
			i.setDone()
			break
		} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
			i.rb.ss = 0
			i.next = i.asciiF
			break
		}
		i.info = i.rb.f.info(i.rb.src, i.p)
		if v := i.rb.ss.next(i.info); v == ssStarter {
			break
		} else if v == ssOverflow {
			i.next = nextCGJCompose
			break
		}
		if i.info.ccc < prevCC {
			goto doNorm
		}
	}
	return i.returnSlice(startp, i.p)
doNorm:
	// reset to start position
	i.p = startp
	i.info = i.rb.f.info(i.rb.src, i.p)
	i.rb.ss.first(i.info)
	if i.info.multiSegment() {
		d := i.info.Decomposition()
		info := i.rb.f.info(input{bytes: d}, 0)
		i.rb.insertUnsafe(input{bytes: d}, 0, info)
		i.multiSeg = d[int(info.size):]
		i.next = nextMultiNorm
		return nextMultiNorm(i)
	}
	i.rb.ss.first(i.info)
	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
	return doNormComposed(i)
}

func doNormComposed(i *Iter) []byte {
	// First rune should already be inserted.
	for {
		if i.p += int(i.info.size); i.p >= i.rb.nsrc {
			i.setDone()
			break
		}
		i.info = i.rb.f.info(i.rb.src, i.p)
		if s := i.rb.ss.next(i.info); s == ssStarter {
			break
		} else if s == ssOverflow {
			i.next = nextCGJCompose
			break
		}
		i.rb.insertUnsafe(i.rb.src, i.p, i.info)
	}
	i.rb.compose()
	seg := i.buf[:i.rb.flushCopy(i.buf[:])]
	return seg
}

func nextCGJCompose(i *Iter) []byte {
	i.rb.ss = 0 // instead of first
	i.rb.insertCGJ()
	i.next = nextComposed
	// Note that we treat any rune with nLeadingNonStarters > 0 as a non-starter,
	// even if they are not. This is particularly dubious for U+FF9E and UFF9A.
	// If we ever change that, insert a check here.
	i.rb.ss.first(i.info)
	i.rb.insertUnsafe(i.rb.src, i.p, i.info)
	return doNormComposed(i)
}

相关信息

go 源码目录

相关文章

go composition 源码

go forminfo 源码

go input 源码

go normalize 源码

go readwriter 源码

go tables10.0.0 源码

go tables11.0.0 源码

go tables12.0.0 源码

go tables13.0.0 源码

go tables9.0.0 源码

0  赞