diff options
Diffstat (limited to 'gcc-4.8.1/libgo/go/exp/norm/normalize.go')
-rw-r--r-- | gcc-4.8.1/libgo/go/exp/norm/normalize.go | 478 |
1 files changed, 0 insertions, 478 deletions
diff --git a/gcc-4.8.1/libgo/go/exp/norm/normalize.go b/gcc-4.8.1/libgo/go/exp/norm/normalize.go deleted file mode 100644 index 1c3e49b77..000000000 --- a/gcc-4.8.1/libgo/go/exp/norm/normalize.go +++ /dev/null @@ -1,478 +0,0 @@ -// Copyright 2011 The Go Authors. All rights reserved. -// Use of this source code is governed by a BSD-style -// license that can be found in the LICENSE file. - -// Package norm contains types and functions for normalizing Unicode strings. -package norm - -import "unicode/utf8" - -// A Form denotes a canonical representation of Unicode code points. -// The Unicode-defined normalization and equivalence forms are: -// -// NFC Unicode Normalization Form C -// NFD Unicode Normalization Form D -// NFKC Unicode Normalization Form KC -// NFKD Unicode Normalization Form KD -// -// For a Form f, this documentation uses the notation f(x) to mean -// the bytes or string x converted to the given form. -// A position n in x is called a boundary if conversion to the form can -// proceed independently on both sides: -// f(x) == append(f(x[0:n]), f(x[n:])...) -// -// References: http://unicode.org/reports/tr15/ and -// http://unicode.org/notes/tn5/. -type Form int - -const ( - NFC Form = iota - NFD - NFKC - NFKD -) - -// Bytes returns f(b). May return b if f(b) = b. -func (f Form) Bytes(b []byte) []byte { - rb := reorderBuffer{} - rb.init(f, b) - n := quickSpan(&rb, 0) - if n == len(b) { - return b - } - out := make([]byte, n, len(b)) - copy(out, b[0:n]) - return doAppend(&rb, out, n) -} - -// String returns f(s). -func (f Form) String(s string) string { - rb := reorderBuffer{} - rb.initString(f, s) - n := quickSpan(&rb, 0) - if n == len(s) { - return s - } - out := make([]byte, n, len(s)) - copy(out, s[0:n]) - return string(doAppend(&rb, out, n)) -} - -// IsNormal returns true if b == f(b). -func (f Form) IsNormal(b []byte) bool { - rb := reorderBuffer{} - rb.init(f, b) - bp := quickSpan(&rb, 0) - if bp == len(b) { - return true - } - for bp < len(b) { - decomposeSegment(&rb, bp) - if rb.f.composing { - rb.compose() - } - for i := 0; i < rb.nrune; i++ { - info := rb.rune[i] - if bp+int(info.size) > len(b) { - return false - } - p := info.pos - pe := p + info.size - for ; p < pe; p++ { - if b[bp] != rb.byte[p] { - return false - } - bp++ - } - } - rb.reset() - bp = quickSpan(&rb, bp) - } - return true -} - -// IsNormalString returns true if s == f(s). -func (f Form) IsNormalString(s string) bool { - rb := reorderBuffer{} - rb.initString(f, s) - bp := quickSpan(&rb, 0) - if bp == len(s) { - return true - } - for bp < len(s) { - decomposeSegment(&rb, bp) - if rb.f.composing { - rb.compose() - } - for i := 0; i < rb.nrune; i++ { - info := rb.rune[i] - if bp+int(info.size) > len(s) { - return false - } - p := info.pos - pe := p + info.size - for ; p < pe; p++ { - if s[bp] != rb.byte[p] { - return false - } - bp++ - } - } - rb.reset() - bp = quickSpan(&rb, bp) - } - return true -} - -// patchTail fixes a case where a rune may be incorrectly normalized -// if it is followed by illegal continuation bytes. It returns the -// patched buffer and whether there were trailing continuation bytes. -func patchTail(rb *reorderBuffer, buf []byte) ([]byte, bool) { - info, p := lastRuneStart(&rb.f, buf) - if p == -1 || info.size == 0 { - return buf, false - } - end := p + int(info.size) - extra := len(buf) - end - if extra > 0 { - // Potentially allocating memory. However, this only - // happens with ill-formed UTF-8. - x := make([]byte, 0) - x = append(x, buf[len(buf)-extra:]...) - buf = decomposeToLastBoundary(rb, buf[:end]) - if rb.f.composing { - rb.compose() - } - buf = rb.flush(buf) - return append(buf, x...), true - } - return buf, false -} - -func appendQuick(rb *reorderBuffer, dst []byte, i int) ([]byte, int) { - if rb.nsrc == i { - return dst, i - } - end := quickSpan(rb, i) - return rb.src.appendSlice(dst, i, end), end -} - -// Append returns f(append(out, b...)). -// The buffer out must be nil, empty, or equal to f(out). -func (f Form) Append(out []byte, src ...byte) []byte { - if len(src) == 0 { - return out - } - rb := reorderBuffer{} - rb.init(f, src) - return doAppend(&rb, out, 0) -} - -func doAppend(rb *reorderBuffer, out []byte, p int) []byte { - src, n := rb.src, rb.nsrc - doMerge := len(out) > 0 - if q := src.skipNonStarter(p); q > p { - // Move leading non-starters to destination. - out = src.appendSlice(out, p, q) - buf, endsInError := patchTail(rb, out) - if endsInError { - out = buf - doMerge = false // no need to merge, ends with illegal UTF-8 - } else { - out = decomposeToLastBoundary(rb, buf) // force decomposition - } - p = q - } - fd := &rb.f - if doMerge { - var info Properties - if p < n { - info = fd.info(src, p) - if p == 0 && !info.BoundaryBefore() { - out = decomposeToLastBoundary(rb, out) - } - } - if info.size == 0 || info.BoundaryBefore() { - if fd.composing { - rb.compose() - } - out = rb.flush(out) - if info.size == 0 { - // Append incomplete UTF-8 encoding. - return src.appendSlice(out, p, n) - } - } - } - if rb.nrune == 0 { - out, p = appendQuick(rb, out, p) - } - for p < n { - p = decomposeSegment(rb, p) - if fd.composing { - rb.compose() - } - out = rb.flush(out) - out, p = appendQuick(rb, out, p) - } - return out -} - -// AppendString returns f(append(out, []byte(s))). -// The buffer out must be nil, empty, or equal to f(out). -func (f Form) AppendString(out []byte, src string) []byte { - if len(src) == 0 { - return out - } - rb := reorderBuffer{} - rb.initString(f, src) - return doAppend(&rb, out, 0) -} - -// QuickSpan returns a boundary n such that b[0:n] == f(b[0:n]). -// It is not guaranteed to return the largest such n. -func (f Form) QuickSpan(b []byte) int { - rb := reorderBuffer{} - rb.init(f, b) - n := quickSpan(&rb, 0) - return n -} - -func quickSpan(rb *reorderBuffer, i int) int { - var lastCC uint8 - var nc int - lastSegStart := i - src, n := rb.src, rb.nsrc - for i < n { - if j := src.skipASCII(i, n); i != j { - i = j - lastSegStart = i - 1 - lastCC = 0 - nc = 0 - continue - } - info := rb.f.info(src, i) - if info.size == 0 { - // include incomplete runes - return n - } - cc := info.ccc - if rb.f.composing { - if !info.isYesC() { - break - } - } else { - if !info.isYesD() { - break - } - } - if cc == 0 { - lastSegStart = i - nc = 0 - } else { - if nc >= maxCombiningChars { - lastSegStart = i - lastCC = cc - nc = 1 - } else { - if lastCC > cc { - return lastSegStart - } - nc++ - } - } - lastCC = cc - i += int(info.size) - } - if i == n { - return n - } - if rb.f.composing { - return lastSegStart - } - return i -} - -// QuickSpanString returns a boundary n such that b[0:n] == f(s[0:n]). -// It is not guaranteed to return the largest such n. -func (f Form) QuickSpanString(s string) int { - rb := reorderBuffer{} - rb.initString(f, s) - return quickSpan(&rb, 0) -} - -// FirstBoundary returns the position i of the first boundary in b -// or -1 if b contains no boundary. -func (f Form) FirstBoundary(b []byte) int { - rb := reorderBuffer{} - rb.init(f, b) - return firstBoundary(&rb) -} - -func firstBoundary(rb *reorderBuffer) int { - src, nsrc := rb.src, rb.nsrc - i := src.skipNonStarter(0) - if i >= nsrc { - return -1 - } - fd := &rb.f - info := fd.info(src, i) - for n := 0; info.size != 0 && !info.BoundaryBefore(); { - i += int(info.size) - if n++; n >= maxCombiningChars { - return i - } - if i >= nsrc { - if !info.BoundaryAfter() { - return -1 - } - return nsrc - } - info = fd.info(src, i) - } - if info.size == 0 { - return -1 - } - return i -} - -// FirstBoundaryInString returns the position i of the first boundary in s -// or -1 if s contains no boundary. -func (f Form) FirstBoundaryInString(s string) int { - rb := reorderBuffer{} - rb.initString(f, s) - return firstBoundary(&rb) -} - -// LastBoundary returns the position i of the last boundary in b -// or -1 if b contains no boundary. -func (f Form) LastBoundary(b []byte) int { - return lastBoundary(formTable[f], b) -} - -func lastBoundary(fd *formInfo, b []byte) int { - i := len(b) - info, p := lastRuneStart(fd, b) - if p == -1 { - return -1 - } - if info.size == 0 { // ends with incomplete rune - if p == 0 { // starts with incomplete rune - return -1 - } - i = p - info, p = lastRuneStart(fd, b[:i]) - if p == -1 { // incomplete UTF-8 encoding or non-starter bytes without a starter - return i - } - } - if p+int(info.size) != i { // trailing non-starter bytes: illegal UTF-8 - return i - } - if info.BoundaryAfter() { - return i - } - i = p - for n := 0; i >= 0 && !info.BoundaryBefore(); { - info, p = lastRuneStart(fd, b[:i]) - if n++; n >= maxCombiningChars { - return len(b) - } - if p+int(info.size) != i { - if p == -1 { // no boundary found - return -1 - } - return i // boundary after an illegal UTF-8 encoding - } - i = p - } - return i -} - -// decomposeSegment scans the first segment in src into rb. -// It returns the number of bytes consumed from src. -// TODO(mpvl): consider inserting U+034f (Combining Grapheme Joiner) -// when we detect a sequence of 30+ non-starter chars. -func decomposeSegment(rb *reorderBuffer, sp int) int { - // Force one character to be consumed. - info := rb.f.info(rb.src, sp) - if info.size == 0 { - return 0 - } - for rb.insert(rb.src, sp, info) { - sp += int(info.size) - if sp >= rb.nsrc { - break - } - info = rb.f.info(rb.src, sp) - bound := info.BoundaryBefore() - if bound || info.size == 0 { - break - } - } - return sp -} - -// lastRuneStart returns the runeInfo and position of the last -// rune in buf or the zero runeInfo and -1 if no rune was found. -func lastRuneStart(fd *formInfo, buf []byte) (Properties, int) { - p := len(buf) - 1 - for ; p >= 0 && !utf8.RuneStart(buf[p]); p-- { - } - if p < 0 { - return Properties{}, -1 - } - return fd.info(inputBytes(buf), p), p -} - -// decomposeToLastBoundary finds an open segment at the end of the buffer -// and scans it into rb. Returns the buffer minus the last segment. -func decomposeToLastBoundary(rb *reorderBuffer, buf []byte) []byte { - fd := &rb.f - info, i := lastRuneStart(fd, buf) - if int(info.size) != len(buf)-i { - // illegal trailing continuation bytes - return buf - } - if info.BoundaryAfter() { - return buf - } - var add [maxBackRunes]Properties // stores runeInfo in reverse order - add[0] = info - padd := 1 - n := 1 - p := len(buf) - int(info.size) - for ; p >= 0 && !info.BoundaryBefore(); p -= int(info.size) { - info, i = lastRuneStart(fd, buf[:p]) - if int(info.size) != p-i { - break - } - // Check that decomposition doesn't result in overflow. - if info.hasDecomposition() { - if isHangul(buf) { - i += int(info.size) - n++ - } else { - dcomp := info.Decomposition() - for i := 0; i < len(dcomp); { - inf := rb.f.info(inputBytes(dcomp), i) - i += int(inf.size) - n++ - } - } - } else { - n++ - } - if n > maxBackRunes { - break - } - add[padd] = info - padd++ - } - pp := p - for padd--; padd >= 0; padd-- { - info = add[padd] - rb.insert(inputBytes(buf), pp, info) - pp += int(info.size) - } - return buf[:p] -} |