vendor: Update everything

GitHub-Pull-Request: https://github.com/syncthing/syncthing/pull/4620
This commit is contained in:
Jakob Borg
2017-12-29 11:38:00 +00:00
parent 1296a22069
commit c24bf7ea55
1070 changed files with 294926 additions and 488191 deletions
+27
View File
@@ -0,0 +1,27 @@
Copyright (c) 2009 The Go Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+249
View File
@@ -0,0 +1,249 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run maketables.go
// Package charmap provides simple character encodings such as IBM Code Page 437
// and Windows 1252.
package charmap // import "golang.org/x/text/encoding/charmap"
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// These encodings vary only in the way clients should interpret them. Their
// coded character set is identical and a single implementation can be shared.
var (
// ISO8859_6E is the ISO 8859-6E encoding.
ISO8859_6E encoding.Encoding = &iso8859_6E
// ISO8859_6I is the ISO 8859-6I encoding.
ISO8859_6I encoding.Encoding = &iso8859_6I
// ISO8859_8E is the ISO 8859-8E encoding.
ISO8859_8E encoding.Encoding = &iso8859_8E
// ISO8859_8I is the ISO 8859-8I encoding.
ISO8859_8I encoding.Encoding = &iso8859_8I
iso8859_6E = internal.Encoding{
Encoding: ISO8859_6,
Name: "ISO-8859-6E",
MIB: identifier.ISO88596E,
}
iso8859_6I = internal.Encoding{
Encoding: ISO8859_6,
Name: "ISO-8859-6I",
MIB: identifier.ISO88596I,
}
iso8859_8E = internal.Encoding{
Encoding: ISO8859_8,
Name: "ISO-8859-8E",
MIB: identifier.ISO88598E,
}
iso8859_8I = internal.Encoding{
Encoding: ISO8859_8,
Name: "ISO-8859-8I",
MIB: identifier.ISO88598I,
}
)
// All is a list of all defined encodings in this package.
var All []encoding.Encoding = listAll
// TODO: implement these encodings, in order of importance.
// ASCII, ISO8859_1: Rather common. Close to Windows 1252.
// ISO8859_9: Close to Windows 1254.
// utf8Enc holds a rune's UTF-8 encoding in data[:len].
type utf8Enc struct {
len uint8
data [3]byte
}
// Charmap is an 8-bit character set encoding.
type Charmap struct {
// name is the encoding's name.
name string
// mib is the encoding type of this encoder.
mib identifier.MIB
// asciiSuperset states whether the encoding is a superset of ASCII.
asciiSuperset bool
// low is the lower bound of the encoded byte for a non-ASCII rune. If
// Charmap.asciiSuperset is true then this will be 0x80, otherwise 0x00.
low uint8
// replacement is the encoded replacement character.
replacement byte
// decode is the map from encoded byte to UTF-8.
decode [256]utf8Enc
// encoding is the map from runes to encoded bytes. Each entry is a
// uint32: the high 8 bits are the encoded byte and the low 24 bits are
// the rune. The table entries are sorted by ascending rune.
encode [256]uint32
}
// NewDecoder implements the encoding.Encoding interface.
func (m *Charmap) NewDecoder() *encoding.Decoder {
return &encoding.Decoder{Transformer: charmapDecoder{charmap: m}}
}
// NewEncoder implements the encoding.Encoding interface.
func (m *Charmap) NewEncoder() *encoding.Encoder {
return &encoding.Encoder{Transformer: charmapEncoder{charmap: m}}
}
// String returns the Charmap's name.
func (m *Charmap) String() string {
return m.name
}
// ID implements an internal interface.
func (m *Charmap) ID() (mib identifier.MIB, other string) {
return m.mib, ""
}
// charmapDecoder implements transform.Transformer by decoding to UTF-8.
type charmapDecoder struct {
transform.NopResetter
charmap *Charmap
}
func (m charmapDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for i, c := range src {
if m.charmap.asciiSuperset && c < utf8.RuneSelf {
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = c
nDst++
nSrc = i + 1
continue
}
decode := &m.charmap.decode[c]
n := int(decode.len)
if nDst+n > len(dst) {
err = transform.ErrShortDst
break
}
// It's 15% faster to avoid calling copy for these tiny slices.
for j := 0; j < n; j++ {
dst[nDst] = decode.data[j]
nDst++
}
nSrc = i + 1
}
return nDst, nSrc, err
}
// DecodeByte returns the Charmap's rune decoding of the byte b.
func (m *Charmap) DecodeByte(b byte) rune {
switch x := &m.decode[b]; x.len {
case 1:
return rune(x.data[0])
case 2:
return rune(x.data[0]&0x1f)<<6 | rune(x.data[1]&0x3f)
default:
return rune(x.data[0]&0x0f)<<12 | rune(x.data[1]&0x3f)<<6 | rune(x.data[2]&0x3f)
}
}
// charmapEncoder implements transform.Transformer by encoding from UTF-8.
type charmapEncoder struct {
transform.NopResetter
charmap *Charmap
}
func (m charmapEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for nSrc < len(src) {
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
if m.charmap.asciiSuperset {
nSrc++
dst[nDst] = uint8(r)
nDst++
continue
}
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
} else {
err = internal.RepertoireError(m.charmap.replacement)
}
break
}
}
// Binary search in [low, high) for that rune in the m.charmap.encode table.
for low, high := int(m.charmap.low), 0x100; ; {
if low >= high {
err = internal.RepertoireError(m.charmap.replacement)
break loop
}
mid := (low + high) / 2
got := m.charmap.encode[mid]
gotRune := rune(got & (1<<24 - 1))
if gotRune < r {
low = mid + 1
} else if gotRune > r {
high = mid
} else {
dst[nDst] = byte(got >> 24)
nDst++
break
}
}
nSrc += size
}
return nDst, nSrc, err
}
// EncodeRune returns the Charmap's byte encoding of the rune r. ok is whether
// r is in the Charmap's repertoire. If not, b is set to the Charmap's
// replacement byte. This is often the ASCII substitute character '\x1a'.
func (m *Charmap) EncodeRune(r rune) (b byte, ok bool) {
if r < utf8.RuneSelf && m.asciiSuperset {
return byte(r), true
}
for low, high := int(m.low), 0x100; ; {
if low >= high {
return m.replacement, false
}
mid := (low + high) / 2
got := m.encode[mid]
gotRune := rune(got & (1<<24 - 1))
if gotRune < r {
low = mid + 1
} else if gotRune > r {
high = mid
} else {
return byte(got >> 24), true
}
}
}
+556
View File
@@ -0,0 +1,556 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
import (
"bufio"
"fmt"
"log"
"net/http"
"sort"
"strings"
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/internal/gen"
)
const ascii = "\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f" +
"\x10\x11\x12\x13\x14\x15\x16\x17\x18\x19\x1a\x1b\x1c\x1d\x1e\x1f" +
` !"#$%&'()*+,-./0123456789:;<=>?` +
`@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_` +
"`abcdefghijklmnopqrstuvwxyz{|}~\u007f"
var encodings = []struct {
name string
mib string
comment string
varName string
replacement byte
mapping string
}{
{
"IBM Code Page 037",
"IBM037",
"",
"CodePage037",
0x3f,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM037-2.1.2.ucm",
},
{
"IBM Code Page 437",
"PC8CodePage437",
"",
"CodePage437",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM437-2.1.2.ucm",
},
{
"IBM Code Page 850",
"PC850Multilingual",
"",
"CodePage850",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM850-2.1.2.ucm",
},
{
"IBM Code Page 852",
"PCp852",
"",
"CodePage852",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM852-2.1.2.ucm",
},
{
"IBM Code Page 855",
"IBM855",
"",
"CodePage855",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM855-2.1.2.ucm",
},
{
"Windows Code Page 858", // PC latin1 with Euro
"IBM00858",
"",
"CodePage858",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/windows-858-2000.ucm",
},
{
"IBM Code Page 860",
"IBM860",
"",
"CodePage860",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM860-2.1.2.ucm",
},
{
"IBM Code Page 862",
"PC862LatinHebrew",
"",
"CodePage862",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM862-2.1.2.ucm",
},
{
"IBM Code Page 863",
"IBM863",
"",
"CodePage863",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM863-2.1.2.ucm",
},
{
"IBM Code Page 865",
"IBM865",
"",
"CodePage865",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM865-2.1.2.ucm",
},
{
"IBM Code Page 866",
"IBM866",
"",
"CodePage866",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-ibm866.txt",
},
{
"IBM Code Page 1047",
"IBM1047",
"",
"CodePage1047",
0x3f,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/glibc-IBM1047-2.1.2.ucm",
},
{
"IBM Code Page 1140",
"IBM01140",
"",
"CodePage1140",
0x3f,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/ibm-1140_P100-1997.ucm",
},
{
"ISO 8859-1",
"ISOLatin1",
"",
"ISO8859_1",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_1-1998.ucm",
},
{
"ISO 8859-2",
"ISOLatin2",
"",
"ISO8859_2",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-2.txt",
},
{
"ISO 8859-3",
"ISOLatin3",
"",
"ISO8859_3",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-3.txt",
},
{
"ISO 8859-4",
"ISOLatin4",
"",
"ISO8859_4",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-4.txt",
},
{
"ISO 8859-5",
"ISOLatinCyrillic",
"",
"ISO8859_5",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-5.txt",
},
{
"ISO 8859-6",
"ISOLatinArabic",
"",
"ISO8859_6,ISO8859_6E,ISO8859_6I",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-6.txt",
},
{
"ISO 8859-7",
"ISOLatinGreek",
"",
"ISO8859_7",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-7.txt",
},
{
"ISO 8859-8",
"ISOLatinHebrew",
"",
"ISO8859_8,ISO8859_8E,ISO8859_8I",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-8.txt",
},
{
"ISO 8859-9",
"ISOLatin5",
"",
"ISO8859_9",
encoding.ASCIISub,
"http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/iso-8859_9-1999.ucm",
},
{
"ISO 8859-10",
"ISOLatin6",
"",
"ISO8859_10",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-10.txt",
},
{
"ISO 8859-13",
"ISO885913",
"",
"ISO8859_13",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-13.txt",
},
{
"ISO 8859-14",
"ISO885914",
"",
"ISO8859_14",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-14.txt",
},
{
"ISO 8859-15",
"ISO885915",
"",
"ISO8859_15",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-15.txt",
},
{
"ISO 8859-16",
"ISO885916",
"",
"ISO8859_16",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-iso-8859-16.txt",
},
{
"KOI8-R",
"KOI8R",
"",
"KOI8R",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-koi8-r.txt",
},
{
"KOI8-U",
"KOI8U",
"",
"KOI8U",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-koi8-u.txt",
},
{
"Macintosh",
"Macintosh",
"",
"Macintosh",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-macintosh.txt",
},
{
"Macintosh Cyrillic",
"MacintoshCyrillic",
"",
"MacintoshCyrillic",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-x-mac-cyrillic.txt",
},
{
"Windows 874",
"Windows874",
"",
"Windows874",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-874.txt",
},
{
"Windows 1250",
"Windows1250",
"",
"Windows1250",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1250.txt",
},
{
"Windows 1251",
"Windows1251",
"",
"Windows1251",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1251.txt",
},
{
"Windows 1252",
"Windows1252",
"",
"Windows1252",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1252.txt",
},
{
"Windows 1253",
"Windows1253",
"",
"Windows1253",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1253.txt",
},
{
"Windows 1254",
"Windows1254",
"",
"Windows1254",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1254.txt",
},
{
"Windows 1255",
"Windows1255",
"",
"Windows1255",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1255.txt",
},
{
"Windows 1256",
"Windows1256",
"",
"Windows1256",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1256.txt",
},
{
"Windows 1257",
"Windows1257",
"",
"Windows1257",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1257.txt",
},
{
"Windows 1258",
"Windows1258",
"",
"Windows1258",
encoding.ASCIISub,
"http://encoding.spec.whatwg.org/index-windows-1258.txt",
},
{
"X-User-Defined",
"XUserDefined",
"It is defined at http://encoding.spec.whatwg.org/#x-user-defined",
"XUserDefined",
encoding.ASCIISub,
ascii +
"\uf780\uf781\uf782\uf783\uf784\uf785\uf786\uf787" +
"\uf788\uf789\uf78a\uf78b\uf78c\uf78d\uf78e\uf78f" +
"\uf790\uf791\uf792\uf793\uf794\uf795\uf796\uf797" +
"\uf798\uf799\uf79a\uf79b\uf79c\uf79d\uf79e\uf79f" +
"\uf7a0\uf7a1\uf7a2\uf7a3\uf7a4\uf7a5\uf7a6\uf7a7" +
"\uf7a8\uf7a9\uf7aa\uf7ab\uf7ac\uf7ad\uf7ae\uf7af" +
"\uf7b0\uf7b1\uf7b2\uf7b3\uf7b4\uf7b5\uf7b6\uf7b7" +
"\uf7b8\uf7b9\uf7ba\uf7bb\uf7bc\uf7bd\uf7be\uf7bf" +
"\uf7c0\uf7c1\uf7c2\uf7c3\uf7c4\uf7c5\uf7c6\uf7c7" +
"\uf7c8\uf7c9\uf7ca\uf7cb\uf7cc\uf7cd\uf7ce\uf7cf" +
"\uf7d0\uf7d1\uf7d2\uf7d3\uf7d4\uf7d5\uf7d6\uf7d7" +
"\uf7d8\uf7d9\uf7da\uf7db\uf7dc\uf7dd\uf7de\uf7df" +
"\uf7e0\uf7e1\uf7e2\uf7e3\uf7e4\uf7e5\uf7e6\uf7e7" +
"\uf7e8\uf7e9\uf7ea\uf7eb\uf7ec\uf7ed\uf7ee\uf7ef" +
"\uf7f0\uf7f1\uf7f2\uf7f3\uf7f4\uf7f5\uf7f6\uf7f7" +
"\uf7f8\uf7f9\uf7fa\uf7fb\uf7fc\uf7fd\uf7fe\uf7ff",
},
}
func getWHATWG(url string) string {
res, err := http.Get(url)
if err != nil {
log.Fatalf("%q: Get: %v", url, err)
}
defer res.Body.Close()
mapping := make([]rune, 128)
for i := range mapping {
mapping[i] = '\ufffd'
}
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := 0, 0
if _, err := fmt.Sscanf(s, "%d\t0x%x", &x, &y); err != nil {
log.Fatalf("could not parse %q", s)
}
if x < 0 || 128 <= x {
log.Fatalf("code %d is out of range", x)
}
if 0x80 <= y && y < 0xa0 {
// We diverge from the WHATWG spec by mapping control characters
// in the range [0x80, 0xa0) to U+FFFD.
continue
}
mapping[x] = rune(y)
}
return ascii + string(mapping)
}
func getUCM(url string) string {
res, err := http.Get(url)
if err != nil {
log.Fatalf("%q: Get: %v", url, err)
}
defer res.Body.Close()
mapping := make([]rune, 256)
for i := range mapping {
mapping[i] = '\ufffd'
}
charsFound := 0
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
var c byte
var r rune
if _, err := fmt.Sscanf(s, `<U%x> \x%x |0`, &r, &c); err != nil {
continue
}
mapping[c] = r
charsFound++
}
if charsFound < 200 {
log.Fatalf("%q: only %d characters found (wrong page format?)", url, charsFound)
}
return string(mapping)
}
func main() {
mibs := map[string]bool{}
all := []string{}
w := gen.NewCodeWriter()
defer w.WriteGoFile("tables.go", "charmap")
printf := func(s string, a ...interface{}) { fmt.Fprintf(w, s, a...) }
printf("import (\n")
printf("\t\"golang.org/x/text/encoding\"\n")
printf("\t\"golang.org/x/text/encoding/internal/identifier\"\n")
printf(")\n\n")
for _, e := range encodings {
varNames := strings.Split(e.varName, ",")
all = append(all, varNames...)
varName := varNames[0]
switch {
case strings.HasPrefix(e.mapping, "http://encoding.spec.whatwg.org/"):
e.mapping = getWHATWG(e.mapping)
case strings.HasPrefix(e.mapping, "http://source.icu-project.org/repos/icu/data/trunk/charset/data/ucm/"):
e.mapping = getUCM(e.mapping)
}
asciiSuperset, low := strings.HasPrefix(e.mapping, ascii), 0x00
if asciiSuperset {
low = 0x80
}
lvn := 1
if strings.HasPrefix(varName, "ISO") || strings.HasPrefix(varName, "KOI") {
lvn = 3
}
lowerVarName := strings.ToLower(varName[:lvn]) + varName[lvn:]
printf("// %s is the %s encoding.\n", varName, e.name)
if e.comment != "" {
printf("//\n// %s\n", e.comment)
}
printf("var %s *Charmap = &%s\n\nvar %s = Charmap{\nname: %q,\n",
varName, lowerVarName, lowerVarName, e.name)
if mibs[e.mib] {
log.Fatalf("MIB type %q declared multiple times.", e.mib)
}
printf("mib: identifier.%s,\n", e.mib)
printf("asciiSuperset: %t,\n", asciiSuperset)
printf("low: 0x%02x,\n", low)
printf("replacement: 0x%02x,\n", e.replacement)
printf("decode: [256]utf8Enc{\n")
i, backMapping := 0, map[rune]byte{}
for _, c := range e.mapping {
if _, ok := backMapping[c]; !ok && c != utf8.RuneError {
backMapping[c] = byte(i)
}
var buf [8]byte
n := utf8.EncodeRune(buf[:], c)
if n > 3 {
panic(fmt.Sprintf("rune %q (%U) is too long", c, c))
}
printf("{%d,[3]byte{0x%02x,0x%02x,0x%02x}},", n, buf[0], buf[1], buf[2])
if i%2 == 1 {
printf("\n")
}
i++
}
printf("},\n")
printf("encode: [256]uint32{\n")
encode := make([]uint32, 0, 256)
for c, i := range backMapping {
encode = append(encode, uint32(i)<<24|uint32(c))
}
sort.Sort(byRune(encode))
for len(encode) < cap(encode) {
encode = append(encode, encode[len(encode)-1])
}
for i, enc := range encode {
printf("0x%08x,", enc)
if i%8 == 7 {
printf("\n")
}
}
printf("},\n}\n")
// Add an estimate of the size of a single Charmap{} struct value, which
// includes two 256 elem arrays of 4 bytes and some extra fields, which
// align to 3 uint64s on 64-bit architectures.
w.Size += 2*4*256 + 3*8
}
// TODO: add proper line breaking.
printf("var listAll = []encoding.Encoding{\n%s,\n}\n\n", strings.Join(all, ",\n"))
}
type byRune []uint32
func (b byRune) Len() int { return len(b) }
func (b byRune) Less(i, j int) bool { return b[i]&0xffffff < b[j]&0xffffff }
func (b byRune) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
File diff suppressed because it is too large Load Diff
+335
View File
@@ -0,0 +1,335 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package encoding defines an interface for character encodings, such as Shift
// JIS and Windows 1252, that can convert to and from UTF-8.
//
// Encoding implementations are provided in other packages, such as
// golang.org/x/text/encoding/charmap and
// golang.org/x/text/encoding/japanese.
package encoding // import "golang.org/x/text/encoding"
import (
"errors"
"io"
"strconv"
"unicode/utf8"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// TODO:
// - There seems to be some inconsistency in when decoders return errors
// and when not. Also documentation seems to suggest they shouldn't return
// errors at all (except for UTF-16).
// - Encoders seem to rely on or at least benefit from the input being in NFC
// normal form. Perhaps add an example how users could prepare their output.
// Encoding is a character set encoding that can be transformed to and from
// UTF-8.
type Encoding interface {
// NewDecoder returns a Decoder.
NewDecoder() *Decoder
// NewEncoder returns an Encoder.
NewEncoder() *Encoder
}
// A Decoder converts bytes to UTF-8. It implements transform.Transformer.
//
// Transforming source bytes that are not of that encoding will not result in an
// error per se. Each byte that cannot be transcoded will be represented in the
// output by the UTF-8 encoding of '\uFFFD', the replacement rune.
type Decoder struct {
transform.Transformer
// This forces external creators of Decoders to use names in struct
// initializers, allowing for future extendibility without having to break
// code.
_ struct{}
}
// Bytes converts the given encoded bytes to UTF-8. It returns the converted
// bytes or nil, err if any error occurred.
func (d *Decoder) Bytes(b []byte) ([]byte, error) {
b, _, err := transform.Bytes(d, b)
if err != nil {
return nil, err
}
return b, nil
}
// String converts the given encoded string to UTF-8. It returns the converted
// string or "", err if any error occurred.
func (d *Decoder) String(s string) (string, error) {
s, _, err := transform.String(d, s)
if err != nil {
return "", err
}
return s, nil
}
// Reader wraps another Reader to decode its bytes.
//
// The Decoder may not be used for any other operation as long as the returned
// Reader is in use.
func (d *Decoder) Reader(r io.Reader) io.Reader {
return transform.NewReader(r, d)
}
// An Encoder converts bytes from UTF-8. It implements transform.Transformer.
//
// Each rune that cannot be transcoded will result in an error. In this case,
// the transform will consume all source byte up to, not including the offending
// rune. Transforming source bytes that are not valid UTF-8 will be replaced by
// `\uFFFD`. To return early with an error instead, use transform.Chain to
// preprocess the data with a UTF8Validator.
type Encoder struct {
transform.Transformer
// This forces external creators of Encoders to use names in struct
// initializers, allowing for future extendibility without having to break
// code.
_ struct{}
}
// Bytes converts bytes from UTF-8. It returns the converted bytes or nil, err if
// any error occurred.
func (e *Encoder) Bytes(b []byte) ([]byte, error) {
b, _, err := transform.Bytes(e, b)
if err != nil {
return nil, err
}
return b, nil
}
// String converts a string from UTF-8. It returns the converted string or
// "", err if any error occurred.
func (e *Encoder) String(s string) (string, error) {
s, _, err := transform.String(e, s)
if err != nil {
return "", err
}
return s, nil
}
// Writer wraps another Writer to encode its UTF-8 output.
//
// The Encoder may not be used for any other operation as long as the returned
// Writer is in use.
func (e *Encoder) Writer(w io.Writer) io.Writer {
return transform.NewWriter(w, e)
}
// ASCIISub is the ASCII substitute character, as recommended by
// http://unicode.org/reports/tr36/#Text_Comparison
const ASCIISub = '\x1a'
// Nop is the nop encoding. Its transformed bytes are the same as the source
// bytes; it does not replace invalid UTF-8 sequences.
var Nop Encoding = nop{}
type nop struct{}
func (nop) NewDecoder() *Decoder {
return &Decoder{Transformer: transform.Nop}
}
func (nop) NewEncoder() *Encoder {
return &Encoder{Transformer: transform.Nop}
}
// Replacement is the replacement encoding. Decoding from the replacement
// encoding yields a single '\uFFFD' replacement rune. Encoding from UTF-8 to
// the replacement encoding yields the same as the source bytes except that
// invalid UTF-8 is converted to '\uFFFD'.
//
// It is defined at http://encoding.spec.whatwg.org/#replacement
var Replacement Encoding = replacement{}
type replacement struct{}
func (replacement) NewDecoder() *Decoder {
return &Decoder{Transformer: replacementDecoder{}}
}
func (replacement) NewEncoder() *Encoder {
return &Encoder{Transformer: replacementEncoder{}}
}
func (replacement) ID() (mib identifier.MIB, other string) {
return identifier.Replacement, ""
}
type replacementDecoder struct{ transform.NopResetter }
func (replacementDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if len(dst) < 3 {
return 0, 0, transform.ErrShortDst
}
if atEOF {
const fffd = "\ufffd"
dst[0] = fffd[0]
dst[1] = fffd[1]
dst[2] = fffd[2]
nDst = 3
}
return nDst, len(src), nil
}
type replacementEncoder struct{ transform.NopResetter }
func (replacementEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
r = '\ufffd'
}
}
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
// HTMLEscapeUnsupported wraps encoders to replace source runes outside the
// repertoire of the destination encoding with HTML escape sequences.
//
// This wrapper exists to comply to URL and HTML forms requiring a
// non-terminating legacy encoder. The produced sequences may lead to data
// loss as they are indistinguishable from legitimate input. To avoid this
// issue, use UTF-8 encodings whenever possible.
func HTMLEscapeUnsupported(e *Encoder) *Encoder {
return &Encoder{Transformer: &errorHandler{e, errorToHTML}}
}
// ReplaceUnsupported wraps encoders to replace source runes outside the
// repertoire of the destination encoding with an encoding-specific
// replacement.
//
// This wrapper is only provided for backwards compatibility and legacy
// handling. Its use is strongly discouraged. Use UTF-8 whenever possible.
func ReplaceUnsupported(e *Encoder) *Encoder {
return &Encoder{Transformer: &errorHandler{e, errorToReplacement}}
}
type errorHandler struct {
*Encoder
handler func(dst []byte, r rune, err repertoireError) (n int, ok bool)
}
// TODO: consider making this error public in some form.
type repertoireError interface {
Replacement() byte
}
func (h errorHandler) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
nDst, nSrc, err = h.Transformer.Transform(dst, src, atEOF)
for err != nil {
rerr, ok := err.(repertoireError)
if !ok {
return nDst, nSrc, err
}
r, sz := utf8.DecodeRune(src[nSrc:])
n, ok := h.handler(dst[nDst:], r, rerr)
if !ok {
return nDst, nSrc, transform.ErrShortDst
}
err = nil
nDst += n
if nSrc += sz; nSrc < len(src) {
var dn, sn int
dn, sn, err = h.Transformer.Transform(dst[nDst:], src[nSrc:], atEOF)
nDst += dn
nSrc += sn
}
}
return nDst, nSrc, err
}
func errorToHTML(dst []byte, r rune, err repertoireError) (n int, ok bool) {
buf := [8]byte{}
b := strconv.AppendUint(buf[:0], uint64(r), 10)
if n = len(b) + len("&#;"); n >= len(dst) {
return 0, false
}
dst[0] = '&'
dst[1] = '#'
dst[copy(dst[2:], b)+2] = ';'
return n, true
}
func errorToReplacement(dst []byte, r rune, err repertoireError) (n int, ok bool) {
if len(dst) == 0 {
return 0, false
}
dst[0] = err.Replacement()
return 1, true
}
// ErrInvalidUTF8 means that a transformer encountered invalid UTF-8.
var ErrInvalidUTF8 = errors.New("encoding: invalid UTF-8")
// UTF8Validator is a transformer that returns ErrInvalidUTF8 on the first
// input byte that is not valid UTF-8.
var UTF8Validator transform.Transformer = utf8Validator{}
type utf8Validator struct{ transform.NopResetter }
func (utf8Validator) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
n := len(src)
if n > len(dst) {
n = len(dst)
}
for i := 0; i < n; {
if c := src[i]; c < utf8.RuneSelf {
dst[i] = c
i++
continue
}
_, size := utf8.DecodeRune(src[i:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
err = ErrInvalidUTF8
if !atEOF && !utf8.FullRune(src[i:]) {
err = transform.ErrShortSrc
}
return i, i, err
}
if i+size > len(dst) {
return i, i, transform.ErrShortDst
}
for ; size > 0; size-- {
dst[i] = src[i]
i++
}
}
if len(src) > len(dst) {
err = transform.ErrShortDst
}
return n, n, err
}
+173
View File
@@ -0,0 +1,173 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
import (
"bytes"
"encoding/json"
"fmt"
"log"
"strings"
"golang.org/x/text/internal/gen"
)
type group struct {
Encodings []struct {
Labels []string
Name string
}
}
func main() {
gen.Init()
r := gen.Open("https://encoding.spec.whatwg.org", "whatwg", "encodings.json")
var groups []group
if err := json.NewDecoder(r).Decode(&groups); err != nil {
log.Fatalf("Error reading encodings.json: %v", err)
}
w := &bytes.Buffer{}
fmt.Fprintln(w, "type htmlEncoding byte")
fmt.Fprintln(w, "const (")
for i, g := range groups {
for _, e := range g.Encodings {
key := strings.ToLower(e.Name)
name := consts[key]
if name == "" {
log.Fatalf("No const defined for %s.", key)
}
if i == 0 {
fmt.Fprintf(w, "%s htmlEncoding = iota\n", name)
} else {
fmt.Fprintf(w, "%s\n", name)
}
}
}
fmt.Fprintln(w, "numEncodings")
fmt.Fprint(w, ")\n\n")
fmt.Fprintln(w, "var canonical = [numEncodings]string{")
for _, g := range groups {
for _, e := range g.Encodings {
fmt.Fprintf(w, "%q,\n", strings.ToLower(e.Name))
}
}
fmt.Fprint(w, "}\n\n")
fmt.Fprintln(w, "var nameMap = map[string]htmlEncoding{")
for _, g := range groups {
for _, e := range g.Encodings {
for _, l := range e.Labels {
key := strings.ToLower(e.Name)
name := consts[key]
fmt.Fprintf(w, "%q: %s,\n", l, name)
}
}
}
fmt.Fprint(w, "}\n\n")
var tags []string
fmt.Fprintln(w, "var localeMap = []htmlEncoding{")
for _, loc := range locales {
tags = append(tags, loc.tag)
fmt.Fprintf(w, "%s, // %s \n", consts[loc.name], loc.tag)
}
fmt.Fprint(w, "}\n\n")
fmt.Fprintf(w, "const locales = %q\n", strings.Join(tags, " "))
gen.WriteGoFile("tables.go", "htmlindex", w.Bytes())
}
// consts maps canonical encoding name to internal constant.
var consts = map[string]string{
"utf-8": "utf8",
"ibm866": "ibm866",
"iso-8859-2": "iso8859_2",
"iso-8859-3": "iso8859_3",
"iso-8859-4": "iso8859_4",
"iso-8859-5": "iso8859_5",
"iso-8859-6": "iso8859_6",
"iso-8859-7": "iso8859_7",
"iso-8859-8": "iso8859_8",
"iso-8859-8-i": "iso8859_8I",
"iso-8859-10": "iso8859_10",
"iso-8859-13": "iso8859_13",
"iso-8859-14": "iso8859_14",
"iso-8859-15": "iso8859_15",
"iso-8859-16": "iso8859_16",
"koi8-r": "koi8r",
"koi8-u": "koi8u",
"macintosh": "macintosh",
"windows-874": "windows874",
"windows-1250": "windows1250",
"windows-1251": "windows1251",
"windows-1252": "windows1252",
"windows-1253": "windows1253",
"windows-1254": "windows1254",
"windows-1255": "windows1255",
"windows-1256": "windows1256",
"windows-1257": "windows1257",
"windows-1258": "windows1258",
"x-mac-cyrillic": "macintoshCyrillic",
"gbk": "gbk",
"gb18030": "gb18030",
// "hz-gb-2312": "hzgb2312", // Was removed from WhatWG
"big5": "big5",
"euc-jp": "eucjp",
"iso-2022-jp": "iso2022jp",
"shift_jis": "shiftJIS",
"euc-kr": "euckr",
"replacement": "replacement",
"utf-16be": "utf16be",
"utf-16le": "utf16le",
"x-user-defined": "xUserDefined",
}
// locales is taken from
// https://html.spec.whatwg.org/multipage/syntax.html#encoding-sniffing-algorithm.
var locales = []struct{ tag, name string }{
// The default value. Explicitly state latin to benefit from the exact
// script option, while still making 1252 the default encoding for languages
// written in Latin script.
{"und_Latn", "windows-1252"},
{"ar", "windows-1256"},
{"ba", "windows-1251"},
{"be", "windows-1251"},
{"bg", "windows-1251"},
{"cs", "windows-1250"},
{"el", "iso-8859-7"},
{"et", "windows-1257"},
{"fa", "windows-1256"},
{"he", "windows-1255"},
{"hr", "windows-1250"},
{"hu", "iso-8859-2"},
{"ja", "shift_jis"},
{"kk", "windows-1251"},
{"ko", "euc-kr"},
{"ku", "windows-1254"},
{"ky", "windows-1251"},
{"lt", "windows-1257"},
{"lv", "windows-1257"},
{"mk", "windows-1251"},
{"pl", "iso-8859-2"},
{"ru", "windows-1251"},
{"sah", "windows-1251"},
{"sk", "windows-1250"},
{"sl", "iso-8859-2"},
{"sr", "windows-1251"},
{"tg", "windows-1251"},
{"th", "windows-874"},
{"tr", "windows-1254"},
{"tt", "windows-1251"},
{"uk", "windows-1251"},
{"vi", "windows-1258"},
{"zh-hans", "gb18030"},
{"zh-hant", "big5"},
}
+86
View File
@@ -0,0 +1,86 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run gen.go
// Package htmlindex maps character set encoding names to Encodings as
// recommended by the W3C for use in HTML 5. See http://www.w3.org/TR/encoding.
package htmlindex
// TODO: perhaps have a "bare" version of the index (used by this package) that
// is not pre-loaded with all encodings. Global variables in encodings prevent
// the linker from being able to purge unneeded tables. This means that
// referencing all encodings, as this package does for the default index, links
// in all encodings unconditionally.
//
// This issue can be solved by either solving the linking issue (see
// https://github.com/golang/go/issues/6330) or refactoring the encoding tables
// (e.g. moving the tables to internal packages that do not use global
// variables).
// TODO: allow canonicalizing names
import (
"errors"
"strings"
"sync"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/language"
)
var (
errInvalidName = errors.New("htmlindex: invalid encoding name")
errUnknown = errors.New("htmlindex: unknown Encoding")
errUnsupported = errors.New("htmlindex: this encoding is not supported")
)
var (
matcherOnce sync.Once
matcher language.Matcher
)
// LanguageDefault returns the canonical name of the default encoding for a
// given language.
func LanguageDefault(tag language.Tag) string {
matcherOnce.Do(func() {
tags := []language.Tag{}
for _, t := range strings.Split(locales, " ") {
tags = append(tags, language.MustParse(t))
}
matcher = language.NewMatcher(tags, language.PreferSameScript(true))
})
_, i, _ := matcher.Match(tag)
return canonical[localeMap[i]] // Default is Windows-1252.
}
// Get returns an Encoding for one of the names listed in
// http://www.w3.org/TR/encoding using the Default Index. Matching is case-
// insensitive.
func Get(name string) (encoding.Encoding, error) {
x, ok := nameMap[strings.ToLower(strings.TrimSpace(name))]
if !ok {
return nil, errInvalidName
}
return encodings[x], nil
}
// Name reports the canonical name of the given Encoding. It will return
// an error if e is not associated with a supported encoding scheme.
func Name(e encoding.Encoding) (string, error) {
id, ok := e.(identifier.Interface)
if !ok {
return "", errUnknown
}
mib, _ := id.ID()
if mib == 0 {
return "", errUnknown
}
v, ok := mibMap[mib]
if !ok {
return "", errUnsupported
}
return canonical[v], nil
}
+105
View File
@@ -0,0 +1,105 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package htmlindex
import (
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/encoding/japanese"
"golang.org/x/text/encoding/korean"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/encoding/traditionalchinese"
"golang.org/x/text/encoding/unicode"
)
// mibMap maps a MIB identifier to an htmlEncoding index.
var mibMap = map[identifier.MIB]htmlEncoding{
identifier.UTF8: utf8,
identifier.UTF16BE: utf16be,
identifier.UTF16LE: utf16le,
identifier.IBM866: ibm866,
identifier.ISOLatin2: iso8859_2,
identifier.ISOLatin3: iso8859_3,
identifier.ISOLatin4: iso8859_4,
identifier.ISOLatinCyrillic: iso8859_5,
identifier.ISOLatinArabic: iso8859_6,
identifier.ISOLatinGreek: iso8859_7,
identifier.ISOLatinHebrew: iso8859_8,
identifier.ISO88598I: iso8859_8I,
identifier.ISOLatin6: iso8859_10,
identifier.ISO885913: iso8859_13,
identifier.ISO885914: iso8859_14,
identifier.ISO885915: iso8859_15,
identifier.ISO885916: iso8859_16,
identifier.KOI8R: koi8r,
identifier.KOI8U: koi8u,
identifier.Macintosh: macintosh,
identifier.MacintoshCyrillic: macintoshCyrillic,
identifier.Windows874: windows874,
identifier.Windows1250: windows1250,
identifier.Windows1251: windows1251,
identifier.Windows1252: windows1252,
identifier.Windows1253: windows1253,
identifier.Windows1254: windows1254,
identifier.Windows1255: windows1255,
identifier.Windows1256: windows1256,
identifier.Windows1257: windows1257,
identifier.Windows1258: windows1258,
identifier.XUserDefined: xUserDefined,
identifier.GBK: gbk,
identifier.GB18030: gb18030,
identifier.Big5: big5,
identifier.EUCPkdFmtJapanese: eucjp,
identifier.ISO2022JP: iso2022jp,
identifier.ShiftJIS: shiftJIS,
identifier.EUCKR: euckr,
identifier.Replacement: replacement,
}
// encodings maps the internal htmlEncoding to an Encoding.
// TODO: consider using a reusable index in encoding/internal.
var encodings = [numEncodings]encoding.Encoding{
utf8: unicode.UTF8,
ibm866: charmap.CodePage866,
iso8859_2: charmap.ISO8859_2,
iso8859_3: charmap.ISO8859_3,
iso8859_4: charmap.ISO8859_4,
iso8859_5: charmap.ISO8859_5,
iso8859_6: charmap.ISO8859_6,
iso8859_7: charmap.ISO8859_7,
iso8859_8: charmap.ISO8859_8,
iso8859_8I: charmap.ISO8859_8I,
iso8859_10: charmap.ISO8859_10,
iso8859_13: charmap.ISO8859_13,
iso8859_14: charmap.ISO8859_14,
iso8859_15: charmap.ISO8859_15,
iso8859_16: charmap.ISO8859_16,
koi8r: charmap.KOI8R,
koi8u: charmap.KOI8U,
macintosh: charmap.Macintosh,
windows874: charmap.Windows874,
windows1250: charmap.Windows1250,
windows1251: charmap.Windows1251,
windows1252: charmap.Windows1252,
windows1253: charmap.Windows1253,
windows1254: charmap.Windows1254,
windows1255: charmap.Windows1255,
windows1256: charmap.Windows1256,
windows1257: charmap.Windows1257,
windows1258: charmap.Windows1258,
macintoshCyrillic: charmap.MacintoshCyrillic,
gbk: simplifiedchinese.GBK,
gb18030: simplifiedchinese.GB18030,
big5: traditionalchinese.Big5,
eucjp: japanese.EUCJP,
iso2022jp: japanese.ISO2022JP,
shiftJIS: japanese.ShiftJIS,
euckr: korean.EUCKR,
replacement: encoding.Replacement,
utf16be: unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM),
utf16le: unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM),
xUserDefined: charmap.XUserDefined,
}
+352
View File
@@ -0,0 +1,352 @@
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
package htmlindex
type htmlEncoding byte
const (
utf8 htmlEncoding = iota
ibm866
iso8859_2
iso8859_3
iso8859_4
iso8859_5
iso8859_6
iso8859_7
iso8859_8
iso8859_8I
iso8859_10
iso8859_13
iso8859_14
iso8859_15
iso8859_16
koi8r
koi8u
macintosh
windows874
windows1250
windows1251
windows1252
windows1253
windows1254
windows1255
windows1256
windows1257
windows1258
macintoshCyrillic
gbk
gb18030
big5
eucjp
iso2022jp
shiftJIS
euckr
replacement
utf16be
utf16le
xUserDefined
numEncodings
)
var canonical = [numEncodings]string{
"utf-8",
"ibm866",
"iso-8859-2",
"iso-8859-3",
"iso-8859-4",
"iso-8859-5",
"iso-8859-6",
"iso-8859-7",
"iso-8859-8",
"iso-8859-8-i",
"iso-8859-10",
"iso-8859-13",
"iso-8859-14",
"iso-8859-15",
"iso-8859-16",
"koi8-r",
"koi8-u",
"macintosh",
"windows-874",
"windows-1250",
"windows-1251",
"windows-1252",
"windows-1253",
"windows-1254",
"windows-1255",
"windows-1256",
"windows-1257",
"windows-1258",
"x-mac-cyrillic",
"gbk",
"gb18030",
"big5",
"euc-jp",
"iso-2022-jp",
"shift_jis",
"euc-kr",
"replacement",
"utf-16be",
"utf-16le",
"x-user-defined",
}
var nameMap = map[string]htmlEncoding{
"unicode-1-1-utf-8": utf8,
"utf-8": utf8,
"utf8": utf8,
"866": ibm866,
"cp866": ibm866,
"csibm866": ibm866,
"ibm866": ibm866,
"csisolatin2": iso8859_2,
"iso-8859-2": iso8859_2,
"iso-ir-101": iso8859_2,
"iso8859-2": iso8859_2,
"iso88592": iso8859_2,
"iso_8859-2": iso8859_2,
"iso_8859-2:1987": iso8859_2,
"l2": iso8859_2,
"latin2": iso8859_2,
"csisolatin3": iso8859_3,
"iso-8859-3": iso8859_3,
"iso-ir-109": iso8859_3,
"iso8859-3": iso8859_3,
"iso88593": iso8859_3,
"iso_8859-3": iso8859_3,
"iso_8859-3:1988": iso8859_3,
"l3": iso8859_3,
"latin3": iso8859_3,
"csisolatin4": iso8859_4,
"iso-8859-4": iso8859_4,
"iso-ir-110": iso8859_4,
"iso8859-4": iso8859_4,
"iso88594": iso8859_4,
"iso_8859-4": iso8859_4,
"iso_8859-4:1988": iso8859_4,
"l4": iso8859_4,
"latin4": iso8859_4,
"csisolatincyrillic": iso8859_5,
"cyrillic": iso8859_5,
"iso-8859-5": iso8859_5,
"iso-ir-144": iso8859_5,
"iso8859-5": iso8859_5,
"iso88595": iso8859_5,
"iso_8859-5": iso8859_5,
"iso_8859-5:1988": iso8859_5,
"arabic": iso8859_6,
"asmo-708": iso8859_6,
"csiso88596e": iso8859_6,
"csiso88596i": iso8859_6,
"csisolatinarabic": iso8859_6,
"ecma-114": iso8859_6,
"iso-8859-6": iso8859_6,
"iso-8859-6-e": iso8859_6,
"iso-8859-6-i": iso8859_6,
"iso-ir-127": iso8859_6,
"iso8859-6": iso8859_6,
"iso88596": iso8859_6,
"iso_8859-6": iso8859_6,
"iso_8859-6:1987": iso8859_6,
"csisolatingreek": iso8859_7,
"ecma-118": iso8859_7,
"elot_928": iso8859_7,
"greek": iso8859_7,
"greek8": iso8859_7,
"iso-8859-7": iso8859_7,
"iso-ir-126": iso8859_7,
"iso8859-7": iso8859_7,
"iso88597": iso8859_7,
"iso_8859-7": iso8859_7,
"iso_8859-7:1987": iso8859_7,
"sun_eu_greek": iso8859_7,
"csiso88598e": iso8859_8,
"csisolatinhebrew": iso8859_8,
"hebrew": iso8859_8,
"iso-8859-8": iso8859_8,
"iso-8859-8-e": iso8859_8,
"iso-ir-138": iso8859_8,
"iso8859-8": iso8859_8,
"iso88598": iso8859_8,
"iso_8859-8": iso8859_8,
"iso_8859-8:1988": iso8859_8,
"visual": iso8859_8,
"csiso88598i": iso8859_8I,
"iso-8859-8-i": iso8859_8I,
"logical": iso8859_8I,
"csisolatin6": iso8859_10,
"iso-8859-10": iso8859_10,
"iso-ir-157": iso8859_10,
"iso8859-10": iso8859_10,
"iso885910": iso8859_10,
"l6": iso8859_10,
"latin6": iso8859_10,
"iso-8859-13": iso8859_13,
"iso8859-13": iso8859_13,
"iso885913": iso8859_13,
"iso-8859-14": iso8859_14,
"iso8859-14": iso8859_14,
"iso885914": iso8859_14,
"csisolatin9": iso8859_15,
"iso-8859-15": iso8859_15,
"iso8859-15": iso8859_15,
"iso885915": iso8859_15,
"iso_8859-15": iso8859_15,
"l9": iso8859_15,
"iso-8859-16": iso8859_16,
"cskoi8r": koi8r,
"koi": koi8r,
"koi8": koi8r,
"koi8-r": koi8r,
"koi8_r": koi8r,
"koi8-ru": koi8u,
"koi8-u": koi8u,
"csmacintosh": macintosh,
"mac": macintosh,
"macintosh": macintosh,
"x-mac-roman": macintosh,
"dos-874": windows874,
"iso-8859-11": windows874,
"iso8859-11": windows874,
"iso885911": windows874,
"tis-620": windows874,
"windows-874": windows874,
"cp1250": windows1250,
"windows-1250": windows1250,
"x-cp1250": windows1250,
"cp1251": windows1251,
"windows-1251": windows1251,
"x-cp1251": windows1251,
"ansi_x3.4-1968": windows1252,
"ascii": windows1252,
"cp1252": windows1252,
"cp819": windows1252,
"csisolatin1": windows1252,
"ibm819": windows1252,
"iso-8859-1": windows1252,
"iso-ir-100": windows1252,
"iso8859-1": windows1252,
"iso88591": windows1252,
"iso_8859-1": windows1252,
"iso_8859-1:1987": windows1252,
"l1": windows1252,
"latin1": windows1252,
"us-ascii": windows1252,
"windows-1252": windows1252,
"x-cp1252": windows1252,
"cp1253": windows1253,
"windows-1253": windows1253,
"x-cp1253": windows1253,
"cp1254": windows1254,
"csisolatin5": windows1254,
"iso-8859-9": windows1254,
"iso-ir-148": windows1254,
"iso8859-9": windows1254,
"iso88599": windows1254,
"iso_8859-9": windows1254,
"iso_8859-9:1989": windows1254,
"l5": windows1254,
"latin5": windows1254,
"windows-1254": windows1254,
"x-cp1254": windows1254,
"cp1255": windows1255,
"windows-1255": windows1255,
"x-cp1255": windows1255,
"cp1256": windows1256,
"windows-1256": windows1256,
"x-cp1256": windows1256,
"cp1257": windows1257,
"windows-1257": windows1257,
"x-cp1257": windows1257,
"cp1258": windows1258,
"windows-1258": windows1258,
"x-cp1258": windows1258,
"x-mac-cyrillic": macintoshCyrillic,
"x-mac-ukrainian": macintoshCyrillic,
"chinese": gbk,
"csgb2312": gbk,
"csiso58gb231280": gbk,
"gb2312": gbk,
"gb_2312": gbk,
"gb_2312-80": gbk,
"gbk": gbk,
"iso-ir-58": gbk,
"x-gbk": gbk,
"gb18030": gb18030,
"big5": big5,
"big5-hkscs": big5,
"cn-big5": big5,
"csbig5": big5,
"x-x-big5": big5,
"cseucpkdfmtjapanese": eucjp,
"euc-jp": eucjp,
"x-euc-jp": eucjp,
"csiso2022jp": iso2022jp,
"iso-2022-jp": iso2022jp,
"csshiftjis": shiftJIS,
"ms932": shiftJIS,
"ms_kanji": shiftJIS,
"shift-jis": shiftJIS,
"shift_jis": shiftJIS,
"sjis": shiftJIS,
"windows-31j": shiftJIS,
"x-sjis": shiftJIS,
"cseuckr": euckr,
"csksc56011987": euckr,
"euc-kr": euckr,
"iso-ir-149": euckr,
"korean": euckr,
"ks_c_5601-1987": euckr,
"ks_c_5601-1989": euckr,
"ksc5601": euckr,
"ksc_5601": euckr,
"windows-949": euckr,
"csiso2022kr": replacement,
"hz-gb-2312": replacement,
"iso-2022-cn": replacement,
"iso-2022-cn-ext": replacement,
"iso-2022-kr": replacement,
"utf-16be": utf16be,
"utf-16": utf16le,
"utf-16le": utf16le,
"x-user-defined": xUserDefined,
}
var localeMap = []htmlEncoding{
windows1252, // und_Latn
windows1256, // ar
windows1251, // ba
windows1251, // be
windows1251, // bg
windows1250, // cs
iso8859_7, // el
windows1257, // et
windows1256, // fa
windows1255, // he
windows1250, // hr
iso8859_2, // hu
shiftJIS, // ja
windows1251, // kk
euckr, // ko
windows1254, // ku
windows1251, // ky
windows1257, // lt
windows1257, // lv
windows1251, // mk
iso8859_2, // pl
windows1251, // ru
windows1251, // sah
windows1250, // sk
iso8859_2, // sl
windows1251, // sr
windows1251, // tg
windows874, // th
windows1254, // tr
windows1251, // tt
windows1251, // uk
windows1258, // vi
gb18030, // zh-hans
big5, // zh-hant
}
const locales = "und_Latn ar ba be bg cs el et fa he hr hu ja kk ko ku ky lt lv mk pl ru sah sk sl sr tg th tr tt uk vi zh-hans zh-hant"
+192
View File
@@ -0,0 +1,192 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
import (
"encoding/xml"
"fmt"
"io"
"log"
"sort"
"strconv"
"strings"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/internal/gen"
)
type registry struct {
XMLName xml.Name `xml:"registry"`
Updated string `xml:"updated"`
Registry []struct {
ID string `xml:"id,attr"`
Record []struct {
Name string `xml:"name"`
Xref []struct {
Type string `xml:"type,attr"`
Data string `xml:"data,attr"`
} `xml:"xref"`
Desc struct {
Data string `xml:",innerxml"`
} `xml:"description,"`
MIB string `xml:"value"`
Alias []string `xml:"alias"`
MIME string `xml:"preferred_alias"`
} `xml:"record"`
} `xml:"registry"`
}
func main() {
r := gen.OpenIANAFile("assignments/character-sets/character-sets.xml")
reg := &registry{}
if err := xml.NewDecoder(r).Decode(&reg); err != nil && err != io.EOF {
log.Fatalf("Error decoding charset registry: %v", err)
}
if len(reg.Registry) == 0 || reg.Registry[0].ID != "character-sets-1" {
log.Fatalf("Unexpected ID %s", reg.Registry[0].ID)
}
x := &indexInfo{}
for _, rec := range reg.Registry[0].Record {
mib := identifier.MIB(parseInt(rec.MIB))
x.addEntry(mib, rec.Name)
for _, a := range rec.Alias {
a = strings.Split(a, " ")[0] // strip comments.
x.addAlias(a, mib)
// MIB name aliases are prefixed with a "cs" (character set) in the
// registry to identify them as display names and to ensure that
// the name starts with a lowercase letter in case it is used as
// an identifier. We remove it to be left with a nice clean name.
if strings.HasPrefix(a, "cs") {
x.setName(2, a[2:])
}
}
if rec.MIME != "" {
x.addAlias(rec.MIME, mib)
x.setName(1, rec.MIME)
}
}
w := gen.NewCodeWriter()
fmt.Fprintln(w, `import "golang.org/x/text/encoding/internal/identifier"`)
writeIndex(w, x)
w.WriteGoFile("tables.go", "ianaindex")
}
type alias struct {
name string
mib identifier.MIB
}
type indexInfo struct {
// compacted index from code to MIB
codeToMIB []identifier.MIB
alias []alias
names [][3]string
}
func (ii *indexInfo) Len() int {
return len(ii.codeToMIB)
}
func (ii *indexInfo) Less(a, b int) bool {
return ii.codeToMIB[a] < ii.codeToMIB[b]
}
func (ii *indexInfo) Swap(a, b int) {
ii.codeToMIB[a], ii.codeToMIB[b] = ii.codeToMIB[b], ii.codeToMIB[a]
// Co-sort the names.
ii.names[a], ii.names[b] = ii.names[b], ii.names[a]
}
func (ii *indexInfo) setName(i int, name string) {
ii.names[len(ii.names)-1][i] = name
}
func (ii *indexInfo) addEntry(mib identifier.MIB, name string) {
ii.names = append(ii.names, [3]string{name, name, name})
ii.addAlias(name, mib)
ii.codeToMIB = append(ii.codeToMIB, mib)
}
func (ii *indexInfo) addAlias(name string, mib identifier.MIB) {
// Don't add duplicates for the same mib. Adding duplicate aliases for
// different MIBs will cause the compiler to barf on an invalid map: great!.
for i := len(ii.alias) - 1; i >= 0 && ii.alias[i].mib == mib; i-- {
if ii.alias[i].name == name {
return
}
}
ii.alias = append(ii.alias, alias{name, mib})
lower := strings.ToLower(name)
if lower != name {
ii.addAlias(lower, mib)
}
}
const maxMIMENameLen = '0' - 1 // officially 40, but we leave some buffer.
func writeIndex(w *gen.CodeWriter, x *indexInfo) {
sort.Stable(x)
// Write constants.
fmt.Fprintln(w, "const (")
for i, m := range x.codeToMIB {
if i == 0 {
fmt.Fprintf(w, "enc%d = iota\n", m)
} else {
fmt.Fprintf(w, "enc%d\n", m)
}
}
fmt.Fprintln(w, "numIANA")
fmt.Fprintln(w, ")")
w.WriteVar("ianaToMIB", x.codeToMIB)
var ianaNames, mibNames []string
for _, names := range x.names {
n := names[0]
if names[0] != names[1] {
// MIME names are mostly identical to IANA names. We share the
// tables by setting the first byte of the string to an index into
// the string itself (< maxMIMENameLen) to the IANA name. The MIME
// name immediately follows the index.
x := len(names[1]) + 1
if x > maxMIMENameLen {
log.Fatalf("MIME name length (%d) > %d", x, maxMIMENameLen)
}
n = string(x) + names[1] + names[0]
}
ianaNames = append(ianaNames, n)
mibNames = append(mibNames, names[2])
}
w.WriteVar("ianaNames", ianaNames)
w.WriteVar("mibNames", mibNames)
w.WriteComment(`
TODO: Instead of using a map, we could use binary search strings doing
on-the fly lower-casing per character. This allows to always avoid
allocation and will be considerably more compact.`)
fmt.Fprintln(w, "var ianaAliases = map[string]int{")
for _, a := range x.alias {
fmt.Fprintf(w, "%q: enc%d,\n", a.name, a.mib)
}
fmt.Fprintln(w, "}")
}
func parseInt(s string) int {
x, err := strconv.ParseInt(s, 10, 64)
if err != nil {
log.Fatalf("Could not parse integer: %v", err)
}
return int(x)
}
+209
View File
@@ -0,0 +1,209 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run gen.go
// Package ianaindex maps names to Encodings as specified by the IANA registry.
// This includes both the MIME and IANA names.
//
// See http://www.iana.org/assignments/character-sets/character-sets.xhtml for
// more details.
package ianaindex
import (
"errors"
"sort"
"strings"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/charmap"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/encoding/japanese"
"golang.org/x/text/encoding/korean"
"golang.org/x/text/encoding/simplifiedchinese"
"golang.org/x/text/encoding/traditionalchinese"
"golang.org/x/text/encoding/unicode"
)
// TODO: remove the "Status... incomplete" in the package doc comment.
// TODO: allow users to specify their own aliases?
// TODO: allow users to specify their own indexes?
// TODO: allow canonicalizing names
// NOTE: only use these top-level variables if we can get the linker to drop
// the indexes when they are not used. Make them a function or perhaps only
// support MIME otherwise.
var (
// MIME is an index to map MIME names.
MIME *Index = mime
// IANA is an index that supports all names and aliases using IANA names as
// the canonical identifier.
IANA *Index = iana
// MIB is an index that associates the MIB display name with an Encoding.
MIB *Index = mib
mime = &Index{mimeName, ianaToMIB, ianaAliases, encodings[:]}
iana = &Index{ianaName, ianaToMIB, ianaAliases, encodings[:]}
mib = &Index{mibName, ianaToMIB, ianaAliases, encodings[:]}
)
// Index maps names registered by IANA to Encodings.
// Currently different Indexes only differ in the names they return for
// encodings. In the future they may also differ in supported aliases.
type Index struct {
names func(i int) string
toMIB []identifier.MIB // Sorted slice of supported MIBs
alias map[string]int
enc []encoding.Encoding
}
var (
errInvalidName = errors.New("ianaindex: invalid encoding name")
errUnknown = errors.New("ianaindex: unknown Encoding")
errUnsupported = errors.New("ianaindex: unsupported Encoding")
)
// Encoding returns an Encoding for IANA-registered names. Matching is
// case-insensitive.
func (x *Index) Encoding(name string) (encoding.Encoding, error) {
name = strings.TrimSpace(name)
// First try without lowercasing (possibly creating an allocation).
i, ok := x.alias[name]
if !ok {
i, ok = x.alias[strings.ToLower(name)]
if !ok {
return nil, errInvalidName
}
}
return x.enc[i], nil
}
// Name reports the canonical name of the given Encoding. It will return an
// error if the e is not associated with a known encoding scheme.
func (x *Index) Name(e encoding.Encoding) (string, error) {
id, ok := e.(identifier.Interface)
if !ok {
return "", errUnknown
}
mib, _ := id.ID()
if mib == 0 {
return "", errUnknown
}
v := findMIB(x.toMIB, mib)
if v == -1 {
return "", errUnsupported
}
return x.names(v), nil
}
// TODO: the coverage of this index is rather spotty. Allowing users to set
// encodings would allow:
// - users to increase coverage
// - allow a partially loaded set of encodings in case the user doesn't need to
// them all.
// - write an OS-specific wrapper for supported encodings and set them.
// The exact definition of Set depends a bit on if and how we want to let users
// write their own Encoding implementations. Also, it is not possible yet to
// only partially load the encodings without doing some refactoring. Until this
// is solved, we might as well not support Set.
// // Set sets the e to be used for the encoding scheme identified by name. Only
// // canonical names may be used. An empty name assigns e to its internally
// // associated encoding scheme.
// func (x *Index) Set(name string, e encoding.Encoding) error {
// panic("TODO: implement")
// }
func findMIB(x []identifier.MIB, mib identifier.MIB) int {
i := sort.Search(len(x), func(i int) bool { return x[i] >= mib })
if i < len(x) && x[i] == mib {
return i
}
return -1
}
const maxMIMENameLen = '0' - 1 // officially 40, but we leave some buffer.
func mimeName(x int) string {
n := ianaNames[x]
// See gen.go for a description of the encoding.
if n[0] <= maxMIMENameLen {
return n[1:n[0]]
}
return n
}
func ianaName(x int) string {
n := ianaNames[x]
// See gen.go for a description of the encoding.
if n[0] <= maxMIMENameLen {
return n[n[0]:]
}
return n
}
func mibName(x int) string {
return mibNames[x]
}
var encodings = [numIANA]encoding.Encoding{
enc106: unicode.UTF8,
enc1015: unicode.UTF16(unicode.BigEndian, unicode.UseBOM),
enc1013: unicode.UTF16(unicode.BigEndian, unicode.IgnoreBOM),
enc1014: unicode.UTF16(unicode.LittleEndian, unicode.IgnoreBOM),
enc2028: charmap.CodePage037,
enc2011: charmap.CodePage437,
enc2009: charmap.CodePage850,
enc2010: charmap.CodePage852,
enc2046: charmap.CodePage855,
enc2089: charmap.CodePage858,
enc2048: charmap.CodePage860,
enc2013: charmap.CodePage862,
enc2050: charmap.CodePage863,
enc2052: charmap.CodePage865,
enc2086: charmap.CodePage866,
enc2102: charmap.CodePage1047,
enc2091: charmap.CodePage1140,
enc4: charmap.ISO8859_1,
enc5: charmap.ISO8859_2,
enc6: charmap.ISO8859_3,
enc7: charmap.ISO8859_4,
enc8: charmap.ISO8859_5,
enc9: charmap.ISO8859_6,
enc81: charmap.ISO8859_6E,
enc82: charmap.ISO8859_6I,
enc10: charmap.ISO8859_7,
enc11: charmap.ISO8859_8,
enc84: charmap.ISO8859_8E,
enc85: charmap.ISO8859_8I,
enc12: charmap.ISO8859_9,
enc13: charmap.ISO8859_10,
enc109: charmap.ISO8859_13,
enc110: charmap.ISO8859_14,
enc111: charmap.ISO8859_15,
enc112: charmap.ISO8859_16,
enc2084: charmap.KOI8R,
enc2088: charmap.KOI8U,
enc2027: charmap.Macintosh,
enc2109: charmap.Windows874,
enc2250: charmap.Windows1250,
enc2251: charmap.Windows1251,
enc2252: charmap.Windows1252,
enc2253: charmap.Windows1253,
enc2254: charmap.Windows1254,
enc2255: charmap.Windows1255,
enc2256: charmap.Windows1256,
enc2257: charmap.Windows1257,
enc2258: charmap.Windows1258,
enc18: japanese.EUCJP,
enc39: japanese.ISO2022JP,
enc17: japanese.ShiftJIS,
enc38: korean.EUCKR,
enc114: simplifiedchinese.GB18030,
enc113: simplifiedchinese.GBK,
enc2085: simplifiedchinese.HZGB2312,
enc2026: traditionalchinese.Big5,
}
File diff suppressed because it is too large Load Diff
+180
View File
@@ -0,0 +1,180 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package enctest
import (
"bytes"
"fmt"
"io"
"io/ioutil"
"strings"
"testing"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// Encoder or Decoder
type Transcoder interface {
transform.Transformer
Bytes([]byte) ([]byte, error)
String(string) (string, error)
}
func TestEncoding(t *testing.T, e encoding.Encoding, encoded, utf8, prefix, suffix string) {
for _, direction := range []string{"Decode", "Encode"} {
t.Run(fmt.Sprintf("%v/%s", e, direction), func(t *testing.T) {
var coder Transcoder
var want, src, wPrefix, sPrefix, wSuffix, sSuffix string
if direction == "Decode" {
coder, want, src = e.NewDecoder(), utf8, encoded
wPrefix, sPrefix, wSuffix, sSuffix = "", prefix, "", suffix
} else {
coder, want, src = e.NewEncoder(), encoded, utf8
wPrefix, sPrefix, wSuffix, sSuffix = prefix, "", suffix, ""
}
dst := make([]byte, len(wPrefix)+len(want)+len(wSuffix))
nDst, nSrc, err := coder.Transform(dst, []byte(sPrefix+src+sSuffix), true)
if err != nil {
t.Fatal(err)
}
if nDst != len(wPrefix)+len(want)+len(wSuffix) {
t.Fatalf("nDst got %d, want %d",
nDst, len(wPrefix)+len(want)+len(wSuffix))
}
if nSrc != len(sPrefix)+len(src)+len(sSuffix) {
t.Fatalf("nSrc got %d, want %d",
nSrc, len(sPrefix)+len(src)+len(sSuffix))
}
if got := string(dst); got != wPrefix+want+wSuffix {
t.Fatalf("\ngot %q\nwant %q", got, wPrefix+want+wSuffix)
}
for _, n := range []int{0, 1, 2, 10, 123, 4567} {
input := sPrefix + strings.Repeat(src, n) + sSuffix
g, err := coder.String(input)
if err != nil {
t.Fatalf("Bytes: n=%d: %v", n, err)
}
if len(g) == 0 && len(input) == 0 {
// If the input is empty then the output can be empty,
// regardless of whatever wPrefix is.
continue
}
got1, want1 := string(g), wPrefix+strings.Repeat(want, n)+wSuffix
if got1 != want1 {
t.Fatalf("ReadAll: n=%d\ngot %q\nwant %q",
n, trim(got1), trim(want1))
}
}
})
}
}
func TestFile(t *testing.T, e encoding.Encoding) {
for _, dir := range []string{"Decode", "Encode"} {
t.Run(fmt.Sprintf("%s/%s", e, dir), func(t *testing.T) {
dst, src, transformer, err := load(dir, e)
if err != nil {
t.Fatalf("load: %v", err)
}
buf, err := transformer.Bytes(src)
if err != nil {
t.Fatalf("transform: %v", err)
}
if !bytes.Equal(buf, dst) {
t.Error("transformed bytes did not match golden file")
}
})
}
}
func Benchmark(b *testing.B, enc encoding.Encoding) {
for _, direction := range []string{"Decode", "Encode"} {
b.Run(fmt.Sprintf("%s/%s", enc, direction), func(b *testing.B) {
_, src, transformer, err := load(direction, enc)
if err != nil {
b.Fatal(err)
}
b.SetBytes(int64(len(src)))
b.ResetTimer()
for i := 0; i < b.N; i++ {
r := transform.NewReader(bytes.NewReader(src), transformer)
io.Copy(ioutil.Discard, r)
}
})
}
}
// testdataFiles are files in testdata/*.txt.
var testdataFiles = []struct {
mib identifier.MIB
basename, ext string
}{
{identifier.Windows1252, "candide", "windows-1252"},
{identifier.EUCPkdFmtJapanese, "rashomon", "euc-jp"},
{identifier.ISO2022JP, "rashomon", "iso-2022-jp"},
{identifier.ShiftJIS, "rashomon", "shift-jis"},
{identifier.EUCKR, "unsu-joh-eun-nal", "euc-kr"},
{identifier.GBK, "sunzi-bingfa-simplified", "gbk"},
{identifier.HZGB2312, "sunzi-bingfa-gb-levels-1-and-2", "hz-gb2312"},
{identifier.Big5, "sunzi-bingfa-traditional", "big5"},
{identifier.UTF16LE, "candide", "utf-16le"},
{identifier.UTF8, "candide", "utf-8"},
{identifier.UTF32BE, "candide", "utf-32be"},
// GB18030 is a superset of GBK and is nominally a Simplified Chinese
// encoding, but it can also represent the entire Basic Multilingual
// Plane, including codepoints like 'â' that aren't encodable by GBK.
// GB18030 on Simplified Chinese should perform similarly to GBK on
// Simplified Chinese. GB18030 on "candide" is more interesting.
{identifier.GB18030, "candide", "gb18030"},
}
func load(direction string, enc encoding.Encoding) ([]byte, []byte, Transcoder, error) {
basename, ext, count := "", "", 0
for _, tf := range testdataFiles {
if mib, _ := enc.(identifier.Interface).ID(); tf.mib == mib {
basename, ext = tf.basename, tf.ext
count++
}
}
if count != 1 {
if count == 0 {
return nil, nil, nil, fmt.Errorf("no testdataFiles for %s", enc)
}
return nil, nil, nil, fmt.Errorf("too many testdataFiles for %s", enc)
}
dstFile := fmt.Sprintf("../testdata/%s-%s.txt", basename, ext)
srcFile := fmt.Sprintf("../testdata/%s-utf-8.txt", basename)
var coder Transcoder = encoding.ReplaceUnsupported(enc.NewEncoder())
if direction == "Decode" {
dstFile, srcFile = srcFile, dstFile
coder = enc.NewDecoder()
}
dst, err := ioutil.ReadFile(dstFile)
if err != nil {
if dst, err = ioutil.ReadFile("../" + dstFile); err != nil {
return nil, nil, nil, err
}
}
src, err := ioutil.ReadFile(srcFile)
if err != nil {
if src, err = ioutil.ReadFile("../" + srcFile); err != nil {
return nil, nil, nil, err
}
}
return dst, src, coder, nil
}
func trim(s string) string {
if len(s) < 120 {
return s
}
return s[:50] + "..." + s[len(s)-50:]
}
+137
View File
@@ -0,0 +1,137 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
import (
"bytes"
"encoding/xml"
"fmt"
"io"
"log"
"strings"
"golang.org/x/text/internal/gen"
)
type registry struct {
XMLName xml.Name `xml:"registry"`
Updated string `xml:"updated"`
Registry []struct {
ID string `xml:"id,attr"`
Record []struct {
Name string `xml:"name"`
Xref []struct {
Type string `xml:"type,attr"`
Data string `xml:"data,attr"`
} `xml:"xref"`
Desc struct {
Data string `xml:",innerxml"`
// Any []struct {
// Data string `xml:",chardata"`
// } `xml:",any"`
// Data string `xml:",chardata"`
} `xml:"description,"`
MIB string `xml:"value"`
Alias []string `xml:"alias"`
MIME string `xml:"preferred_alias"`
} `xml:"record"`
} `xml:"registry"`
}
func main() {
r := gen.OpenIANAFile("assignments/character-sets/character-sets.xml")
reg := &registry{}
if err := xml.NewDecoder(r).Decode(&reg); err != nil && err != io.EOF {
log.Fatalf("Error decoding charset registry: %v", err)
}
if len(reg.Registry) == 0 || reg.Registry[0].ID != "character-sets-1" {
log.Fatalf("Unexpected ID %s", reg.Registry[0].ID)
}
w := &bytes.Buffer{}
fmt.Fprintf(w, "const (\n")
for _, rec := range reg.Registry[0].Record {
constName := ""
for _, a := range rec.Alias {
if strings.HasPrefix(a, "cs") && strings.IndexByte(a, '-') == -1 {
// Some of the constant definitions have comments in them. Strip those.
constName = strings.Title(strings.SplitN(a[2:], "\n", 2)[0])
}
}
if constName == "" {
switch rec.MIB {
case "2085":
constName = "HZGB2312" // Not listed as alias for some reason.
default:
log.Fatalf("No cs alias defined for %s.", rec.MIB)
}
}
if rec.MIME != "" {
rec.MIME = fmt.Sprintf(" (MIME: %s)", rec.MIME)
}
fmt.Fprintf(w, "// %s is the MIB identifier with IANA name %s%s.\n//\n", constName, rec.Name, rec.MIME)
if len(rec.Desc.Data) > 0 {
fmt.Fprint(w, "// ")
d := xml.NewDecoder(strings.NewReader(rec.Desc.Data))
inElem := true
attr := ""
for {
t, err := d.Token()
if err != nil {
if err != io.EOF {
log.Fatal(err)
}
break
}
switch x := t.(type) {
case xml.CharData:
attr = "" // Don't need attribute info.
a := bytes.Split([]byte(x), []byte("\n"))
for i, b := range a {
if b = bytes.TrimSpace(b); len(b) != 0 {
if !inElem && i > 0 {
fmt.Fprint(w, "\n// ")
}
inElem = false
fmt.Fprintf(w, "%s ", string(b))
}
}
case xml.StartElement:
if x.Name.Local == "xref" {
inElem = true
use := false
for _, a := range x.Attr {
if a.Name.Local == "type" {
use = use || a.Value != "person"
}
if a.Name.Local == "data" && use {
attr = a.Value + " "
}
}
}
case xml.EndElement:
inElem = false
fmt.Fprint(w, attr)
}
}
fmt.Fprint(w, "\n")
}
for _, x := range rec.Xref {
switch x.Type {
case "rfc":
fmt.Fprintf(w, "// Reference: %s\n", strings.ToUpper(x.Data))
case "uri":
fmt.Fprintf(w, "// Reference: %s\n", x.Data)
}
}
fmt.Fprintf(w, "%s MIB = %s\n", constName, rec.MIB)
fmt.Fprintln(w)
}
fmt.Fprintln(w, ")")
gen.WriteGoFile("mib.go", "identifier", w.Bytes())
}
+81
View File
@@ -0,0 +1,81 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run gen.go
// Package identifier defines the contract between implementations of Encoding
// and Index by defining identifiers that uniquely identify standardized coded
// character sets (CCS) and character encoding schemes (CES), which we will
// together refer to as encodings, for which Encoding implementations provide
// converters to and from UTF-8. This package is typically only of concern to
// implementers of Indexes and Encodings.
//
// One part of the identifier is the MIB code, which is defined by IANA and
// uniquely identifies a CCS or CES. Each code is associated with data that
// references authorities, official documentation as well as aliases and MIME
// names.
//
// Not all CESs are covered by the IANA registry. The "other" string that is
// returned by ID can be used to identify other character sets or versions of
// existing ones.
//
// It is recommended that each package that provides a set of Encodings provide
// the All and Common variables to reference all supported encodings and
// commonly used subset. This allows Index implementations to include all
// available encodings without explicitly referencing or knowing about them.
package identifier
// Note: this package is internal, but could be made public if there is a need
// for writing third-party Indexes and Encodings.
// References:
// - http://source.icu-project.org/repos/icu/icu/trunk/source/data/mappings/convrtrs.txt
// - http://www.iana.org/assignments/character-sets/character-sets.xhtml
// - http://www.iana.org/assignments/ianacharset-mib/ianacharset-mib
// - http://www.ietf.org/rfc/rfc2978.txt
// - http://www.unicode.org/reports/tr22/
// - http://www.w3.org/TR/encoding/
// - https://encoding.spec.whatwg.org/
// - https://encoding.spec.whatwg.org/encodings.json
// - https://tools.ietf.org/html/rfc6657#section-5
// Interface can be implemented by Encodings to define the CCS or CES for which
// it implements conversions.
type Interface interface {
// ID returns an encoding identifier. Exactly one of the mib and other
// values should be non-zero.
//
// In the usual case it is only necessary to indicate the MIB code. The
// other string can be used to specify encodings for which there is no MIB,
// such as "x-mac-dingbat".
//
// The other string may only contain the characters a-z, A-Z, 0-9, - and _.
ID() (mib MIB, other string)
// NOTE: the restrictions on the encoding are to allow extending the syntax
// with additional information such as versions, vendors and other variants.
}
// A MIB identifies an encoding. It is derived from the IANA MIB codes and adds
// some identifiers for some encodings that are not covered by the IANA
// standard.
//
// See http://www.iana.org/assignments/ianacharset-mib.
type MIB uint16
// These additional MIB types are not defined in IANA. They are added because
// they are common and defined within the text repo.
const (
// Unofficial marks the start of encodings not registered by IANA.
Unofficial MIB = 10000 + iota
// Replacement is the WhatWG replacement encoding.
Replacement
// XUserDefined is the code for x-user-defined.
XUserDefined
// MacintoshCyrillic is the code for x-mac-cyrillic.
MacintoshCyrillic
)
File diff suppressed because it is too large Load Diff
+75
View File
@@ -0,0 +1,75 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package internal contains code that is shared among encoding implementations.
package internal
import (
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// Encoding is an implementation of the Encoding interface that adds the String
// and ID methods to an existing encoding.
type Encoding struct {
encoding.Encoding
Name string
MIB identifier.MIB
}
// _ verifies that Encoding implements identifier.Interface.
var _ identifier.Interface = (*Encoding)(nil)
func (e *Encoding) String() string {
return e.Name
}
func (e *Encoding) ID() (mib identifier.MIB, other string) {
return e.MIB, ""
}
// SimpleEncoding is an Encoding that combines two Transformers.
type SimpleEncoding struct {
Decoder transform.Transformer
Encoder transform.Transformer
}
func (e *SimpleEncoding) NewDecoder() *encoding.Decoder {
return &encoding.Decoder{Transformer: e.Decoder}
}
func (e *SimpleEncoding) NewEncoder() *encoding.Encoder {
return &encoding.Encoder{Transformer: e.Encoder}
}
// FuncEncoding is an Encoding that combines two functions returning a new
// Transformer.
type FuncEncoding struct {
Decoder func() transform.Transformer
Encoder func() transform.Transformer
}
func (e FuncEncoding) NewDecoder() *encoding.Decoder {
return &encoding.Decoder{Transformer: e.Decoder()}
}
func (e FuncEncoding) NewEncoder() *encoding.Encoder {
return &encoding.Encoder{Transformer: e.Encoder()}
}
// A RepertoireError indicates a rune is not in the repertoire of a destination
// encoding. It is associated with an encoding-specific suggested replacement
// byte.
type RepertoireError byte
// Error implements the error interrface.
func (r RepertoireError) Error() string {
return "encoding: rune not supported by encoding."
}
// Replacement returns the replacement string associated with this error.
func (r RepertoireError) Replacement() byte { return byte(r) }
var ErrASCIIReplacement = RepertoireError(encoding.ASCIISub)
+12
View File
@@ -0,0 +1,12 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package japanese
import (
"golang.org/x/text/encoding"
)
// All is a list of all defined encodings in this package.
var All = []encoding.Encoding{EUCJP, ISO2022JP, ShiftJIS}
+225
View File
@@ -0,0 +1,225 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package japanese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// EUCJP is the EUC-JP encoding.
var EUCJP encoding.Encoding = &eucJP
var eucJP = internal.Encoding{
&internal.SimpleEncoding{eucJPDecoder{}, eucJPEncoder{}},
"EUC-JP",
identifier.EUCPkdFmtJapanese,
}
type eucJPDecoder struct{ transform.NopResetter }
// See https://encoding.spec.whatwg.org/#euc-jp-decoder.
func (eucJPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
switch c0 := src[nSrc]; {
case c0 < utf8.RuneSelf:
r, size = rune(c0), 1
case c0 == 0x8e:
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
break
}
c1 := src[nSrc+1]
switch {
case c1 < 0xa1:
r, size = utf8.RuneError, 1
case c1 > 0xdf:
r, size = utf8.RuneError, 2
if c1 == 0xff {
size = 1
}
default:
r, size = rune(c1)+(0xff61-0xa1), 2
}
case c0 == 0x8f:
if nSrc+2 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
if p := nSrc + 1; p < len(src) && 0xa1 <= src[p] && src[p] < 0xfe {
size = 2
}
break
}
c1 := src[nSrc+1]
if c1 < 0xa1 || 0xfe < c1 {
r, size = utf8.RuneError, 1
break
}
c2 := src[nSrc+2]
if c2 < 0xa1 || 0xfe < c2 {
r, size = utf8.RuneError, 2
break
}
r, size = utf8.RuneError, 3
if i := int(c1-0xa1)*94 + int(c2-0xa1); i < len(jis0212Decode) {
r = rune(jis0212Decode[i])
if r == 0 {
r = utf8.RuneError
}
}
case 0xa1 <= c0 && c0 <= 0xfe:
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
break
}
c1 := src[nSrc+1]
if c1 < 0xa1 || 0xfe < c1 {
r, size = utf8.RuneError, 1
break
}
r, size = utf8.RuneError, 2
if i := int(c0-0xa1)*94 + int(c1-0xa1); i < len(jis0208Decode) {
r = rune(jis0208Decode[i])
if r == 0 {
r = utf8.RuneError
}
}
default:
r, size = utf8.RuneError, 1
}
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type eucJPEncoder struct{ transform.NopResetter }
func (eucJPEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r != 0 {
goto write2or3
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r != 0 {
goto write2or3
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r != 0 {
goto write2or3
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r != 0 {
goto write2or3
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r != 0 {
goto write2or3
}
case encode5Low <= r && r < encode5High:
if 0xff61 <= r && r < 0xffa0 {
goto write2
}
if r = rune(encode5[r-encode5Low]); r != 0 {
goto write2or3
}
}
err = internal.ErrASCIIReplacement
break
}
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
write2or3:
if r>>tableShift == jis0208 {
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
} else {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = 0x8f
nDst++
}
dst[nDst+0] = 0xa1 + uint8(r>>codeShift)&codeMask
dst[nDst+1] = 0xa1 + uint8(r)&codeMask
nDst += 2
continue
write2:
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = 0x8e
dst[nDst+1] = uint8(r - (0xff61 - 0xa1))
nDst += 2
continue
}
return nDst, nSrc, err
}
func init() {
// Check that the hard-coded encode switch covers all tables.
if numEncodeTables != 6 {
panic("bad numEncodeTables")
}
}
+299
View File
@@ -0,0 +1,299 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package japanese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// ISO2022JP is the ISO-2022-JP encoding.
var ISO2022JP encoding.Encoding = &iso2022JP
var iso2022JP = internal.Encoding{
internal.FuncEncoding{iso2022JPNewDecoder, iso2022JPNewEncoder},
"ISO-2022-JP",
identifier.ISO2022JP,
}
func iso2022JPNewDecoder() transform.Transformer {
return new(iso2022JPDecoder)
}
func iso2022JPNewEncoder() transform.Transformer {
return new(iso2022JPEncoder)
}
const (
asciiState = iota
katakanaState
jis0208State
jis0212State
)
const asciiEsc = 0x1b
type iso2022JPDecoder int
func (d *iso2022JPDecoder) Reset() {
*d = asciiState
}
func (d *iso2022JPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
c0 := src[nSrc]
if c0 >= utf8.RuneSelf {
r, size = '\ufffd', 1
goto write
}
if c0 == asciiEsc {
if nSrc+2 >= len(src) {
if !atEOF {
return nDst, nSrc, transform.ErrShortSrc
}
// TODO: is it correct to only skip 1??
r, size = '\ufffd', 1
goto write
}
size = 3
c1 := src[nSrc+1]
c2 := src[nSrc+2]
switch {
case c1 == '$' && (c2 == '@' || c2 == 'B'): // 0x24 {0x40, 0x42}
*d = jis0208State
continue
case c1 == '$' && c2 == '(': // 0x24 0x28
if nSrc+3 >= len(src) {
if !atEOF {
return nDst, nSrc, transform.ErrShortSrc
}
r, size = '\ufffd', 1
goto write
}
size = 4
if src[nSrc+3] == 'D' {
*d = jis0212State
continue
}
case c1 == '(' && (c2 == 'B' || c2 == 'J'): // 0x28 {0x42, 0x4A}
*d = asciiState
continue
case c1 == '(' && c2 == 'I': // 0x28 0x49
*d = katakanaState
continue
}
r, size = '\ufffd', 1
goto write
}
switch *d {
case asciiState:
r, size = rune(c0), 1
case katakanaState:
if c0 < 0x21 || 0x60 <= c0 {
r, size = '\ufffd', 1
goto write
}
r, size = rune(c0)+(0xff61-0x21), 1
default:
if c0 == 0x0a {
*d = asciiState
r, size = rune(c0), 1
goto write
}
if nSrc+1 >= len(src) {
if !atEOF {
return nDst, nSrc, transform.ErrShortSrc
}
r, size = '\ufffd', 1
goto write
}
size = 2
c1 := src[nSrc+1]
i := int(c0-0x21)*94 + int(c1-0x21)
if *d == jis0208State && i < len(jis0208Decode) {
r = rune(jis0208Decode[i])
} else if *d == jis0212State && i < len(jis0212Decode) {
r = rune(jis0212Decode[i])
} else {
r = '\ufffd'
goto write
}
if r == 0 {
r = '\ufffd'
}
}
write:
if nDst+utf8.RuneLen(r) > len(dst) {
return nDst, nSrc, transform.ErrShortDst
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type iso2022JPEncoder int
func (e *iso2022JPEncoder) Reset() {
*e = asciiState
}
func (e *iso2022JPEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
// func init checks that the switch covers all tables.
//
// http://encoding.spec.whatwg.org/#iso-2022-jp says that "the index jis0212
// is not used by the iso-2022-jp encoder due to lack of widespread support".
//
// TODO: do we have to special-case U+00A5 and U+203E, as per
// http://encoding.spec.whatwg.org/#iso-2022-jp
// Doing so would mean that "\u00a5" would not be preserved
// after an encode-decode round trip.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r>>tableShift == jis0208 {
goto writeJIS
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r>>tableShift == jis0208 {
goto writeJIS
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r>>tableShift == jis0208 {
goto writeJIS
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r>>tableShift == jis0208 {
goto writeJIS
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r>>tableShift == jis0208 {
goto writeJIS
}
case encode5Low <= r && r < encode5High:
if 0xff61 <= r && r < 0xffa0 {
goto writeKatakana
}
if r = rune(encode5[r-encode5Low]); r>>tableShift == jis0208 {
goto writeJIS
}
}
// Switch back to ASCII state in case of error so that an ASCII
// replacement character can be written in the correct state.
if *e != asciiState {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
*e = asciiState
dst[nDst+0] = asciiEsc
dst[nDst+1] = '('
dst[nDst+2] = 'B'
nDst += 3
}
err = internal.ErrASCIIReplacement
break
}
if *e != asciiState {
if nDst+4 > len(dst) {
err = transform.ErrShortDst
break
}
*e = asciiState
dst[nDst+0] = asciiEsc
dst[nDst+1] = '('
dst[nDst+2] = 'B'
nDst += 3
} else if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
writeJIS:
if *e != jis0208State {
if nDst+5 > len(dst) {
err = transform.ErrShortDst
break
}
*e = jis0208State
dst[nDst+0] = asciiEsc
dst[nDst+1] = '$'
dst[nDst+2] = 'B'
nDst += 3
} else if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = 0x21 + uint8(r>>codeShift)&codeMask
dst[nDst+1] = 0x21 + uint8(r)&codeMask
nDst += 2
continue
writeKatakana:
if *e != katakanaState {
if nDst+4 > len(dst) {
err = transform.ErrShortDst
break
}
*e = katakanaState
dst[nDst+0] = asciiEsc
dst[nDst+1] = '('
dst[nDst+2] = 'I'
nDst += 3
} else if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r - (0xff61 - 0x21))
nDst++
continue
}
if atEOF && err == nil && *e != asciiState {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
} else {
*e = asciiState
dst[nDst+0] = asciiEsc
dst[nDst+1] = '('
dst[nDst+2] = 'B'
nDst += 3
}
}
return nDst, nSrc, err
}
+161
View File
@@ -0,0 +1,161 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This program generates tables.go:
// go run maketables.go | gofmt > tables.go
// TODO: Emoji extensions?
// http://www.unicode.org/faq/emoji_dingbats.html
// http://www.unicode.org/Public/UNIDATA/EmojiSources.txt
import (
"bufio"
"fmt"
"log"
"net/http"
"sort"
"strings"
)
type entry struct {
jisCode, table int
}
func main() {
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
fmt.Printf("// Package japanese provides Japanese encodings such as EUC-JP and Shift JIS.\n")
fmt.Printf(`package japanese // import "golang.org/x/text/encoding/japanese"` + "\n\n")
reverse := [65536]entry{}
for i := range reverse {
reverse[i].table = -1
}
tables := []struct {
url string
name string
}{
{"http://encoding.spec.whatwg.org/index-jis0208.txt", "0208"},
{"http://encoding.spec.whatwg.org/index-jis0212.txt", "0212"},
}
for i, table := range tables {
res, err := http.Get(table.url)
if err != nil {
log.Fatalf("%q: Get: %v", table.url, err)
}
defer res.Body.Close()
mapping := [65536]uint16{}
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := 0, uint16(0)
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
log.Fatalf("%q: could not parse %q", table.url, s)
}
if x < 0 || 120*94 <= x {
log.Fatalf("%q: JIS code %d is out of range", table.url, x)
}
mapping[x] = y
if reverse[y].table == -1 {
reverse[y] = entry{jisCode: x, table: i}
}
}
if err := scanner.Err(); err != nil {
log.Fatalf("%q: scanner error: %v", table.url, err)
}
fmt.Printf("// jis%sDecode is the decoding table from JIS %s code to Unicode.\n// It is defined at %s\n",
table.name, table.name, table.url)
fmt.Printf("var jis%sDecode = [...]uint16{\n", table.name)
for i, m := range mapping {
if m != 0 {
fmt.Printf("\t%d: 0x%04X,\n", i, m)
}
}
fmt.Printf("}\n\n")
}
// Any run of at least separation continuous zero entries in the reverse map will
// be a separate encode table.
const separation = 1024
intervals := []interval(nil)
low, high := -1, -1
for i, v := range reverse {
if v.table == -1 {
continue
}
if low < 0 {
low = i
} else if i-high >= separation {
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
low = i
}
high = i + 1
}
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
sort.Sort(byDecreasingLength(intervals))
fmt.Printf("const (\n")
fmt.Printf("\tjis0208 = 1\n")
fmt.Printf("\tjis0212 = 2\n")
fmt.Printf("\tcodeMask = 0x7f\n")
fmt.Printf("\tcodeShift = 7\n")
fmt.Printf("\ttableShift = 14\n")
fmt.Printf(")\n\n")
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
fmt.Printf("// encodeX are the encoding tables from Unicode to JIS code,\n")
fmt.Printf("// sorted by decreasing length.\n")
for i, v := range intervals {
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
}
fmt.Printf("//\n")
fmt.Printf("// The high two bits of the value record whether the JIS code comes from the\n")
fmt.Printf("// JIS0208 table (high bits == 1) or the JIS0212 table (high bits == 2).\n")
fmt.Printf("// The low 14 bits are two 7-bit unsigned integers j1 and j2 that form the\n")
fmt.Printf("// JIS code (94*j1 + j2) within that table.\n")
fmt.Printf("\n")
for i, v := range intervals {
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var encode%d = [...]uint16{\n", i)
for j := v.low; j < v.high; j++ {
x := reverse[j]
if x.table == -1 {
continue
}
fmt.Printf("\t%d - %d: jis%s<<14 | 0x%02X<<7 | 0x%02X,\n",
j, v.low, tables[x.table].name, x.jisCode/94, x.jisCode%94)
}
fmt.Printf("}\n\n")
}
}
// interval is a half-open interval [low, high).
type interval struct {
low, high int
}
func (i interval) len() int { return i.high - i.low }
// byDecreasingLength sorts intervals by decreasing length.
type byDecreasingLength []interval
func (b byDecreasingLength) Len() int { return len(b) }
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
+189
View File
@@ -0,0 +1,189 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package japanese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// ShiftJIS is the Shift JIS encoding, also known as Code Page 932 and
// Windows-31J.
var ShiftJIS encoding.Encoding = &shiftJIS
var shiftJIS = internal.Encoding{
&internal.SimpleEncoding{shiftJISDecoder{}, shiftJISEncoder{}},
"Shift JIS",
identifier.ShiftJIS,
}
type shiftJISDecoder struct{ transform.NopResetter }
func (shiftJISDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
switch c0 := src[nSrc]; {
case c0 < utf8.RuneSelf:
r, size = rune(c0), 1
case 0xa1 <= c0 && c0 < 0xe0:
r, size = rune(c0)+(0xff61-0xa1), 1
case (0x81 <= c0 && c0 < 0xa0) || (0xe0 <= c0 && c0 < 0xfd):
if c0 <= 0x9f {
c0 -= 0x70
} else {
c0 -= 0xb0
}
c0 = 2*c0 - 0x21
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = '\ufffd', 1
goto write
}
c1 := src[nSrc+1]
switch {
case c1 < 0x40:
r, size = '\ufffd', 1 // c1 is ASCII so output on next round
goto write
case c1 < 0x7f:
c0--
c1 -= 0x40
case c1 == 0x7f:
r, size = '\ufffd', 1 // c1 is ASCII so output on next round
goto write
case c1 < 0x9f:
c0--
c1 -= 0x41
case c1 < 0xfd:
c1 -= 0x9f
default:
r, size = '\ufffd', 2
goto write
}
r, size = '\ufffd', 2
if i := int(c0)*94 + int(c1); i < len(jis0208Decode) {
r = rune(jis0208Decode[i])
if r == 0 {
r = '\ufffd'
}
}
case c0 == 0x80:
r, size = 0x80, 1
default:
r, size = '\ufffd', 1
}
write:
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type shiftJISEncoder struct{ transform.NopResetter }
func (shiftJISEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break loop
}
}
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r>>tableShift == jis0208 {
goto write2
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r>>tableShift == jis0208 {
goto write2
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r>>tableShift == jis0208 {
goto write2
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r>>tableShift == jis0208 {
goto write2
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r>>tableShift == jis0208 {
goto write2
}
case encode5Low <= r && r < encode5High:
if 0xff61 <= r && r < 0xffa0 {
r -= 0xff61 - 0xa1
goto write1
}
if r = rune(encode5[r-encode5Low]); r>>tableShift == jis0208 {
goto write2
}
}
err = internal.ErrASCIIReplacement
break
}
write1:
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
write2:
j1 := uint8(r>>codeShift) & codeMask
j2 := uint8(r) & codeMask
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break loop
}
if j1 <= 61 {
dst[nDst+0] = 129 + j1/2
} else {
dst[nDst+0] = 193 + j1/2
}
if j1&1 == 0 {
dst[nDst+1] = j2 + j2/63 + 64
} else {
dst[nDst+1] = j2 + 159
}
nDst += 2
continue
}
return nDst, nSrc, err
}
File diff suppressed because it is too large Load Diff
+177
View File
@@ -0,0 +1,177 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package korean
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// All is a list of all defined encodings in this package.
var All = []encoding.Encoding{EUCKR}
// EUCKR is the EUC-KR encoding, also known as Code Page 949.
var EUCKR encoding.Encoding = &eucKR
var eucKR = internal.Encoding{
&internal.SimpleEncoding{eucKRDecoder{}, eucKREncoder{}},
"EUC-KR",
identifier.EUCKR,
}
type eucKRDecoder struct{ transform.NopResetter }
func (eucKRDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
switch c0 := src[nSrc]; {
case c0 < utf8.RuneSelf:
r, size = rune(c0), 1
case 0x81 <= c0 && c0 < 0xff:
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
break
}
c1 := src[nSrc+1]
size = 2
if c0 < 0xc7 {
r = 178 * rune(c0-0x81)
switch {
case 0x41 <= c1 && c1 < 0x5b:
r += rune(c1) - (0x41 - 0*26)
case 0x61 <= c1 && c1 < 0x7b:
r += rune(c1) - (0x61 - 1*26)
case 0x81 <= c1 && c1 < 0xff:
r += rune(c1) - (0x81 - 2*26)
default:
goto decError
}
} else if 0xa1 <= c1 && c1 < 0xff {
r = 178*(0xc7-0x81) + rune(c0-0xc7)*94 + rune(c1-0xa1)
} else {
goto decError
}
if int(r) < len(decode) {
r = rune(decode[r])
if r != 0 {
break
}
}
decError:
r = utf8.RuneError
if c1 < utf8.RuneSelf {
size = 1
}
default:
r, size = utf8.RuneError, 1
break
}
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type eucKREncoder struct{ transform.NopResetter }
func (eucKREncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r != 0 {
goto write2
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r != 0 {
goto write2
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r != 0 {
goto write2
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r != 0 {
goto write2
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r != 0 {
goto write2
}
case encode5Low <= r && r < encode5High:
if r = rune(encode5[r-encode5Low]); r != 0 {
goto write2
}
case encode6Low <= r && r < encode6High:
if r = rune(encode6[r-encode6Low]); r != 0 {
goto write2
}
}
err = internal.ErrASCIIReplacement
break
}
write2:
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = uint8(r >> 8)
dst[nDst+1] = uint8(r)
nDst += 2
continue
}
return nDst, nSrc, err
}
func init() {
// Check that the hard-coded encode switch covers all tables.
if numEncodeTables != 7 {
panic("bad numEncodeTables")
}
}
+143
View File
@@ -0,0 +1,143 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This program generates tables.go:
// go run maketables.go | gofmt > tables.go
import (
"bufio"
"fmt"
"log"
"net/http"
"sort"
"strings"
)
func main() {
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
fmt.Printf("// Package korean provides Korean encodings such as EUC-KR.\n")
fmt.Printf(`package korean // import "golang.org/x/text/encoding/korean"` + "\n\n")
res, err := http.Get("http://encoding.spec.whatwg.org/index-euc-kr.txt")
if err != nil {
log.Fatalf("Get: %v", err)
}
defer res.Body.Close()
mapping := [65536]uint16{}
reverse := [65536]uint16{}
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := uint16(0), uint16(0)
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
log.Fatalf("could not parse %q", s)
}
if x < 0 || 178*(0xc7-0x81)+(0xfe-0xc7)*94+(0xff-0xa1) <= x {
log.Fatalf("EUC-KR code %d is out of range", x)
}
mapping[x] = y
if reverse[y] == 0 {
c0, c1 := uint16(0), uint16(0)
if x < 178*(0xc7-0x81) {
c0 = uint16(x/178) + 0x81
c1 = uint16(x % 178)
switch {
case c1 < 1*26:
c1 += 0x41
case c1 < 2*26:
c1 += 0x47
default:
c1 += 0x4d
}
} else {
x -= 178 * (0xc7 - 0x81)
c0 = uint16(x/94) + 0xc7
c1 = uint16(x%94) + 0xa1
}
reverse[y] = c0<<8 | c1
}
}
if err := scanner.Err(); err != nil {
log.Fatalf("scanner error: %v", err)
}
fmt.Printf("// decode is the decoding table from EUC-KR code to Unicode.\n")
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-euc-kr.txt\n")
fmt.Printf("var decode = [...]uint16{\n")
for i, v := range mapping {
if v != 0 {
fmt.Printf("\t%d: 0x%04X,\n", i, v)
}
}
fmt.Printf("}\n\n")
// Any run of at least separation continuous zero entries in the reverse map will
// be a separate encode table.
const separation = 1024
intervals := []interval(nil)
low, high := -1, -1
for i, v := range reverse {
if v == 0 {
continue
}
if low < 0 {
low = i
} else if i-high >= separation {
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
low = i
}
high = i + 1
}
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
sort.Sort(byDecreasingLength(intervals))
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
fmt.Printf("// encodeX are the encoding tables from Unicode to EUC-KR code,\n")
fmt.Printf("// sorted by decreasing length.\n")
for i, v := range intervals {
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
}
fmt.Printf("\n")
for i, v := range intervals {
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var encode%d = [...]uint16{\n", i)
for j := v.low; j < v.high; j++ {
x := reverse[j]
if x == 0 {
continue
}
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x)
}
fmt.Printf("}\n\n")
}
}
// interval is a half-open interval [low, high).
type interval struct {
low, high int
}
func (i interval) len() int { return i.high - i.low }
// byDecreasingLength sorts intervals by decreasing length.
type byDecreasingLength []interval
func (b byDecreasingLength) Len() int { return len(b) }
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
+34152
View File
File diff suppressed because it is too large Load Diff
+12
View File
@@ -0,0 +1,12 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package simplifiedchinese
import (
"golang.org/x/text/encoding"
)
// All is a list of all defined encodings in this package.
var All = []encoding.Encoding{GB18030, GBK, HZGB2312}
+269
View File
@@ -0,0 +1,269 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package simplifiedchinese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
var (
// GB18030 is the GB18030 encoding.
GB18030 encoding.Encoding = &gbk18030
// GBK is the GBK encoding. It encodes an extension of the GB2312 character set
// and is also known as Code Page 936.
GBK encoding.Encoding = &gbk
)
var gbk = internal.Encoding{
&internal.SimpleEncoding{
gbkDecoder{gb18030: false},
gbkEncoder{gb18030: false},
},
"GBK",
identifier.GBK,
}
var gbk18030 = internal.Encoding{
&internal.SimpleEncoding{
gbkDecoder{gb18030: true},
gbkEncoder{gb18030: true},
},
"GB18030",
identifier.GB18030,
}
type gbkDecoder struct {
transform.NopResetter
gb18030 bool
}
func (d gbkDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
switch c0 := src[nSrc]; {
case c0 < utf8.RuneSelf:
r, size = rune(c0), 1
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
// says to treat "gbk" as Code Page 936.
case c0 == 0x80:
r, size = '€', 1
case c0 < 0xff:
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
goto write
}
c1 := src[nSrc+1]
switch {
case 0x40 <= c1 && c1 < 0x7f:
c1 -= 0x40
case 0x80 <= c1 && c1 < 0xff:
c1 -= 0x41
case d.gb18030 && 0x30 <= c1 && c1 < 0x40:
if nSrc+3 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
// The second byte here is always ASCII, so we can set size
// to 1 in all cases.
r, size = utf8.RuneError, 1
goto write
}
c2 := src[nSrc+2]
if c2 < 0x81 || 0xff <= c2 {
r, size = utf8.RuneError, 1
goto write
}
c3 := src[nSrc+3]
if c3 < 0x30 || 0x3a <= c3 {
r, size = utf8.RuneError, 1
goto write
}
size = 4
r = ((rune(c0-0x81)*10+rune(c1-0x30))*126+rune(c2-0x81))*10 + rune(c3-0x30)
if r < 39420 {
i, j := 0, len(gb18030)
for i < j {
h := i + (j-i)/2
if r >= rune(gb18030[h][0]) {
i = h + 1
} else {
j = h
}
}
dec := &gb18030[i-1]
r += rune(dec[1]) - rune(dec[0])
goto write
}
r -= 189000
if 0 <= r && r < 0x100000 {
r += 0x10000
} else {
r, size = utf8.RuneError, 1
}
goto write
default:
r, size = utf8.RuneError, 1
goto write
}
r, size = '\ufffd', 2
if i := int(c0-0x81)*190 + int(c1); i < len(decode) {
r = rune(decode[i])
if r == 0 {
r = '\ufffd'
}
}
default:
r, size = utf8.RuneError, 1
}
write:
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type gbkEncoder struct {
transform.NopResetter
gb18030 bool
}
func (e gbkEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, r2, size := rune(0), rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r2 = rune(encode0[r-encode0Low]); r2 != 0 {
goto write2
}
case encode1Low <= r && r < encode1High:
// Microsoft's Code Page 936 extends GBK 1.0 to encode the euro sign U+20AC
// as 0x80. The HTML5 specification at http://encoding.spec.whatwg.org/#gbk
// says to treat "gbk" as Code Page 936.
if r == '€' {
r = 0x80
goto write1
}
if r2 = rune(encode1[r-encode1Low]); r2 != 0 {
goto write2
}
case encode2Low <= r && r < encode2High:
if r2 = rune(encode2[r-encode2Low]); r2 != 0 {
goto write2
}
case encode3Low <= r && r < encode3High:
if r2 = rune(encode3[r-encode3Low]); r2 != 0 {
goto write2
}
case encode4Low <= r && r < encode4High:
if r2 = rune(encode4[r-encode4Low]); r2 != 0 {
goto write2
}
}
if e.gb18030 {
if r < 0x10000 {
i, j := 0, len(gb18030)
for i < j {
h := i + (j-i)/2
if r >= rune(gb18030[h][1]) {
i = h + 1
} else {
j = h
}
}
dec := &gb18030[i-1]
r += rune(dec[0]) - rune(dec[1])
goto write4
} else if r < 0x110000 {
r += 189000 - 0x10000
goto write4
}
}
err = internal.ErrASCIIReplacement
break
}
write1:
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
write2:
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = uint8(r2 >> 8)
dst[nDst+1] = uint8(r2)
nDst += 2
continue
write4:
if nDst+4 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+3] = uint8(r%10 + 0x30)
r /= 10
dst[nDst+2] = uint8(r%126 + 0x81)
r /= 126
dst[nDst+1] = uint8(r%10 + 0x30)
r /= 10
dst[nDst+0] = uint8(r + 0x81)
nDst += 4
continue
}
return nDst, nSrc, err
}
func init() {
// Check that the hard-coded encode switch covers all tables.
if numEncodeTables != 5 {
panic("bad numEncodeTables")
}
}
+245
View File
@@ -0,0 +1,245 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package simplifiedchinese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// HZGB2312 is the HZ-GB2312 encoding.
var HZGB2312 encoding.Encoding = &hzGB2312
var hzGB2312 = internal.Encoding{
internal.FuncEncoding{hzGB2312NewDecoder, hzGB2312NewEncoder},
"HZ-GB2312",
identifier.HZGB2312,
}
func hzGB2312NewDecoder() transform.Transformer {
return new(hzGB2312Decoder)
}
func hzGB2312NewEncoder() transform.Transformer {
return new(hzGB2312Encoder)
}
const (
asciiState = iota
gbState
)
type hzGB2312Decoder int
func (d *hzGB2312Decoder) Reset() {
*d = asciiState
}
func (d *hzGB2312Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
loop:
for ; nSrc < len(src); nSrc += size {
c0 := src[nSrc]
if c0 >= utf8.RuneSelf {
r, size = utf8.RuneError, 1
goto write
}
if c0 == '~' {
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r = utf8.RuneError
goto write
}
size = 2
switch src[nSrc+1] {
case '{':
*d = gbState
continue
case '}':
*d = asciiState
continue
case '~':
if nDst >= len(dst) {
err = transform.ErrShortDst
break loop
}
dst[nDst] = '~'
nDst++
continue
case '\n':
continue
default:
r = utf8.RuneError
goto write
}
}
if *d == asciiState {
r, size = rune(c0), 1
} else {
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
goto write
}
size = 2
c1 := src[nSrc+1]
if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
// error
} else if i := int(c0-0x01)*190 + int(c1+0x3f); i < len(decode) {
r = rune(decode[i])
if r != 0 {
goto write
}
}
if c1 > utf8.RuneSelf {
// Be consistent and always treat non-ASCII as a single error.
size = 1
}
r = utf8.RuneError
}
write:
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += utf8.EncodeRune(dst[nDst:], r)
}
return nDst, nSrc, err
}
type hzGB2312Encoder int
func (d *hzGB2312Encoder) Reset() {
*d = asciiState
}
func (e *hzGB2312Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
if r == '~' {
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = '~'
dst[nDst+1] = '~'
nDst += 2
continue
} else if *e != asciiState {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
*e = asciiState
dst[nDst+0] = '~'
dst[nDst+1] = '}'
nDst += 2
} else if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst += 1
continue
}
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r != 0 {
goto writeGB
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r != 0 {
goto writeGB
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r != 0 {
goto writeGB
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r != 0 {
goto writeGB
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r != 0 {
goto writeGB
}
}
terminateInASCIIState:
// Switch back to ASCII state in case of error so that an ASCII
// replacement character can be written in the correct state.
if *e != asciiState {
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = '~'
dst[nDst+1] = '}'
nDst += 2
}
err = internal.ErrASCIIReplacement
break
writeGB:
c0 := uint8(r>>8) - 0x80
c1 := uint8(r) - 0x80
if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
goto terminateInASCIIState
}
if *e == asciiState {
if nDst+4 > len(dst) {
err = transform.ErrShortDst
break
}
*e = gbState
dst[nDst+0] = '~'
dst[nDst+1] = '{'
nDst += 2
} else if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = c0
dst[nDst+1] = c1
nDst += 2
continue
}
// TODO: should one always terminate in ASCII state to make it safe to
// concatenate two HZ-GB2312-encoded strings?
return nDst, nSrc, err
}
+161
View File
@@ -0,0 +1,161 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This program generates tables.go:
// go run maketables.go | gofmt > tables.go
import (
"bufio"
"fmt"
"log"
"net/http"
"sort"
"strings"
)
func main() {
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
fmt.Printf("// Package simplifiedchinese provides Simplified Chinese encodings such as GBK.\n")
fmt.Printf(`package simplifiedchinese // import "golang.org/x/text/encoding/simplifiedchinese"` + "\n\n")
printGB18030()
printGBK()
}
func printGB18030() {
res, err := http.Get("http://encoding.spec.whatwg.org/index-gb18030.txt")
if err != nil {
log.Fatalf("Get: %v", err)
}
defer res.Body.Close()
fmt.Printf("// gb18030 is the table from http://encoding.spec.whatwg.org/index-gb18030.txt\n")
fmt.Printf("var gb18030 = [...][2]uint16{\n")
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := uint32(0), uint32(0)
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
log.Fatalf("could not parse %q", s)
}
if x < 0x10000 && y < 0x10000 {
fmt.Printf("\t{0x%04x, 0x%04x},\n", x, y)
}
}
fmt.Printf("}\n\n")
}
func printGBK() {
res, err := http.Get("http://encoding.spec.whatwg.org/index-gbk.txt")
if err != nil {
log.Fatalf("Get: %v", err)
}
defer res.Body.Close()
mapping := [65536]uint16{}
reverse := [65536]uint16{}
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := uint16(0), uint16(0)
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
log.Fatalf("could not parse %q", s)
}
if x < 0 || 126*190 <= x {
log.Fatalf("GBK code %d is out of range", x)
}
mapping[x] = y
if reverse[y] == 0 {
c0, c1 := x/190, x%190
if c1 >= 0x3f {
c1++
}
reverse[y] = (0x81+c0)<<8 | (0x40 + c1)
}
}
if err := scanner.Err(); err != nil {
log.Fatalf("scanner error: %v", err)
}
fmt.Printf("// decode is the decoding table from GBK code to Unicode.\n")
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-gbk.txt\n")
fmt.Printf("var decode = [...]uint16{\n")
for i, v := range mapping {
if v != 0 {
fmt.Printf("\t%d: 0x%04X,\n", i, v)
}
}
fmt.Printf("}\n\n")
// Any run of at least separation continuous zero entries in the reverse map will
// be a separate encode table.
const separation = 1024
intervals := []interval(nil)
low, high := -1, -1
for i, v := range reverse {
if v == 0 {
continue
}
if low < 0 {
low = i
} else if i-high >= separation {
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
low = i
}
high = i + 1
}
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
sort.Sort(byDecreasingLength(intervals))
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
fmt.Printf("// encodeX are the encoding tables from Unicode to GBK code,\n")
fmt.Printf("// sorted by decreasing length.\n")
for i, v := range intervals {
fmt.Printf("// encode%d: %5d entries for runes in [%5d, %5d).\n", i, v.len(), v.low, v.high)
}
fmt.Printf("\n")
for i, v := range intervals {
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var encode%d = [...]uint16{\n", i)
for j := v.low; j < v.high; j++ {
x := reverse[j]
if x == 0 {
continue
}
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x)
}
fmt.Printf("}\n\n")
}
}
// interval is a half-open interval [low, high).
type interval struct {
low, high int
}
func (i interval) len() int { return i.high - i.low }
// byDecreasingLength sorts intervals by decreasing length.
type byDecreasingLength []interval
func (b byDecreasingLength) Len() int { return len(b) }
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
File diff suppressed because it is too large Load Diff
+199
View File
@@ -0,0 +1,199 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package traditionalchinese
import (
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// All is a list of all defined encodings in this package.
var All = []encoding.Encoding{Big5}
// Big5 is the Big5 encoding, also known as Code Page 950.
var Big5 encoding.Encoding = &big5
var big5 = internal.Encoding{
&internal.SimpleEncoding{big5Decoder{}, big5Encoder{}},
"Big5",
identifier.Big5,
}
type big5Decoder struct{ transform.NopResetter }
func (big5Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size, s := rune(0), 0, ""
loop:
for ; nSrc < len(src); nSrc += size {
switch c0 := src[nSrc]; {
case c0 < utf8.RuneSelf:
r, size = rune(c0), 1
case 0x81 <= c0 && c0 < 0xff:
if nSrc+1 >= len(src) {
if !atEOF {
err = transform.ErrShortSrc
break loop
}
r, size = utf8.RuneError, 1
goto write
}
c1 := src[nSrc+1]
switch {
case 0x40 <= c1 && c1 < 0x7f:
c1 -= 0x40
case 0xa1 <= c1 && c1 < 0xff:
c1 -= 0x62
case c1 < 0x40:
r, size = utf8.RuneError, 1
goto write
default:
r, size = utf8.RuneError, 2
goto write
}
r, size = '\ufffd', 2
if i := int(c0-0x81)*157 + int(c1); i < len(decode) {
if 1133 <= i && i < 1167 {
// The two-rune special cases for LATIN CAPITAL / SMALL E WITH CIRCUMFLEX
// AND MACRON / CARON are from http://encoding.spec.whatwg.org/#big5
switch i {
case 1133:
s = "\u00CA\u0304"
goto writeStr
case 1135:
s = "\u00CA\u030C"
goto writeStr
case 1164:
s = "\u00EA\u0304"
goto writeStr
case 1166:
s = "\u00EA\u030C"
goto writeStr
}
}
r = rune(decode[i])
if r == 0 {
r = '\ufffd'
}
}
default:
r, size = utf8.RuneError, 1
}
write:
if nDst+utf8.RuneLen(r) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += utf8.EncodeRune(dst[nDst:], r)
continue loop
writeStr:
if nDst+len(s) > len(dst) {
err = transform.ErrShortDst
break loop
}
nDst += copy(dst[nDst:], s)
continue loop
}
return nDst, nSrc, err
}
type big5Encoder struct{ transform.NopResetter }
func (big5Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
r, size := rune(0), 0
for ; nSrc < len(src); nSrc += size {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
if nDst >= len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = uint8(r)
nDst++
continue
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
}
if r >= utf8.RuneSelf {
// func init checks that the switch covers all tables.
switch {
case encode0Low <= r && r < encode0High:
if r = rune(encode0[r-encode0Low]); r != 0 {
goto write2
}
case encode1Low <= r && r < encode1High:
if r = rune(encode1[r-encode1Low]); r != 0 {
goto write2
}
case encode2Low <= r && r < encode2High:
if r = rune(encode2[r-encode2Low]); r != 0 {
goto write2
}
case encode3Low <= r && r < encode3High:
if r = rune(encode3[r-encode3Low]); r != 0 {
goto write2
}
case encode4Low <= r && r < encode4High:
if r = rune(encode4[r-encode4Low]); r != 0 {
goto write2
}
case encode5Low <= r && r < encode5High:
if r = rune(encode5[r-encode5Low]); r != 0 {
goto write2
}
case encode6Low <= r && r < encode6High:
if r = rune(encode6[r-encode6Low]); r != 0 {
goto write2
}
case encode7Low <= r && r < encode7High:
if r = rune(encode7[r-encode7Low]); r != 0 {
goto write2
}
}
err = internal.ErrASCIIReplacement
break
}
write2:
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = uint8(r >> 8)
dst[nDst+1] = uint8(r)
nDst += 2
continue
}
return nDst, nSrc, err
}
func init() {
// Check that the hard-coded encode switch covers all tables.
if numEncodeTables != 8 {
panic("bad numEncodeTables")
}
}
+140
View File
@@ -0,0 +1,140 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This program generates tables.go:
// go run maketables.go | gofmt > tables.go
import (
"bufio"
"fmt"
"log"
"net/http"
"sort"
"strings"
)
func main() {
fmt.Printf("// generated by go run maketables.go; DO NOT EDIT\n\n")
fmt.Printf("// Package traditionalchinese provides Traditional Chinese encodings such as Big5.\n")
fmt.Printf(`package traditionalchinese // import "golang.org/x/text/encoding/traditionalchinese"` + "\n\n")
res, err := http.Get("http://encoding.spec.whatwg.org/index-big5.txt")
if err != nil {
log.Fatalf("Get: %v", err)
}
defer res.Body.Close()
mapping := [65536]uint32{}
reverse := [65536 * 4]uint16{}
scanner := bufio.NewScanner(res.Body)
for scanner.Scan() {
s := strings.TrimSpace(scanner.Text())
if s == "" || s[0] == '#' {
continue
}
x, y := uint16(0), uint32(0)
if _, err := fmt.Sscanf(s, "%d 0x%x", &x, &y); err != nil {
log.Fatalf("could not parse %q", s)
}
if x < 0 || 126*157 <= x {
log.Fatalf("Big5 code %d is out of range", x)
}
mapping[x] = y
// The WHATWG spec http://encoding.spec.whatwg.org/#indexes says that
// "The index pointer for code point in index is the first pointer
// corresponding to code point in index", which would normally mean
// that the code below should be guarded by "if reverse[y] == 0", but
// last instead of first seems to match the behavior of
// "iconv -f UTF-8 -t BIG5". For example, U+8005 者 occurs twice in
// http://encoding.spec.whatwg.org/index-big5.txt, as index 2148
// (encoded as "\x8e\xcd") and index 6543 (encoded as "\xaa\xcc")
// and "echo 者 | iconv -f UTF-8 -t BIG5 | xxd" gives "\xaa\xcc".
c0, c1 := x/157, x%157
if c1 < 0x3f {
c1 += 0x40
} else {
c1 += 0x62
}
reverse[y] = (0x81+c0)<<8 | c1
}
if err := scanner.Err(); err != nil {
log.Fatalf("scanner error: %v", err)
}
fmt.Printf("// decode is the decoding table from Big5 code to Unicode.\n")
fmt.Printf("// It is defined at http://encoding.spec.whatwg.org/index-big5.txt\n")
fmt.Printf("var decode = [...]uint32{\n")
for i, v := range mapping {
if v != 0 {
fmt.Printf("\t%d: 0x%08X,\n", i, v)
}
}
fmt.Printf("}\n\n")
// Any run of at least separation continuous zero entries in the reverse map will
// be a separate encode table.
const separation = 1024
intervals := []interval(nil)
low, high := -1, -1
for i, v := range reverse {
if v == 0 {
continue
}
if low < 0 {
low = i
} else if i-high >= separation {
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
low = i
}
high = i + 1
}
if high >= 0 {
intervals = append(intervals, interval{low, high})
}
sort.Sort(byDecreasingLength(intervals))
fmt.Printf("const numEncodeTables = %d\n\n", len(intervals))
fmt.Printf("// encodeX are the encoding tables from Unicode to Big5 code,\n")
fmt.Printf("// sorted by decreasing length.\n")
for i, v := range intervals {
fmt.Printf("// encode%d: %5d entries for runes in [%6d, %6d).\n", i, v.len(), v.low, v.high)
}
fmt.Printf("\n")
for i, v := range intervals {
fmt.Printf("const encode%dLow, encode%dHigh = %d, %d\n\n", i, i, v.low, v.high)
fmt.Printf("var encode%d = [...]uint16{\n", i)
for j := v.low; j < v.high; j++ {
x := reverse[j]
if x == 0 {
continue
}
fmt.Printf("\t%d-%d: 0x%04X,\n", j, v.low, x)
}
fmt.Printf("}\n\n")
}
}
// interval is a half-open interval [low, high).
type interval struct {
low, high int
}
func (i interval) len() int { return i.high - i.low }
// byDecreasingLength sorts intervals by decreasing length.
type byDecreasingLength []interval
func (b byDecreasingLength) Len() int { return len(b) }
func (b byDecreasingLength) Less(i, j int) bool { return b[i].len() > b[j].len() }
func (b byDecreasingLength) Swap(i, j int) { b[i], b[j] = b[j], b[i] }
File diff suppressed because it is too large Load Diff
+82
View File
@@ -0,0 +1,82 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package unicode
import (
"golang.org/x/text/transform"
)
// BOMOverride returns a new decoder transformer that is identical to fallback,
// except that the presence of a Byte Order Mark at the start of the input
// causes it to switch to the corresponding Unicode decoding. It will only
// consider BOMs for UTF-8, UTF-16BE, and UTF-16LE.
//
// This differs from using ExpectBOM by allowing a BOM to switch to UTF-8, not
// just UTF-16 variants, and allowing falling back to any encoding scheme.
//
// This technique is recommended by the W3C for use in HTML 5: "For
// compatibility with deployed content, the byte order mark (also known as BOM)
// is considered more authoritative than anything else."
// http://www.w3.org/TR/encoding/#specification-hooks
//
// Using BOMOverride is mostly intended for use cases where the first characters
// of a fallback encoding are known to not be a BOM, for example, for valid HTML
// and most encodings.
func BOMOverride(fallback transform.Transformer) transform.Transformer {
// TODO: possibly allow a variadic argument of unicode encodings to allow
// specifying details of which fallbacks are supported as well as
// specifying the details of the implementations. This would also allow for
// support for UTF-32, which should not be supported by default.
return &bomOverride{fallback: fallback}
}
type bomOverride struct {
fallback transform.Transformer
current transform.Transformer
}
func (d *bomOverride) Reset() {
d.current = nil
d.fallback.Reset()
}
var (
// TODO: we could use decode functions here, instead of allocating a new
// decoder on every NewDecoder as IgnoreBOM decoders can be stateless.
utf16le = UTF16(LittleEndian, IgnoreBOM)
utf16be = UTF16(BigEndian, IgnoreBOM)
)
const utf8BOM = "\ufeff"
func (d *bomOverride) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if d.current != nil {
return d.current.Transform(dst, src, atEOF)
}
if len(src) < 3 && !atEOF {
return 0, 0, transform.ErrShortSrc
}
d.current = d.fallback
bomSize := 0
if len(src) >= 2 {
if src[0] == 0xFF && src[1] == 0xFE {
d.current = utf16le.NewDecoder()
bomSize = 2
} else if src[0] == 0xFE && src[1] == 0xFF {
d.current = utf16be.NewDecoder()
bomSize = 2
} else if len(src) >= 3 &&
src[0] == utf8BOM[0] &&
src[1] == utf8BOM[1] &&
src[2] == utf8BOM[2] {
d.current = transform.Nop
bomSize = 3
}
}
if bomSize < len(src) {
nDst, nSrc, err = d.current.Transform(dst, src[bomSize:], atEOF)
}
return nDst, nSrc + bomSize, err
}
+434
View File
@@ -0,0 +1,434 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package unicode provides Unicode encodings such as UTF-16.
package unicode // import "golang.org/x/text/encoding/unicode"
import (
"errors"
"unicode/utf16"
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/internal/utf8internal"
"golang.org/x/text/runes"
"golang.org/x/text/transform"
)
// TODO: I think the Transformers really should return errors on unmatched
// surrogate pairs and odd numbers of bytes. This is not required by RFC 2781,
// which leaves it open, but is suggested by WhatWG. It will allow for all error
// modes as defined by WhatWG: fatal, HTML and Replacement. This would require
// the introduction of some kind of error type for conveying the erroneous code
// point.
// UTF8 is the UTF-8 encoding.
var UTF8 encoding.Encoding = utf8enc
var utf8enc = &internal.Encoding{
&internal.SimpleEncoding{utf8Decoder{}, runes.ReplaceIllFormed()},
"UTF-8",
identifier.UTF8,
}
type utf8Decoder struct{ transform.NopResetter }
func (utf8Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
var pSrc int // point from which to start copy in src
var accept utf8internal.AcceptRange
// The decoder can only make the input larger, not smaller.
n := len(src)
if len(dst) < n {
err = transform.ErrShortDst
n = len(dst)
atEOF = false
}
for nSrc < n {
c := src[nSrc]
if c < utf8.RuneSelf {
nSrc++
continue
}
first := utf8internal.First[c]
size := int(first & utf8internal.SizeMask)
if first == utf8internal.FirstInvalid {
goto handleInvalid // invalid starter byte
}
accept = utf8internal.AcceptRanges[first>>utf8internal.AcceptShift]
if nSrc+size > n {
if !atEOF {
// We may stop earlier than necessary here if the short sequence
// has invalid bytes. Not checking for this simplifies the code
// and may avoid duplicate computations in certain conditions.
if err == nil {
err = transform.ErrShortSrc
}
break
}
// Determine the maximal subpart of an ill-formed subsequence.
switch {
case nSrc+1 >= n || src[nSrc+1] < accept.Lo || accept.Hi < src[nSrc+1]:
size = 1
case nSrc+2 >= n || src[nSrc+2] < utf8internal.LoCB || utf8internal.HiCB < src[nSrc+2]:
size = 2
default:
size = 3 // As we are short, the maximum is 3.
}
goto handleInvalid
}
if c = src[nSrc+1]; c < accept.Lo || accept.Hi < c {
size = 1
goto handleInvalid // invalid continuation byte
} else if size == 2 {
} else if c = src[nSrc+2]; c < utf8internal.LoCB || utf8internal.HiCB < c {
size = 2
goto handleInvalid // invalid continuation byte
} else if size == 3 {
} else if c = src[nSrc+3]; c < utf8internal.LoCB || utf8internal.HiCB < c {
size = 3
goto handleInvalid // invalid continuation byte
}
nSrc += size
continue
handleInvalid:
// Copy the scanned input so far.
nDst += copy(dst[nDst:], src[pSrc:nSrc])
// Append RuneError to the destination.
const runeError = "\ufffd"
if nDst+len(runeError) > len(dst) {
return nDst, nSrc, transform.ErrShortDst
}
nDst += copy(dst[nDst:], runeError)
// Skip the maximal subpart of an ill-formed subsequence according to
// the W3C standard way instead of the Go way. This Transform is
// probably the only place in the text repo where it is warranted.
nSrc += size
pSrc = nSrc
// Recompute the maximum source length.
if sz := len(dst) - nDst; sz < len(src)-nSrc {
err = transform.ErrShortDst
n = nSrc + sz
atEOF = false
}
}
return nDst + copy(dst[nDst:], src[pSrc:nSrc]), nSrc, err
}
// UTF16 returns a UTF-16 Encoding for the given default endianness and byte
// order mark (BOM) policy.
//
// When decoding from UTF-16 to UTF-8, if the BOMPolicy is IgnoreBOM then
// neither BOMs U+FEFF nor noncharacters U+FFFE in the input stream will affect
// the endianness used for decoding, and will instead be output as their
// standard UTF-8 encodings: "\xef\xbb\xbf" and "\xef\xbf\xbe". If the BOMPolicy
// is UseBOM or ExpectBOM a staring BOM is not written to the UTF-8 output.
// Instead, it overrides the default endianness e for the remainder of the
// transformation. Any subsequent BOMs U+FEFF or noncharacters U+FFFE will not
// affect the endianness used, and will instead be output as their standard
// UTF-8 encodings. For UseBOM, if there is no starting BOM, it will proceed
// with the default Endianness. For ExpectBOM, in that case, the transformation
// will return early with an ErrMissingBOM error.
//
// When encoding from UTF-8 to UTF-16, a BOM will be inserted at the start of
// the output if the BOMPolicy is UseBOM or ExpectBOM. Otherwise, a BOM will not
// be inserted. The UTF-8 input does not need to contain a BOM.
//
// There is no concept of a 'native' endianness. If the UTF-16 data is produced
// and consumed in a greater context that implies a certain endianness, use
// IgnoreBOM. Otherwise, use ExpectBOM and always produce and consume a BOM.
//
// In the language of http://www.unicode.org/faq/utf_bom.html#bom10, IgnoreBOM
// corresponds to "Where the precise type of the data stream is known... the
// BOM should not be used" and ExpectBOM corresponds to "A particular
// protocol... may require use of the BOM".
func UTF16(e Endianness, b BOMPolicy) encoding.Encoding {
return utf16Encoding{config{e, b}, mibValue[e][b&bomMask]}
}
// mibValue maps Endianness and BOMPolicy settings to MIB constants. Note that
// some configurations map to the same MIB identifier. RFC 2781 has requirements
// and recommendations. Some of the "configurations" are merely recommendations,
// so multiple configurations could match.
var mibValue = map[Endianness][numBOMValues]identifier.MIB{
BigEndian: [numBOMValues]identifier.MIB{
IgnoreBOM: identifier.UTF16BE,
UseBOM: identifier.UTF16, // BigEnding default is preferred by RFC 2781.
// TODO: acceptBOM | strictBOM would map to UTF16BE as well.
},
LittleEndian: [numBOMValues]identifier.MIB{
IgnoreBOM: identifier.UTF16LE,
UseBOM: identifier.UTF16, // LittleEndian default is allowed and preferred on Windows.
// TODO: acceptBOM | strictBOM would map to UTF16LE as well.
},
// ExpectBOM is not widely used and has no valid MIB identifier.
}
// All lists a configuration for each IANA-defined UTF-16 variant.
var All = []encoding.Encoding{
UTF8,
UTF16(BigEndian, UseBOM),
UTF16(BigEndian, IgnoreBOM),
UTF16(LittleEndian, IgnoreBOM),
}
// BOMPolicy is a UTF-16 encoding's byte order mark policy.
type BOMPolicy uint8
const (
writeBOM BOMPolicy = 0x01
acceptBOM BOMPolicy = 0x02
requireBOM BOMPolicy = 0x04
bomMask BOMPolicy = 0x07
// HACK: numBOMValues == 8 triggers a bug in the 1.4 compiler (cannot have a
// map of an array of length 8 of a type that is also used as a key or value
// in another map). See golang.org/issue/11354.
// TODO: consider changing this value back to 8 if the use of 1.4.* has
// been minimized.
numBOMValues = 8 + 1
// IgnoreBOM means to ignore any byte order marks.
IgnoreBOM BOMPolicy = 0
// Common and RFC 2781-compliant interpretation for UTF-16BE/LE.
// UseBOM means that the UTF-16 form may start with a byte order mark, which
// will be used to override the default encoding.
UseBOM BOMPolicy = writeBOM | acceptBOM
// Common and RFC 2781-compliant interpretation for UTF-16.
// ExpectBOM means that the UTF-16 form must start with a byte order mark,
// which will be used to override the default encoding.
ExpectBOM BOMPolicy = writeBOM | acceptBOM | requireBOM
// Used in Java as Unicode (not to be confused with Java's UTF-16) and
// ICU's UTF-16,version=1. Not compliant with RFC 2781.
// TODO (maybe): strictBOM: BOM must match Endianness. This would allow:
// - UTF-16(B|L)E,version=1: writeBOM | acceptBOM | requireBOM | strictBOM
// (UnicodeBig and UnicodeLittle in Java)
// - RFC 2781-compliant, but less common interpretation for UTF-16(B|L)E:
// acceptBOM | strictBOM (e.g. assigned to CheckBOM).
// This addition would be consistent with supporting ExpectBOM.
)
// Endianness is a UTF-16 encoding's default endianness.
type Endianness bool
const (
// BigEndian is UTF-16BE.
BigEndian Endianness = false
// LittleEndian is UTF-16LE.
LittleEndian Endianness = true
)
// ErrMissingBOM means that decoding UTF-16 input with ExpectBOM did not find a
// starting byte order mark.
var ErrMissingBOM = errors.New("encoding: missing byte order mark")
type utf16Encoding struct {
config
mib identifier.MIB
}
type config struct {
endianness Endianness
bomPolicy BOMPolicy
}
func (u utf16Encoding) NewDecoder() *encoding.Decoder {
return &encoding.Decoder{Transformer: &utf16Decoder{
initial: u.config,
current: u.config,
}}
}
func (u utf16Encoding) NewEncoder() *encoding.Encoder {
return &encoding.Encoder{Transformer: &utf16Encoder{
endianness: u.endianness,
initialBOMPolicy: u.bomPolicy,
currentBOMPolicy: u.bomPolicy,
}}
}
func (u utf16Encoding) ID() (mib identifier.MIB, other string) {
return u.mib, ""
}
func (u utf16Encoding) String() string {
e, b := "B", ""
if u.endianness == LittleEndian {
e = "L"
}
switch u.bomPolicy {
case ExpectBOM:
b = "Expect"
case UseBOM:
b = "Use"
case IgnoreBOM:
b = "Ignore"
}
return "UTF-16" + e + "E (" + b + " BOM)"
}
type utf16Decoder struct {
initial config
current config
}
func (u *utf16Decoder) Reset() {
u.current = u.initial
}
func (u *utf16Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if len(src) == 0 {
if atEOF && u.current.bomPolicy&requireBOM != 0 {
return 0, 0, ErrMissingBOM
}
return 0, 0, nil
}
if u.current.bomPolicy&acceptBOM != 0 {
if len(src) < 2 {
return 0, 0, transform.ErrShortSrc
}
switch {
case src[0] == 0xfe && src[1] == 0xff:
u.current.endianness = BigEndian
nSrc = 2
case src[0] == 0xff && src[1] == 0xfe:
u.current.endianness = LittleEndian
nSrc = 2
default:
if u.current.bomPolicy&requireBOM != 0 {
return 0, 0, ErrMissingBOM
}
}
u.current.bomPolicy = IgnoreBOM
}
var r rune
var dSize, sSize int
for nSrc < len(src) {
if nSrc+1 < len(src) {
x := uint16(src[nSrc+0])<<8 | uint16(src[nSrc+1])
if u.current.endianness == LittleEndian {
x = x>>8 | x<<8
}
r, sSize = rune(x), 2
if utf16.IsSurrogate(r) {
if nSrc+3 < len(src) {
x = uint16(src[nSrc+2])<<8 | uint16(src[nSrc+3])
if u.current.endianness == LittleEndian {
x = x>>8 | x<<8
}
// Save for next iteration if it is not a high surrogate.
if isHighSurrogate(rune(x)) {
r, sSize = utf16.DecodeRune(r, rune(x)), 4
}
} else if !atEOF {
err = transform.ErrShortSrc
break
}
}
if dSize = utf8.RuneLen(r); dSize < 0 {
r, dSize = utf8.RuneError, 3
}
} else if atEOF {
// Single trailing byte.
r, dSize, sSize = utf8.RuneError, 3, 1
} else {
err = transform.ErrShortSrc
break
}
if nDst+dSize > len(dst) {
err = transform.ErrShortDst
break
}
nDst += utf8.EncodeRune(dst[nDst:], r)
nSrc += sSize
}
return nDst, nSrc, err
}
func isHighSurrogate(r rune) bool {
return 0xDC00 <= r && r <= 0xDFFF
}
type utf16Encoder struct {
endianness Endianness
initialBOMPolicy BOMPolicy
currentBOMPolicy BOMPolicy
}
func (u *utf16Encoder) Reset() {
u.currentBOMPolicy = u.initialBOMPolicy
}
func (u *utf16Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if u.currentBOMPolicy&writeBOM != 0 {
if len(dst) < 2 {
return 0, 0, transform.ErrShortDst
}
dst[0], dst[1] = 0xfe, 0xff
u.currentBOMPolicy = IgnoreBOM
nDst = 2
}
r, size := rune(0), 0
for nSrc < len(src) {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
}
if r <= 0xffff {
if nDst+2 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = uint8(r >> 8)
dst[nDst+1] = uint8(r)
nDst += 2
} else {
if nDst+4 > len(dst) {
err = transform.ErrShortDst
break
}
r1, r2 := utf16.EncodeRune(r)
dst[nDst+0] = uint8(r1 >> 8)
dst[nDst+1] = uint8(r1)
dst[nDst+2] = uint8(r2 >> 8)
dst[nDst+3] = uint8(r2)
nDst += 4
}
nSrc += size
}
if u.endianness == LittleEndian {
for i := 0; i < nDst; i += 2 {
dst[i], dst[i+1] = dst[i+1], dst[i]
}
}
return nDst, nSrc, err
}
+296
View File
@@ -0,0 +1,296 @@
// Copyright 2016 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package utf32 provides the UTF-32 Unicode encoding.
//
// Please note that support for UTF-32 is discouraged as it is a rare and
// inefficient encoding, unfit for use as an interchange format. For use
// on the web, the W3C strongly discourages its use
// (https://www.w3.org/TR/html5/document-metadata.html#charset)
// while WHATWG directly prohibits supporting it
// (https://html.spec.whatwg.org/multipage/syntax.html#character-encodings).
package utf32 // import "golang.org/x/text/encoding/unicode/utf32"
import (
"errors"
"unicode/utf8"
"golang.org/x/text/encoding"
"golang.org/x/text/encoding/internal/identifier"
"golang.org/x/text/transform"
)
// All lists a configuration for each IANA-defined UTF-32 variant.
var All = []encoding.Encoding{
UTF32(BigEndian, UseBOM),
UTF32(BigEndian, IgnoreBOM),
UTF32(LittleEndian, IgnoreBOM),
}
// ErrMissingBOM means that decoding UTF-32 input with ExpectBOM did not
// find a starting byte order mark.
var ErrMissingBOM = errors.New("encoding: missing byte order mark")
// UTF32 returns a UTF-32 Encoding for the given default endianness and
// byte order mark (BOM) policy.
//
// When decoding from UTF-32 to UTF-8, if the BOMPolicy is IgnoreBOM then
// neither BOMs U+FEFF nor ill-formed code units 0xFFFE0000 in the input
// stream will affect the endianness used for decoding. Instead BOMs will
// be output as their standard UTF-8 encoding "\xef\xbb\xbf" while
// 0xFFFE0000 code units will be output as "\xef\xbf\xbd", the standard
// UTF-8 encoding for the Unicode replacement character. If the BOMPolicy
// is UseBOM or ExpectBOM a starting BOM is not written to the UTF-8
// output. Instead, it overrides the default endianness e for the remainder
// of the transformation. Any subsequent BOMs U+FEFF or ill-formed code
// units 0xFFFE0000 will not affect the endianness used, and will instead
// be output as their standard UTF-8 (replacement) encodings. For UseBOM,
// if there is no starting BOM, it will proceed with the default
// Endianness. For ExpectBOM, in that case, the transformation will return
// early with an ErrMissingBOM error.
//
// When encoding from UTF-8 to UTF-32, a BOM will be inserted at the start
// of the output if the BOMPolicy is UseBOM or ExpectBOM. Otherwise, a BOM
// will not be inserted. The UTF-8 input does not need to contain a BOM.
//
// There is no concept of a 'native' endianness. If the UTF-32 data is
// produced and consumed in a greater context that implies a certain
// endianness, use IgnoreBOM. Otherwise, use ExpectBOM and always produce
// and consume a BOM.
//
// In the language of http://www.unicode.org/faq/utf_bom.html#bom10,
// IgnoreBOM corresponds to "Where the precise type of the data stream is
// known... the BOM should not be used" and ExpectBOM corresponds to "A
// particular protocol... may require use of the BOM".
func UTF32(e Endianness, b BOMPolicy) encoding.Encoding {
return utf32Encoding{config{e, b}, mibValue[e][b&bomMask]}
}
// mibValue maps Endianness and BOMPolicy settings to MIB constants for UTF-32.
// Note that some configurations map to the same MIB identifier.
var mibValue = map[Endianness][numBOMValues]identifier.MIB{
BigEndian: [numBOMValues]identifier.MIB{
IgnoreBOM: identifier.UTF32BE,
UseBOM: identifier.UTF32,
},
LittleEndian: [numBOMValues]identifier.MIB{
IgnoreBOM: identifier.UTF32LE,
UseBOM: identifier.UTF32,
},
// ExpectBOM is not widely used and has no valid MIB identifier.
}
// BOMPolicy is a UTF-32 encodings's byte order mark policy.
type BOMPolicy uint8
const (
writeBOM BOMPolicy = 0x01
acceptBOM BOMPolicy = 0x02
requireBOM BOMPolicy = 0x04
bomMask BOMPolicy = 0x07
// HACK: numBOMValues == 8 triggers a bug in the 1.4 compiler (cannot have a
// map of an array of length 8 of a type that is also used as a key or value
// in another map). See golang.org/issue/11354.
// TODO: consider changing this value back to 8 if the use of 1.4.* has
// been minimized.
numBOMValues = 8 + 1
// IgnoreBOM means to ignore any byte order marks.
IgnoreBOM BOMPolicy = 0
// Unicode-compliant interpretation for UTF-32BE/LE.
// UseBOM means that the UTF-32 form may start with a byte order mark,
// which will be used to override the default encoding.
UseBOM BOMPolicy = writeBOM | acceptBOM
// Unicode-compliant interpretation for UTF-32.
// ExpectBOM means that the UTF-32 form must start with a byte order mark,
// which will be used to override the default encoding.
ExpectBOM BOMPolicy = writeBOM | acceptBOM | requireBOM
// Consistent with BOMPolicy definition in golang.org/x/text/encoding/unicode
)
// Endianness is a UTF-32 encoding's default endianness.
type Endianness bool
const (
// BigEndian is UTF-32BE.
BigEndian Endianness = false
// LittleEndian is UTF-32LE.
LittleEndian Endianness = true
)
type config struct {
endianness Endianness
bomPolicy BOMPolicy
}
type utf32Encoding struct {
config
mib identifier.MIB
}
func (u utf32Encoding) NewDecoder() *encoding.Decoder {
return &encoding.Decoder{Transformer: &utf32Decoder{
initial: u.config,
current: u.config,
}}
}
func (u utf32Encoding) NewEncoder() *encoding.Encoder {
return &encoding.Encoder{Transformer: &utf32Encoder{
endianness: u.endianness,
initialBOMPolicy: u.bomPolicy,
currentBOMPolicy: u.bomPolicy,
}}
}
func (u utf32Encoding) ID() (mib identifier.MIB, other string) {
return u.mib, ""
}
func (u utf32Encoding) String() string {
e, b := "B", ""
if u.endianness == LittleEndian {
e = "L"
}
switch u.bomPolicy {
case ExpectBOM:
b = "Expect"
case UseBOM:
b = "Use"
case IgnoreBOM:
b = "Ignore"
}
return "UTF-32" + e + "E (" + b + " BOM)"
}
type utf32Decoder struct {
initial config
current config
}
func (u *utf32Decoder) Reset() {
u.current = u.initial
}
func (u *utf32Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if len(src) == 0 {
if atEOF && u.current.bomPolicy&requireBOM != 0 {
return 0, 0, ErrMissingBOM
}
return 0, 0, nil
}
if u.current.bomPolicy&acceptBOM != 0 {
if len(src) < 4 {
return 0, 0, transform.ErrShortSrc
}
switch {
case src[0] == 0x00 && src[1] == 0x00 && src[2] == 0xfe && src[3] == 0xff:
u.current.endianness = BigEndian
nSrc = 4
case src[0] == 0xff && src[1] == 0xfe && src[2] == 0x00 && src[3] == 0x00:
u.current.endianness = LittleEndian
nSrc = 4
default:
if u.current.bomPolicy&requireBOM != 0 {
return 0, 0, ErrMissingBOM
}
}
u.current.bomPolicy = IgnoreBOM
}
var r rune
var dSize, sSize int
for nSrc < len(src) {
if nSrc+3 < len(src) {
x := uint32(src[nSrc+0])<<24 | uint32(src[nSrc+1])<<16 |
uint32(src[nSrc+2])<<8 | uint32(src[nSrc+3])
if u.current.endianness == LittleEndian {
x = x>>24 | (x >> 8 & 0x0000FF00) | (x << 8 & 0x00FF0000) | x<<24
}
r, sSize = rune(x), 4
if dSize = utf8.RuneLen(r); dSize < 0 {
r, dSize = utf8.RuneError, 3
}
} else if atEOF {
// 1..3 trailing bytes.
r, dSize, sSize = utf8.RuneError, 3, len(src)-nSrc
} else {
err = transform.ErrShortSrc
break
}
if nDst+dSize > len(dst) {
err = transform.ErrShortDst
break
}
nDst += utf8.EncodeRune(dst[nDst:], r)
nSrc += sSize
}
return nDst, nSrc, err
}
type utf32Encoder struct {
endianness Endianness
initialBOMPolicy BOMPolicy
currentBOMPolicy BOMPolicy
}
func (u *utf32Encoder) Reset() {
u.currentBOMPolicy = u.initialBOMPolicy
}
func (u *utf32Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
if u.currentBOMPolicy&writeBOM != 0 {
if len(dst) < 4 {
return 0, 0, transform.ErrShortDst
}
dst[0], dst[1], dst[2], dst[3] = 0x00, 0x00, 0xfe, 0xff
u.currentBOMPolicy = IgnoreBOM
nDst = 4
}
r, size := rune(0), 0
for nSrc < len(src) {
r = rune(src[nSrc])
// Decode a 1-byte rune.
if r < utf8.RuneSelf {
size = 1
} else {
// Decode a multi-byte rune.
r, size = utf8.DecodeRune(src[nSrc:])
if size == 1 {
// All valid runes of size 1 (those below utf8.RuneSelf) were
// handled above. We have invalid UTF-8 or we haven't seen the
// full character yet.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
}
}
if nDst+4 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = uint8(r >> 24)
dst[nDst+1] = uint8(r >> 16)
dst[nDst+2] = uint8(r >> 8)
dst[nDst+3] = uint8(r)
nDst += 4
nSrc += size
}
if u.endianness == LittleEndian {
for i := 0; i < nDst; i += 4 {
dst[i], dst[i+1], dst[i+2], dst[i+3] = dst[i+3], dst[i+2], dst[i+1], dst[i]
}
}
return nDst, nSrc, err
}
+27
View File
@@ -0,0 +1,27 @@
Copyright (c) 2009 The Go Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+41
View File
@@ -0,0 +1,41 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package format contains types for defining language-specific formatting of
// values.
//
// This package is internal now, but will eventually be exposed after the API
// settles.
package format // import "golang.org/x/text/internal/format"
import (
"fmt"
"golang.org/x/text/language"
)
// State represents the printer state passed to custom formatters. It provides
// access to the fmt.State interface and the sentence and language-related
// context.
type State interface {
fmt.State
// Language reports the requested language in which to render a message.
Language() language.Tag
// TODO: consider this and removing rune from the Format method in the
// Formatter interface.
//
// Verb returns the format variant to render, analogous to the types used
// in fmt. Use 'v' for the default or only variant.
// Verb() rune
// TODO: more info:
// - sentence context such as linguistic features passed by the translator.
}
// Formatter is analogous to fmt.Formatter.
type Formatter interface {
Format(state State, verb rune)
}
+357
View File
@@ -0,0 +1,357 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package format
import (
"reflect"
"unicode/utf8"
)
// A Parser parses a format string. The result from the parse are set in the
// struct fields.
type Parser struct {
Verb rune
WidthPresent bool
PrecPresent bool
Minus bool
Plus bool
Sharp bool
Space bool
Zero bool
// For the formats %+v %#v, we set the plusV/sharpV flags
// and clear the plus/sharp flags since %+v and %#v are in effect
// different, flagless formats set at the top level.
PlusV bool
SharpV bool
HasIndex bool
Width int
Prec int // precision
// retain arguments across calls.
Args []interface{}
// retain current argument number across calls
ArgNum int
// reordered records whether the format string used argument reordering.
Reordered bool
// goodArgNum records whether the most recent reordering directive was valid.
goodArgNum bool
// position info
format string
startPos int
endPos int
Status Status
}
// Reset initializes a parser to scan format strings for the given args.
func (p *Parser) Reset(args []interface{}) {
p.Args = args
p.ArgNum = 0
p.startPos = 0
p.Reordered = false
}
// Text returns the part of the format string that was parsed by the last call
// to Scan. It returns the original substitution clause if the current scan
// parsed a substitution.
func (p *Parser) Text() string { return p.format[p.startPos:p.endPos] }
// SetFormat sets a new format string to parse. It does not reset the argument
// count.
func (p *Parser) SetFormat(format string) {
p.format = format
p.startPos = 0
p.endPos = 0
}
// Status indicates the result type of a call to Scan.
type Status int
const (
StatusText Status = iota
StatusSubstitution
StatusBadWidthSubstitution
StatusBadPrecSubstitution
StatusNoVerb
StatusBadArgNum
StatusMissingArg
)
// ClearFlags reset the parser to default behavior.
func (p *Parser) ClearFlags() {
p.WidthPresent = false
p.PrecPresent = false
p.Minus = false
p.Plus = false
p.Sharp = false
p.Space = false
p.Zero = false
p.PlusV = false
p.SharpV = false
p.HasIndex = false
}
// Scan scans the next part of the format string and sets the status to
// indicate whether it scanned a string literal, substitution or error.
func (p *Parser) Scan() bool {
p.Status = StatusText
format := p.format
end := len(format)
if p.endPos >= end {
return false
}
afterIndex := false // previous item in format was an index like [3].
p.startPos = p.endPos
p.goodArgNum = true
i := p.startPos
for i < end && format[i] != '%' {
i++
}
if i > p.startPos {
p.endPos = i
return true
}
// Process one verb
i++
p.Status = StatusSubstitution
// Do we have flags?
p.ClearFlags()
simpleFormat:
for ; i < end; i++ {
c := p.format[i]
switch c {
case '#':
p.Sharp = true
case '0':
p.Zero = !p.Minus // Only allow zero padding to the left.
case '+':
p.Plus = true
case '-':
p.Minus = true
p.Zero = false // Do not pad with zeros to the right.
case ' ':
p.Space = true
default:
// Fast path for common case of ascii lower case simple verbs
// without precision or width or argument indices.
if 'a' <= c && c <= 'z' && p.ArgNum < len(p.Args) {
if c == 'v' {
// Go syntax
p.SharpV = p.Sharp
p.Sharp = false
// Struct-field syntax
p.PlusV = p.Plus
p.Plus = false
}
p.Verb = rune(c)
p.ArgNum++
p.endPos = i + 1
return true
}
// Format is more complex than simple flags and a verb or is malformed.
break simpleFormat
}
}
// Do we have an explicit argument index?
i, afterIndex = p.updateArgNumber(format, i)
// Do we have width?
if i < end && format[i] == '*' {
i++
p.Width, p.WidthPresent = p.intFromArg()
if !p.WidthPresent {
p.Status = StatusBadWidthSubstitution
}
// We have a negative width, so take its value and ensure
// that the minus flag is set
if p.Width < 0 {
p.Width = -p.Width
p.Minus = true
p.Zero = false // Do not pad with zeros to the right.
}
afterIndex = false
} else {
p.Width, p.WidthPresent, i = parsenum(format, i, end)
if afterIndex && p.WidthPresent { // "%[3]2d"
p.goodArgNum = false
}
}
// Do we have precision?
if i+1 < end && format[i] == '.' {
i++
if afterIndex { // "%[3].2d"
p.goodArgNum = false
}
i, afterIndex = p.updateArgNumber(format, i)
if i < end && format[i] == '*' {
i++
p.Prec, p.PrecPresent = p.intFromArg()
// Negative precision arguments don't make sense
if p.Prec < 0 {
p.Prec = 0
p.PrecPresent = false
}
if !p.PrecPresent {
p.Status = StatusBadPrecSubstitution
}
afterIndex = false
} else {
p.Prec, p.PrecPresent, i = parsenum(format, i, end)
if !p.PrecPresent {
p.Prec = 0
p.PrecPresent = true
}
}
}
if !afterIndex {
i, afterIndex = p.updateArgNumber(format, i)
}
p.HasIndex = afterIndex
if i >= end {
p.endPos = i
p.Status = StatusNoVerb
return true
}
verb, w := utf8.DecodeRuneInString(format[i:])
p.endPos = i + w
p.Verb = verb
switch {
case verb == '%': // Percent does not absorb operands and ignores f.wid and f.prec.
p.startPos = p.endPos - 1
p.Status = StatusText
case !p.goodArgNum:
p.Status = StatusBadArgNum
case p.ArgNum >= len(p.Args): // No argument left over to print for the current verb.
p.Status = StatusMissingArg
case verb == 'v':
// Go syntax
p.SharpV = p.Sharp
p.Sharp = false
// Struct-field syntax
p.PlusV = p.Plus
p.Plus = false
fallthrough
default:
p.ArgNum++
}
return true
}
// intFromArg gets the ArgNumth element of Args. On return, isInt reports
// whether the argument has integer type.
func (p *Parser) intFromArg() (num int, isInt bool) {
if p.ArgNum < len(p.Args) {
arg := p.Args[p.ArgNum]
num, isInt = arg.(int) // Almost always OK.
if !isInt {
// Work harder.
switch v := reflect.ValueOf(arg); v.Kind() {
case reflect.Int, reflect.Int8, reflect.Int16, reflect.Int32, reflect.Int64:
n := v.Int()
if int64(int(n)) == n {
num = int(n)
isInt = true
}
case reflect.Uint, reflect.Uint8, reflect.Uint16, reflect.Uint32, reflect.Uint64, reflect.Uintptr:
n := v.Uint()
if int64(n) >= 0 && uint64(int(n)) == n {
num = int(n)
isInt = true
}
default:
// Already 0, false.
}
}
p.ArgNum++
if tooLarge(num) {
num = 0
isInt = false
}
}
return
}
// parseArgNumber returns the value of the bracketed number, minus 1
// (explicit argument numbers are one-indexed but we want zero-indexed).
// The opening bracket is known to be present at format[0].
// The returned values are the index, the number of bytes to consume
// up to the closing paren, if present, and whether the number parsed
// ok. The bytes to consume will be 1 if no closing paren is present.
func parseArgNumber(format string) (index int, wid int, ok bool) {
// There must be at least 3 bytes: [n].
if len(format) < 3 {
return 0, 1, false
}
// Find closing bracket.
for i := 1; i < len(format); i++ {
if format[i] == ']' {
width, ok, newi := parsenum(format, 1, i)
if !ok || newi != i {
return 0, i + 1, false
}
return width - 1, i + 1, true // arg numbers are one-indexed and skip paren.
}
}
return 0, 1, false
}
// updateArgNumber returns the next argument to evaluate, which is either the value of the passed-in
// argNum or the value of the bracketed integer that begins format[i:]. It also returns
// the new value of i, that is, the index of the next byte of the format to process.
func (p *Parser) updateArgNumber(format string, i int) (newi int, found bool) {
if len(format) <= i || format[i] != '[' {
return i, false
}
p.Reordered = true
index, wid, ok := parseArgNumber(format[i:])
if ok && 0 <= index && index < len(p.Args) {
p.ArgNum = index
return i + wid, true
}
p.goodArgNum = false
return i + wid, ok
}
// tooLarge reports whether the magnitude of the integer is
// too large to be used as a formatting width or precision.
func tooLarge(x int) bool {
const max int = 1e6
return x > max || x < -max
}
// parsenum converts ASCII to integer. num is 0 (and isnum is false) if no number present.
func parsenum(s string, start, end int) (num int, isnum bool, newi int) {
if start >= end {
return 0, false, end
}
for newi = start; newi < end && '0' <= s[newi] && s[newi] <= '9'; newi++ {
if tooLarge(num) {
return 0, false, end // Overflow; crazy long number most likely.
}
num = num*10 + int(s[newi]-'0')
isnum = true
}
return
}
+21 -3
View File
@@ -55,18 +55,36 @@ func (w *CodeWriter) WriteGoFile(filename, pkg string) {
log.Fatalf("Could not create file %s: %v", filename, err)
}
defer f.Close()
if _, err = w.WriteGo(f, pkg); err != nil {
if _, err = w.WriteGo(f, pkg, ""); err != nil {
log.Fatalf("Error writing file %s: %v", filename, err)
}
}
// WriteVersionedGoFile appends the buffer with the total size of all created
// structures and writes it as a Go file to the the given file with the given
// package name and build tags for the current Unicode version,
func (w *CodeWriter) WriteVersionedGoFile(filename, pkg string) {
tags := buildTags()
if tags != "" {
filename = insertVersion(filename, UnicodeVersion())
}
f, err := os.Create(filename)
if err != nil {
log.Fatalf("Could not create file %s: %v", filename, err)
}
defer f.Close()
if _, err = w.WriteGo(f, pkg, tags); err != nil {
log.Fatalf("Error writing file %s: %v", filename, err)
}
}
// WriteGo appends the buffer with the total size of all created structures and
// writes it as a Go file to the the given writer with the given package name.
func (w *CodeWriter) WriteGo(out io.Writer, pkg string) (n int, err error) {
func (w *CodeWriter) WriteGo(out io.Writer, pkg, tags string) (n int, err error) {
sz := w.Size
w.WriteComment("Total table size %d bytes (%dKiB); checksum: %X\n", sz, sz/1024, w.Hash.Sum32())
defer w.buf.Reset()
return WriteGo(out, pkg, w.buf.Bytes())
return WriteGo(out, pkg, tags, w.buf.Bytes())
}
func (w *CodeWriter) printf(f string, x ...interface{}) {
+58 -6
View File
@@ -31,6 +31,7 @@ import (
"os"
"path"
"path/filepath"
"strings"
"sync"
"unicode"
@@ -69,8 +70,6 @@ func Init() {
const header = `// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
package %s
`
// UnicodeVersion reports the requested Unicode version.
@@ -78,11 +77,33 @@ func UnicodeVersion() string {
return *unicodeVersion
}
// UnicodeVersion reports the requested CLDR version.
// CLDRVersion reports the requested CLDR version.
func CLDRVersion() string {
return *cldrVersion
}
var tags = []struct{ version, buildTags string }{
{"10.0.0", "go1.10"},
{"", "!go1.10"},
}
// buildTags reports the build tags used for the current Unicode version.
func buildTags() string {
v := UnicodeVersion()
for _, x := range tags {
// We should do a numeric comparison, but including the collate package
// would create an import cycle. We approximate it by assuming that
// longer version strings are later.
if len(x.version) <= len(v) {
return x.buildTags
}
if len(x.version) == len(v) && x.version <= v {
return x.buildTags
}
}
return tags[0].buildTags
}
// IsLocal reports whether data files are available locally.
func IsLocal() bool {
dir, err := localReadmeFile()
@@ -243,15 +264,46 @@ func WriteGoFile(filename, pkg string, b []byte) {
log.Fatalf("Could not create file %s: %v", filename, err)
}
defer w.Close()
if _, err = WriteGo(w, pkg, b); err != nil {
if _, err = WriteGo(w, pkg, "", b); err != nil {
log.Fatalf("Error writing file %s: %v", filename, err)
}
}
func insertVersion(filename, version string) string {
suffix := ".go"
if strings.HasSuffix(filename, "_test.go") {
suffix = "_test.go"
}
return fmt.Sprint(filename[:len(filename)-len(suffix)], version, suffix)
}
// WriteVersionedGoFile prepends a standard file comment, adds build tags to
// version the file for the current Unicode version, and package statement to
// the given bytes, applies gofmt, and writes them to a file with the given
// name. It will call log.Fatal if there are any errors.
func WriteVersionedGoFile(filename, pkg string, b []byte) {
tags := buildTags()
if tags != "" {
filename = insertVersion(filename, UnicodeVersion())
}
w, err := os.Create(filename)
if err != nil {
log.Fatalf("Could not create file %s: %v", filename, err)
}
defer w.Close()
if _, err = WriteGo(w, pkg, tags, b); err != nil {
log.Fatalf("Error writing file %s: %v", filename, err)
}
}
// WriteGo prepends a standard file comment and package statement to the given
// bytes, applies gofmt, and writes them to w.
func WriteGo(w io.Writer, pkg string, b []byte) (n int, err error) {
src := []byte(fmt.Sprintf(header, pkg))
func WriteGo(w io.Writer, pkg, tags string, b []byte) (n int, err error) {
src := []byte(header)
if tags != "" {
src = append(src, fmt.Sprintf("// +build %s\n\n", tags)...)
}
src = append(src, fmt.Sprintf("package %s\n\n", pkg)...)
src = append(src, b...)
formatted, err := format.Source(src)
if err != nil {
+27
View File
@@ -0,0 +1,27 @@
Copyright (c) 2009 The Go Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+100
View File
@@ -0,0 +1,100 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package tag contains functionality handling tags and related data.
package tag // import "golang.org/x/text/internal/tag"
import "sort"
// An Index converts tags to a compact numeric value.
//
// All elements are of size 4. Tags may be up to 4 bytes long. Excess bytes can
// be used to store additional information about the tag.
type Index string
// Elem returns the element data at the given index.
func (s Index) Elem(x int) string {
return string(s[x*4 : x*4+4])
}
// Index reports the index of the given key or -1 if it could not be found.
// Only the first len(key) bytes from the start of the 4-byte entries will be
// considered for the search and the first match in Index will be returned.
func (s Index) Index(key []byte) int {
n := len(key)
// search the index of the first entry with an equal or higher value than
// key in s.
index := sort.Search(len(s)/4, func(i int) bool {
return cmp(s[i*4:i*4+n], key) != -1
})
i := index * 4
if cmp(s[i:i+len(key)], key) != 0 {
return -1
}
return index
}
// Next finds the next occurrence of key after index x, which must have been
// obtained from a call to Index using the same key. It returns x+1 or -1.
func (s Index) Next(key []byte, x int) int {
if x++; x*4 < len(s) && cmp(s[x*4:x*4+len(key)], key) == 0 {
return x
}
return -1
}
// cmp returns an integer comparing a and b lexicographically.
func cmp(a Index, b []byte) int {
n := len(a)
if len(b) < n {
n = len(b)
}
for i, c := range b[:n] {
switch {
case a[i] > c:
return 1
case a[i] < c:
return -1
}
}
switch {
case len(a) < len(b):
return -1
case len(a) > len(b):
return 1
}
return 0
}
// Compare returns an integer comparing a and b lexicographically.
func Compare(a string, b []byte) int {
return cmp(Index(a), b)
}
// FixCase reformats b to the same pattern of cases as form.
// If returns false if string b is malformed.
func FixCase(form string, b []byte) bool {
if len(form) != len(b) {
return false
}
for i, c := range b {
if form[i] <= 'Z' {
if c >= 'a' {
c -= 'z' - 'Z'
}
if c < 'A' || 'Z' < c {
return false
}
} else {
if c <= 'Z' {
c += 'z' - 'Z'
}
if c < 'a' || 'z' < c {
return false
}
}
b[i] = c
}
return true
}
+55 -60
View File
@@ -11,8 +11,8 @@ package ucd // import "golang.org/x/text/internal/ucd"
import (
"bufio"
"bytes"
"errors"
"fmt"
"io"
"log"
"regexp"
@@ -92,10 +92,11 @@ type Parser struct {
keepRanges bool // Don't expand rune ranges in field 0.
err error
comment []byte
field [][]byte
comment string
field []string
// parsedRange is needed in case Range(0) is called more than once for one
// field. In some cases this requires scanning ahead.
line int
parsedRange bool
rangeStart, rangeEnd rune
@@ -103,15 +104,19 @@ type Parser struct {
commentHandler func(s string)
}
func (p *Parser) setError(err error) {
if p.err == nil {
p.err = err
func (p *Parser) setError(err error, msg string) {
if p.err == nil && err != nil {
if msg == "" {
p.err = fmt.Errorf("ucd:line:%d: %v", p.line, err)
} else {
p.err = fmt.Errorf("ucd:line:%d:%s: %v", p.line, msg, err)
}
}
}
func (p *Parser) getField(i int) []byte {
func (p *Parser) getField(i int) string {
if i >= len(p.field) {
return nil
return ""
}
return p.field[i]
}
@@ -139,65 +144,66 @@ func (p *Parser) Next() bool {
p.rangeStart++
return true
}
p.comment = nil
p.comment = ""
p.field = p.field[:0]
p.parsedRange = false
for p.scanner.Scan() {
b := p.scanner.Bytes()
if len(b) == 0 {
for p.scanner.Scan() && p.err == nil {
p.line++
s := p.scanner.Text()
if s == "" {
continue
}
if b[0] == '#' {
if s[0] == '#' {
if p.commentHandler != nil {
p.commentHandler(strings.TrimSpace(string(b[1:])))
p.commentHandler(strings.TrimSpace(s[1:]))
}
continue
}
// Parse line
if i := bytes.IndexByte(b, '#'); i != -1 {
p.comment = bytes.TrimSpace(b[i+1:])
b = b[:i]
if i := strings.IndexByte(s, '#'); i != -1 {
p.comment = strings.TrimSpace(s[i+1:])
s = s[:i]
}
if b[0] == '@' {
if s[0] == '@' {
if p.partHandler != nil {
p.field = append(p.field, bytes.TrimSpace(b[1:]))
p.field = append(p.field, strings.TrimSpace(s[1:]))
p.partHandler(p)
p.field = p.field[:0]
}
p.comment = nil
p.comment = ""
continue
}
for {
i := bytes.IndexByte(b, ';')
i := strings.IndexByte(s, ';')
if i == -1 {
p.field = append(p.field, bytes.TrimSpace(b))
p.field = append(p.field, strings.TrimSpace(s))
break
}
p.field = append(p.field, bytes.TrimSpace(b[:i]))
b = b[i+1:]
p.field = append(p.field, strings.TrimSpace(s[:i]))
s = s[i+1:]
}
if !p.keepRanges {
p.rangeStart, p.rangeEnd = p.getRange(0)
}
return true
}
p.setError(p.scanner.Err())
p.setError(p.scanner.Err(), "scanner failed")
return false
}
func parseRune(b []byte) (rune, error) {
func parseRune(b string) (rune, error) {
if len(b) > 2 && b[0] == 'U' && b[1] == '+' {
b = b[2:]
}
x, err := strconv.ParseUint(string(b), 16, 32)
x, err := strconv.ParseUint(b, 16, 32)
return rune(x), err
}
func (p *Parser) parseRune(b []byte) rune {
x, err := parseRune(b)
p.setError(err)
func (p *Parser) parseRune(s string) rune {
x, err := parseRune(s)
p.setError(err, "failed to parse rune")
return x
}
@@ -211,13 +217,13 @@ func (p *Parser) Rune(i int) rune {
// Runes interprets and returns field i as a sequence of runes.
func (p *Parser) Runes(i int) (runes []rune) {
add := func(b []byte) {
if b = bytes.TrimSpace(b); len(b) > 0 {
runes = append(runes, p.parseRune(b))
add := func(s string) {
if s = strings.TrimSpace(s); len(s) > 0 {
runes = append(runes, p.parseRune(s))
}
}
for b := p.getField(i); ; {
i := bytes.IndexByte(b, ' ')
i := strings.IndexByte(b, ' ')
if i == -1 {
add(b)
break
@@ -247,7 +253,7 @@ func (p *Parser) Range(i int) (first, last rune) {
func (p *Parser) getRange(i int) (first, last rune) {
b := p.getField(i)
if k := bytes.Index(b, []byte("..")); k != -1 {
if k := strings.Index(b, ".."); k != -1 {
return p.parseRune(b[:k]), p.parseRune(b[k+2:])
}
// The first field may not be a rune, in which case we may ignore any error
@@ -260,23 +266,24 @@ func (p *Parser) getRange(i int) (first, last rune) {
p.keepRanges = true
}
// Special case for UnicodeData that was retained for backwards compatibility.
if i == 0 && len(p.field) > 1 && bytes.HasSuffix(p.field[1], []byte("First>")) {
if i == 0 && len(p.field) > 1 && strings.HasSuffix(p.field[1], "First>") {
if p.parsedRange {
return p.rangeStart, p.rangeEnd
}
mf := reRange.FindStringSubmatch(p.scanner.Text())
p.line++
if mf == nil || !p.scanner.Scan() {
p.setError(errIncorrectLegacyRange)
p.setError(errIncorrectLegacyRange, "")
return x, x
}
// Using Bytes would be more efficient here, but Text is a lot easier
// and this is not a frequent case.
ml := reRange.FindStringSubmatch(p.scanner.Text())
if ml == nil || mf[2] != ml[2] || ml[3] != "Last" || mf[4] != ml[4] {
p.setError(errIncorrectLegacyRange)
p.setError(errIncorrectLegacyRange, "")
return x, x
}
p.rangeStart, p.rangeEnd = x, p.parseRune(p.scanner.Bytes()[:len(ml[1])])
p.rangeStart, p.rangeEnd = x, p.parseRune(p.scanner.Text()[:len(ml[1])])
p.parsedRange = true
return p.rangeStart, p.rangeEnd
}
@@ -298,34 +305,34 @@ var bools = map[string]bool{
// Bool parses and returns field i as a boolean value.
func (p *Parser) Bool(i int) bool {
b := p.getField(i)
f := p.getField(i)
for s, v := range bools {
if bstrEq(b, s) {
if f == s {
return v
}
}
p.setError(strconv.ErrSyntax)
p.setError(strconv.ErrSyntax, "error parsing bool")
return false
}
// Int parses and returns field i as an integer value.
func (p *Parser) Int(i int) int {
x, err := strconv.ParseInt(string(p.getField(i)), 10, 64)
p.setError(err)
p.setError(err, "error parsing int")
return int(x)
}
// Uint parses and returns field i as an unsigned integer value.
func (p *Parser) Uint(i int) uint {
x, err := strconv.ParseUint(string(p.getField(i)), 10, 64)
p.setError(err)
p.setError(err, "error parsing uint")
return uint(x)
}
// Float parses and returns field i as a decimal value.
func (p *Parser) Float(i int) float64 {
x, err := strconv.ParseFloat(string(p.getField(i)), 64)
p.setError(err)
p.setError(err, "error parsing float")
return x
}
@@ -353,24 +360,12 @@ var errUndefinedEnum = errors.New("ucd: undefined enum value")
// Enum interprets and returns field i as a value that must be one of the values
// in enum.
func (p *Parser) Enum(i int, enum ...string) string {
b := p.getField(i)
f := p.getField(i)
for _, s := range enum {
if bstrEq(b, s) {
if f == s {
return s
}
}
p.setError(errUndefinedEnum)
p.setError(errUndefinedEnum, "error parsing enum")
return ""
}
func bstrEq(b []byte, s string) bool {
if len(b) != len(s) {
return false
}
for i, c := range b {
if c != s[i] {
return false
}
}
return true
}
+27
View File
@@ -0,0 +1,27 @@
Copyright (c) 2009 The Go Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+87
View File
@@ -0,0 +1,87 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package utf8internal contains low-level utf8-related constants, tables, etc.
// that are used internally by the text package.
package utf8internal
// The default lowest and highest continuation byte.
const (
LoCB = 0x80 // 1000 0000
HiCB = 0xBF // 1011 1111
)
// Constants related to getting information of first bytes of UTF-8 sequences.
const (
// ASCII identifies a UTF-8 byte as ASCII.
ASCII = as
// FirstInvalid indicates a byte is invalid as a first byte of a UTF-8
// sequence.
FirstInvalid = xx
// SizeMask is a mask for the size bits. Use use x&SizeMask to get the size.
SizeMask = 7
// AcceptShift is the right-shift count for the first byte info byte to get
// the index into the AcceptRanges table. See AcceptRanges.
AcceptShift = 4
// The names of these constants are chosen to give nice alignment in the
// table below. The first nibble is an index into acceptRanges or F for
// special one-byte cases. The second nibble is the Rune length or the
// Status for the special one-byte case.
xx = 0xF1 // invalid: size 1
as = 0xF0 // ASCII: size 1
s1 = 0x02 // accept 0, size 2
s2 = 0x13 // accept 1, size 3
s3 = 0x03 // accept 0, size 3
s4 = 0x23 // accept 2, size 3
s5 = 0x34 // accept 3, size 4
s6 = 0x04 // accept 0, size 4
s7 = 0x44 // accept 4, size 4
)
// First is information about the first byte in a UTF-8 sequence.
var First = [256]uint8{
// 1 2 3 4 5 6 7 8 9 A B C D E F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
// 1 2 3 4 5 6 7 8 9 A B C D E F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
}
// AcceptRange gives the range of valid values for the second byte in a UTF-8
// sequence for any value for First that is not ASCII or FirstInvalid.
type AcceptRange struct {
Lo uint8 // lowest value for second byte.
Hi uint8 // highest value for second byte.
}
// AcceptRanges is a slice of AcceptRange values. For a given byte sequence b
//
// AcceptRanges[First[b[0]]>>AcceptShift]
//
// will give the value of AcceptRange for the multi-byte UTF-8 sequence starting
// at b[0].
var AcceptRanges = [...]AcceptRange{
0: {LoCB, HiCB},
1: {0xA0, HiCB},
2: {LoCB, 0x9F},
3: {0x90, HiCB},
4: {LoCB, 0x8F},
}
+27
View File
@@ -0,0 +1,27 @@
Copyright (c) 2009 The Go Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+16
View File
@@ -0,0 +1,16 @@
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
package language
// This file contains code common to the maketables.go and the package code.
// langAliasType is the type of an alias in langAliasMap.
type langAliasType int8
const (
langDeprecated langAliasType = iota
langMacro
langLegacy
langAliasTypeUnknown langAliasType = -1
)
+197
View File
@@ -0,0 +1,197 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
import (
"fmt"
"sort"
)
// The Coverage interface is used to define the level of coverage of an
// internationalization service. Note that not all types are supported by all
// services. As lists may be generated on the fly, it is recommended that users
// of a Coverage cache the results.
type Coverage interface {
// Tags returns the list of supported tags.
Tags() []Tag
// BaseLanguages returns the list of supported base languages.
BaseLanguages() []Base
// Scripts returns the list of supported scripts.
Scripts() []Script
// Regions returns the list of supported regions.
Regions() []Region
}
var (
// Supported defines a Coverage that lists all supported subtags. Tags
// always returns nil.
Supported Coverage = allSubtags{}
)
// TODO:
// - Support Variants, numbering systems.
// - CLDR coverage levels.
// - Set of common tags defined in this package.
type allSubtags struct{}
// Regions returns the list of supported regions. As all regions are in a
// consecutive range, it simply returns a slice of numbers in increasing order.
// The "undefined" region is not returned.
func (s allSubtags) Regions() []Region {
reg := make([]Region, numRegions)
for i := range reg {
reg[i] = Region{regionID(i + 1)}
}
return reg
}
// Scripts returns the list of supported scripts. As all scripts are in a
// consecutive range, it simply returns a slice of numbers in increasing order.
// The "undefined" script is not returned.
func (s allSubtags) Scripts() []Script {
scr := make([]Script, numScripts)
for i := range scr {
scr[i] = Script{scriptID(i + 1)}
}
return scr
}
// BaseLanguages returns the list of all supported base languages. It generates
// the list by traversing the internal structures.
func (s allSubtags) BaseLanguages() []Base {
base := make([]Base, 0, numLanguages)
for i := 0; i < langNoIndexOffset; i++ {
// We included "und" already for the value 0.
if i != nonCanonicalUnd {
base = append(base, Base{langID(i)})
}
}
i := langNoIndexOffset
for _, v := range langNoIndex {
for k := 0; k < 8; k++ {
if v&1 == 1 {
base = append(base, Base{langID(i)})
}
v >>= 1
i++
}
}
return base
}
// Tags always returns nil.
func (s allSubtags) Tags() []Tag {
return nil
}
// coverage is used used by NewCoverage which is used as a convenient way for
// creating Coverage implementations for partially defined data. Very often a
// package will only need to define a subset of slices. coverage provides a
// convenient way to do this. Moreover, packages using NewCoverage, instead of
// their own implementation, will not break if later new slice types are added.
type coverage struct {
tags func() []Tag
bases func() []Base
scripts func() []Script
regions func() []Region
}
func (s *coverage) Tags() []Tag {
if s.tags == nil {
return nil
}
return s.tags()
}
// bases implements sort.Interface and is used to sort base languages.
type bases []Base
func (b bases) Len() int {
return len(b)
}
func (b bases) Swap(i, j int) {
b[i], b[j] = b[j], b[i]
}
func (b bases) Less(i, j int) bool {
return b[i].langID < b[j].langID
}
// BaseLanguages returns the result from calling s.bases if it is specified or
// otherwise derives the set of supported base languages from tags.
func (s *coverage) BaseLanguages() []Base {
if s.bases == nil {
tags := s.Tags()
if len(tags) == 0 {
return nil
}
a := make([]Base, len(tags))
for i, t := range tags {
a[i] = Base{langID(t.lang)}
}
sort.Sort(bases(a))
k := 0
for i := 1; i < len(a); i++ {
if a[k] != a[i] {
k++
a[k] = a[i]
}
}
return a[:k+1]
}
return s.bases()
}
func (s *coverage) Scripts() []Script {
if s.scripts == nil {
return nil
}
return s.scripts()
}
func (s *coverage) Regions() []Region {
if s.regions == nil {
return nil
}
return s.regions()
}
// NewCoverage returns a Coverage for the given lists. It is typically used by
// packages providing internationalization services to define their level of
// coverage. A list may be of type []T or func() []T, where T is either Tag,
// Base, Script or Region. The returned Coverage derives the value for Bases
// from Tags if no func or slice for []Base is specified. For other unspecified
// types the returned Coverage will return nil for the respective methods.
func NewCoverage(list ...interface{}) Coverage {
s := &coverage{}
for _, x := range list {
switch v := x.(type) {
case func() []Base:
s.bases = v
case func() []Script:
s.scripts = v
case func() []Region:
s.regions = v
case func() []Tag:
s.tags = v
case []Base:
s.bases = func() []Base { return v }
case []Script:
s.scripts = func() []Script { return v }
case []Region:
s.regions = func() []Region { return v }
case []Tag:
s.tags = func() []Tag { return v }
default:
panic(fmt.Sprintf("language: unsupported set type %T", v))
}
}
return s
}
+92
View File
@@ -0,0 +1,92 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package display
// This file contains sets of data for specific languages. Users can use these
// to create smaller collections of supported languages and reduce total table
// size.
// The variable names defined here correspond to those in package language.
var (
Afrikaans *Dictionary = &af // af
Amharic *Dictionary = &am // am
Arabic *Dictionary = &ar // ar
ModernStandardArabic *Dictionary = Arabic // ar-001
Azerbaijani *Dictionary = &az // az
Bulgarian *Dictionary = &bg // bg
Bengali *Dictionary = &bn // bn
Catalan *Dictionary = &ca // ca
Czech *Dictionary = &cs // cs
Danish *Dictionary = &da // da
German *Dictionary = &de // de
Greek *Dictionary = &el // el
English *Dictionary = &en // en
AmericanEnglish *Dictionary = English // en-US
BritishEnglish *Dictionary = English // en-GB
Spanish *Dictionary = &es // es
EuropeanSpanish *Dictionary = Spanish // es-ES
LatinAmericanSpanish *Dictionary = Spanish // es-419
Estonian *Dictionary = &et // et
Persian *Dictionary = &fa // fa
Finnish *Dictionary = &fi // fi
Filipino *Dictionary = &fil // fil
French *Dictionary = &fr // fr
Gujarati *Dictionary = &gu // gu
Hebrew *Dictionary = &he // he
Hindi *Dictionary = &hi // hi
Croatian *Dictionary = &hr // hr
Hungarian *Dictionary = &hu // hu
Armenian *Dictionary = &hy // hy
Indonesian *Dictionary = &id // id
Icelandic *Dictionary = &is // is
Italian *Dictionary = &it // it
Japanese *Dictionary = &ja // ja
Georgian *Dictionary = &ka // ka
Kazakh *Dictionary = &kk // kk
Khmer *Dictionary = &km // km
Kannada *Dictionary = &kn // kn
Korean *Dictionary = &ko // ko
Kirghiz *Dictionary = &ky // ky
Lao *Dictionary = &lo // lo
Lithuanian *Dictionary = &lt // lt
Latvian *Dictionary = &lv // lv
Macedonian *Dictionary = &mk // mk
Malayalam *Dictionary = &ml // ml
Mongolian *Dictionary = &mn // mn
Marathi *Dictionary = &mr // mr
Malay *Dictionary = &ms // ms
Burmese *Dictionary = &my // my
Nepali *Dictionary = &ne // ne
Dutch *Dictionary = &nl // nl
Norwegian *Dictionary = &no // no
Punjabi *Dictionary = &pa // pa
Polish *Dictionary = &pl // pl
Portuguese *Dictionary = &pt // pt
BrazilianPortuguese *Dictionary = Portuguese // pt-BR
EuropeanPortuguese *Dictionary = &ptPT // pt-PT
Romanian *Dictionary = &ro // ro
Russian *Dictionary = &ru // ru
Sinhala *Dictionary = &si // si
Slovak *Dictionary = &sk // sk
Slovenian *Dictionary = &sl // sl
Albanian *Dictionary = &sq // sq
Serbian *Dictionary = &sr // sr
SerbianLatin *Dictionary = &srLatn // sr
Swedish *Dictionary = &sv // sv
Swahili *Dictionary = &sw // sw
Tamil *Dictionary = &ta // ta
Telugu *Dictionary = &te // te
Thai *Dictionary = &th // th
Turkish *Dictionary = &tr // tr
Ukrainian *Dictionary = &uk // uk
Urdu *Dictionary = &ur // ur
Uzbek *Dictionary = &uz // uz
Vietnamese *Dictionary = &vi // vi
Chinese *Dictionary = &zh // zh
SimplifiedChinese *Dictionary = Chinese // zh-Hans
TraditionalChinese *Dictionary = &zhHant // zh-Hant
Zulu *Dictionary = &zu // zu
)
+420
View File
@@ -0,0 +1,420 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run maketables.go -output tables.go
// Package display provides display names for languages, scripts and regions in
// a requested language.
//
// The data is based on CLDR's localeDisplayNames. It includes the names of the
// draft level "contributed" or "approved". The resulting tables are quite
// large. The display package is designed so that users can reduce the linked-in
// table sizes by cherry picking the languages one wishes to support. There is a
// Dictionary defined for a selected set of common languages for this purpose.
package display // import "golang.org/x/text/language/display"
import (
"fmt"
"strings"
"golang.org/x/text/internal/format"
"golang.org/x/text/language"
)
/*
TODO:
All fairly low priority at the moment:
- Include alternative and variants as an option (using func options).
- Option for returning the empty string for undefined values.
- Support variants, currencies, time zones, option names and other data
provided in CLDR.
- Do various optimizations:
- Reduce size of offset tables.
- Consider compressing infrequently used languages and decompress on demand.
*/
// A Formatter formats a tag in the current language. It is used in conjunction
// with the message package.
type Formatter struct {
lookup func(tag int, x interface{}) string
x interface{}
}
// Format implements "golang.org/x/text/internal/format".Formatter.
func (f Formatter) Format(state format.State, verb rune) {
// TODO: there are a lot of inefficiencies in this code. Fix it when we
// language.Tag has embedded compact tags.
t := state.Language()
_, index, _ := matcher.Match(t)
str := f.lookup(index, f.x)
if str == "" {
// TODO: use language-specific punctuation.
// TODO: use codePattern instead of language?
if unknown := f.lookup(index, language.Und); unknown != "" {
fmt.Fprintf(state, "%v (%v)", unknown, f.x)
} else {
fmt.Fprintf(state, "[language: %v]", f.x)
}
} else {
state.Write([]byte(str))
}
}
// Language returns a Formatter that renders the name for lang in the
// the current language. x may be a language.Base or a language.Tag.
// It renders lang in the default language if no translation for the current
// language is supported.
func Language(lang interface{}) Formatter {
return Formatter{langFunc, lang}
}
// Region returns a Formatter that renders the name for region in the current
// language. region may be a language.Region or a language.Tag.
// It renders region in the default language if no translation for the current
// language is supported.
func Region(region interface{}) Formatter {
return Formatter{regionFunc, region}
}
// Script returns a Formatter that renders the name for script in the current
// language. script may be a language.Script or a language.Tag.
// It renders script in the default language if no translation for the current
// language is supported.
func Script(script interface{}) Formatter {
return Formatter{scriptFunc, script}
}
// Script returns a Formatter that renders the name for tag in the current
// language. tag may be a language.Tag.
// It renders tag in the default language if no translation for the current
// language is supported.
func Tag(tag interface{}) Formatter {
return Formatter{tagFunc, tag}
}
// A Namer is used to get the name for a given value, such as a Tag, Language,
// Script or Region.
type Namer interface {
// Name returns a display string for the given value. A Namer returns an
// empty string for values it does not support. A Namer may support naming
// an unspecified value. For example, when getting the name for a region for
// a tag that does not have a defined Region, it may return the name for an
// unknown region. It is up to the user to filter calls to Name for values
// for which one does not want to have a name string.
Name(x interface{}) string
}
var (
// Supported lists the languages for which names are defined.
Supported language.Coverage
// The set of all possible values for which names are defined. Note that not
// all Namer implementations will cover all the values of a given type.
// A Namer will return the empty string for unsupported values.
Values language.Coverage
matcher language.Matcher
)
func init() {
tags := make([]language.Tag, numSupported)
s := supported
for i := range tags {
p := strings.IndexByte(s, '|')
tags[i] = language.Raw.Make(s[:p])
s = s[p+1:]
}
matcher = language.NewMatcher(tags)
Supported = language.NewCoverage(tags)
Values = language.NewCoverage(langTagSet.Tags, supportedScripts, supportedRegions)
}
// Languages returns a Namer for naming languages. It returns nil if there is no
// data for the given tag. The type passed to Name must be either language.Base
// or language.Tag. Note that the result may differ between passing a tag or its
// base language. For example, for English, passing "nl-BE" would return Flemish
// whereas passing "nl" returns "Dutch".
func Languages(t language.Tag) Namer {
if _, index, conf := matcher.Match(t); conf != language.No {
return languageNamer(index)
}
return nil
}
type languageNamer int
func langFunc(i int, x interface{}) string {
return nameLanguage(languageNamer(i), x)
}
func (n languageNamer) name(i int) string {
return lookup(langHeaders[:], int(n), i)
}
// Name implements the Namer interface for language names.
func (n languageNamer) Name(x interface{}) string {
return nameLanguage(n, x)
}
// nonEmptyIndex walks up the parent chain until a non-empty header is found.
// It returns -1 if no index could be found.
func nonEmptyIndex(h []header, index int) int {
for ; index != -1 && h[index].data == ""; index = int(parents[index]) {
}
return index
}
// Scripts returns a Namer for naming scripts. It returns nil if there is no
// data for the given tag. The type passed to Name must be either a
// language.Script or a language.Tag. It will not attempt to infer a script for
// tags with an unspecified script.
func Scripts(t language.Tag) Namer {
if _, index, conf := matcher.Match(t); conf != language.No {
if index = nonEmptyIndex(scriptHeaders[:], index); index != -1 {
return scriptNamer(index)
}
}
return nil
}
type scriptNamer int
func scriptFunc(i int, x interface{}) string {
return nameScript(scriptNamer(i), x)
}
func (n scriptNamer) name(i int) string {
return lookup(scriptHeaders[:], int(n), i)
}
// Name implements the Namer interface for script names.
func (n scriptNamer) Name(x interface{}) string {
return nameScript(n, x)
}
// Regions returns a Namer for naming regions. It returns nil if there is no
// data for the given tag. The type passed to Name must be either a
// language.Region or a language.Tag. It will not attempt to infer a region for
// tags with an unspecified region.
func Regions(t language.Tag) Namer {
if _, index, conf := matcher.Match(t); conf != language.No {
if index = nonEmptyIndex(regionHeaders[:], index); index != -1 {
return regionNamer(index)
}
}
return nil
}
type regionNamer int
func regionFunc(i int, x interface{}) string {
return nameRegion(regionNamer(i), x)
}
func (n regionNamer) name(i int) string {
return lookup(regionHeaders[:], int(n), i)
}
// Name implements the Namer interface for region names.
func (n regionNamer) Name(x interface{}) string {
return nameRegion(n, x)
}
// Tags returns a Namer for giving a full description of a tag. The names of
// scripts and regions that are not already implied by the language name will
// in appended within parentheses. It returns nil if there is not data for the
// given tag. The type passed to Name must be a tag.
func Tags(t language.Tag) Namer {
if _, index, conf := matcher.Match(t); conf != language.No {
return tagNamer(index)
}
return nil
}
type tagNamer int
func tagFunc(i int, x interface{}) string {
return nameTag(languageNamer(i), scriptNamer(i), regionNamer(i), x)
}
// Name implements the Namer interface for tag names.
func (n tagNamer) Name(x interface{}) string {
return nameTag(languageNamer(n), scriptNamer(n), regionNamer(n), x)
}
// lookup finds the name for an entry in a global table, traversing the
// inheritance hierarchy if needed.
func lookup(table []header, dict, want int) string {
for dict != -1 {
if s := table[dict].name(want); s != "" {
return s
}
dict = int(parents[dict])
}
return ""
}
// A Dictionary holds a collection of Namers for a single language. One can
// reduce the amount of data linked in to a binary by only referencing
// Dictionaries for the languages one needs to support instead of using the
// generic Namer factories.
type Dictionary struct {
parent *Dictionary
lang header
script header
region header
}
// Tags returns a Namer for giving a full description of a tag. The names of
// scripts and regions that are not already implied by the language name will
// in appended within parentheses. It returns nil if there is not data for the
// given tag. The type passed to Name must be a tag.
func (d *Dictionary) Tags() Namer {
return dictTags{d}
}
type dictTags struct {
d *Dictionary
}
// Name implements the Namer interface for tag names.
func (n dictTags) Name(x interface{}) string {
return nameTag(dictLanguages{n.d}, dictScripts{n.d}, dictRegions{n.d}, x)
}
// Languages returns a Namer for naming languages. It returns nil if there is no
// data for the given tag. The type passed to Name must be either language.Base
// or language.Tag. Note that the result may differ between passing a tag or its
// base language. For example, for English, passing "nl-BE" would return Flemish
// whereas passing "nl" returns "Dutch".
func (d *Dictionary) Languages() Namer {
return dictLanguages{d}
}
type dictLanguages struct {
d *Dictionary
}
func (n dictLanguages) name(i int) string {
for d := n.d; d != nil; d = d.parent {
if s := d.lang.name(i); s != "" {
return s
}
}
return ""
}
// Name implements the Namer interface for language names.
func (n dictLanguages) Name(x interface{}) string {
return nameLanguage(n, x)
}
// Scripts returns a Namer for naming scripts. It returns nil if there is no
// data for the given tag. The type passed to Name must be either a
// language.Script or a language.Tag. It will not attempt to infer a script for
// tags with an unspecified script.
func (d *Dictionary) Scripts() Namer {
return dictScripts{d}
}
type dictScripts struct {
d *Dictionary
}
func (n dictScripts) name(i int) string {
for d := n.d; d != nil; d = d.parent {
if s := d.script.name(i); s != "" {
return s
}
}
return ""
}
// Name implements the Namer interface for script names.
func (n dictScripts) Name(x interface{}) string {
return nameScript(n, x)
}
// Regions returns a Namer for naming regions. It returns nil if there is no
// data for the given tag. The type passed to Name must be either a
// language.Region or a language.Tag. It will not attempt to infer a region for
// tags with an unspecified region.
func (d *Dictionary) Regions() Namer {
return dictRegions{d}
}
type dictRegions struct {
d *Dictionary
}
func (n dictRegions) name(i int) string {
for d := n.d; d != nil; d = d.parent {
if s := d.region.name(i); s != "" {
return s
}
}
return ""
}
// Name implements the Namer interface for region names.
func (n dictRegions) Name(x interface{}) string {
return nameRegion(n, x)
}
// A SelfNamer implements a Namer that returns the name of language in this same
// language. It provides a very compact mechanism to provide a comprehensive
// list of languages to users in their native language.
type SelfNamer struct {
// Supported defines the values supported by this Namer.
Supported language.Coverage
}
var (
// Self is a shared instance of a SelfNamer.
Self *SelfNamer = &self
self = SelfNamer{language.NewCoverage(selfTagSet.Tags)}
)
// Name returns the name of a given language tag in the language identified by
// this tag. It supports both the language.Base and language.Tag types.
func (n SelfNamer) Name(x interface{}) string {
t, _ := language.All.Compose(x)
base, scr, reg := t.Raw()
baseScript := language.Script{}
if (scr == language.Script{} && reg != language.Region{}) {
// For looking up in the self dictionary, we need to select the
// maximized script. This is even the case if the script isn't
// specified.
s1, _ := t.Script()
if baseScript = getScript(base); baseScript != s1 {
scr = s1
}
}
i, scr, reg := selfTagSet.index(base, scr, reg)
if i == -1 {
return ""
}
// Only return the display name if the script matches the expected script.
if (scr != language.Script{}) {
if (baseScript == language.Script{}) {
baseScript = getScript(base)
}
if baseScript != scr {
return ""
}
}
return selfHeaders[0].name(i)
}
// getScript returns the maximized script for a base language.
func getScript(b language.Base) language.Script {
tag, _ := language.Raw.Compose(b)
scr, _ := tag.Script()
return scr
}
+251
View File
@@ -0,0 +1,251 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package display
// This file contains common lookup code that is shared between the various
// implementations of Namer and Dictionaries.
import (
"fmt"
"sort"
"strings"
"golang.org/x/text/language"
)
type namer interface {
// name gets the string for the given index. It should walk the
// inheritance chain if a value is not present in the base index.
name(idx int) string
}
func nameLanguage(n namer, x interface{}) string {
t, _ := language.All.Compose(x)
for {
i, _, _ := langTagSet.index(t.Raw())
if s := n.name(i); s != "" {
return s
}
if t = t.Parent(); t == language.Und {
return ""
}
}
}
func nameScript(n namer, x interface{}) string {
t, _ := language.DeprecatedScript.Compose(x)
_, s, _ := t.Raw()
return n.name(scriptIndex.index(s.String()))
}
func nameRegion(n namer, x interface{}) string {
t, _ := language.DeprecatedRegion.Compose(x)
_, _, r := t.Raw()
return n.name(regionIndex.index(r.String()))
}
func nameTag(langN, scrN, regN namer, x interface{}) string {
t, ok := x.(language.Tag)
if !ok {
return ""
}
const form = language.All &^ language.SuppressScript
if c, err := form.Canonicalize(t); err == nil {
t = c
}
_, sRaw, rRaw := t.Raw()
i, scr, reg := langTagSet.index(t.Raw())
for i != -1 {
if str := langN.name(i); str != "" {
if hasS, hasR := (scr != language.Script{}), (reg != language.Region{}); hasS || hasR {
ss, sr := "", ""
if hasS {
ss = scrN.name(scriptIndex.index(scr.String()))
}
if hasR {
sr = regN.name(regionIndex.index(reg.String()))
}
// TODO: use patterns in CLDR or at least confirm they are the
// same for all languages.
if ss != "" && sr != "" {
return fmt.Sprintf("%s (%s, %s)", str, ss, sr)
}
if ss != "" || sr != "" {
return fmt.Sprintf("%s (%s%s)", str, ss, sr)
}
}
return str
}
scr, reg = sRaw, rRaw
if t = t.Parent(); t == language.Und {
return ""
}
i, _, _ = langTagSet.index(t.Raw())
}
return ""
}
// header contains the data and indexes for a single namer.
// data contains a series of strings concatenated into one. index contains the
// offsets for a string in data. For example, consider a header that defines
// strings for the languages de, el, en, fi, and nl:
//
// header{
// data: "GermanGreekEnglishDutch",
// index: []uint16{ 0, 6, 11, 18, 18, 23 },
// }
//
// For a language with index i, the string is defined by
// data[index[i]:index[i+1]]. So the number of elements in index is always one
// greater than the number of languages for which header defines a value.
// A string for a language may be empty, which means the name is undefined. In
// the above example, the name for fi (Finnish) is undefined.
type header struct {
data string
index []uint16
}
// name looks up the name for a tag in the dictionary, given its index.
func (h *header) name(i int) string {
if 0 <= i && i < len(h.index)-1 {
return h.data[h.index[i]:h.index[i+1]]
}
return ""
}
// tagSet is used to find the index of a language in a set of tags.
type tagSet struct {
single tagIndex
long []string
}
var (
langTagSet = tagSet{
single: langIndex,
long: langTagsLong,
}
// selfTagSet is used for indexing the language strings in their own
// language.
selfTagSet = tagSet{
single: selfIndex,
long: selfTagsLong,
}
zzzz = language.MustParseScript("Zzzz")
zz = language.MustParseRegion("ZZ")
)
// index returns the index of the tag for the given base, script and region or
// its parent if the tag is not available. If the match is for a parent entry,
// the excess script and region are returned.
func (ts *tagSet) index(base language.Base, scr language.Script, reg language.Region) (int, language.Script, language.Region) {
lang := base.String()
index := -1
if (scr != language.Script{} || reg != language.Region{}) {
if scr == zzzz {
scr = language.Script{}
}
if reg == zz {
reg = language.Region{}
}
i := sort.SearchStrings(ts.long, lang)
// All entries have either a script or a region and not both.
scrStr, regStr := scr.String(), reg.String()
for ; i < len(ts.long) && strings.HasPrefix(ts.long[i], lang); i++ {
if s := ts.long[i][len(lang)+1:]; s == scrStr {
scr = language.Script{}
index = i + ts.single.len()
break
} else if s == regStr {
reg = language.Region{}
index = i + ts.single.len()
break
}
}
}
if index == -1 {
index = ts.single.index(lang)
}
return index, scr, reg
}
func (ts *tagSet) Tags() []language.Tag {
tags := make([]language.Tag, 0, ts.single.len()+len(ts.long))
ts.single.keys(func(s string) {
tags = append(tags, language.Raw.MustParse(s))
})
for _, s := range ts.long {
tags = append(tags, language.Raw.MustParse(s))
}
return tags
}
func supportedScripts() []language.Script {
scr := make([]language.Script, 0, scriptIndex.len())
scriptIndex.keys(func(s string) {
scr = append(scr, language.MustParseScript(s))
})
return scr
}
func supportedRegions() []language.Region {
reg := make([]language.Region, 0, regionIndex.len())
regionIndex.keys(func(s string) {
reg = append(reg, language.MustParseRegion(s))
})
return reg
}
// tagIndex holds a concatenated lists of subtags of length 2 to 4, one string
// for each length, which can be used in combination with binary search to get
// the index associated with a tag.
// For example, a tagIndex{
// "arenesfrruzh", // 6 2-byte tags.
// "barwae", // 2 3-byte tags.
// "",
// }
// would mean that the 2-byte tag "fr" had an index of 3, and the 3-byte tag
// "wae" had an index of 7.
type tagIndex [3]string
func (t *tagIndex) index(s string) int {
sz := len(s)
if sz < 2 || 4 < sz {
return -1
}
a := t[sz-2]
index := sort.Search(len(a)/sz, func(i int) bool {
p := i * sz
return a[p:p+sz] >= s
})
p := index * sz
if end := p + sz; end > len(a) || a[p:end] != s {
return -1
}
// Add the number of tags for smaller sizes.
for i := 0; i < sz-2; i++ {
index += len(t[i]) / (i + 2)
}
return index
}
// len returns the number of tags that are contained in the tagIndex.
func (t *tagIndex) len() (n int) {
for i, s := range t {
n += len(s) / (i + 2)
}
return n
}
// keys calls f for each tag.
func (t *tagIndex) keys(f func(key string)) {
for i, s := range *t {
for ; s != ""; s = s[i+2:] {
f(s[:i+2])
}
}
}
+602
View File
@@ -0,0 +1,602 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
// Generator for display name tables.
package main
import (
"bytes"
"flag"
"fmt"
"log"
"reflect"
"sort"
"strings"
"golang.org/x/text/internal/gen"
"golang.org/x/text/language"
"golang.org/x/text/unicode/cldr"
)
var (
test = flag.Bool("test", false,
"test existing tables; can be used to compare web data with package data.")
outputFile = flag.String("output", "tables.go", "output file")
stats = flag.Bool("stats", false, "prints statistics to stderr")
short = flag.Bool("short", false, `Use "short" alternatives, when available.`)
draft = flag.String("draft",
"contributed",
`Minimal draft requirements (approved, contributed, provisional, unconfirmed).`)
pkg = flag.String("package",
"display",
"the name of the package in which the generated file is to be included")
tags = newTagSet("tags",
[]language.Tag{},
"space-separated list of tags to include or empty for all")
dict = newTagSet("dict",
dictTags(),
"space-separated list or tags for which to include a Dictionary. "+
`"" means the common list from go.text/language.`)
)
func dictTags() (tag []language.Tag) {
// TODO: replace with language.Common.Tags() once supported.
const str = "af am ar ar-001 az bg bn ca cs da de el en en-US en-GB " +
"es es-ES es-419 et fa fi fil fr fr-CA gu he hi hr hu hy id is it ja " +
"ka kk km kn ko ky lo lt lv mk ml mn mr ms my ne nl no pa pl pt pt-BR " +
"pt-PT ro ru si sk sl sq sr sr-Latn sv sw ta te th tr uk ur uz vi " +
"zh zh-Hans zh-Hant zu"
for _, s := range strings.Split(str, " ") {
tag = append(tag, language.MustParse(s))
}
return tag
}
func main() {
gen.Init()
// Read the CLDR zip file.
r := gen.OpenCLDRCoreZip()
defer r.Close()
d := &cldr.Decoder{}
d.SetDirFilter("main", "supplemental")
d.SetSectionFilter("localeDisplayNames")
data, err := d.DecodeZip(r)
if err != nil {
log.Fatalf("DecodeZip: %v", err)
}
w := gen.NewCodeWriter()
defer w.WriteGoFile(*outputFile, "display")
gen.WriteCLDRVersion(w)
b := builder{
w: w,
data: data,
group: make(map[string]*group),
}
b.generate()
}
const tagForm = language.All
// tagSet is used to parse command line flags of tags. It implements the
// flag.Value interface.
type tagSet map[language.Tag]bool
func newTagSet(name string, tags []language.Tag, usage string) tagSet {
f := tagSet(make(map[language.Tag]bool))
for _, t := range tags {
f[t] = true
}
flag.Var(f, name, usage)
return f
}
// String implements the String method of the flag.Value interface.
func (f tagSet) String() string {
tags := []string{}
for t := range f {
tags = append(tags, t.String())
}
sort.Strings(tags)
return strings.Join(tags, " ")
}
// Set implements Set from the flag.Value interface.
func (f tagSet) Set(s string) error {
if s != "" {
for _, s := range strings.Split(s, " ") {
if s != "" {
tag, err := tagForm.Parse(s)
if err != nil {
return err
}
f[tag] = true
}
}
}
return nil
}
func (f tagSet) contains(t language.Tag) bool {
if len(f) == 0 {
return true
}
return f[t]
}
// builder is used to create all tables with display name information.
type builder struct {
w *gen.CodeWriter
data *cldr.CLDR
fromLocs []string
// destination tags for the current locale.
toTags []string
toTagIndex map[string]int
// list of supported tags
supported []language.Tag
// key-value pairs per group
group map[string]*group
// statistics
sizeIndex int // total size of all indexes of headers
sizeData int // total size of all data of headers
totalSize int
}
type group struct {
// Maps from a given language to the Namer data for this language.
lang map[language.Tag]keyValues
headers []header
toTags []string
threeStart int
fourPlusStart int
}
// set sets the typ to the name for locale loc.
func (g *group) set(t language.Tag, typ, name string) {
kv := g.lang[t]
if kv == nil {
kv = make(keyValues)
g.lang[t] = kv
}
if kv[typ] == "" {
kv[typ] = name
}
}
type keyValues map[string]string
type header struct {
tag language.Tag
data string
index []uint16
}
var versionInfo = `// Version is deprecated. Use CLDRVersion.
const Version = %#v
`
var self = language.MustParse("mul")
// generate builds and writes all tables.
func (b *builder) generate() {
fmt.Fprintf(b.w, versionInfo, cldr.Version)
b.filter()
b.setData("lang", func(g *group, loc language.Tag, ldn *cldr.LocaleDisplayNames) {
if ldn.Languages != nil {
for _, v := range ldn.Languages.Language {
lang := v.Type
if lang == "root" {
// We prefer the data from "und"
// TODO: allow both the data for root and und somehow.
continue
}
tag := tagForm.MustParse(lang)
if tags.contains(tag) {
g.set(loc, tag.String(), v.Data())
}
}
}
})
b.setData("script", func(g *group, loc language.Tag, ldn *cldr.LocaleDisplayNames) {
if ldn.Scripts != nil {
for _, v := range ldn.Scripts.Script {
code := language.MustParseScript(v.Type)
if code.IsPrivateUse() { // Qaaa..Qabx
// TODO: data currently appears to be very meager.
// Reconsider if we have data for English.
if loc == language.English {
log.Fatal("Consider including data for private use scripts.")
}
continue
}
g.set(loc, code.String(), v.Data())
}
}
})
b.setData("region", func(g *group, loc language.Tag, ldn *cldr.LocaleDisplayNames) {
if ldn.Territories != nil {
for _, v := range ldn.Territories.Territory {
g.set(loc, language.MustParseRegion(v.Type).String(), v.Data())
}
}
})
b.makeSupported()
b.writeParents()
b.writeGroup("lang")
b.writeGroup("script")
b.writeGroup("region")
b.w.WriteConst("numSupported", len(b.supported))
buf := bytes.Buffer{}
for _, tag := range b.supported {
fmt.Fprint(&buf, tag.String(), "|")
}
b.w.WriteConst("supported", buf.String())
b.writeDictionaries()
b.supported = []language.Tag{self}
// Compute the names of locales in their own language. Some of these names
// may be specified in their parent locales. We iterate the maximum depth
// of the parent three times to match successive parents of tags until a
// possible match is found.
for i := 0; i < 4; i++ {
b.setData("self", func(g *group, tag language.Tag, ldn *cldr.LocaleDisplayNames) {
parent := tag
if b, s, r := tag.Raw(); i > 0 && (s != language.Script{} && r == language.Region{}) {
parent, _ = language.Raw.Compose(b)
}
if ldn.Languages != nil {
for _, v := range ldn.Languages.Language {
key := tagForm.MustParse(v.Type)
saved := key
if key == parent {
g.set(self, tag.String(), v.Data())
}
for k := 0; k < i; k++ {
key = key.Parent()
}
if key == tag {
g.set(self, saved.String(), v.Data()) // set does not overwrite a value.
}
}
}
})
}
b.writeGroup("self")
}
func (b *builder) setData(name string, f func(*group, language.Tag, *cldr.LocaleDisplayNames)) {
b.sizeIndex = 0
b.sizeData = 0
b.toTags = nil
b.fromLocs = nil
b.toTagIndex = make(map[string]int)
g := b.group[name]
if g == nil {
g = &group{lang: make(map[language.Tag]keyValues)}
b.group[name] = g
}
for _, loc := range b.data.Locales() {
// We use RawLDML instead of LDML as we are managing our own inheritance
// in this implementation.
ldml := b.data.RawLDML(loc)
// We do not support the POSIX variant (it is not a supported BCP 47
// variant). This locale also doesn't happen to contain any data, so
// we'll skip it by checking for this.
tag, err := tagForm.Parse(loc)
if err != nil {
if ldml.LocaleDisplayNames != nil {
log.Fatalf("setData: %v", err)
}
continue
}
if ldml.LocaleDisplayNames != nil && tags.contains(tag) {
f(g, tag, ldml.LocaleDisplayNames)
}
}
}
func (b *builder) filter() {
filter := func(s *cldr.Slice) {
if *short {
s.SelectOnePerGroup("alt", []string{"short", ""})
} else {
s.SelectOnePerGroup("alt", []string{"stand-alone", ""})
}
d, err := cldr.ParseDraft(*draft)
if err != nil {
log.Fatalf("filter: %v", err)
}
s.SelectDraft(d)
}
for _, loc := range b.data.Locales() {
if ldn := b.data.RawLDML(loc).LocaleDisplayNames; ldn != nil {
if ldn.Languages != nil {
s := cldr.MakeSlice(&ldn.Languages.Language)
if filter(&s); len(ldn.Languages.Language) == 0 {
ldn.Languages = nil
}
}
if ldn.Scripts != nil {
s := cldr.MakeSlice(&ldn.Scripts.Script)
if filter(&s); len(ldn.Scripts.Script) == 0 {
ldn.Scripts = nil
}
}
if ldn.Territories != nil {
s := cldr.MakeSlice(&ldn.Territories.Territory)
if filter(&s); len(ldn.Territories.Territory) == 0 {
ldn.Territories = nil
}
}
}
}
}
// makeSupported creates a list of all supported locales.
func (b *builder) makeSupported() {
// tags across groups
for _, g := range b.group {
for t, _ := range g.lang {
b.supported = append(b.supported, t)
}
}
b.supported = b.supported[:unique(tagsSorter(b.supported))]
}
type tagsSorter []language.Tag
func (a tagsSorter) Len() int { return len(a) }
func (a tagsSorter) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a tagsSorter) Less(i, j int) bool { return a[i].String() < a[j].String() }
func (b *builder) writeGroup(name string) {
g := b.group[name]
for _, kv := range g.lang {
for t, _ := range kv {
g.toTags = append(g.toTags, t)
}
}
g.toTags = g.toTags[:unique(tagsBySize(g.toTags))]
// Allocate header per supported value.
g.headers = make([]header, len(b.supported))
for i, sup := range b.supported {
kv, ok := g.lang[sup]
if !ok {
g.headers[i].tag = sup
continue
}
data := []byte{}
index := make([]uint16, len(g.toTags), len(g.toTags)+1)
for j, t := range g.toTags {
index[j] = uint16(len(data))
data = append(data, kv[t]...)
}
index = append(index, uint16(len(data)))
// Trim the tail of the index.
// TODO: indexes can be reduced in size quite a bit more.
n := len(index)
for ; n >= 2 && index[n-2] == index[n-1]; n-- {
}
index = index[:n]
// Workaround for a bug in CLDR 26.
// See http://unicode.org/cldr/trac/ticket/8042.
if cldr.Version == "26" && sup.String() == "hsb" {
data = bytes.Replace(data, []byte{'"'}, nil, 1)
}
g.headers[i] = header{sup, string(data), index}
}
g.writeTable(b.w, name)
}
type tagsBySize []string
func (l tagsBySize) Len() int { return len(l) }
func (l tagsBySize) Swap(i, j int) { l[i], l[j] = l[j], l[i] }
func (l tagsBySize) Less(i, j int) bool {
a, b := l[i], l[j]
// Sort single-tag entries based on size first. Otherwise alphabetic.
if len(a) != len(b) && (len(a) <= 4 || len(b) <= 4) {
return len(a) < len(b)
}
return a < b
}
// parentIndices returns slice a of len(tags) where tags[a[i]] is the parent
// of tags[i].
func parentIndices(tags []language.Tag) []int16 {
index := make(map[language.Tag]int16)
for i, t := range tags {
index[t] = int16(i)
}
// Construct default parents.
parents := make([]int16, len(tags))
for i, t := range tags {
parents[i] = -1
for t = t.Parent(); t != language.Und; t = t.Parent() {
if j, ok := index[t]; ok {
parents[i] = j
break
}
}
}
return parents
}
func (b *builder) writeParents() {
parents := parentIndices(b.supported)
fmt.Fprintf(b.w, "var parents = ")
b.w.WriteArray(parents)
}
// writeKeys writes keys to a special index used by the display package.
// tags are assumed to be sorted by length.
func writeKeys(w *gen.CodeWriter, name string, keys []string) {
w.Size += int(3 * reflect.TypeOf("").Size())
w.WriteComment("Number of keys: %d", len(keys))
fmt.Fprintf(w, "var (\n\t%sIndex = tagIndex{\n", name)
for i := 2; i <= 4; i++ {
sub := []string{}
for _, t := range keys {
if len(t) != i {
break
}
sub = append(sub, t)
}
s := strings.Join(sub, "")
w.WriteString(s)
fmt.Fprintf(w, ",\n")
keys = keys[len(sub):]
}
fmt.Fprintln(w, "\t}")
if len(keys) > 0 {
w.Size += int(reflect.TypeOf([]string{}).Size())
fmt.Fprintf(w, "\t%sTagsLong = ", name)
w.WriteSlice(keys)
}
fmt.Fprintln(w, ")\n")
}
// identifier creates an identifier from the given tag.
func identifier(t language.Tag) string {
return strings.Replace(t.String(), "-", "", -1)
}
func (h *header) writeEntry(w *gen.CodeWriter, name string) {
if len(dict) > 0 && dict.contains(h.tag) {
fmt.Fprintf(w, "\t{ // %s\n", h.tag)
fmt.Fprintf(w, "\t\t%[1]s%[2]sStr,\n\t\t%[1]s%[2]sIdx,\n", identifier(h.tag), name)
fmt.Fprintln(w, "\t},")
} else if len(h.data) == 0 {
fmt.Fprintln(w, "\t\t{}, //", h.tag)
} else {
fmt.Fprintf(w, "\t{ // %s\n", h.tag)
w.WriteString(h.data)
fmt.Fprintln(w, ",")
w.WriteSlice(h.index)
fmt.Fprintln(w, ",\n\t},")
}
}
// write the data for the given header as single entries. The size for this data
// was already accounted for in writeEntry.
func (h *header) writeSingle(w *gen.CodeWriter, name string) {
if len(dict) > 0 && dict.contains(h.tag) {
tag := identifier(h.tag)
w.WriteConst(tag+name+"Str", h.data)
// Note that we create a slice instead of an array. If we use an array
// we need to refer to it as a[:] in other tables, which will cause the
// array to always be included by the linker. See Issue 7651.
w.WriteVar(tag+name+"Idx", h.index)
}
}
// WriteTable writes an entry for a single Namer.
func (g *group) writeTable(w *gen.CodeWriter, name string) {
start := w.Size
writeKeys(w, name, g.toTags)
w.Size += len(g.headers) * int(reflect.ValueOf(g.headers[0]).Type().Size())
fmt.Fprintf(w, "var %sHeaders = [%d]header{\n", name, len(g.headers))
title := strings.Title(name)
for _, h := range g.headers {
h.writeEntry(w, title)
}
fmt.Fprintln(w, "}\n")
for _, h := range g.headers {
h.writeSingle(w, title)
}
n := w.Size - start
fmt.Fprintf(w, "// Total size for %s: %d bytes (%d KB)\n\n", name, n, n/1000)
}
func (b *builder) writeDictionaries() {
fmt.Fprintln(b.w, "// Dictionary entries of frequent languages")
fmt.Fprintln(b.w, "var (")
parents := parentIndices(b.supported)
for i, t := range b.supported {
if dict.contains(t) {
ident := identifier(t)
fmt.Fprintf(b.w, "\t%s = Dictionary{ // %s\n", ident, t)
if p := parents[i]; p == -1 {
fmt.Fprintln(b.w, "\t\tnil,")
} else {
fmt.Fprintf(b.w, "\t\t&%s,\n", identifier(b.supported[p]))
}
fmt.Fprintf(b.w, "\t\theader{%[1]sLangStr, %[1]sLangIdx},\n", ident)
fmt.Fprintf(b.w, "\t\theader{%[1]sScriptStr, %[1]sScriptIdx},\n", ident)
fmt.Fprintf(b.w, "\t\theader{%[1]sRegionStr, %[1]sRegionIdx},\n", ident)
fmt.Fprintln(b.w, "\t}")
}
}
fmt.Fprintln(b.w, ")")
var s string
var a []uint16
sz := reflect.TypeOf(s).Size()
sz += reflect.TypeOf(a).Size()
sz *= 3
sz += reflect.TypeOf(&a).Size()
n := int(sz) * len(dict)
fmt.Fprintf(b.w, "// Total size for %d entries: %d bytes (%d KB)\n\n", len(dict), n, n/1000)
b.w.Size += n
}
// unique sorts the given lists and removes duplicate entries by swapping them
// past position k, where k is the number of unique values. It returns k.
func unique(a sort.Interface) int {
if a.Len() == 0 {
return 0
}
sort.Sort(a)
k := 1
for i := 1; i < a.Len(); i++ {
if a.Less(k-1, i) {
if k != i {
a.Swap(k, i)
}
k++
}
}
return k
}
File diff suppressed because it is too large Load Diff
+102
View File
@@ -0,0 +1,102 @@
// Copyright 2017 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package language implements BCP 47 language tags and related functionality.
//
// The most important function of package language is to match a list of
// user-preferred languages to a list of supported languages.
// It alleviates the developer of dealing with the complexity of this process
// and provides the user with the best experience
// (see https://blog.golang.org/matchlang).
//
//
// Matching preferred against supported languages
//
// A Matcher for an application that supports English, Australian English,
// Danish, and standard Mandarin can be created as follows:
//
// var matcher = language.NewMatcher([]language.Tag{
// language.English, // The first language is used as fallback.
// language.MustParse("en-AU"),
// language.Danish,
// language.Chinese,
// })
//
// This list of supported languages is typically implied by the languages for
// which there exists translations of the user interface.
//
// User-preferred languages usually come as a comma-separated list of BCP 47
// language tags.
// The MatchString finds best matches for such strings:
//
// handler(w http.ResponseWriter, r *http.Request) {
// lang, _ := r.Cookie("lang")
// accept := r.Header.Get("Accept-Language")
// tag, _ := language.MatchStrings(matcher, lang.String(), accept)
//
// // tag should now be used for the initialization of any
// // locale-specific service.
// }
//
// The Matcher's Match method can be used to match Tags directly.
//
// Matchers are aware of the intricacies of equivalence between languages, such
// as deprecated subtags, legacy tags, macro languages, mutual
// intelligibility between scripts and languages, and transparently passing
// BCP 47 user configuration.
// For instance, it will know that a reader of Bokmål Danish can read Norwegian
// and will know that Cantonese ("yue") is a good match for "zh-HK".
//
//
// Using match results
//
// To guarantee a consistent user experience to the user it is important to
// use the same language tag for the selection of any locale-specific services.
// For example, it is utterly confusing to substitute spelled-out numbers
// or dates in one language in text of another language.
// More subtly confusing is using the wrong sorting order or casing
// algorithm for a certain language.
//
// All the packages in x/text that provide locale-specific services
// (e.g. collate, cases) should be initialized with the tag that was
// obtained at the start of an interaction with the user.
//
// Note that Tag that is returned by Match and MatchString may differ from any
// of the supported languages, as it may contain carried over settings from
// the user tags.
// This may be inconvenient when your application has some additional
// locale-specific data for your supported languages.
// Match and MatchString both return the index of the matched supported tag
// to simplify associating such data with the matched tag.
//
//
// Canonicalization
//
// If one uses the Matcher to compare languages one does not need to
// worry about canonicalization.
//
// The meaning of a Tag varies per application. The language package
// therefore delays canonicalization and preserves information as much
// as possible. The Matcher, however, will always take into account that
// two different tags may represent the same language.
//
// By default, only legacy and deprecated tags are converted into their
// canonical equivalent. All other information is preserved. This approach makes
// the confidence scores more accurate and allows matchers to distinguish
// between variants that are otherwise lost.
//
// As a consequence, two tags that should be treated as identical according to
// BCP 47 or CLDR, like "en-Latn" and "en", will be represented differently. The
// Matcher handles such distinctions, though, and is aware of the
// equivalence relations. The CanonType type can be used to alter the
// canonicalization form.
//
// References
//
// BCP 47 - Tags for Identifying Languages http://tools.ietf.org/html/bcp47
//
package language // import "golang.org/x/text/language"
// TODO: explanation on how to match languages for your own locale-specific
// service.
+1712
View File
File diff suppressed because it is too large Load Diff
+20
View File
@@ -0,0 +1,20 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This file contains code common to the maketables.go and the package code.
// langAliasType is the type of an alias in langAliasMap.
type langAliasType int8
const (
langDeprecated langAliasType = iota
langMacro
langLegacy
langAliasTypeUnknown langAliasType = -1
)
+162
View File
@@ -0,0 +1,162 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build ignore
package main
// This file generates derivative tables based on the language package itself.
import (
"bytes"
"flag"
"fmt"
"io/ioutil"
"log"
"reflect"
"sort"
"strings"
"golang.org/x/text/internal/gen"
"golang.org/x/text/language"
"golang.org/x/text/unicode/cldr"
)
var (
test = flag.Bool("test", false,
"test existing tables; can be used to compare web data with package data.")
draft = flag.String("draft",
"contributed",
`Minimal draft requirements (approved, contributed, provisional, unconfirmed).`)
)
func main() {
gen.Init()
// Read the CLDR zip file.
r := gen.OpenCLDRCoreZip()
defer r.Close()
d := &cldr.Decoder{}
data, err := d.DecodeZip(r)
if err != nil {
log.Fatalf("DecodeZip: %v", err)
}
w := gen.NewCodeWriter()
defer func() {
buf := &bytes.Buffer{}
if _, err = w.WriteGo(buf, "language", ""); err != nil {
log.Fatalf("Error formatting file index.go: %v", err)
}
// Since we're generating a table for our own package we need to rewrite
// doing the equivalent of go fmt -r 'language.b -> b'. Using
// bytes.Replace will do.
out := bytes.Replace(buf.Bytes(), []byte("language."), nil, -1)
if err := ioutil.WriteFile("index.go", out, 0600); err != nil {
log.Fatalf("Could not create file index.go: %v", err)
}
}()
m := map[language.Tag]bool{}
for _, lang := range data.Locales() {
// We include all locales unconditionally to be consistent with en_US.
// We want en_US, even though it has no data associated with it.
// TODO: put any of the languages for which no data exists at the end
// of the index. This allows all components based on ICU to use that
// as the cutoff point.
// if x := data.RawLDML(lang); false ||
// x.LocaleDisplayNames != nil ||
// x.Characters != nil ||
// x.Delimiters != nil ||
// x.Measurement != nil ||
// x.Dates != nil ||
// x.Numbers != nil ||
// x.Units != nil ||
// x.ListPatterns != nil ||
// x.Collations != nil ||
// x.Segmentations != nil ||
// x.Rbnf != nil ||
// x.Annotations != nil ||
// x.Metadata != nil {
// TODO: support POSIX natively, albeit non-standard.
tag := language.Make(strings.Replace(lang, "_POSIX", "-u-va-posix", 1))
m[tag] = true
// }
}
// Include locales for plural rules, which uses a different structure.
for _, plurals := range data.Supplemental().Plurals {
for _, rules := range plurals.PluralRules {
for _, lang := range strings.Split(rules.Locales, " ") {
m[language.Make(lang)] = true
}
}
}
var core, special []language.Tag
for t := range m {
if x := t.Extensions(); len(x) != 0 && fmt.Sprint(x) != "[u-va-posix]" {
log.Fatalf("Unexpected extension %v in %v", x, t)
}
if len(t.Variants()) == 0 && len(t.Extensions()) == 0 {
core = append(core, t)
} else {
special = append(special, t)
}
}
w.WriteComment(`
NumCompactTags is the number of common tags. The maximum tag is
NumCompactTags-1.`)
w.WriteConst("NumCompactTags", len(core)+len(special))
sort.Sort(byAlpha(special))
w.WriteVar("specialTags", special)
// TODO: order by frequency?
sort.Sort(byAlpha(core))
// Size computations are just an estimate.
w.Size += int(reflect.TypeOf(map[uint32]uint16{}).Size())
w.Size += len(core) * 6 // size of uint32 and uint16
fmt.Fprintln(w)
fmt.Fprintln(w, "var coreTags = map[uint32]uint16{")
fmt.Fprintln(w, "0x0: 0, // und")
i := len(special) + 1 // Und and special tags already written.
for _, t := range core {
if t == language.Und {
continue
}
fmt.Fprint(w.Hash, t, i)
b, s, r := t.Raw()
fmt.Fprintf(w, "0x%s%s%s: %d, // %s\n",
getIndex(b, 3), // 3 is enough as it is guaranteed to be a compact number
getIndex(s, 2),
getIndex(r, 3),
i, t)
i++
}
fmt.Fprintln(w, "}")
}
// getIndex prints the subtag type and extracts its index of size nibble.
// If the index is less than n nibbles, the result is prefixed with 0s.
func getIndex(x interface{}, n int) string {
s := fmt.Sprintf("%#v", x) // s is of form Type{typeID: 0x00}
s = s[strings.Index(s, "0x")+2 : len(s)-1]
return strings.Repeat("0", n-len(s)) + s
}
type byAlpha []language.Tag
func (a byAlpha) Len() int { return len(a) }
func (a byAlpha) Swap(i, j int) { a[i], a[j] = a[j], a[i] }
func (a byAlpha) Less(i, j int) bool { return a[i].String() < a[j].String() }
+38
View File
@@ -0,0 +1,38 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build !go1.2
package language
import "sort"
func sortStable(s sort.Interface) {
ss := stableSort{
s: s,
pos: make([]int, s.Len()),
}
for i := range ss.pos {
ss.pos[i] = i
}
sort.Sort(&ss)
}
type stableSort struct {
s sort.Interface
pos []int
}
func (s *stableSort) Len() int {
return len(s.pos)
}
func (s *stableSort) Less(i, j int) bool {
return s.s.Less(i, j) || !s.s.Less(j, i) && s.pos[i] < s.pos[j]
}
func (s *stableSort) Swap(i, j int) {
s.s.Swap(i, j)
s.pos[i], s.pos[j] = s.pos[j], s.pos[i]
}
+11
View File
@@ -0,0 +1,11 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// +build go1.2
package language
import "sort"
var sortStable = sort.Stable
+783
View File
@@ -0,0 +1,783 @@
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
package language
// NumCompactTags is the number of common tags. The maximum tag is
// NumCompactTags-1.
const NumCompactTags = 768
var specialTags = []Tag{ // 2 elements
0: {lang: 0xd7, region: 0x6e, script: 0x0, pVariant: 0x5, pExt: 0xe, str: "ca-ES-valencia"},
1: {lang: 0x139, region: 0x135, script: 0x0, pVariant: 0x5, pExt: 0x5, str: "en-US-u-va-posix"},
} // Size: 72 bytes
var coreTags = map[uint32]uint16{
0x0: 0, // und
0x01600000: 3, // af
0x016000d2: 4, // af-NA
0x01600161: 5, // af-ZA
0x01c00000: 6, // agq
0x01c00052: 7, // agq-CM
0x02100000: 8, // ak
0x02100080: 9, // ak-GH
0x02700000: 10, // am
0x0270006f: 11, // am-ET
0x03a00000: 12, // ar
0x03a00001: 13, // ar-001
0x03a00023: 14, // ar-AE
0x03a00039: 15, // ar-BH
0x03a00062: 16, // ar-DJ
0x03a00067: 17, // ar-DZ
0x03a0006b: 18, // ar-EG
0x03a0006c: 19, // ar-EH
0x03a0006d: 20, // ar-ER
0x03a00097: 21, // ar-IL
0x03a0009b: 22, // ar-IQ
0x03a000a1: 23, // ar-JO
0x03a000a8: 24, // ar-KM
0x03a000ac: 25, // ar-KW
0x03a000b0: 26, // ar-LB
0x03a000b9: 27, // ar-LY
0x03a000ba: 28, // ar-MA
0x03a000c9: 29, // ar-MR
0x03a000e1: 30, // ar-OM
0x03a000ed: 31, // ar-PS
0x03a000f3: 32, // ar-QA
0x03a00108: 33, // ar-SA
0x03a0010b: 34, // ar-SD
0x03a00115: 35, // ar-SO
0x03a00117: 36, // ar-SS
0x03a0011c: 37, // ar-SY
0x03a00120: 38, // ar-TD
0x03a00128: 39, // ar-TN
0x03a0015e: 40, // ar-YE
0x04000000: 41, // ars
0x04300000: 42, // as
0x04300099: 43, // as-IN
0x04400000: 44, // asa
0x0440012f: 45, // asa-TZ
0x04800000: 46, // ast
0x0480006e: 47, // ast-ES
0x05800000: 48, // az
0x0581f000: 49, // az-Cyrl
0x0581f032: 50, // az-Cyrl-AZ
0x05857000: 51, // az-Latn
0x05857032: 52, // az-Latn-AZ
0x05e00000: 53, // bas
0x05e00052: 54, // bas-CM
0x07100000: 55, // be
0x07100047: 56, // be-BY
0x07500000: 57, // bem
0x07500162: 58, // bem-ZM
0x07900000: 59, // bez
0x0790012f: 60, // bez-TZ
0x07e00000: 61, // bg
0x07e00038: 62, // bg-BG
0x08200000: 63, // bh
0x0a000000: 64, // bm
0x0a0000c3: 65, // bm-ML
0x0a500000: 66, // bn
0x0a500035: 67, // bn-BD
0x0a500099: 68, // bn-IN
0x0a900000: 69, // bo
0x0a900053: 70, // bo-CN
0x0a900099: 71, // bo-IN
0x0b200000: 72, // br
0x0b200078: 73, // br-FR
0x0b500000: 74, // brx
0x0b500099: 75, // brx-IN
0x0b700000: 76, // bs
0x0b71f000: 77, // bs-Cyrl
0x0b71f033: 78, // bs-Cyrl-BA
0x0b757000: 79, // bs-Latn
0x0b757033: 80, // bs-Latn-BA
0x0d700000: 81, // ca
0x0d700022: 82, // ca-AD
0x0d70006e: 83, // ca-ES
0x0d700078: 84, // ca-FR
0x0d70009e: 85, // ca-IT
0x0db00000: 86, // ccp
0x0db00035: 87, // ccp-BD
0x0db00099: 88, // ccp-IN
0x0dc00000: 89, // ce
0x0dc00106: 90, // ce-RU
0x0df00000: 91, // cgg
0x0df00131: 92, // cgg-UG
0x0e500000: 93, // chr
0x0e500135: 94, // chr-US
0x0e900000: 95, // ckb
0x0e90009b: 96, // ckb-IQ
0x0e90009c: 97, // ckb-IR
0x0fa00000: 98, // cs
0x0fa0005e: 99, // cs-CZ
0x0fe00000: 100, // cu
0x0fe00106: 101, // cu-RU
0x10000000: 102, // cy
0x1000007b: 103, // cy-GB
0x10100000: 104, // da
0x10100063: 105, // da-DK
0x10100082: 106, // da-GL
0x10800000: 107, // dav
0x108000a4: 108, // dav-KE
0x10d00000: 109, // de
0x10d0002e: 110, // de-AT
0x10d00036: 111, // de-BE
0x10d0004e: 112, // de-CH
0x10d00060: 113, // de-DE
0x10d0009e: 114, // de-IT
0x10d000b2: 115, // de-LI
0x10d000b7: 116, // de-LU
0x11700000: 117, // dje
0x117000d4: 118, // dje-NE
0x11f00000: 119, // dsb
0x11f00060: 120, // dsb-DE
0x12400000: 121, // dua
0x12400052: 122, // dua-CM
0x12800000: 123, // dv
0x12b00000: 124, // dyo
0x12b00114: 125, // dyo-SN
0x12d00000: 126, // dz
0x12d00043: 127, // dz-BT
0x12f00000: 128, // ebu
0x12f000a4: 129, // ebu-KE
0x13000000: 130, // ee
0x13000080: 131, // ee-GH
0x13000122: 132, // ee-TG
0x13600000: 133, // el
0x1360005d: 134, // el-CY
0x13600087: 135, // el-GR
0x13900000: 136, // en
0x13900001: 137, // en-001
0x1390001a: 138, // en-150
0x13900025: 139, // en-AG
0x13900026: 140, // en-AI
0x1390002d: 141, // en-AS
0x1390002e: 142, // en-AT
0x1390002f: 143, // en-AU
0x13900034: 144, // en-BB
0x13900036: 145, // en-BE
0x1390003a: 146, // en-BI
0x1390003d: 147, // en-BM
0x13900042: 148, // en-BS
0x13900046: 149, // en-BW
0x13900048: 150, // en-BZ
0x13900049: 151, // en-CA
0x1390004a: 152, // en-CC
0x1390004e: 153, // en-CH
0x13900050: 154, // en-CK
0x13900052: 155, // en-CM
0x1390005c: 156, // en-CX
0x1390005d: 157, // en-CY
0x13900060: 158, // en-DE
0x13900061: 159, // en-DG
0x13900063: 160, // en-DK
0x13900064: 161, // en-DM
0x1390006d: 162, // en-ER
0x13900072: 163, // en-FI
0x13900073: 164, // en-FJ
0x13900074: 165, // en-FK
0x13900075: 166, // en-FM
0x1390007b: 167, // en-GB
0x1390007c: 168, // en-GD
0x1390007f: 169, // en-GG
0x13900080: 170, // en-GH
0x13900081: 171, // en-GI
0x13900083: 172, // en-GM
0x1390008a: 173, // en-GU
0x1390008c: 174, // en-GY
0x1390008d: 175, // en-HK
0x13900096: 176, // en-IE
0x13900097: 177, // en-IL
0x13900098: 178, // en-IM
0x13900099: 179, // en-IN
0x1390009a: 180, // en-IO
0x1390009f: 181, // en-JE
0x139000a0: 182, // en-JM
0x139000a4: 183, // en-KE
0x139000a7: 184, // en-KI
0x139000a9: 185, // en-KN
0x139000ad: 186, // en-KY
0x139000b1: 187, // en-LC
0x139000b4: 188, // en-LR
0x139000b5: 189, // en-LS
0x139000bf: 190, // en-MG
0x139000c0: 191, // en-MH
0x139000c6: 192, // en-MO
0x139000c7: 193, // en-MP
0x139000ca: 194, // en-MS
0x139000cb: 195, // en-MT
0x139000cc: 196, // en-MU
0x139000ce: 197, // en-MW
0x139000d0: 198, // en-MY
0x139000d2: 199, // en-NA
0x139000d5: 200, // en-NF
0x139000d6: 201, // en-NG
0x139000d9: 202, // en-NL
0x139000dd: 203, // en-NR
0x139000df: 204, // en-NU
0x139000e0: 205, // en-NZ
0x139000e6: 206, // en-PG
0x139000e7: 207, // en-PH
0x139000e8: 208, // en-PK
0x139000eb: 209, // en-PN
0x139000ec: 210, // en-PR
0x139000f0: 211, // en-PW
0x13900107: 212, // en-RW
0x13900109: 213, // en-SB
0x1390010a: 214, // en-SC
0x1390010b: 215, // en-SD
0x1390010c: 216, // en-SE
0x1390010d: 217, // en-SG
0x1390010e: 218, // en-SH
0x1390010f: 219, // en-SI
0x13900112: 220, // en-SL
0x13900117: 221, // en-SS
0x1390011b: 222, // en-SX
0x1390011d: 223, // en-SZ
0x1390011f: 224, // en-TC
0x13900125: 225, // en-TK
0x13900129: 226, // en-TO
0x1390012c: 227, // en-TT
0x1390012d: 228, // en-TV
0x1390012f: 229, // en-TZ
0x13900131: 230, // en-UG
0x13900133: 231, // en-UM
0x13900135: 232, // en-US
0x13900139: 233, // en-VC
0x1390013c: 234, // en-VG
0x1390013d: 235, // en-VI
0x1390013f: 236, // en-VU
0x13900142: 237, // en-WS
0x13900161: 238, // en-ZA
0x13900162: 239, // en-ZM
0x13900164: 240, // en-ZW
0x13c00000: 241, // eo
0x13c00001: 242, // eo-001
0x13e00000: 243, // es
0x13e0001f: 244, // es-419
0x13e0002c: 245, // es-AR
0x13e0003f: 246, // es-BO
0x13e00041: 247, // es-BR
0x13e00048: 248, // es-BZ
0x13e00051: 249, // es-CL
0x13e00054: 250, // es-CO
0x13e00056: 251, // es-CR
0x13e00059: 252, // es-CU
0x13e00065: 253, // es-DO
0x13e00068: 254, // es-EA
0x13e00069: 255, // es-EC
0x13e0006e: 256, // es-ES
0x13e00086: 257, // es-GQ
0x13e00089: 258, // es-GT
0x13e0008f: 259, // es-HN
0x13e00094: 260, // es-IC
0x13e000cf: 261, // es-MX
0x13e000d8: 262, // es-NI
0x13e000e2: 263, // es-PA
0x13e000e4: 264, // es-PE
0x13e000e7: 265, // es-PH
0x13e000ec: 266, // es-PR
0x13e000f1: 267, // es-PY
0x13e0011a: 268, // es-SV
0x13e00135: 269, // es-US
0x13e00136: 270, // es-UY
0x13e0013b: 271, // es-VE
0x14000000: 272, // et
0x1400006a: 273, // et-EE
0x14500000: 274, // eu
0x1450006e: 275, // eu-ES
0x14600000: 276, // ewo
0x14600052: 277, // ewo-CM
0x14800000: 278, // fa
0x14800024: 279, // fa-AF
0x1480009c: 280, // fa-IR
0x14e00000: 281, // ff
0x14e00052: 282, // ff-CM
0x14e00084: 283, // ff-GN
0x14e000c9: 284, // ff-MR
0x14e00114: 285, // ff-SN
0x15100000: 286, // fi
0x15100072: 287, // fi-FI
0x15300000: 288, // fil
0x153000e7: 289, // fil-PH
0x15800000: 290, // fo
0x15800063: 291, // fo-DK
0x15800076: 292, // fo-FO
0x15e00000: 293, // fr
0x15e00036: 294, // fr-BE
0x15e00037: 295, // fr-BF
0x15e0003a: 296, // fr-BI
0x15e0003b: 297, // fr-BJ
0x15e0003c: 298, // fr-BL
0x15e00049: 299, // fr-CA
0x15e0004b: 300, // fr-CD
0x15e0004c: 301, // fr-CF
0x15e0004d: 302, // fr-CG
0x15e0004e: 303, // fr-CH
0x15e0004f: 304, // fr-CI
0x15e00052: 305, // fr-CM
0x15e00062: 306, // fr-DJ
0x15e00067: 307, // fr-DZ
0x15e00078: 308, // fr-FR
0x15e0007a: 309, // fr-GA
0x15e0007e: 310, // fr-GF
0x15e00084: 311, // fr-GN
0x15e00085: 312, // fr-GP
0x15e00086: 313, // fr-GQ
0x15e00091: 314, // fr-HT
0x15e000a8: 315, // fr-KM
0x15e000b7: 316, // fr-LU
0x15e000ba: 317, // fr-MA
0x15e000bb: 318, // fr-MC
0x15e000be: 319, // fr-MF
0x15e000bf: 320, // fr-MG
0x15e000c3: 321, // fr-ML
0x15e000c8: 322, // fr-MQ
0x15e000c9: 323, // fr-MR
0x15e000cc: 324, // fr-MU
0x15e000d3: 325, // fr-NC
0x15e000d4: 326, // fr-NE
0x15e000e5: 327, // fr-PF
0x15e000ea: 328, // fr-PM
0x15e00102: 329, // fr-RE
0x15e00107: 330, // fr-RW
0x15e0010a: 331, // fr-SC
0x15e00114: 332, // fr-SN
0x15e0011c: 333, // fr-SY
0x15e00120: 334, // fr-TD
0x15e00122: 335, // fr-TG
0x15e00128: 336, // fr-TN
0x15e0013f: 337, // fr-VU
0x15e00140: 338, // fr-WF
0x15e0015f: 339, // fr-YT
0x16900000: 340, // fur
0x1690009e: 341, // fur-IT
0x16d00000: 342, // fy
0x16d000d9: 343, // fy-NL
0x16e00000: 344, // ga
0x16e00096: 345, // ga-IE
0x17e00000: 346, // gd
0x17e0007b: 347, // gd-GB
0x19000000: 348, // gl
0x1900006e: 349, // gl-ES
0x1a300000: 350, // gsw
0x1a30004e: 351, // gsw-CH
0x1a300078: 352, // gsw-FR
0x1a3000b2: 353, // gsw-LI
0x1a400000: 354, // gu
0x1a400099: 355, // gu-IN
0x1a900000: 356, // guw
0x1ab00000: 357, // guz
0x1ab000a4: 358, // guz-KE
0x1ac00000: 359, // gv
0x1ac00098: 360, // gv-IM
0x1b400000: 361, // ha
0x1b400080: 362, // ha-GH
0x1b4000d4: 363, // ha-NE
0x1b4000d6: 364, // ha-NG
0x1b800000: 365, // haw
0x1b800135: 366, // haw-US
0x1bc00000: 367, // he
0x1bc00097: 368, // he-IL
0x1be00000: 369, // hi
0x1be00099: 370, // hi-IN
0x1d100000: 371, // hr
0x1d100033: 372, // hr-BA
0x1d100090: 373, // hr-HR
0x1d200000: 374, // hsb
0x1d200060: 375, // hsb-DE
0x1d500000: 376, // hu
0x1d500092: 377, // hu-HU
0x1d700000: 378, // hy
0x1d700028: 379, // hy-AM
0x1e100000: 380, // id
0x1e100095: 381, // id-ID
0x1e700000: 382, // ig
0x1e7000d6: 383, // ig-NG
0x1ea00000: 384, // ii
0x1ea00053: 385, // ii-CN
0x1f500000: 386, // io
0x1f800000: 387, // is
0x1f80009d: 388, // is-IS
0x1f900000: 389, // it
0x1f90004e: 390, // it-CH
0x1f90009e: 391, // it-IT
0x1f900113: 392, // it-SM
0x1f900138: 393, // it-VA
0x1fa00000: 394, // iu
0x20000000: 395, // ja
0x200000a2: 396, // ja-JP
0x20300000: 397, // jbo
0x20700000: 398, // jgo
0x20700052: 399, // jgo-CM
0x20a00000: 400, // jmc
0x20a0012f: 401, // jmc-TZ
0x20e00000: 402, // jv
0x21000000: 403, // ka
0x2100007d: 404, // ka-GE
0x21200000: 405, // kab
0x21200067: 406, // kab-DZ
0x21600000: 407, // kaj
0x21700000: 408, // kam
0x217000a4: 409, // kam-KE
0x21f00000: 410, // kcg
0x22300000: 411, // kde
0x2230012f: 412, // kde-TZ
0x22700000: 413, // kea
0x2270005a: 414, // kea-CV
0x23400000: 415, // khq
0x234000c3: 416, // khq-ML
0x23900000: 417, // ki
0x239000a4: 418, // ki-KE
0x24200000: 419, // kk
0x242000ae: 420, // kk-KZ
0x24400000: 421, // kkj
0x24400052: 422, // kkj-CM
0x24500000: 423, // kl
0x24500082: 424, // kl-GL
0x24600000: 425, // kln
0x246000a4: 426, // kln-KE
0x24a00000: 427, // km
0x24a000a6: 428, // km-KH
0x25100000: 429, // kn
0x25100099: 430, // kn-IN
0x25400000: 431, // ko
0x254000aa: 432, // ko-KP
0x254000ab: 433, // ko-KR
0x25600000: 434, // kok
0x25600099: 435, // kok-IN
0x26a00000: 436, // ks
0x26a00099: 437, // ks-IN
0x26b00000: 438, // ksb
0x26b0012f: 439, // ksb-TZ
0x26d00000: 440, // ksf
0x26d00052: 441, // ksf-CM
0x26e00000: 442, // ksh
0x26e00060: 443, // ksh-DE
0x27400000: 444, // ku
0x28100000: 445, // kw
0x2810007b: 446, // kw-GB
0x28a00000: 447, // ky
0x28a000a5: 448, // ky-KG
0x29100000: 449, // lag
0x2910012f: 450, // lag-TZ
0x29500000: 451, // lb
0x295000b7: 452, // lb-LU
0x2a300000: 453, // lg
0x2a300131: 454, // lg-UG
0x2af00000: 455, // lkt
0x2af00135: 456, // lkt-US
0x2b500000: 457, // ln
0x2b50002a: 458, // ln-AO
0x2b50004b: 459, // ln-CD
0x2b50004c: 460, // ln-CF
0x2b50004d: 461, // ln-CG
0x2b800000: 462, // lo
0x2b8000af: 463, // lo-LA
0x2bf00000: 464, // lrc
0x2bf0009b: 465, // lrc-IQ
0x2bf0009c: 466, // lrc-IR
0x2c000000: 467, // lt
0x2c0000b6: 468, // lt-LT
0x2c200000: 469, // lu
0x2c20004b: 470, // lu-CD
0x2c400000: 471, // luo
0x2c4000a4: 472, // luo-KE
0x2c500000: 473, // luy
0x2c5000a4: 474, // luy-KE
0x2c700000: 475, // lv
0x2c7000b8: 476, // lv-LV
0x2d100000: 477, // mas
0x2d1000a4: 478, // mas-KE
0x2d10012f: 479, // mas-TZ
0x2e900000: 480, // mer
0x2e9000a4: 481, // mer-KE
0x2ed00000: 482, // mfe
0x2ed000cc: 483, // mfe-MU
0x2f100000: 484, // mg
0x2f1000bf: 485, // mg-MG
0x2f200000: 486, // mgh
0x2f2000d1: 487, // mgh-MZ
0x2f400000: 488, // mgo
0x2f400052: 489, // mgo-CM
0x2ff00000: 490, // mk
0x2ff000c2: 491, // mk-MK
0x30400000: 492, // ml
0x30400099: 493, // ml-IN
0x30b00000: 494, // mn
0x30b000c5: 495, // mn-MN
0x31b00000: 496, // mr
0x31b00099: 497, // mr-IN
0x31f00000: 498, // ms
0x31f0003e: 499, // ms-BN
0x31f000d0: 500, // ms-MY
0x31f0010d: 501, // ms-SG
0x32000000: 502, // mt
0x320000cb: 503, // mt-MT
0x32500000: 504, // mua
0x32500052: 505, // mua-CM
0x33100000: 506, // my
0x331000c4: 507, // my-MM
0x33a00000: 508, // mzn
0x33a0009c: 509, // mzn-IR
0x34100000: 510, // nah
0x34500000: 511, // naq
0x345000d2: 512, // naq-NA
0x34700000: 513, // nb
0x347000da: 514, // nb-NO
0x34700110: 515, // nb-SJ
0x34e00000: 516, // nd
0x34e00164: 517, // nd-ZW
0x35000000: 518, // nds
0x35000060: 519, // nds-DE
0x350000d9: 520, // nds-NL
0x35100000: 521, // ne
0x35100099: 522, // ne-IN
0x351000db: 523, // ne-NP
0x36700000: 524, // nl
0x36700030: 525, // nl-AW
0x36700036: 526, // nl-BE
0x36700040: 527, // nl-BQ
0x3670005b: 528, // nl-CW
0x367000d9: 529, // nl-NL
0x36700116: 530, // nl-SR
0x3670011b: 531, // nl-SX
0x36800000: 532, // nmg
0x36800052: 533, // nmg-CM
0x36a00000: 534, // nn
0x36a000da: 535, // nn-NO
0x36c00000: 536, // nnh
0x36c00052: 537, // nnh-CM
0x36f00000: 538, // no
0x37500000: 539, // nqo
0x37600000: 540, // nr
0x37a00000: 541, // nso
0x38000000: 542, // nus
0x38000117: 543, // nus-SS
0x38700000: 544, // ny
0x38900000: 545, // nyn
0x38900131: 546, // nyn-UG
0x39000000: 547, // om
0x3900006f: 548, // om-ET
0x390000a4: 549, // om-KE
0x39500000: 550, // or
0x39500099: 551, // or-IN
0x39800000: 552, // os
0x3980007d: 553, // os-GE
0x39800106: 554, // os-RU
0x39d00000: 555, // pa
0x39d05000: 556, // pa-Arab
0x39d050e8: 557, // pa-Arab-PK
0x39d33000: 558, // pa-Guru
0x39d33099: 559, // pa-Guru-IN
0x3a100000: 560, // pap
0x3b300000: 561, // pl
0x3b3000e9: 562, // pl-PL
0x3bd00000: 563, // prg
0x3bd00001: 564, // prg-001
0x3be00000: 565, // ps
0x3be00024: 566, // ps-AF
0x3c000000: 567, // pt
0x3c00002a: 568, // pt-AO
0x3c000041: 569, // pt-BR
0x3c00004e: 570, // pt-CH
0x3c00005a: 571, // pt-CV
0x3c000086: 572, // pt-GQ
0x3c00008b: 573, // pt-GW
0x3c0000b7: 574, // pt-LU
0x3c0000c6: 575, // pt-MO
0x3c0000d1: 576, // pt-MZ
0x3c0000ee: 577, // pt-PT
0x3c000118: 578, // pt-ST
0x3c000126: 579, // pt-TL
0x3c400000: 580, // qu
0x3c40003f: 581, // qu-BO
0x3c400069: 582, // qu-EC
0x3c4000e4: 583, // qu-PE
0x3d400000: 584, // rm
0x3d40004e: 585, // rm-CH
0x3d900000: 586, // rn
0x3d90003a: 587, // rn-BI
0x3dc00000: 588, // ro
0x3dc000bc: 589, // ro-MD
0x3dc00104: 590, // ro-RO
0x3de00000: 591, // rof
0x3de0012f: 592, // rof-TZ
0x3e200000: 593, // ru
0x3e200047: 594, // ru-BY
0x3e2000a5: 595, // ru-KG
0x3e2000ae: 596, // ru-KZ
0x3e2000bc: 597, // ru-MD
0x3e200106: 598, // ru-RU
0x3e200130: 599, // ru-UA
0x3e500000: 600, // rw
0x3e500107: 601, // rw-RW
0x3e600000: 602, // rwk
0x3e60012f: 603, // rwk-TZ
0x3eb00000: 604, // sah
0x3eb00106: 605, // sah-RU
0x3ec00000: 606, // saq
0x3ec000a4: 607, // saq-KE
0x3f300000: 608, // sbp
0x3f30012f: 609, // sbp-TZ
0x3fa00000: 610, // sd
0x3fa000e8: 611, // sd-PK
0x3fc00000: 612, // sdh
0x3fd00000: 613, // se
0x3fd00072: 614, // se-FI
0x3fd000da: 615, // se-NO
0x3fd0010c: 616, // se-SE
0x3ff00000: 617, // seh
0x3ff000d1: 618, // seh-MZ
0x40100000: 619, // ses
0x401000c3: 620, // ses-ML
0x40200000: 621, // sg
0x4020004c: 622, // sg-CF
0x40800000: 623, // shi
0x40857000: 624, // shi-Latn
0x408570ba: 625, // shi-Latn-MA
0x408dc000: 626, // shi-Tfng
0x408dc0ba: 627, // shi-Tfng-MA
0x40c00000: 628, // si
0x40c000b3: 629, // si-LK
0x41200000: 630, // sk
0x41200111: 631, // sk-SK
0x41600000: 632, // sl
0x4160010f: 633, // sl-SI
0x41c00000: 634, // sma
0x41d00000: 635, // smi
0x41e00000: 636, // smj
0x41f00000: 637, // smn
0x41f00072: 638, // smn-FI
0x42200000: 639, // sms
0x42300000: 640, // sn
0x42300164: 641, // sn-ZW
0x42900000: 642, // so
0x42900062: 643, // so-DJ
0x4290006f: 644, // so-ET
0x429000a4: 645, // so-KE
0x42900115: 646, // so-SO
0x43100000: 647, // sq
0x43100027: 648, // sq-AL
0x431000c2: 649, // sq-MK
0x4310014d: 650, // sq-XK
0x43200000: 651, // sr
0x4321f000: 652, // sr-Cyrl
0x4321f033: 653, // sr-Cyrl-BA
0x4321f0bd: 654, // sr-Cyrl-ME
0x4321f105: 655, // sr-Cyrl-RS
0x4321f14d: 656, // sr-Cyrl-XK
0x43257000: 657, // sr-Latn
0x43257033: 658, // sr-Latn-BA
0x432570bd: 659, // sr-Latn-ME
0x43257105: 660, // sr-Latn-RS
0x4325714d: 661, // sr-Latn-XK
0x43700000: 662, // ss
0x43a00000: 663, // ssy
0x43b00000: 664, // st
0x44400000: 665, // sv
0x44400031: 666, // sv-AX
0x44400072: 667, // sv-FI
0x4440010c: 668, // sv-SE
0x44500000: 669, // sw
0x4450004b: 670, // sw-CD
0x445000a4: 671, // sw-KE
0x4450012f: 672, // sw-TZ
0x44500131: 673, // sw-UG
0x44e00000: 674, // syr
0x45000000: 675, // ta
0x45000099: 676, // ta-IN
0x450000b3: 677, // ta-LK
0x450000d0: 678, // ta-MY
0x4500010d: 679, // ta-SG
0x46100000: 680, // te
0x46100099: 681, // te-IN
0x46400000: 682, // teo
0x464000a4: 683, // teo-KE
0x46400131: 684, // teo-UG
0x46700000: 685, // tg
0x46700124: 686, // tg-TJ
0x46b00000: 687, // th
0x46b00123: 688, // th-TH
0x46f00000: 689, // ti
0x46f0006d: 690, // ti-ER
0x46f0006f: 691, // ti-ET
0x47100000: 692, // tig
0x47600000: 693, // tk
0x47600127: 694, // tk-TM
0x48000000: 695, // tn
0x48200000: 696, // to
0x48200129: 697, // to-TO
0x48a00000: 698, // tr
0x48a0005d: 699, // tr-CY
0x48a0012b: 700, // tr-TR
0x48e00000: 701, // ts
0x49400000: 702, // tt
0x49400106: 703, // tt-RU
0x4a400000: 704, // twq
0x4a4000d4: 705, // twq-NE
0x4a900000: 706, // tzm
0x4a9000ba: 707, // tzm-MA
0x4ac00000: 708, // ug
0x4ac00053: 709, // ug-CN
0x4ae00000: 710, // uk
0x4ae00130: 711, // uk-UA
0x4b400000: 712, // ur
0x4b400099: 713, // ur-IN
0x4b4000e8: 714, // ur-PK
0x4bc00000: 715, // uz
0x4bc05000: 716, // uz-Arab
0x4bc05024: 717, // uz-Arab-AF
0x4bc1f000: 718, // uz-Cyrl
0x4bc1f137: 719, // uz-Cyrl-UZ
0x4bc57000: 720, // uz-Latn
0x4bc57137: 721, // uz-Latn-UZ
0x4be00000: 722, // vai
0x4be57000: 723, // vai-Latn
0x4be570b4: 724, // vai-Latn-LR
0x4bee3000: 725, // vai-Vaii
0x4bee30b4: 726, // vai-Vaii-LR
0x4c000000: 727, // ve
0x4c300000: 728, // vi
0x4c30013e: 729, // vi-VN
0x4c900000: 730, // vo
0x4c900001: 731, // vo-001
0x4cc00000: 732, // vun
0x4cc0012f: 733, // vun-TZ
0x4ce00000: 734, // wa
0x4cf00000: 735, // wae
0x4cf0004e: 736, // wae-CH
0x4e500000: 737, // wo
0x4e500114: 738, // wo-SN
0x4f200000: 739, // xh
0x4fb00000: 740, // xog
0x4fb00131: 741, // xog-UG
0x50900000: 742, // yav
0x50900052: 743, // yav-CM
0x51200000: 744, // yi
0x51200001: 745, // yi-001
0x51800000: 746, // yo
0x5180003b: 747, // yo-BJ
0x518000d6: 748, // yo-NG
0x51f00000: 749, // yue
0x51f38000: 750, // yue-Hans
0x51f38053: 751, // yue-Hans-CN
0x51f39000: 752, // yue-Hant
0x51f3908d: 753, // yue-Hant-HK
0x52800000: 754, // zgh
0x528000ba: 755, // zgh-MA
0x52900000: 756, // zh
0x52938000: 757, // zh-Hans
0x52938053: 758, // zh-Hans-CN
0x5293808d: 759, // zh-Hans-HK
0x529380c6: 760, // zh-Hans-MO
0x5293810d: 761, // zh-Hans-SG
0x52939000: 762, // zh-Hant
0x5293908d: 763, // zh-Hant-HK
0x529390c6: 764, // zh-Hant-MO
0x5293912e: 765, // zh-Hant-TW
0x52f00000: 766, // zu
0x52f00161: 767, // zu-ZA
}
// Total table size 4676 bytes (4KiB); checksum: 17BE3673
+907
View File
@@ -0,0 +1,907 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
//go:generate go run gen.go gen_common.go -output tables.go
//go:generate go run gen_index.go
package language
// TODO: Remove above NOTE after:
// - verifying that tables are dropped correctly (most notably matcher tables).
import (
"errors"
"fmt"
"strings"
)
const (
// maxCoreSize is the maximum size of a BCP 47 tag without variants and
// extensions. Equals max lang (3) + script (4) + max reg (3) + 2 dashes.
maxCoreSize = 12
// max99thPercentileSize is a somewhat arbitrary buffer size that presumably
// is large enough to hold at least 99% of the BCP 47 tags.
max99thPercentileSize = 32
// maxSimpleUExtensionSize is the maximum size of a -u extension with one
// key-type pair. Equals len("-u-") + key (2) + dash + max value (8).
maxSimpleUExtensionSize = 14
)
// Tag represents a BCP 47 language tag. It is used to specify an instance of a
// specific language or locale. All language tag values are guaranteed to be
// well-formed.
type Tag struct {
lang langID
region regionID
// TODO: we will soon run out of positions for script. Idea: instead of
// storing lang, region, and script codes, store only the compact index and
// have a lookup table from this code to its expansion. This greatly speeds
// up table lookup, speed up common variant cases.
// This will also immediately free up 3 extra bytes. Also, the pVariant
// field can now be moved to the lookup table, as the compact index uniquely
// determines the offset of a possible variant.
script scriptID
pVariant byte // offset in str, includes preceding '-'
pExt uint16 // offset of first extension, includes preceding '-'
// str is the string representation of the Tag. It will only be used if the
// tag has variants or extensions.
str string
}
// Make is a convenience wrapper for Parse that omits the error.
// In case of an error, a sensible default is returned.
func Make(s string) Tag {
return Default.Make(s)
}
// Make is a convenience wrapper for c.Parse that omits the error.
// In case of an error, a sensible default is returned.
func (c CanonType) Make(s string) Tag {
t, _ := c.Parse(s)
return t
}
// Raw returns the raw base language, script and region, without making an
// attempt to infer their values.
func (t Tag) Raw() (b Base, s Script, r Region) {
return Base{t.lang}, Script{t.script}, Region{t.region}
}
// equalTags compares language, script and region subtags only.
func (t Tag) equalTags(a Tag) bool {
return t.lang == a.lang && t.script == a.script && t.region == a.region
}
// IsRoot returns true if t is equal to language "und".
func (t Tag) IsRoot() bool {
if int(t.pVariant) < len(t.str) {
return false
}
return t.equalTags(und)
}
// private reports whether the Tag consists solely of a private use tag.
func (t Tag) private() bool {
return t.str != "" && t.pVariant == 0
}
// CanonType can be used to enable or disable various types of canonicalization.
type CanonType int
const (
// Replace deprecated base languages with their preferred replacements.
DeprecatedBase CanonType = 1 << iota
// Replace deprecated scripts with their preferred replacements.
DeprecatedScript
// Replace deprecated regions with their preferred replacements.
DeprecatedRegion
// Remove redundant scripts.
SuppressScript
// Normalize legacy encodings. This includes legacy languages defined in
// CLDR as well as bibliographic codes defined in ISO-639.
Legacy
// Map the dominant language of a macro language group to the macro language
// subtag. For example cmn -> zh.
Macro
// The CLDR flag should be used if full compatibility with CLDR is required.
// There are a few cases where language.Tag may differ from CLDR. To follow all
// of CLDR's suggestions, use All|CLDR.
CLDR
// Raw can be used to Compose or Parse without Canonicalization.
Raw CanonType = 0
// Replace all deprecated tags with their preferred replacements.
Deprecated = DeprecatedBase | DeprecatedScript | DeprecatedRegion
// All canonicalizations recommended by BCP 47.
BCP47 = Deprecated | SuppressScript
// All canonicalizations.
All = BCP47 | Legacy | Macro
// Default is the canonicalization used by Parse, Make and Compose. To
// preserve as much information as possible, canonicalizations that remove
// potentially valuable information are not included. The Matcher is
// designed to recognize similar tags that would be the same if
// they were canonicalized using All.
Default = Deprecated | Legacy
canonLang = DeprecatedBase | Legacy | Macro
// TODO: LikelyScript, LikelyRegion: suppress similar to ICU.
)
// canonicalize returns the canonicalized equivalent of the tag and
// whether there was any change.
func (t Tag) canonicalize(c CanonType) (Tag, bool) {
if c == Raw {
return t, false
}
changed := false
if c&SuppressScript != 0 {
if t.lang < langNoIndexOffset && uint8(t.script) == suppressScript[t.lang] {
t.script = 0
changed = true
}
}
if c&canonLang != 0 {
for {
if l, aliasType := normLang(t.lang); l != t.lang {
switch aliasType {
case langLegacy:
if c&Legacy != 0 {
if t.lang == _sh && t.script == 0 {
t.script = _Latn
}
t.lang = l
changed = true
}
case langMacro:
if c&Macro != 0 {
// We deviate here from CLDR. The mapping "nb" -> "no"
// qualifies as a typical Macro language mapping. However,
// for legacy reasons, CLDR maps "no", the macro language
// code for Norwegian, to the dominant variant "nb". This
// change is currently under consideration for CLDR as well.
// See http://unicode.org/cldr/trac/ticket/2698 and also
// http://unicode.org/cldr/trac/ticket/1790 for some of the
// practical implications. TODO: this check could be removed
// if CLDR adopts this change.
if c&CLDR == 0 || t.lang != _nb {
changed = true
t.lang = l
}
}
case langDeprecated:
if c&DeprecatedBase != 0 {
if t.lang == _mo && t.region == 0 {
t.region = _MD
}
t.lang = l
changed = true
// Other canonicalization types may still apply.
continue
}
}
} else if c&Legacy != 0 && t.lang == _no && c&CLDR != 0 {
t.lang = _nb
changed = true
}
break
}
}
if c&DeprecatedScript != 0 {
if t.script == _Qaai {
changed = true
t.script = _Zinh
}
}
if c&DeprecatedRegion != 0 {
if r := normRegion(t.region); r != 0 {
changed = true
t.region = r
}
}
return t, changed
}
// Canonicalize returns the canonicalized equivalent of the tag.
func (c CanonType) Canonicalize(t Tag) (Tag, error) {
t, changed := t.canonicalize(c)
if changed {
t.remakeString()
}
return t, nil
}
// Confidence indicates the level of certainty for a given return value.
// For example, Serbian may be written in Cyrillic or Latin script.
// The confidence level indicates whether a value was explicitly specified,
// whether it is typically the only possible value, or whether there is
// an ambiguity.
type Confidence int
const (
No Confidence = iota // full confidence that there was no match
Low // most likely value picked out of a set of alternatives
High // value is generally assumed to be the correct match
Exact // exact match or explicitly specified value
)
var confName = []string{"No", "Low", "High", "Exact"}
func (c Confidence) String() string {
return confName[c]
}
// remakeString is used to update t.str in case lang, script or region changed.
// It is assumed that pExt and pVariant still point to the start of the
// respective parts.
func (t *Tag) remakeString() {
if t.str == "" {
return
}
extra := t.str[t.pVariant:]
if t.pVariant > 0 {
extra = extra[1:]
}
if t.equalTags(und) && strings.HasPrefix(extra, "x-") {
t.str = extra
t.pVariant = 0
t.pExt = 0
return
}
var buf [max99thPercentileSize]byte // avoid extra memory allocation in most cases.
b := buf[:t.genCoreBytes(buf[:])]
if extra != "" {
diff := len(b) - int(t.pVariant)
b = append(b, '-')
b = append(b, extra...)
t.pVariant = uint8(int(t.pVariant) + diff)
t.pExt = uint16(int(t.pExt) + diff)
} else {
t.pVariant = uint8(len(b))
t.pExt = uint16(len(b))
}
t.str = string(b)
}
// genCoreBytes writes a string for the base languages, script and region tags
// to the given buffer and returns the number of bytes written. It will never
// write more than maxCoreSize bytes.
func (t *Tag) genCoreBytes(buf []byte) int {
n := t.lang.stringToBuf(buf[:])
if t.script != 0 {
n += copy(buf[n:], "-")
n += copy(buf[n:], t.script.String())
}
if t.region != 0 {
n += copy(buf[n:], "-")
n += copy(buf[n:], t.region.String())
}
return n
}
// String returns the canonical string representation of the language tag.
func (t Tag) String() string {
if t.str != "" {
return t.str
}
if t.script == 0 && t.region == 0 {
return t.lang.String()
}
buf := [maxCoreSize]byte{}
return string(buf[:t.genCoreBytes(buf[:])])
}
// MarshalText implements encoding.TextMarshaler.
func (t Tag) MarshalText() (text []byte, err error) {
if t.str != "" {
text = append(text, t.str...)
} else if t.script == 0 && t.region == 0 {
text = append(text, t.lang.String()...)
} else {
buf := [maxCoreSize]byte{}
text = buf[:t.genCoreBytes(buf[:])]
}
return text, nil
}
// UnmarshalText implements encoding.TextUnmarshaler.
func (t *Tag) UnmarshalText(text []byte) error {
tag, err := Raw.Parse(string(text))
*t = tag
return err
}
// Base returns the base language of the language tag. If the base language is
// unspecified, an attempt will be made to infer it from the context.
// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
func (t Tag) Base() (Base, Confidence) {
if t.lang != 0 {
return Base{t.lang}, Exact
}
c := High
if t.script == 0 && !(Region{t.region}).IsCountry() {
c = Low
}
if tag, err := addTags(t); err == nil && tag.lang != 0 {
return Base{tag.lang}, c
}
return Base{0}, No
}
// Script infers the script for the language tag. If it was not explicitly given, it will infer
// a most likely candidate.
// If more than one script is commonly used for a language, the most likely one
// is returned with a low confidence indication. For example, it returns (Cyrl, Low)
// for Serbian.
// If a script cannot be inferred (Zzzz, No) is returned. We do not use Zyyy (undetermined)
// as one would suspect from the IANA registry for BCP 47. In a Unicode context Zyyy marks
// common characters (like 1, 2, 3, '.', etc.) and is therefore more like multiple scripts.
// See http://www.unicode.org/reports/tr24/#Values for more details. Zzzz is also used for
// unknown value in CLDR. (Zzzz, Exact) is returned if Zzzz was explicitly specified.
// Note that an inferred script is never guaranteed to be the correct one. Latin is
// almost exclusively used for Afrikaans, but Arabic has been used for some texts
// in the past. Also, the script that is commonly used may change over time.
// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
func (t Tag) Script() (Script, Confidence) {
if t.script != 0 {
return Script{t.script}, Exact
}
sc, c := scriptID(_Zzzz), No
if t.lang < langNoIndexOffset {
if scr := scriptID(suppressScript[t.lang]); scr != 0 {
// Note: it is not always the case that a language with a suppress
// script value is only written in one script (e.g. kk, ms, pa).
if t.region == 0 {
return Script{scriptID(scr)}, High
}
sc, c = scr, High
}
}
if tag, err := addTags(t); err == nil {
if tag.script != sc {
sc, c = tag.script, Low
}
} else {
t, _ = (Deprecated | Macro).Canonicalize(t)
if tag, err := addTags(t); err == nil && tag.script != sc {
sc, c = tag.script, Low
}
}
return Script{sc}, c
}
// Region returns the region for the language tag. If it was not explicitly given, it will
// infer a most likely candidate from the context.
// It uses a variant of CLDR's Add Likely Subtags algorithm. This is subject to change.
func (t Tag) Region() (Region, Confidence) {
if t.region != 0 {
return Region{t.region}, Exact
}
if t, err := addTags(t); err == nil {
return Region{t.region}, Low // TODO: differentiate between high and low.
}
t, _ = (Deprecated | Macro).Canonicalize(t)
if tag, err := addTags(t); err == nil {
return Region{tag.region}, Low
}
return Region{_ZZ}, No // TODO: return world instead of undetermined?
}
// Variant returns the variants specified explicitly for this language tag.
// or nil if no variant was specified.
func (t Tag) Variants() []Variant {
v := []Variant{}
if int(t.pVariant) < int(t.pExt) {
for x, str := "", t.str[t.pVariant:t.pExt]; str != ""; {
x, str = nextToken(str)
v = append(v, Variant{x})
}
}
return v
}
// Parent returns the CLDR parent of t. In CLDR, missing fields in data for a
// specific language are substituted with fields from the parent language.
// The parent for a language may change for newer versions of CLDR.
func (t Tag) Parent() Tag {
if t.str != "" {
// Strip the variants and extensions.
t, _ = Raw.Compose(t.Raw())
if t.region == 0 && t.script != 0 && t.lang != 0 {
base, _ := addTags(Tag{lang: t.lang})
if base.script == t.script {
return Tag{lang: t.lang}
}
}
return t
}
if t.lang != 0 {
if t.region != 0 {
maxScript := t.script
if maxScript == 0 {
max, _ := addTags(t)
maxScript = max.script
}
for i := range parents {
if langID(parents[i].lang) == t.lang && scriptID(parents[i].maxScript) == maxScript {
for _, r := range parents[i].fromRegion {
if regionID(r) == t.region {
return Tag{
lang: t.lang,
script: scriptID(parents[i].script),
region: regionID(parents[i].toRegion),
}
}
}
}
}
// Strip the script if it is the default one.
base, _ := addTags(Tag{lang: t.lang})
if base.script != maxScript {
return Tag{lang: t.lang, script: maxScript}
}
return Tag{lang: t.lang}
} else if t.script != 0 {
// The parent for an base-script pair with a non-default script is
// "und" instead of the base language.
base, _ := addTags(Tag{lang: t.lang})
if base.script != t.script {
return und
}
return Tag{lang: t.lang}
}
}
return und
}
// returns token t and the rest of the string.
func nextToken(s string) (t, tail string) {
p := strings.Index(s[1:], "-")
if p == -1 {
return s[1:], ""
}
p++
return s[1:p], s[p:]
}
// Extension is a single BCP 47 extension.
type Extension struct {
s string
}
// String returns the string representation of the extension, including the
// type tag.
func (e Extension) String() string {
return e.s
}
// ParseExtension parses s as an extension and returns it on success.
func ParseExtension(s string) (e Extension, err error) {
scan := makeScannerString(s)
var end int
if n := len(scan.token); n != 1 {
return Extension{}, errSyntax
}
scan.toLower(0, len(scan.b))
end = parseExtension(&scan)
if end != len(s) {
return Extension{}, errSyntax
}
return Extension{string(scan.b)}, nil
}
// Type returns the one-byte extension type of e. It returns 0 for the zero
// exception.
func (e Extension) Type() byte {
if e.s == "" {
return 0
}
return e.s[0]
}
// Tokens returns the list of tokens of e.
func (e Extension) Tokens() []string {
return strings.Split(e.s, "-")
}
// Extension returns the extension of type x for tag t. It will return
// false for ok if t does not have the requested extension. The returned
// extension will be invalid in this case.
func (t Tag) Extension(x byte) (ext Extension, ok bool) {
for i := int(t.pExt); i < len(t.str)-1; {
var ext string
i, ext = getExtension(t.str, i)
if ext[0] == x {
return Extension{ext}, true
}
}
return Extension{}, false
}
// Extensions returns all extensions of t.
func (t Tag) Extensions() []Extension {
e := []Extension{}
for i := int(t.pExt); i < len(t.str)-1; {
var ext string
i, ext = getExtension(t.str, i)
e = append(e, Extension{ext})
}
return e
}
// TypeForKey returns the type associated with the given key, where key and type
// are of the allowed values defined for the Unicode locale extension ('u') in
// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// TypeForKey will traverse the inheritance chain to get the correct value.
func (t Tag) TypeForKey(key string) string {
if start, end, _ := t.findTypeForKey(key); end != start {
return t.str[start:end]
}
return ""
}
var (
errPrivateUse = errors.New("cannot set a key on a private use tag")
errInvalidArguments = errors.New("invalid key or type")
)
// SetTypeForKey returns a new Tag with the key set to type, where key and type
// are of the allowed values defined for the Unicode locale extension ('u') in
// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// An empty value removes an existing pair with the same key.
func (t Tag) SetTypeForKey(key, value string) (Tag, error) {
if t.private() {
return t, errPrivateUse
}
if len(key) != 2 {
return t, errInvalidArguments
}
// Remove the setting if value is "".
if value == "" {
start, end, _ := t.findTypeForKey(key)
if start != end {
// Remove key tag and leading '-'.
start -= 4
// Remove a possible empty extension.
if (end == len(t.str) || t.str[end+2] == '-') && t.str[start-2] == '-' {
start -= 2
}
if start == int(t.pVariant) && end == len(t.str) {
t.str = ""
t.pVariant, t.pExt = 0, 0
} else {
t.str = fmt.Sprintf("%s%s", t.str[:start], t.str[end:])
}
}
return t, nil
}
if len(value) < 3 || len(value) > 8 {
return t, errInvalidArguments
}
var (
buf [maxCoreSize + maxSimpleUExtensionSize]byte
uStart int // start of the -u extension.
)
// Generate the tag string if needed.
if t.str == "" {
uStart = t.genCoreBytes(buf[:])
buf[uStart] = '-'
uStart++
}
// Create new key-type pair and parse it to verify.
b := buf[uStart:]
copy(b, "u-")
copy(b[2:], key)
b[4] = '-'
b = b[:5+copy(b[5:], value)]
scan := makeScanner(b)
if parseExtensions(&scan); scan.err != nil {
return t, scan.err
}
// Assemble the replacement string.
if t.str == "" {
t.pVariant, t.pExt = byte(uStart-1), uint16(uStart-1)
t.str = string(buf[:uStart+len(b)])
} else {
s := t.str
start, end, hasExt := t.findTypeForKey(key)
if start == end {
if hasExt {
b = b[2:]
}
t.str = fmt.Sprintf("%s-%s%s", s[:start], b, s[end:])
} else {
t.str = fmt.Sprintf("%s%s%s", s[:start], value, s[end:])
}
}
return t, nil
}
// findKeyAndType returns the start and end position for the type corresponding
// to key or the point at which to insert the key-value pair if the type
// wasn't found. The hasExt return value reports whether an -u extension was present.
// Note: the extensions are typically very small and are likely to contain
// only one key-type pair.
func (t Tag) findTypeForKey(key string) (start, end int, hasExt bool) {
p := int(t.pExt)
if len(key) != 2 || p == len(t.str) || p == 0 {
return p, p, false
}
s := t.str
// Find the correct extension.
for p++; s[p] != 'u'; p++ {
if s[p] > 'u' {
p--
return p, p, false
}
if p = nextExtension(s, p); p == len(s) {
return len(s), len(s), false
}
}
// Proceed to the hyphen following the extension name.
p++
// curKey is the key currently being processed.
curKey := ""
// Iterate over keys until we get the end of a section.
for {
// p points to the hyphen preceding the current token.
if p3 := p + 3; s[p3] == '-' {
// Found a key.
// Check whether we just processed the key that was requested.
if curKey == key {
return start, p, true
}
// Set to the next key and continue scanning type tokens.
curKey = s[p+1 : p3]
if curKey > key {
return p, p, true
}
// Start of the type token sequence.
start = p + 4
// A type is at least 3 characters long.
p += 7 // 4 + 3
} else {
// Attribute or type, which is at least 3 characters long.
p += 4
}
// p points past the third character of a type or attribute.
max := p + 5 // maximum length of token plus hyphen.
if len(s) < max {
max = len(s)
}
for ; p < max && s[p] != '-'; p++ {
}
// Bail if we have exhausted all tokens or if the next token starts
// a new extension.
if p == len(s) || s[p+2] == '-' {
if curKey == key {
return start, p, true
}
return p, p, true
}
}
}
// CompactIndex returns an index, where 0 <= index < NumCompactTags, for tags
// for which data exists in the text repository. The index will change over time
// and should not be stored in persistent storage. Extensions, except for the
// 'va' type of the 'u' extension, are ignored. It will return 0, false if no
// compact tag exists, where 0 is the index for the root language (Und).
func CompactIndex(t Tag) (index int, ok bool) {
// TODO: perhaps give more frequent tags a lower index.
// TODO: we could make the indexes stable. This will excluded some
// possibilities for optimization, so don't do this quite yet.
b, s, r := t.Raw()
if len(t.str) > 0 {
if strings.HasPrefix(t.str, "x-") {
// We have no entries for user-defined tags.
return 0, false
}
if uint16(t.pVariant) != t.pExt {
// There are no tags with variants and an u-va type.
if t.TypeForKey("va") != "" {
return 0, false
}
t, _ = Raw.Compose(b, s, r, t.Variants())
} else if _, ok := t.Extension('u'); ok {
// Strip all but the 'va' entry.
variant := t.TypeForKey("va")
t, _ = Raw.Compose(b, s, r)
t, _ = t.SetTypeForKey("va", variant)
}
if len(t.str) > 0 {
// We have some variants.
for i, s := range specialTags {
if s == t {
return i + 1, true
}
}
return 0, false
}
}
// No variants specified: just compare core components.
// The key has the form lllssrrr, where l, s, and r are nibbles for
// respectively the langID, scriptID, and regionID.
key := uint32(b.langID) << (8 + 12)
key |= uint32(s.scriptID) << 12
key |= uint32(r.regionID)
x, ok := coreTags[key]
return int(x), ok
}
// Base is an ISO 639 language code, used for encoding the base language
// of a language tag.
type Base struct {
langID
}
// ParseBase parses a 2- or 3-letter ISO 639 code.
// It returns a ValueError if s is a well-formed but unknown language identifier
// or another error if another error occurred.
func ParseBase(s string) (Base, error) {
if n := len(s); n < 2 || 3 < n {
return Base{}, errSyntax
}
var buf [3]byte
l, err := getLangID(buf[:copy(buf[:], s)])
return Base{l}, err
}
// Script is a 4-letter ISO 15924 code for representing scripts.
// It is idiomatically represented in title case.
type Script struct {
scriptID
}
// ParseScript parses a 4-letter ISO 15924 code.
// It returns a ValueError if s is a well-formed but unknown script identifier
// or another error if another error occurred.
func ParseScript(s string) (Script, error) {
if len(s) != 4 {
return Script{}, errSyntax
}
var buf [4]byte
sc, err := getScriptID(script, buf[:copy(buf[:], s)])
return Script{sc}, err
}
// Region is an ISO 3166-1 or UN M.49 code for representing countries and regions.
type Region struct {
regionID
}
// EncodeM49 returns the Region for the given UN M.49 code.
// It returns an error if r is not a valid code.
func EncodeM49(r int) (Region, error) {
rid, err := getRegionM49(r)
return Region{rid}, err
}
// ParseRegion parses a 2- or 3-letter ISO 3166-1 or a UN M.49 code.
// It returns a ValueError if s is a well-formed but unknown region identifier
// or another error if another error occurred.
func ParseRegion(s string) (Region, error) {
if n := len(s); n < 2 || 3 < n {
return Region{}, errSyntax
}
var buf [3]byte
r, err := getRegionID(buf[:copy(buf[:], s)])
return Region{r}, err
}
// IsCountry returns whether this region is a country or autonomous area. This
// includes non-standard definitions from CLDR.
func (r Region) IsCountry() bool {
if r.regionID == 0 || r.IsGroup() || r.IsPrivateUse() && r.regionID != _XK {
return false
}
return true
}
// IsGroup returns whether this region defines a collection of regions. This
// includes non-standard definitions from CLDR.
func (r Region) IsGroup() bool {
if r.regionID == 0 {
return false
}
return int(regionInclusion[r.regionID]) < len(regionContainment)
}
// Contains returns whether Region c is contained by Region r. It returns true
// if c == r.
func (r Region) Contains(c Region) bool {
return r.regionID.contains(c.regionID)
}
func (r regionID) contains(c regionID) bool {
if r == c {
return true
}
g := regionInclusion[r]
if g >= nRegionGroups {
return false
}
m := regionContainment[g]
d := regionInclusion[c]
b := regionInclusionBits[d]
// A contained country may belong to multiple disjoint groups. Matching any
// of these indicates containment. If the contained region is a group, it
// must strictly be a subset.
if d >= nRegionGroups {
return b&m != 0
}
return b&^m == 0
}
var errNoTLD = errors.New("language: region is not a valid ccTLD")
// TLD returns the country code top-level domain (ccTLD). UK is returned for GB.
// In all other cases it returns either the region itself or an error.
//
// This method may return an error for a region for which there exists a
// canonical form with a ccTLD. To get that ccTLD canonicalize r first. The
// region will already be canonicalized it was obtained from a Tag that was
// obtained using any of the default methods.
func (r Region) TLD() (Region, error) {
// See http://en.wikipedia.org/wiki/Country_code_top-level_domain for the
// difference between ISO 3166-1 and IANA ccTLD.
if r.regionID == _GB {
r = Region{_UK}
}
if (r.typ() & ccTLD) == 0 {
return Region{}, errNoTLD
}
return r, nil
}
// Canonicalize returns the region or a possible replacement if the region is
// deprecated. It will not return a replacement for deprecated regions that
// are split into multiple regions.
func (r Region) Canonicalize() Region {
if cr := normRegion(r.regionID); cr != 0 {
return Region{cr}
}
return r
}
// Variant represents a registered variant of a language as defined by BCP 47.
type Variant struct {
variant string
}
// ParseVariant parses and returns a Variant. An error is returned if s is not
// a valid variant.
func ParseVariant(s string) (Variant, error) {
s = strings.ToLower(s)
if _, ok := variantIndex[s]; ok {
return Variant{s}, nil
}
return Variant{}, mkErrInvalid([]byte(s))
}
// String returns the string representation of the variant.
func (v Variant) String() string {
return v.variant
}
+396
View File
@@ -0,0 +1,396 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
import (
"bytes"
"fmt"
"sort"
"strconv"
"golang.org/x/text/internal/tag"
)
// findIndex tries to find the given tag in idx and returns a standardized error
// if it could not be found.
func findIndex(idx tag.Index, key []byte, form string) (index int, err error) {
if !tag.FixCase(form, key) {
return 0, errSyntax
}
i := idx.Index(key)
if i == -1 {
return 0, mkErrInvalid(key)
}
return i, nil
}
func searchUint(imap []uint16, key uint16) int {
return sort.Search(len(imap), func(i int) bool {
return imap[i] >= key
})
}
type langID uint16
// getLangID returns the langID of s if s is a canonical subtag
// or langUnknown if s is not a canonical subtag.
func getLangID(s []byte) (langID, error) {
if len(s) == 2 {
return getLangISO2(s)
}
return getLangISO3(s)
}
// mapLang returns the mapped langID of id according to mapping m.
func normLang(id langID) (langID, langAliasType) {
k := sort.Search(len(langAliasMap), func(i int) bool {
return langAliasMap[i].from >= uint16(id)
})
if k < len(langAliasMap) && langAliasMap[k].from == uint16(id) {
return langID(langAliasMap[k].to), langAliasTypes[k]
}
return id, langAliasTypeUnknown
}
// getLangISO2 returns the langID for the given 2-letter ISO language code
// or unknownLang if this does not exist.
func getLangISO2(s []byte) (langID, error) {
if !tag.FixCase("zz", s) {
return 0, errSyntax
}
if i := lang.Index(s); i != -1 && lang.Elem(i)[3] != 0 {
return langID(i), nil
}
return 0, mkErrInvalid(s)
}
const base = 'z' - 'a' + 1
func strToInt(s []byte) uint {
v := uint(0)
for i := 0; i < len(s); i++ {
v *= base
v += uint(s[i] - 'a')
}
return v
}
// converts the given integer to the original ASCII string passed to strToInt.
// len(s) must match the number of characters obtained.
func intToStr(v uint, s []byte) {
for i := len(s) - 1; i >= 0; i-- {
s[i] = byte(v%base) + 'a'
v /= base
}
}
// getLangISO3 returns the langID for the given 3-letter ISO language code
// or unknownLang if this does not exist.
func getLangISO3(s []byte) (langID, error) {
if tag.FixCase("und", s) {
// first try to match canonical 3-letter entries
for i := lang.Index(s[:2]); i != -1; i = lang.Next(s[:2], i) {
if e := lang.Elem(i); e[3] == 0 && e[2] == s[2] {
// We treat "und" as special and always translate it to "unspecified".
// Note that ZZ and Zzzz are private use and are not treated as
// unspecified by default.
id := langID(i)
if id == nonCanonicalUnd {
return 0, nil
}
return id, nil
}
}
if i := altLangISO3.Index(s); i != -1 {
return langID(altLangIndex[altLangISO3.Elem(i)[3]]), nil
}
n := strToInt(s)
if langNoIndex[n/8]&(1<<(n%8)) != 0 {
return langID(n) + langNoIndexOffset, nil
}
// Check for non-canonical uses of ISO3.
for i := lang.Index(s[:1]); i != -1; i = lang.Next(s[:1], i) {
if e := lang.Elem(i); e[2] == s[1] && e[3] == s[2] {
return langID(i), nil
}
}
return 0, mkErrInvalid(s)
}
return 0, errSyntax
}
// stringToBuf writes the string to b and returns the number of bytes
// written. cap(b) must be >= 3.
func (id langID) stringToBuf(b []byte) int {
if id >= langNoIndexOffset {
intToStr(uint(id)-langNoIndexOffset, b[:3])
return 3
} else if id == 0 {
return copy(b, "und")
}
l := lang[id<<2:]
if l[3] == 0 {
return copy(b, l[:3])
}
return copy(b, l[:2])
}
// String returns the BCP 47 representation of the langID.
// Use b as variable name, instead of id, to ensure the variable
// used is consistent with that of Base in which this type is embedded.
func (b langID) String() string {
if b == 0 {
return "und"
} else if b >= langNoIndexOffset {
b -= langNoIndexOffset
buf := [3]byte{}
intToStr(uint(b), buf[:])
return string(buf[:])
}
l := lang.Elem(int(b))
if l[3] == 0 {
return l[:3]
}
return l[:2]
}
// ISO3 returns the ISO 639-3 language code.
func (b langID) ISO3() string {
if b == 0 || b >= langNoIndexOffset {
return b.String()
}
l := lang.Elem(int(b))
if l[3] == 0 {
return l[:3]
} else if l[2] == 0 {
return altLangISO3.Elem(int(l[3]))[:3]
}
// This allocation will only happen for 3-letter ISO codes
// that are non-canonical BCP 47 language identifiers.
return l[0:1] + l[2:4]
}
// IsPrivateUse reports whether this language code is reserved for private use.
func (b langID) IsPrivateUse() bool {
return langPrivateStart <= b && b <= langPrivateEnd
}
type regionID uint16
// getRegionID returns the region id for s if s is a valid 2-letter region code
// or unknownRegion.
func getRegionID(s []byte) (regionID, error) {
if len(s) == 3 {
if isAlpha(s[0]) {
return getRegionISO3(s)
}
if i, err := strconv.ParseUint(string(s), 10, 10); err == nil {
return getRegionM49(int(i))
}
}
return getRegionISO2(s)
}
// getRegionISO2 returns the regionID for the given 2-letter ISO country code
// or unknownRegion if this does not exist.
func getRegionISO2(s []byte) (regionID, error) {
i, err := findIndex(regionISO, s, "ZZ")
if err != nil {
return 0, err
}
return regionID(i) + isoRegionOffset, nil
}
// getRegionISO3 returns the regionID for the given 3-letter ISO country code
// or unknownRegion if this does not exist.
func getRegionISO3(s []byte) (regionID, error) {
if tag.FixCase("ZZZ", s) {
for i := regionISO.Index(s[:1]); i != -1; i = regionISO.Next(s[:1], i) {
if e := regionISO.Elem(i); e[2] == s[1] && e[3] == s[2] {
return regionID(i) + isoRegionOffset, nil
}
}
for i := 0; i < len(altRegionISO3); i += 3 {
if tag.Compare(altRegionISO3[i:i+3], s) == 0 {
return regionID(altRegionIDs[i/3]), nil
}
}
return 0, mkErrInvalid(s)
}
return 0, errSyntax
}
func getRegionM49(n int) (regionID, error) {
if 0 < n && n <= 999 {
const (
searchBits = 7
regionBits = 9
regionMask = 1<<regionBits - 1
)
idx := n >> searchBits
buf := fromM49[m49Index[idx]:m49Index[idx+1]]
val := uint16(n) << regionBits // we rely on bits shifting out
i := sort.Search(len(buf), func(i int) bool {
return buf[i] >= val
})
if r := fromM49[int(m49Index[idx])+i]; r&^regionMask == val {
return regionID(r & regionMask), nil
}
}
var e ValueError
fmt.Fprint(bytes.NewBuffer([]byte(e.v[:])), n)
return 0, e
}
// normRegion returns a region if r is deprecated or 0 otherwise.
// TODO: consider supporting BYS (-> BLR), CSK (-> 200 or CZ), PHI (-> PHL) and AFI (-> DJ).
// TODO: consider mapping split up regions to new most populous one (like CLDR).
func normRegion(r regionID) regionID {
m := regionOldMap
k := sort.Search(len(m), func(i int) bool {
return m[i].from >= uint16(r)
})
if k < len(m) && m[k].from == uint16(r) {
return regionID(m[k].to)
}
return 0
}
const (
iso3166UserAssigned = 1 << iota
ccTLD
bcp47Region
)
func (r regionID) typ() byte {
return regionTypes[r]
}
// String returns the BCP 47 representation for the region.
// It returns "ZZ" for an unspecified region.
func (r regionID) String() string {
if r < isoRegionOffset {
if r == 0 {
return "ZZ"
}
return fmt.Sprintf("%03d", r.M49())
}
r -= isoRegionOffset
return regionISO.Elem(int(r))[:2]
}
// ISO3 returns the 3-letter ISO code of r.
// Note that not all regions have a 3-letter ISO code.
// In such cases this method returns "ZZZ".
func (r regionID) ISO3() string {
if r < isoRegionOffset {
return "ZZZ"
}
r -= isoRegionOffset
reg := regionISO.Elem(int(r))
switch reg[2] {
case 0:
return altRegionISO3[reg[3]:][:3]
case ' ':
return "ZZZ"
}
return reg[0:1] + reg[2:4]
}
// M49 returns the UN M.49 encoding of r, or 0 if this encoding
// is not defined for r.
func (r regionID) M49() int {
return int(m49[r])
}
// IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This
// may include private-use tags that are assigned by CLDR and used in this
// implementation. So IsPrivateUse and IsCountry can be simultaneously true.
func (r regionID) IsPrivateUse() bool {
return r.typ()&iso3166UserAssigned != 0
}
type scriptID uint8
// getScriptID returns the script id for string s. It assumes that s
// is of the format [A-Z][a-z]{3}.
func getScriptID(idx tag.Index, s []byte) (scriptID, error) {
i, err := findIndex(idx, s, "Zzzz")
return scriptID(i), err
}
// String returns the script code in title case.
// It returns "Zzzz" for an unspecified script.
func (s scriptID) String() string {
if s == 0 {
return "Zzzz"
}
return script.Elem(int(s))
}
// IsPrivateUse reports whether this script code is reserved for private use.
func (s scriptID) IsPrivateUse() bool {
return _Qaaa <= s && s <= _Qabx
}
const (
maxAltTaglen = len("en-US-POSIX")
maxLen = maxAltTaglen
)
var (
// grandfatheredMap holds a mapping from legacy and grandfathered tags to
// their base language or index to more elaborate tag.
grandfatheredMap = map[[maxLen]byte]int16{
[maxLen]byte{'a', 'r', 't', '-', 'l', 'o', 'j', 'b', 'a', 'n'}: _jbo, // art-lojban
[maxLen]byte{'i', '-', 'a', 'm', 'i'}: _ami, // i-ami
[maxLen]byte{'i', '-', 'b', 'n', 'n'}: _bnn, // i-bnn
[maxLen]byte{'i', '-', 'h', 'a', 'k'}: _hak, // i-hak
[maxLen]byte{'i', '-', 'k', 'l', 'i', 'n', 'g', 'o', 'n'}: _tlh, // i-klingon
[maxLen]byte{'i', '-', 'l', 'u', 'x'}: _lb, // i-lux
[maxLen]byte{'i', '-', 'n', 'a', 'v', 'a', 'j', 'o'}: _nv, // i-navajo
[maxLen]byte{'i', '-', 'p', 'w', 'n'}: _pwn, // i-pwn
[maxLen]byte{'i', '-', 't', 'a', 'o'}: _tao, // i-tao
[maxLen]byte{'i', '-', 't', 'a', 'y'}: _tay, // i-tay
[maxLen]byte{'i', '-', 't', 's', 'u'}: _tsu, // i-tsu
[maxLen]byte{'n', 'o', '-', 'b', 'o', 'k'}: _nb, // no-bok
[maxLen]byte{'n', 'o', '-', 'n', 'y', 'n'}: _nn, // no-nyn
[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'f', 'r'}: _sfb, // sgn-BE-FR
[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'n', 'l'}: _vgt, // sgn-BE-NL
[maxLen]byte{'s', 'g', 'n', '-', 'c', 'h', '-', 'd', 'e'}: _sgg, // sgn-CH-DE
[maxLen]byte{'z', 'h', '-', 'g', 'u', 'o', 'y', 'u'}: _cmn, // zh-guoyu
[maxLen]byte{'z', 'h', '-', 'h', 'a', 'k', 'k', 'a'}: _hak, // zh-hakka
[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n', '-', 'n', 'a', 'n'}: _nan, // zh-min-nan
[maxLen]byte{'z', 'h', '-', 'x', 'i', 'a', 'n', 'g'}: _hsn, // zh-xiang
// Grandfathered tags with no modern replacement will be converted as
// follows:
[maxLen]byte{'c', 'e', 'l', '-', 'g', 'a', 'u', 'l', 'i', 's', 'h'}: -1, // cel-gaulish
[maxLen]byte{'e', 'n', '-', 'g', 'b', '-', 'o', 'e', 'd'}: -2, // en-GB-oed
[maxLen]byte{'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'}: -3, // i-default
[maxLen]byte{'i', '-', 'e', 'n', 'o', 'c', 'h', 'i', 'a', 'n'}: -4, // i-enochian
[maxLen]byte{'i', '-', 'm', 'i', 'n', 'g', 'o'}: -5, // i-mingo
[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n'}: -6, // zh-min
// CLDR-specific tag.
[maxLen]byte{'r', 'o', 'o', 't'}: 0, // root
[maxLen]byte{'e', 'n', '-', 'u', 's', '-', 'p', 'o', 's', 'i', 'x'}: -7, // en_US_POSIX"
}
altTagIndex = [...]uint8{0, 17, 31, 45, 61, 74, 86, 102}
altTags = "xtg-x-cel-gaulishen-GB-oxendicten-x-i-defaultund-x-i-enochiansee-x-i-mingonan-x-zh-minen-US-u-va-posix"
)
func grandfathered(s [maxAltTaglen]byte) (t Tag, ok bool) {
if v, ok := grandfatheredMap[s]; ok {
if v < 0 {
return Make(altTags[altTagIndex[-v-1]:altTagIndex[-v]]), true
}
t.lang = langID(v)
return t, true
}
return t, false
}
+933
View File
@@ -0,0 +1,933 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
import "errors"
// A MatchOption configures a Matcher.
type MatchOption func(*matcher)
// PreferSameScript will, in the absence of a match, result in the first
// preferred tag with the same script as a supported tag to match this supported
// tag. The default is currently true, but this may change in the future.
func PreferSameScript(preferSame bool) MatchOption {
return func(m *matcher) { m.preferSameScript = preferSame }
}
// TODO(v1.0.0): consider making Matcher a concrete type, instead of interface.
// There doesn't seem to be too much need for multiple types.
// Making it a concrete type allows MatchStrings to be a method, which will
// improve its discoverability.
// MatchStrings parses and matches the given strings until one of them matches
// the language in the Matcher. A string may be an Accept-Language header as
// handled by ParseAcceptLanguage. The default language is returned if no
// other language matched.
func MatchStrings(m Matcher, lang ...string) (tag Tag, index int) {
for _, accept := range lang {
desired, _, err := ParseAcceptLanguage(accept)
if err != nil {
continue
}
if tag, index, conf := m.Match(desired...); conf != No {
return tag, index
}
}
tag, index, _ = m.Match()
return
}
// Matcher is the interface that wraps the Match method.
//
// Match returns the best match for any of the given tags, along with
// a unique index associated with the returned tag and a confidence
// score.
type Matcher interface {
Match(t ...Tag) (tag Tag, index int, c Confidence)
}
// Comprehends reports the confidence score for a speaker of a given language
// to being able to comprehend the written form of an alternative language.
func Comprehends(speaker, alternative Tag) Confidence {
_, _, c := NewMatcher([]Tag{alternative}).Match(speaker)
return c
}
// NewMatcher returns a Matcher that matches an ordered list of preferred tags
// against a list of supported tags based on written intelligibility, closeness
// of dialect, equivalence of subtags and various other rules. It is initialized
// with the list of supported tags. The first element is used as the default
// value in case no match is found.
//
// Its Match method matches the first of the given Tags to reach a certain
// confidence threshold. The tags passed to Match should therefore be specified
// in order of preference. Extensions are ignored for matching.
//
// The index returned by the Match method corresponds to the index of the
// matched tag in t, but is augmented with the Unicode extension ('u')of the
// corresponding preferred tag. This allows user locale options to be passed
// transparently.
func NewMatcher(t []Tag, options ...MatchOption) Matcher {
return newMatcher(t, options)
}
func (m *matcher) Match(want ...Tag) (t Tag, index int, c Confidence) {
match, w, c := m.getBest(want...)
if match != nil {
t, index = match.tag, match.index
} else {
// TODO: this should be an option
t = m.default_.tag
if m.preferSameScript {
outer:
for _, w := range want {
script, _ := w.Script()
if script.scriptID == 0 {
// Don't do anything if there is no script, such as with
// private subtags.
continue
}
for i, h := range m.supported {
if script.scriptID == h.maxScript {
t, index = h.tag, i
break outer
}
}
}
}
// TODO: select first language tag based on script.
}
if w.region != 0 && t.region != 0 && t.region.contains(w.region) {
t, _ = Raw.Compose(t, Region{w.region})
}
// Copy options from the user-provided tag into the result tag. This is hard
// to do after the fact, so we do it here.
// TODO: add in alternative variants to -u-va-.
// TODO: add preferred region to -u-rg-.
if e := w.Extensions(); len(e) > 0 {
t, _ = Raw.Compose(t, e)
}
return t, index, c
}
type scriptRegionFlags uint8
const (
isList = 1 << iota
scriptInFrom
regionInFrom
)
func (t *Tag) setUndefinedLang(id langID) {
if t.lang == 0 {
t.lang = id
}
}
func (t *Tag) setUndefinedScript(id scriptID) {
if t.script == 0 {
t.script = id
}
}
func (t *Tag) setUndefinedRegion(id regionID) {
if t.region == 0 || t.region.contains(id) {
t.region = id
}
}
// ErrMissingLikelyTagsData indicates no information was available
// to compute likely values of missing tags.
var ErrMissingLikelyTagsData = errors.New("missing likely tags data")
// addLikelySubtags sets subtags to their most likely value, given the locale.
// In most cases this means setting fields for unknown values, but in some
// cases it may alter a value. It returns an ErrMissingLikelyTagsData error
// if the given locale cannot be expanded.
func (t Tag) addLikelySubtags() (Tag, error) {
id, err := addTags(t)
if err != nil {
return t, err
} else if id.equalTags(t) {
return t, nil
}
id.remakeString()
return id, nil
}
// specializeRegion attempts to specialize a group region.
func specializeRegion(t *Tag) bool {
if i := regionInclusion[t.region]; i < nRegionGroups {
x := likelyRegionGroup[i]
if langID(x.lang) == t.lang && scriptID(x.script) == t.script {
t.region = regionID(x.region)
}
return true
}
return false
}
func addTags(t Tag) (Tag, error) {
// We leave private use identifiers alone.
if t.private() {
return t, nil
}
if t.script != 0 && t.region != 0 {
if t.lang != 0 {
// already fully specified
specializeRegion(&t)
return t, nil
}
// Search matches for und-script-region. Note that for these cases
// region will never be a group so there is no need to check for this.
list := likelyRegion[t.region : t.region+1]
if x := list[0]; x.flags&isList != 0 {
list = likelyRegionList[x.lang : x.lang+uint16(x.script)]
}
for _, x := range list {
// Deviating from the spec. See match_test.go for details.
if scriptID(x.script) == t.script {
t.setUndefinedLang(langID(x.lang))
return t, nil
}
}
}
if t.lang != 0 {
// Search matches for lang-script and lang-region, where lang != und.
if t.lang < langNoIndexOffset {
x := likelyLang[t.lang]
if x.flags&isList != 0 {
list := likelyLangList[x.region : x.region+uint16(x.script)]
if t.script != 0 {
for _, x := range list {
if scriptID(x.script) == t.script && x.flags&scriptInFrom != 0 {
t.setUndefinedRegion(regionID(x.region))
return t, nil
}
}
} else if t.region != 0 {
count := 0
goodScript := true
tt := t
for _, x := range list {
// We visit all entries for which the script was not
// defined, including the ones where the region was not
// defined. This allows for proper disambiguation within
// regions.
if x.flags&scriptInFrom == 0 && t.region.contains(regionID(x.region)) {
tt.region = regionID(x.region)
tt.setUndefinedScript(scriptID(x.script))
goodScript = goodScript && tt.script == scriptID(x.script)
count++
}
}
if count == 1 {
return tt, nil
}
// Even if we fail to find a unique Region, we might have
// an unambiguous script.
if goodScript {
t.script = tt.script
}
}
}
}
} else {
// Search matches for und-script.
if t.script != 0 {
x := likelyScript[t.script]
if x.region != 0 {
t.setUndefinedRegion(regionID(x.region))
t.setUndefinedLang(langID(x.lang))
return t, nil
}
}
// Search matches for und-region. If und-script-region exists, it would
// have been found earlier.
if t.region != 0 {
if i := regionInclusion[t.region]; i < nRegionGroups {
x := likelyRegionGroup[i]
if x.region != 0 {
t.setUndefinedLang(langID(x.lang))
t.setUndefinedScript(scriptID(x.script))
t.region = regionID(x.region)
}
} else {
x := likelyRegion[t.region]
if x.flags&isList != 0 {
x = likelyRegionList[x.lang]
}
if x.script != 0 && x.flags != scriptInFrom {
t.setUndefinedLang(langID(x.lang))
t.setUndefinedScript(scriptID(x.script))
return t, nil
}
}
}
}
// Search matches for lang.
if t.lang < langNoIndexOffset {
x := likelyLang[t.lang]
if x.flags&isList != 0 {
x = likelyLangList[x.region]
}
if x.region != 0 {
t.setUndefinedScript(scriptID(x.script))
t.setUndefinedRegion(regionID(x.region))
}
specializeRegion(&t)
if t.lang == 0 {
t.lang = _en // default language
}
return t, nil
}
return t, ErrMissingLikelyTagsData
}
func (t *Tag) setTagsFrom(id Tag) {
t.lang = id.lang
t.script = id.script
t.region = id.region
}
// minimize removes the region or script subtags from t such that
// t.addLikelySubtags() == t.minimize().addLikelySubtags().
func (t Tag) minimize() (Tag, error) {
t, err := minimizeTags(t)
if err != nil {
return t, err
}
t.remakeString()
return t, nil
}
// minimizeTags mimics the behavior of the ICU 51 C implementation.
func minimizeTags(t Tag) (Tag, error) {
if t.equalTags(und) {
return t, nil
}
max, err := addTags(t)
if err != nil {
return t, err
}
for _, id := range [...]Tag{
{lang: t.lang},
{lang: t.lang, region: t.region},
{lang: t.lang, script: t.script},
} {
if x, err := addTags(id); err == nil && max.equalTags(x) {
t.setTagsFrom(id)
break
}
}
return t, nil
}
// Tag Matching
// CLDR defines an algorithm for finding the best match between two sets of language
// tags. The basic algorithm defines how to score a possible match and then find
// the match with the best score
// (see http://www.unicode.org/reports/tr35/#LanguageMatching).
// Using scoring has several disadvantages. The scoring obfuscates the importance of
// the various factors considered, making the algorithm harder to understand. Using
// scoring also requires the full score to be computed for each pair of tags.
//
// We will use a different algorithm which aims to have the following properties:
// - clarity on the precedence of the various selection factors, and
// - improved performance by allowing early termination of a comparison.
//
// Matching algorithm (overview)
// Input:
// - supported: a set of supported tags
// - default: the default tag to return in case there is no match
// - desired: list of desired tags, ordered by preference, starting with
// the most-preferred.
//
// Algorithm:
// 1) Set the best match to the lowest confidence level
// 2) For each tag in "desired":
// a) For each tag in "supported":
// 1) compute the match between the two tags.
// 2) if the match is better than the previous best match, replace it
// with the new match. (see next section)
// b) if the current best match is Exact and pin is true the result will be
// frozen to the language found thusfar, although better matches may
// still be found for the same language.
// 3) If the best match so far is below a certain threshold, return "default".
//
// Ranking:
// We use two phases to determine whether one pair of tags are a better match
// than another pair of tags. First, we determine a rough confidence level. If the
// levels are different, the one with the highest confidence wins.
// Second, if the rough confidence levels are identical, we use a set of tie-breaker
// rules.
//
// The confidence level of matching a pair of tags is determined by finding the
// lowest confidence level of any matches of the corresponding subtags (the
// result is deemed as good as its weakest link).
// We define the following levels:
// Exact - An exact match of a subtag, before adding likely subtags.
// MaxExact - An exact match of a subtag, after adding likely subtags.
// [See Note 2].
// High - High level of mutual intelligibility between different subtag
// variants.
// Low - Low level of mutual intelligibility between different subtag
// variants.
// No - No mutual intelligibility.
//
// The following levels can occur for each type of subtag:
// Base: Exact, MaxExact, High, Low, No
// Script: Exact, MaxExact [see Note 3], Low, No
// Region: Exact, MaxExact, High
// Variant: Exact, High
// Private: Exact, No
//
// Any result with a confidence level of Low or higher is deemed a possible match.
// Once a desired tag matches any of the supported tags with a level of MaxExact
// or higher, the next desired tag is not considered (see Step 2.b).
// Note that CLDR provides languageMatching data that defines close equivalence
// classes for base languages, scripts and regions.
//
// Tie-breaking
// If we get the same confidence level for two matches, we apply a sequence of
// tie-breaking rules. The first that succeeds defines the result. The rules are
// applied in the following order.
// 1) Original language was defined and was identical.
// 2) Original region was defined and was identical.
// 3) Distance between two maximized regions was the smallest.
// 4) Original script was defined and was identical.
// 5) Distance from want tag to have tag using the parent relation [see Note 5.]
// If there is still no winner after these rules are applied, the first match
// found wins.
//
// Notes:
// [2] In practice, as matching of Exact is done in a separate phase from
// matching the other levels, we reuse the Exact level to mean MaxExact in
// the second phase. As a consequence, we only need the levels defined by
// the Confidence type. The MaxExact confidence level is mapped to High in
// the public API.
// [3] We do not differentiate between maximized script values that were derived
// from suppressScript versus most likely tag data. We determined that in
// ranking the two, one ranks just after the other. Moreover, the two cannot
// occur concurrently. As a consequence, they are identical for practical
// purposes.
// [4] In case of deprecated, macro-equivalents and legacy mappings, we assign
// the MaxExact level to allow iw vs he to still be a closer match than
// en-AU vs en-US, for example.
// [5] In CLDR a locale inherits fields that are unspecified for this locale
// from its parent. Therefore, if a locale is a parent of another locale,
// it is a strong measure for closeness, especially when no other tie
// breaker rule applies. One could also argue it is inconsistent, for
// example, when pt-AO matches pt (which CLDR equates with pt-BR), even
// though its parent is pt-PT according to the inheritance rules.
//
// Implementation Details:
// There are several performance considerations worth pointing out. Most notably,
// we preprocess as much as possible (within reason) at the time of creation of a
// matcher. This includes:
// - creating a per-language map, which includes data for the raw base language
// and its canonicalized variant (if applicable),
// - expanding entries for the equivalence classes defined in CLDR's
// languageMatch data.
// The per-language map ensures that typically only a very small number of tags
// need to be considered. The pre-expansion of canonicalized subtags and
// equivalence classes reduces the amount of map lookups that need to be done at
// runtime.
// matcher keeps a set of supported language tags, indexed by language.
type matcher struct {
default_ *haveTag
supported []*haveTag
index map[langID]*matchHeader
passSettings bool
preferSameScript bool
}
// matchHeader has the lists of tags for exact matches and matches based on
// maximized and canonicalized tags for a given language.
type matchHeader struct {
haveTags []*haveTag
original bool
}
// haveTag holds a supported Tag and its maximized script and region. The maximized
// or canonicalized language is not stored as it is not needed during matching.
type haveTag struct {
tag Tag
// index of this tag in the original list of supported tags.
index int
// conf is the maximum confidence that can result from matching this haveTag.
// When conf < Exact this means it was inserted after applying a CLDR equivalence rule.
conf Confidence
// Maximized region and script.
maxRegion regionID
maxScript scriptID
// altScript may be checked as an alternative match to maxScript. If altScript
// matches, the confidence level for this match is Low. Theoretically there
// could be multiple alternative scripts. This does not occur in practice.
altScript scriptID
// nextMax is the index of the next haveTag with the same maximized tags.
nextMax uint16
}
func makeHaveTag(tag Tag, index int) (haveTag, langID) {
max := tag
if tag.lang != 0 || tag.region != 0 || tag.script != 0 {
max, _ = max.canonicalize(All)
max, _ = addTags(max)
max.remakeString()
}
return haveTag{tag, index, Exact, max.region, max.script, altScript(max.lang, max.script), 0}, max.lang
}
// altScript returns an alternative script that may match the given script with
// a low confidence. At the moment, the langMatch data allows for at most one
// script to map to another and we rely on this to keep the code simple.
func altScript(l langID, s scriptID) scriptID {
for _, alt := range matchScript {
// TODO: also match cases where language is not the same.
if (langID(alt.wantLang) == l || langID(alt.haveLang) == l) &&
scriptID(alt.haveScript) == s {
return scriptID(alt.wantScript)
}
}
return 0
}
// addIfNew adds a haveTag to the list of tags only if it is a unique tag.
// Tags that have the same maximized values are linked by index.
func (h *matchHeader) addIfNew(n haveTag, exact bool) {
h.original = h.original || exact
// Don't add new exact matches.
for _, v := range h.haveTags {
if v.tag.equalsRest(n.tag) {
return
}
}
// Allow duplicate maximized tags, but create a linked list to allow quickly
// comparing the equivalents and bail out.
for i, v := range h.haveTags {
if v.maxScript == n.maxScript &&
v.maxRegion == n.maxRegion &&
v.tag.variantOrPrivateTagStr() == n.tag.variantOrPrivateTagStr() {
for h.haveTags[i].nextMax != 0 {
i = int(h.haveTags[i].nextMax)
}
h.haveTags[i].nextMax = uint16(len(h.haveTags))
break
}
}
h.haveTags = append(h.haveTags, &n)
}
// header returns the matchHeader for the given language. It creates one if
// it doesn't already exist.
func (m *matcher) header(l langID) *matchHeader {
if h := m.index[l]; h != nil {
return h
}
h := &matchHeader{}
m.index[l] = h
return h
}
func toConf(d uint8) Confidence {
if d <= 10 {
return High
}
if d < 30 {
return Low
}
return No
}
// newMatcher builds an index for the given supported tags and returns it as
// a matcher. It also expands the index by considering various equivalence classes
// for a given tag.
func newMatcher(supported []Tag, options []MatchOption) *matcher {
m := &matcher{
index: make(map[langID]*matchHeader),
preferSameScript: true,
}
for _, o := range options {
o(m)
}
if len(supported) == 0 {
m.default_ = &haveTag{}
return m
}
// Add supported languages to the index. Add exact matches first to give
// them precedence.
for i, tag := range supported {
pair, _ := makeHaveTag(tag, i)
m.header(tag.lang).addIfNew(pair, true)
m.supported = append(m.supported, &pair)
}
m.default_ = m.header(supported[0].lang).haveTags[0]
// Keep these in two different loops to support the case that two equivalent
// languages are distinguished, such as iw and he.
for i, tag := range supported {
pair, max := makeHaveTag(tag, i)
if max != tag.lang {
m.header(max).addIfNew(pair, true)
}
}
// update is used to add indexes in the map for equivalent languages.
// update will only add entries to original indexes, thus not computing any
// transitive relations.
update := func(want, have uint16, conf Confidence) {
if hh := m.index[langID(have)]; hh != nil {
if !hh.original {
return
}
hw := m.header(langID(want))
for _, ht := range hh.haveTags {
v := *ht
if conf < v.conf {
v.conf = conf
}
v.nextMax = 0 // this value needs to be recomputed
if v.altScript != 0 {
v.altScript = altScript(langID(want), v.maxScript)
}
hw.addIfNew(v, conf == Exact && hh.original)
}
}
}
// Add entries for languages with mutual intelligibility as defined by CLDR's
// languageMatch data.
for _, ml := range matchLang {
update(ml.want, ml.have, toConf(ml.distance))
if !ml.oneway {
update(ml.have, ml.want, toConf(ml.distance))
}
}
// Add entries for possible canonicalizations. This is an optimization to
// ensure that only one map lookup needs to be done at runtime per desired tag.
// First we match deprecated equivalents. If they are perfect equivalents
// (their canonicalization simply substitutes a different language code, but
// nothing else), the match confidence is Exact, otherwise it is High.
for i, lm := range langAliasMap {
// If deprecated codes match and there is no fiddling with the script or
// or region, we consider it an exact match.
conf := Exact
if langAliasTypes[i] != langMacro {
if !isExactEquivalent(langID(lm.from)) {
conf = High
}
update(lm.to, lm.from, conf)
}
update(lm.from, lm.to, conf)
}
return m
}
// getBest gets the best matching tag in m for any of the given tags, taking into
// account the order of preference of the given tags.
func (m *matcher) getBest(want ...Tag) (got *haveTag, orig Tag, c Confidence) {
best := bestMatch{}
for i, w := range want {
var max Tag
// Check for exact match first.
h := m.index[w.lang]
if w.lang != 0 {
if h == nil {
continue
}
// Base language is defined.
max, _ = w.canonicalize(Legacy | Deprecated | Macro)
// A region that is added through canonicalization is stronger than
// a maximized region: set it in the original (e.g. mo -> ro-MD).
if w.region != max.region {
w.region = max.region
}
// TODO: should we do the same for scripts?
// See test case: en, sr, nl ; sh ; sr
max, _ = addTags(max)
} else {
// Base language is not defined.
if h != nil {
for i := range h.haveTags {
have := h.haveTags[i]
if have.tag.equalsRest(w) {
return have, w, Exact
}
}
}
if w.script == 0 && w.region == 0 {
// We skip all tags matching und for approximate matching, including
// private tags.
continue
}
max, _ = addTags(w)
if h = m.index[max.lang]; h == nil {
continue
}
}
pin := true
for _, t := range want[i+1:] {
if w.lang == t.lang {
pin = false
break
}
}
// Check for match based on maximized tag.
for i := range h.haveTags {
have := h.haveTags[i]
best.update(have, w, max.script, max.region, pin)
if best.conf == Exact {
for have.nextMax != 0 {
have = h.haveTags[have.nextMax]
best.update(have, w, max.script, max.region, pin)
}
return best.have, best.want, best.conf
}
}
}
if best.conf <= No {
if len(want) != 0 {
return nil, want[0], No
}
return nil, Tag{}, No
}
return best.have, best.want, best.conf
}
// bestMatch accumulates the best match so far.
type bestMatch struct {
have *haveTag
want Tag
conf Confidence
pinnedRegion regionID
pinLanguage bool
sameRegionGroup bool
// Cached results from applying tie-breaking rules.
origLang bool
origReg bool
paradigmReg bool
regGroupDist uint8
origScript bool
}
// update updates the existing best match if the new pair is considered to be a
// better match. To determine if the given pair is a better match, it first
// computes the rough confidence level. If this surpasses the current match, it
// will replace it and update the tie-breaker rule cache. If there is a tie, it
// proceeds with applying a series of tie-breaker rules. If there is no
// conclusive winner after applying the tie-breaker rules, it leaves the current
// match as the preferred match.
//
// If pin is true and have and tag are a strong match, it will henceforth only
// consider matches for this language. This corresponds to the nothing that most
// users have a strong preference for the first defined language. A user can
// still prefer a second language over a dialect of the preferred language by
// explicitly specifying dialects, e.g. "en, nl, en-GB". In this case pin should
// be false.
func (m *bestMatch) update(have *haveTag, tag Tag, maxScript scriptID, maxRegion regionID, pin bool) {
// Bail if the maximum attainable confidence is below that of the current best match.
c := have.conf
if c < m.conf {
return
}
// Don't change the language once we already have found an exact match.
if m.pinLanguage && tag.lang != m.want.lang {
return
}
// Pin the region group if we are comparing tags for the same language.
if tag.lang == m.want.lang && m.sameRegionGroup {
_, sameGroup := regionGroupDist(m.pinnedRegion, have.maxRegion, have.maxScript, m.want.lang)
if !sameGroup {
return
}
}
if c == Exact && have.maxScript == maxScript {
// If there is another language and then another entry of this language,
// don't pin anything, otherwise pin the language.
m.pinLanguage = pin
}
if have.tag.equalsRest(tag) {
} else if have.maxScript != maxScript {
// There is usually very little comprehension between different scripts.
// In a few cases there may still be Low comprehension. This possibility
// is pre-computed and stored in have.altScript.
if Low < m.conf || have.altScript != maxScript {
return
}
c = Low
} else if have.maxRegion != maxRegion {
if High < c {
// There is usually a small difference between languages across regions.
c = High
}
}
// We store the results of the computations of the tie-breaker rules along
// with the best match. There is no need to do the checks once we determine
// we have a winner, but we do still need to do the tie-breaker computations.
// We use "beaten" to keep track if we still need to do the checks.
beaten := false // true if the new pair defeats the current one.
if c != m.conf {
if c < m.conf {
return
}
beaten = true
}
// Tie-breaker rules:
// We prefer if the pre-maximized language was specified and identical.
origLang := have.tag.lang == tag.lang && tag.lang != 0
if !beaten && m.origLang != origLang {
if m.origLang {
return
}
beaten = true
}
// We prefer if the pre-maximized region was specified and identical.
origReg := have.tag.region == tag.region && tag.region != 0
if !beaten && m.origReg != origReg {
if m.origReg {
return
}
beaten = true
}
regGroupDist, sameGroup := regionGroupDist(have.maxRegion, maxRegion, maxScript, tag.lang)
if !beaten && m.regGroupDist != regGroupDist {
if regGroupDist > m.regGroupDist {
return
}
beaten = true
}
paradigmReg := isParadigmLocale(tag.lang, have.maxRegion)
if !beaten && m.paradigmReg != paradigmReg {
if !paradigmReg {
return
}
beaten = true
}
// Next we prefer if the pre-maximized script was specified and identical.
origScript := have.tag.script == tag.script && tag.script != 0
if !beaten && m.origScript != origScript {
if m.origScript {
return
}
beaten = true
}
// Update m to the newly found best match.
if beaten {
m.have = have
m.want = tag
m.conf = c
m.pinnedRegion = maxRegion
m.sameRegionGroup = sameGroup
m.origLang = origLang
m.origReg = origReg
m.paradigmReg = paradigmReg
m.origScript = origScript
m.regGroupDist = regGroupDist
}
}
func isParadigmLocale(lang langID, r regionID) bool {
for _, e := range paradigmLocales {
if langID(e[0]) == lang && (r == regionID(e[1]) || r == regionID(e[2])) {
return true
}
}
return false
}
// regionGroupDist computes the distance between two regions based on their
// CLDR grouping.
func regionGroupDist(a, b regionID, script scriptID, lang langID) (dist uint8, same bool) {
const defaultDistance = 4
aGroup := uint(regionToGroups[a]) << 1
bGroup := uint(regionToGroups[b]) << 1
for _, ri := range matchRegion {
if langID(ri.lang) == lang && (ri.script == 0 || scriptID(ri.script) == script) {
group := uint(1 << (ri.group &^ 0x80))
if 0x80&ri.group == 0 {
if aGroup&bGroup&group != 0 { // Both regions are in the group.
return ri.distance, ri.distance == defaultDistance
}
} else {
if (aGroup|bGroup)&group == 0 { // Both regions are not in the group.
return ri.distance, ri.distance == defaultDistance
}
}
}
}
return defaultDistance, true
}
func (t Tag) variants() string {
if t.pVariant == 0 {
return ""
}
return t.str[t.pVariant:t.pExt]
}
// variantOrPrivateTagStr returns variants or private use tags.
func (t Tag) variantOrPrivateTagStr() string {
if t.pExt > 0 {
return t.str[t.pVariant:t.pExt]
}
return t.str[t.pVariant:]
}
// equalsRest compares everything except the language.
func (a Tag) equalsRest(b Tag) bool {
// TODO: don't include extensions in this comparison. To do this efficiently,
// though, we should handle private tags separately.
return a.script == b.script && a.region == b.region && a.variantOrPrivateTagStr() == b.variantOrPrivateTagStr()
}
// isExactEquivalent returns true if canonicalizing the language will not alter
// the script or region of a tag.
func isExactEquivalent(l langID) bool {
for _, o := range notEquivalent {
if o == l {
return false
}
}
return true
}
var notEquivalent []langID
func init() {
// Create a list of all languages for which canonicalization may alter the
// script or region.
for _, lm := range langAliasMap {
tag := Tag{lang: langID(lm.from)}
if tag, _ = tag.canonicalize(All); tag.script != 0 || tag.region != 0 {
notEquivalent = append(notEquivalent, langID(lm.from))
}
}
// Maximize undefined regions of paradigm locales.
for i, v := range paradigmLocales {
max, _ := addTags(Tag{lang: langID(v[0])})
if v[1] == 0 {
paradigmLocales[i][1] = uint16(max.region)
}
if v[2] == 0 {
paradigmLocales[i][2] = uint16(max.region)
}
}
}
+859
View File
@@ -0,0 +1,859 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
import (
"bytes"
"errors"
"fmt"
"sort"
"strconv"
"strings"
"golang.org/x/text/internal/tag"
)
// isAlpha returns true if the byte is not a digit.
// b must be an ASCII letter or digit.
func isAlpha(b byte) bool {
return b > '9'
}
// isAlphaNum returns true if the string contains only ASCII letters or digits.
func isAlphaNum(s []byte) bool {
for _, c := range s {
if !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9') {
return false
}
}
return true
}
// errSyntax is returned by any of the parsing functions when the
// input is not well-formed, according to BCP 47.
// TODO: return the position at which the syntax error occurred?
var errSyntax = errors.New("language: tag is not well-formed")
// ValueError is returned by any of the parsing functions when the
// input is well-formed but the respective subtag is not recognized
// as a valid value.
type ValueError struct {
v [8]byte
}
func mkErrInvalid(s []byte) error {
var e ValueError
copy(e.v[:], s)
return e
}
func (e ValueError) tag() []byte {
n := bytes.IndexByte(e.v[:], 0)
if n == -1 {
n = 8
}
return e.v[:n]
}
// Error implements the error interface.
func (e ValueError) Error() string {
return fmt.Sprintf("language: subtag %q is well-formed but unknown", e.tag())
}
// Subtag returns the subtag for which the error occurred.
func (e ValueError) Subtag() string {
return string(e.tag())
}
// scanner is used to scan BCP 47 tokens, which are separated by _ or -.
type scanner struct {
b []byte
bytes [max99thPercentileSize]byte
token []byte
start int // start position of the current token
end int // end position of the current token
next int // next point for scan
err error
done bool
}
func makeScannerString(s string) scanner {
scan := scanner{}
if len(s) <= len(scan.bytes) {
scan.b = scan.bytes[:copy(scan.bytes[:], s)]
} else {
scan.b = []byte(s)
}
scan.init()
return scan
}
// makeScanner returns a scanner using b as the input buffer.
// b is not copied and may be modified by the scanner routines.
func makeScanner(b []byte) scanner {
scan := scanner{b: b}
scan.init()
return scan
}
func (s *scanner) init() {
for i, c := range s.b {
if c == '_' {
s.b[i] = '-'
}
}
s.scan()
}
// restToLower converts the string between start and end to lower case.
func (s *scanner) toLower(start, end int) {
for i := start; i < end; i++ {
c := s.b[i]
if 'A' <= c && c <= 'Z' {
s.b[i] += 'a' - 'A'
}
}
}
func (s *scanner) setError(e error) {
if s.err == nil || (e == errSyntax && s.err != errSyntax) {
s.err = e
}
}
// resizeRange shrinks or grows the array at position oldStart such that
// a new string of size newSize can fit between oldStart and oldEnd.
// Sets the scan point to after the resized range.
func (s *scanner) resizeRange(oldStart, oldEnd, newSize int) {
s.start = oldStart
if end := oldStart + newSize; end != oldEnd {
diff := end - oldEnd
if end < cap(s.b) {
b := make([]byte, len(s.b)+diff)
copy(b, s.b[:oldStart])
copy(b[end:], s.b[oldEnd:])
s.b = b
} else {
s.b = append(s.b[end:], s.b[oldEnd:]...)
}
s.next = end + (s.next - s.end)
s.end = end
}
}
// replace replaces the current token with repl.
func (s *scanner) replace(repl string) {
s.resizeRange(s.start, s.end, len(repl))
copy(s.b[s.start:], repl)
}
// gobble removes the current token from the input.
// Caller must call scan after calling gobble.
func (s *scanner) gobble(e error) {
s.setError(e)
if s.start == 0 {
s.b = s.b[:+copy(s.b, s.b[s.next:])]
s.end = 0
} else {
s.b = s.b[:s.start-1+copy(s.b[s.start-1:], s.b[s.end:])]
s.end = s.start - 1
}
s.next = s.start
}
// deleteRange removes the given range from s.b before the current token.
func (s *scanner) deleteRange(start, end int) {
s.setError(errSyntax)
s.b = s.b[:start+copy(s.b[start:], s.b[end:])]
diff := end - start
s.next -= diff
s.start -= diff
s.end -= diff
}
// scan parses the next token of a BCP 47 string. Tokens that are larger
// than 8 characters or include non-alphanumeric characters result in an error
// and are gobbled and removed from the output.
// It returns the end position of the last token consumed.
func (s *scanner) scan() (end int) {
end = s.end
s.token = nil
for s.start = s.next; s.next < len(s.b); {
i := bytes.IndexByte(s.b[s.next:], '-')
if i == -1 {
s.end = len(s.b)
s.next = len(s.b)
i = s.end - s.start
} else {
s.end = s.next + i
s.next = s.end + 1
}
token := s.b[s.start:s.end]
if i < 1 || i > 8 || !isAlphaNum(token) {
s.gobble(errSyntax)
continue
}
s.token = token
return end
}
if n := len(s.b); n > 0 && s.b[n-1] == '-' {
s.setError(errSyntax)
s.b = s.b[:len(s.b)-1]
}
s.done = true
return end
}
// acceptMinSize parses multiple tokens of the given size or greater.
// It returns the end position of the last token consumed.
func (s *scanner) acceptMinSize(min int) (end int) {
end = s.end
s.scan()
for ; len(s.token) >= min; s.scan() {
end = s.end
}
return end
}
// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
// failed it returns an error and any part of the tag that could be parsed.
// If parsing succeeded but an unknown value was found, it returns
// ValueError. The Tag returned in this case is just stripped of the unknown
// value. All other values are preserved. It accepts tags in the BCP 47 format
// and extensions to this standard defined in
// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// The resulting tag is canonicalized using the default canonicalization type.
func Parse(s string) (t Tag, err error) {
return Default.Parse(s)
}
// Parse parses the given BCP 47 string and returns a valid Tag. If parsing
// failed it returns an error and any part of the tag that could be parsed.
// If parsing succeeded but an unknown value was found, it returns
// ValueError. The Tag returned in this case is just stripped of the unknown
// value. All other values are preserved. It accepts tags in the BCP 47 format
// and extensions to this standard defined in
// http://www.unicode.org/reports/tr35/#Unicode_Language_and_Locale_Identifiers.
// The resulting tag is canonicalized using the the canonicalization type c.
func (c CanonType) Parse(s string) (t Tag, err error) {
// TODO: consider supporting old-style locale key-value pairs.
if s == "" {
return und, errSyntax
}
if len(s) <= maxAltTaglen {
b := [maxAltTaglen]byte{}
for i, c := range s {
// Generating invalid UTF-8 is okay as it won't match.
if 'A' <= c && c <= 'Z' {
c += 'a' - 'A'
} else if c == '_' {
c = '-'
}
b[i] = byte(c)
}
if t, ok := grandfathered(b); ok {
return t, nil
}
}
scan := makeScannerString(s)
t, err = parse(&scan, s)
t, changed := t.canonicalize(c)
if changed {
t.remakeString()
}
return t, err
}
func parse(scan *scanner, s string) (t Tag, err error) {
t = und
var end int
if n := len(scan.token); n <= 1 {
scan.toLower(0, len(scan.b))
if n == 0 || scan.token[0] != 'x' {
return t, errSyntax
}
end = parseExtensions(scan)
} else if n >= 4 {
return und, errSyntax
} else { // the usual case
t, end = parseTag(scan)
if n := len(scan.token); n == 1 {
t.pExt = uint16(end)
end = parseExtensions(scan)
} else if end < len(scan.b) {
scan.setError(errSyntax)
scan.b = scan.b[:end]
}
}
if int(t.pVariant) < len(scan.b) {
if end < len(s) {
s = s[:end]
}
if len(s) > 0 && tag.Compare(s, scan.b) == 0 {
t.str = s
} else {
t.str = string(scan.b)
}
} else {
t.pVariant, t.pExt = 0, 0
}
return t, scan.err
}
// parseTag parses language, script, region and variants.
// It returns a Tag and the end position in the input that was parsed.
func parseTag(scan *scanner) (t Tag, end int) {
var e error
// TODO: set an error if an unknown lang, script or region is encountered.
t.lang, e = getLangID(scan.token)
scan.setError(e)
scan.replace(t.lang.String())
langStart := scan.start
end = scan.scan()
for len(scan.token) == 3 && isAlpha(scan.token[0]) {
// From http://tools.ietf.org/html/bcp47, <lang>-<extlang> tags are equivalent
// to a tag of the form <extlang>.
lang, e := getLangID(scan.token)
if lang != 0 {
t.lang = lang
copy(scan.b[langStart:], lang.String())
scan.b[langStart+3] = '-'
scan.start = langStart + 4
}
scan.gobble(e)
end = scan.scan()
}
if len(scan.token) == 4 && isAlpha(scan.token[0]) {
t.script, e = getScriptID(script, scan.token)
if t.script == 0 {
scan.gobble(e)
}
end = scan.scan()
}
if n := len(scan.token); n >= 2 && n <= 3 {
t.region, e = getRegionID(scan.token)
if t.region == 0 {
scan.gobble(e)
} else {
scan.replace(t.region.String())
}
end = scan.scan()
}
scan.toLower(scan.start, len(scan.b))
t.pVariant = byte(end)
end = parseVariants(scan, end, t)
t.pExt = uint16(end)
return t, end
}
var separator = []byte{'-'}
// parseVariants scans tokens as long as each token is a valid variant string.
// Duplicate variants are removed.
func parseVariants(scan *scanner, end int, t Tag) int {
start := scan.start
varIDBuf := [4]uint8{}
variantBuf := [4][]byte{}
varID := varIDBuf[:0]
variant := variantBuf[:0]
last := -1
needSort := false
for ; len(scan.token) >= 4; scan.scan() {
// TODO: measure the impact of needing this conversion and redesign
// the data structure if there is an issue.
v, ok := variantIndex[string(scan.token)]
if !ok {
// unknown variant
// TODO: allow user-defined variants?
scan.gobble(mkErrInvalid(scan.token))
continue
}
varID = append(varID, v)
variant = append(variant, scan.token)
if !needSort {
if last < int(v) {
last = int(v)
} else {
needSort = true
// There is no legal combinations of more than 7 variants
// (and this is by no means a useful sequence).
const maxVariants = 8
if len(varID) > maxVariants {
break
}
}
}
end = scan.end
}
if needSort {
sort.Sort(variantsSort{varID, variant})
k, l := 0, -1
for i, v := range varID {
w := int(v)
if l == w {
// Remove duplicates.
continue
}
varID[k] = varID[i]
variant[k] = variant[i]
k++
l = w
}
if str := bytes.Join(variant[:k], separator); len(str) == 0 {
end = start - 1
} else {
scan.resizeRange(start, end, len(str))
copy(scan.b[scan.start:], str)
end = scan.end
}
}
return end
}
type variantsSort struct {
i []uint8
v [][]byte
}
func (s variantsSort) Len() int {
return len(s.i)
}
func (s variantsSort) Swap(i, j int) {
s.i[i], s.i[j] = s.i[j], s.i[i]
s.v[i], s.v[j] = s.v[j], s.v[i]
}
func (s variantsSort) Less(i, j int) bool {
return s.i[i] < s.i[j]
}
type bytesSort [][]byte
func (b bytesSort) Len() int {
return len(b)
}
func (b bytesSort) Swap(i, j int) {
b[i], b[j] = b[j], b[i]
}
func (b bytesSort) Less(i, j int) bool {
return bytes.Compare(b[i], b[j]) == -1
}
// parseExtensions parses and normalizes the extensions in the buffer.
// It returns the last position of scan.b that is part of any extension.
// It also trims scan.b to remove excess parts accordingly.
func parseExtensions(scan *scanner) int {
start := scan.start
exts := [][]byte{}
private := []byte{}
end := scan.end
for len(scan.token) == 1 {
extStart := scan.start
ext := scan.token[0]
end = parseExtension(scan)
extension := scan.b[extStart:end]
if len(extension) < 3 || (ext != 'x' && len(extension) < 4) {
scan.setError(errSyntax)
end = extStart
continue
} else if start == extStart && (ext == 'x' || scan.start == len(scan.b)) {
scan.b = scan.b[:end]
return end
} else if ext == 'x' {
private = extension
break
}
exts = append(exts, extension)
}
sort.Sort(bytesSort(exts))
if len(private) > 0 {
exts = append(exts, private)
}
scan.b = scan.b[:start]
if len(exts) > 0 {
scan.b = append(scan.b, bytes.Join(exts, separator)...)
} else if start > 0 {
// Strip trailing '-'.
scan.b = scan.b[:start-1]
}
return end
}
// parseExtension parses a single extension and returns the position of
// the extension end.
func parseExtension(scan *scanner) int {
start, end := scan.start, scan.end
switch scan.token[0] {
case 'u':
attrStart := end
scan.scan()
for last := []byte{}; len(scan.token) > 2; scan.scan() {
if bytes.Compare(scan.token, last) != -1 {
// Attributes are unsorted. Start over from scratch.
p := attrStart + 1
scan.next = p
attrs := [][]byte{}
for scan.scan(); len(scan.token) > 2; scan.scan() {
attrs = append(attrs, scan.token)
end = scan.end
}
sort.Sort(bytesSort(attrs))
copy(scan.b[p:], bytes.Join(attrs, separator))
break
}
last = scan.token
end = scan.end
}
var last, key []byte
for attrEnd := end; len(scan.token) == 2; last = key {
key = scan.token
keyEnd := scan.end
end = scan.acceptMinSize(3)
// TODO: check key value validity
if keyEnd == end || bytes.Compare(key, last) != 1 {
// We have an invalid key or the keys are not sorted.
// Start scanning keys from scratch and reorder.
p := attrEnd + 1
scan.next = p
keys := [][]byte{}
for scan.scan(); len(scan.token) == 2; {
keyStart, keyEnd := scan.start, scan.end
end = scan.acceptMinSize(3)
if keyEnd != end {
keys = append(keys, scan.b[keyStart:end])
} else {
scan.setError(errSyntax)
end = keyStart
}
}
sort.Sort(bytesSort(keys))
reordered := bytes.Join(keys, separator)
if e := p + len(reordered); e < end {
scan.deleteRange(e, end)
end = e
}
copy(scan.b[p:], bytes.Join(keys, separator))
break
}
}
case 't':
scan.scan()
if n := len(scan.token); n >= 2 && n <= 3 && isAlpha(scan.token[1]) {
_, end = parseTag(scan)
scan.toLower(start, end)
}
for len(scan.token) == 2 && !isAlpha(scan.token[1]) {
end = scan.acceptMinSize(3)
}
case 'x':
end = scan.acceptMinSize(1)
default:
end = scan.acceptMinSize(2)
}
return end
}
// Compose creates a Tag from individual parts, which may be of type Tag, Base,
// Script, Region, Variant, []Variant, Extension, []Extension or error. If a
// Base, Script or Region or slice of type Variant or Extension is passed more
// than once, the latter will overwrite the former. Variants and Extensions are
// accumulated, but if two extensions of the same type are passed, the latter
// will replace the former. A Tag overwrites all former values and typically
// only makes sense as the first argument. The resulting tag is returned after
// canonicalizing using the Default CanonType. If one or more errors are
// encountered, one of the errors is returned.
func Compose(part ...interface{}) (t Tag, err error) {
return Default.Compose(part...)
}
// Compose creates a Tag from individual parts, which may be of type Tag, Base,
// Script, Region, Variant, []Variant, Extension, []Extension or error. If a
// Base, Script or Region or slice of type Variant or Extension is passed more
// than once, the latter will overwrite the former. Variants and Extensions are
// accumulated, but if two extensions of the same type are passed, the latter
// will replace the former. A Tag overwrites all former values and typically
// only makes sense as the first argument. The resulting tag is returned after
// canonicalizing using CanonType c. If one or more errors are encountered,
// one of the errors is returned.
func (c CanonType) Compose(part ...interface{}) (t Tag, err error) {
var b builder
if err = b.update(part...); err != nil {
return und, err
}
t, _ = b.tag.canonicalize(c)
if len(b.ext) > 0 || len(b.variant) > 0 {
sort.Sort(sortVariant(b.variant))
sort.Strings(b.ext)
if b.private != "" {
b.ext = append(b.ext, b.private)
}
n := maxCoreSize + tokenLen(b.variant...) + tokenLen(b.ext...)
buf := make([]byte, n)
p := t.genCoreBytes(buf)
t.pVariant = byte(p)
p += appendTokens(buf[p:], b.variant...)
t.pExt = uint16(p)
p += appendTokens(buf[p:], b.ext...)
t.str = string(buf[:p])
} else if b.private != "" {
t.str = b.private
t.remakeString()
}
return
}
type builder struct {
tag Tag
private string // the x extension
ext []string
variant []string
err error
}
func (b *builder) addExt(e string) {
if e == "" {
} else if e[0] == 'x' {
b.private = e
} else {
b.ext = append(b.ext, e)
}
}
var errInvalidArgument = errors.New("invalid Extension or Variant")
func (b *builder) update(part ...interface{}) (err error) {
replace := func(l *[]string, s string, eq func(a, b string) bool) bool {
if s == "" {
b.err = errInvalidArgument
return true
}
for i, v := range *l {
if eq(v, s) {
(*l)[i] = s
return true
}
}
return false
}
for _, x := range part {
switch v := x.(type) {
case Tag:
b.tag.lang = v.lang
b.tag.region = v.region
b.tag.script = v.script
if v.str != "" {
b.variant = nil
for x, s := "", v.str[v.pVariant:v.pExt]; s != ""; {
x, s = nextToken(s)
b.variant = append(b.variant, x)
}
b.ext, b.private = nil, ""
for i, e := int(v.pExt), ""; i < len(v.str); {
i, e = getExtension(v.str, i)
b.addExt(e)
}
}
case Base:
b.tag.lang = v.langID
case Script:
b.tag.script = v.scriptID
case Region:
b.tag.region = v.regionID
case Variant:
if !replace(&b.variant, v.variant, func(a, b string) bool { return a == b }) {
b.variant = append(b.variant, v.variant)
}
case Extension:
if !replace(&b.ext, v.s, func(a, b string) bool { return a[0] == b[0] }) {
b.addExt(v.s)
}
case []Variant:
b.variant = nil
for _, x := range v {
b.update(x)
}
case []Extension:
b.ext, b.private = nil, ""
for _, e := range v {
b.update(e)
}
// TODO: support parsing of raw strings based on morphology or just extensions?
case error:
err = v
}
}
return
}
func tokenLen(token ...string) (n int) {
for _, t := range token {
n += len(t) + 1
}
return
}
func appendTokens(b []byte, token ...string) int {
p := 0
for _, t := range token {
b[p] = '-'
copy(b[p+1:], t)
p += 1 + len(t)
}
return p
}
type sortVariant []string
func (s sortVariant) Len() int {
return len(s)
}
func (s sortVariant) Swap(i, j int) {
s[j], s[i] = s[i], s[j]
}
func (s sortVariant) Less(i, j int) bool {
return variantIndex[s[i]] < variantIndex[s[j]]
}
func findExt(list []string, x byte) int {
for i, e := range list {
if e[0] == x {
return i
}
}
return -1
}
// getExtension returns the name, body and end position of the extension.
func getExtension(s string, p int) (end int, ext string) {
if s[p] == '-' {
p++
}
if s[p] == 'x' {
return len(s), s[p:]
}
end = nextExtension(s, p)
return end, s[p:end]
}
// nextExtension finds the next extension within the string, searching
// for the -<char>- pattern from position p.
// In the fast majority of cases, language tags will have at most
// one extension and extensions tend to be small.
func nextExtension(s string, p int) int {
for n := len(s) - 3; p < n; {
if s[p] == '-' {
if s[p+2] == '-' {
return p
}
p += 3
} else {
p++
}
}
return len(s)
}
var errInvalidWeight = errors.New("ParseAcceptLanguage: invalid weight")
// ParseAcceptLanguage parses the contents of an Accept-Language header as
// defined in http://www.ietf.org/rfc/rfc2616.txt and returns a list of Tags and
// a list of corresponding quality weights. It is more permissive than RFC 2616
// and may return non-nil slices even if the input is not valid.
// The Tags will be sorted by highest weight first and then by first occurrence.
// Tags with a weight of zero will be dropped. An error will be returned if the
// input could not be parsed.
func ParseAcceptLanguage(s string) (tag []Tag, q []float32, err error) {
var entry string
for s != "" {
if entry, s = split(s, ','); entry == "" {
continue
}
entry, weight := split(entry, ';')
// Scan the language.
t, err := Parse(entry)
if err != nil {
id, ok := acceptFallback[entry]
if !ok {
return nil, nil, err
}
t = Tag{lang: id}
}
// Scan the optional weight.
w := 1.0
if weight != "" {
weight = consume(weight, 'q')
weight = consume(weight, '=')
// consume returns the empty string when a token could not be
// consumed, resulting in an error for ParseFloat.
if w, err = strconv.ParseFloat(weight, 32); err != nil {
return nil, nil, errInvalidWeight
}
// Drop tags with a quality weight of 0.
if w <= 0 {
continue
}
}
tag = append(tag, t)
q = append(q, float32(w))
}
sortStable(&tagSort{tag, q})
return tag, q, nil
}
// consume removes a leading token c from s and returns the result or the empty
// string if there is no such token.
func consume(s string, c byte) string {
if s == "" || s[0] != c {
return ""
}
return strings.TrimSpace(s[1:])
}
func split(s string, c byte) (head, tail string) {
if i := strings.IndexByte(s, c); i >= 0 {
return strings.TrimSpace(s[:i]), strings.TrimSpace(s[i+1:])
}
return strings.TrimSpace(s), ""
}
// Add hack mapping to deal with a small number of cases that that occur
// in Accept-Language (with reasonable frequency).
var acceptFallback = map[string]langID{
"english": _en,
"deutsch": _de,
"italian": _it,
"french": _fr,
"*": _mul, // defined in the spec to match all languages.
}
type tagSort struct {
tag []Tag
q []float32
}
func (s *tagSort) Len() int {
return len(s.q)
}
func (s *tagSort) Less(i, j int) bool {
return s.q[i] > s.q[j]
}
func (s *tagSort) Swap(i, j int) {
s.tag[i], s.tag[j] = s.tag[j], s.tag[i]
s.q[i], s.q[j] = s.q[j], s.q[i]
}
+3686
View File
File diff suppressed because it is too large Load Diff
+143
View File
@@ -0,0 +1,143 @@
// Copyright 2013 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package language
// TODO: Various sets of commonly use tags and regions.
// MustParse is like Parse, but panics if the given BCP 47 tag cannot be parsed.
// It simplifies safe initialization of Tag values.
func MustParse(s string) Tag {
t, err := Parse(s)
if err != nil {
panic(err)
}
return t
}
// MustParse is like Parse, but panics if the given BCP 47 tag cannot be parsed.
// It simplifies safe initialization of Tag values.
func (c CanonType) MustParse(s string) Tag {
t, err := c.Parse(s)
if err != nil {
panic(err)
}
return t
}
// MustParseBase is like ParseBase, but panics if the given base cannot be parsed.
// It simplifies safe initialization of Base values.
func MustParseBase(s string) Base {
b, err := ParseBase(s)
if err != nil {
panic(err)
}
return b
}
// MustParseScript is like ParseScript, but panics if the given script cannot be
// parsed. It simplifies safe initialization of Script values.
func MustParseScript(s string) Script {
scr, err := ParseScript(s)
if err != nil {
panic(err)
}
return scr
}
// MustParseRegion is like ParseRegion, but panics if the given region cannot be
// parsed. It simplifies safe initialization of Region values.
func MustParseRegion(s string) Region {
r, err := ParseRegion(s)
if err != nil {
panic(err)
}
return r
}
var (
und = Tag{}
Und Tag = Tag{}
Afrikaans Tag = Tag{lang: _af} // af
Amharic Tag = Tag{lang: _am} // am
Arabic Tag = Tag{lang: _ar} // ar
ModernStandardArabic Tag = Tag{lang: _ar, region: _001} // ar-001
Azerbaijani Tag = Tag{lang: _az} // az
Bulgarian Tag = Tag{lang: _bg} // bg
Bengali Tag = Tag{lang: _bn} // bn
Catalan Tag = Tag{lang: _ca} // ca
Czech Tag = Tag{lang: _cs} // cs
Danish Tag = Tag{lang: _da} // da
German Tag = Tag{lang: _de} // de
Greek Tag = Tag{lang: _el} // el
English Tag = Tag{lang: _en} // en
AmericanEnglish Tag = Tag{lang: _en, region: _US} // en-US
BritishEnglish Tag = Tag{lang: _en, region: _GB} // en-GB
Spanish Tag = Tag{lang: _es} // es
EuropeanSpanish Tag = Tag{lang: _es, region: _ES} // es-ES
LatinAmericanSpanish Tag = Tag{lang: _es, region: _419} // es-419
Estonian Tag = Tag{lang: _et} // et
Persian Tag = Tag{lang: _fa} // fa
Finnish Tag = Tag{lang: _fi} // fi
Filipino Tag = Tag{lang: _fil} // fil
French Tag = Tag{lang: _fr} // fr
CanadianFrench Tag = Tag{lang: _fr, region: _CA} // fr-CA
Gujarati Tag = Tag{lang: _gu} // gu
Hebrew Tag = Tag{lang: _he} // he
Hindi Tag = Tag{lang: _hi} // hi
Croatian Tag = Tag{lang: _hr} // hr
Hungarian Tag = Tag{lang: _hu} // hu
Armenian Tag = Tag{lang: _hy} // hy
Indonesian Tag = Tag{lang: _id} // id
Icelandic Tag = Tag{lang: _is} // is
Italian Tag = Tag{lang: _it} // it
Japanese Tag = Tag{lang: _ja} // ja
Georgian Tag = Tag{lang: _ka} // ka
Kazakh Tag = Tag{lang: _kk} // kk
Khmer Tag = Tag{lang: _km} // km
Kannada Tag = Tag{lang: _kn} // kn
Korean Tag = Tag{lang: _ko} // ko
Kirghiz Tag = Tag{lang: _ky} // ky
Lao Tag = Tag{lang: _lo} // lo
Lithuanian Tag = Tag{lang: _lt} // lt
Latvian Tag = Tag{lang: _lv} // lv
Macedonian Tag = Tag{lang: _mk} // mk
Malayalam Tag = Tag{lang: _ml} // ml
Mongolian Tag = Tag{lang: _mn} // mn
Marathi Tag = Tag{lang: _mr} // mr
Malay Tag = Tag{lang: _ms} // ms
Burmese Tag = Tag{lang: _my} // my
Nepali Tag = Tag{lang: _ne} // ne
Dutch Tag = Tag{lang: _nl} // nl
Norwegian Tag = Tag{lang: _no} // no
Punjabi Tag = Tag{lang: _pa} // pa
Polish Tag = Tag{lang: _pl} // pl
Portuguese Tag = Tag{lang: _pt} // pt
BrazilianPortuguese Tag = Tag{lang: _pt, region: _BR} // pt-BR
EuropeanPortuguese Tag = Tag{lang: _pt, region: _PT} // pt-PT
Romanian Tag = Tag{lang: _ro} // ro
Russian Tag = Tag{lang: _ru} // ru
Sinhala Tag = Tag{lang: _si} // si
Slovak Tag = Tag{lang: _sk} // sk
Slovenian Tag = Tag{lang: _sl} // sl
Albanian Tag = Tag{lang: _sq} // sq
Serbian Tag = Tag{lang: _sr} // sr
SerbianLatin Tag = Tag{lang: _sr, script: _Latn} // sr-Latn
Swedish Tag = Tag{lang: _sv} // sv
Swahili Tag = Tag{lang: _sw} // sw
Tamil Tag = Tag{lang: _ta} // ta
Telugu Tag = Tag{lang: _te} // te
Thai Tag = Tag{lang: _th} // th
Turkish Tag = Tag{lang: _tr} // tr
Ukrainian Tag = Tag{lang: _uk} // uk
Urdu Tag = Tag{lang: _ur} // ur
Uzbek Tag = Tag{lang: _uz} // uz
Vietnamese Tag = Tag{lang: _vi} // vi
Chinese Tag = Tag{lang: _zh} // zh
SimplifiedChinese Tag = Tag{lang: _zh, script: _Hans} // zh-Hans
TraditionalChinese Tag = Tag{lang: _zh, script: _Hant} // zh-Hant
Zulu Tag = Tag{lang: _zu} // zu
)
+27
View File
@@ -0,0 +1,27 @@
Copyright (c) 2009 The Go Authors. All rights reserved.
Redistribution and use in source and binary forms, with or without
modification, are permitted provided that the following conditions are
met:
* Redistributions of source code must retain the above copyright
notice, this list of conditions and the following disclaimer.
* Redistributions in binary form must reproduce the above
copyright notice, this list of conditions and the following disclaimer
in the documentation and/or other materials provided with the
distribution.
* Neither the name of Google Inc. nor the names of its
contributors may be used to endorse or promote products derived from
this software without specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+187
View File
@@ -0,0 +1,187 @@
// Copyright 2015 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
package runes
import (
"unicode/utf8"
"golang.org/x/text/transform"
)
// Note: below we pass invalid UTF-8 to the tIn and tNotIn transformers as is.
// This is done for various reasons:
// - To retain the semantics of the Nop transformer: if input is passed to a Nop
// one would expect it to be unchanged.
// - It would be very expensive to pass a converted RuneError to a transformer:
// a transformer might need more source bytes after RuneError, meaning that
// the only way to pass it safely is to create a new buffer and manage the
// intermingling of RuneErrors and normal input.
// - Many transformers leave ill-formed UTF-8 as is, so this is not
// inconsistent. Generally ill-formed UTF-8 is only replaced if it is a
// logical consequence of the operation (as for Map) or if it otherwise would
// pose security concerns (as for Remove).
// - An alternative would be to return an error on ill-formed UTF-8, but this
// would be inconsistent with other operations.
// If returns a transformer that applies tIn to consecutive runes for which
// s.Contains(r) and tNotIn to consecutive runes for which !s.Contains(r). Reset
// is called on tIn and tNotIn at the start of each run. A Nop transformer will
// substitute a nil value passed to tIn or tNotIn. Invalid UTF-8 is translated
// to RuneError to determine which transformer to apply, but is passed as is to
// the respective transformer.
func If(s Set, tIn, tNotIn transform.Transformer) Transformer {
if tIn == nil && tNotIn == nil {
return Transformer{transform.Nop}
}
if tIn == nil {
tIn = transform.Nop
}
if tNotIn == nil {
tNotIn = transform.Nop
}
sIn, ok := tIn.(transform.SpanningTransformer)
if !ok {
sIn = dummySpan{tIn}
}
sNotIn, ok := tNotIn.(transform.SpanningTransformer)
if !ok {
sNotIn = dummySpan{tNotIn}
}
a := &cond{
tIn: sIn,
tNotIn: sNotIn,
f: s.Contains,
}
a.Reset()
return Transformer{a}
}
type dummySpan struct{ transform.Transformer }
func (d dummySpan) Span(src []byte, atEOF bool) (n int, err error) {
return 0, transform.ErrEndOfSpan
}
type cond struct {
tIn, tNotIn transform.SpanningTransformer
f func(rune) bool
check func(rune) bool // current check to perform
t transform.SpanningTransformer // current transformer to use
}
// Reset implements transform.Transformer.
func (t *cond) Reset() {
t.check = t.is
t.t = t.tIn
t.t.Reset() // notIn will be reset on first usage.
}
func (t *cond) is(r rune) bool {
if t.f(r) {
return true
}
t.check = t.isNot
t.t = t.tNotIn
t.tNotIn.Reset()
return false
}
func (t *cond) isNot(r rune) bool {
if !t.f(r) {
return true
}
t.check = t.is
t.t = t.tIn
t.tIn.Reset()
return false
}
// This implementation of Span doesn't help all too much, but it needs to be
// there to satisfy this package's Transformer interface.
// TODO: there are certainly room for improvements, though. For example, if
// t.t == transform.Nop (which will a common occurrence) it will save a bundle
// to special-case that loop.
func (t *cond) Span(src []byte, atEOF bool) (n int, err error) {
p := 0
for n < len(src) && err == nil {
// Don't process too much at a time as the Spanner that will be
// called on this block may terminate early.
const maxChunk = 4096
max := len(src)
if v := n + maxChunk; v < max {
max = v
}
atEnd := false
size := 0
current := t.t
for ; p < max; p += size {
r := rune(src[p])
if r < utf8.RuneSelf {
size = 1
} else if r, size = utf8.DecodeRune(src[p:]); size == 1 {
if !atEOF && !utf8.FullRune(src[p:]) {
err = transform.ErrShortSrc
break
}
}
if !t.check(r) {
// The next rune will be the start of a new run.
atEnd = true
break
}
}
n2, err2 := current.Span(src[n:p], atEnd || (atEOF && p == len(src)))
n += n2
if err2 != nil {
return n, err2
}
// At this point either err != nil or t.check will pass for the rune at p.
p = n + size
}
return n, err
}
func (t *cond) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
p := 0
for nSrc < len(src) && err == nil {
// Don't process too much at a time, as the work might be wasted if the
// destination buffer isn't large enough to hold the result or a
// transform returns an error early.
const maxChunk = 4096
max := len(src)
if n := nSrc + maxChunk; n < len(src) {
max = n
}
atEnd := false
size := 0
current := t.t
for ; p < max; p += size {
r := rune(src[p])
if r < utf8.RuneSelf {
size = 1
} else if r, size = utf8.DecodeRune(src[p:]); size == 1 {
if !atEOF && !utf8.FullRune(src[p:]) {
err = transform.ErrShortSrc
break
}
}
if !t.check(r) {
// The next rune will be the start of a new run.
atEnd = true
break
}
}
nDst2, nSrc2, err2 := current.Transform(dst[nDst:], src[nSrc:p], atEnd || (atEOF && p == len(src)))
nDst += nDst2
nSrc += nSrc2
if err2 != nil {
return nDst, nSrc, err2
}
// At this point either err != nil or t.check will pass for the rune at p.
p = nSrc + size
}
return nDst, nSrc, err
}
+355
View File
@@ -0,0 +1,355 @@
// Copyright 2014 The Go Authors. All rights reserved.
// Use of this source code is governed by a BSD-style
// license that can be found in the LICENSE file.
// Package runes provide transforms for UTF-8 encoded text.
package runes // import "golang.org/x/text/runes"
import (
"unicode"
"unicode/utf8"
"golang.org/x/text/transform"
)
// A Set is a collection of runes.
type Set interface {
// Contains returns true if r is contained in the set.
Contains(r rune) bool
}
type setFunc func(rune) bool
func (s setFunc) Contains(r rune) bool {
return s(r)
}
// Note: using funcs here instead of wrapping types result in cleaner
// documentation and a smaller API.
// In creates a Set with a Contains method that returns true for all runes in
// the given RangeTable.
func In(rt *unicode.RangeTable) Set {
return setFunc(func(r rune) bool { return unicode.Is(rt, r) })
}
// In creates a Set with a Contains method that returns true for all runes not
// in the given RangeTable.
func NotIn(rt *unicode.RangeTable) Set {
return setFunc(func(r rune) bool { return !unicode.Is(rt, r) })
}
// Predicate creates a Set with a Contains method that returns f(r).
func Predicate(f func(rune) bool) Set {
return setFunc(f)
}
// Transformer implements the transform.Transformer interface.
type Transformer struct {
t transform.SpanningTransformer
}
func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
return t.t.Transform(dst, src, atEOF)
}
func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) {
return t.t.Span(b, atEOF)
}
func (t Transformer) Reset() { t.t.Reset() }
// Bytes returns a new byte slice with the result of converting b using t. It
// calls Reset on t. It returns nil if any error was found. This can only happen
// if an error-producing Transformer is passed to If.
func (t Transformer) Bytes(b []byte) []byte {
b, _, err := transform.Bytes(t, b)
if err != nil {
return nil
}
return b
}
// String returns a string with the result of converting s using t. It calls
// Reset on t. It returns the empty string if any error was found. This can only
// happen if an error-producing Transformer is passed to If.
func (t Transformer) String(s string) string {
s, _, err := transform.String(t, s)
if err != nil {
return ""
}
return s
}
// TODO:
// - Copy: copying strings and bytes in whole-rune units.
// - Validation (maybe)
// - Well-formed-ness (maybe)
const runeErrorString = string(utf8.RuneError)
// Remove returns a Transformer that removes runes r for which s.Contains(r).
// Illegal input bytes are replaced by RuneError before being passed to f.
func Remove(s Set) Transformer {
if f, ok := s.(setFunc); ok {
// This little trick cuts the running time of BenchmarkRemove for sets
// created by Predicate roughly in half.
// TODO: special-case RangeTables as well.
return Transformer{remove(f)}
}
return Transformer{remove(s.Contains)}
}
// TODO: remove transform.RemoveFunc.
type remove func(r rune) bool
func (remove) Reset() {}
// Span implements transform.Spanner.
func (t remove) Span(src []byte, atEOF bool) (n int, err error) {
for r, size := rune(0), 0; n < len(src); {
if r = rune(src[n]); r < utf8.RuneSelf {
size = 1
} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
// Invalid rune.
if !atEOF && !utf8.FullRune(src[n:]) {
err = transform.ErrShortSrc
} else {
err = transform.ErrEndOfSpan
}
break
}
if t(r) {
err = transform.ErrEndOfSpan
break
}
n += size
}
return
}
// Transform implements transform.Transformer.
func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for r, size := rune(0), 0; nSrc < len(src); {
if r = rune(src[nSrc]); r < utf8.RuneSelf {
size = 1
} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
// Invalid rune.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
// We replace illegal bytes with RuneError. Not doing so might
// otherwise turn a sequence of invalid UTF-8 into valid UTF-8.
// The resulting byte sequence may subsequently contain runes
// for which t(r) is true that were passed unnoticed.
if !t(utf8.RuneError) {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = runeErrorString[0]
dst[nDst+1] = runeErrorString[1]
dst[nDst+2] = runeErrorString[2]
nDst += 3
}
nSrc++
continue
}
if t(r) {
nSrc += size
continue
}
if nDst+size > len(dst) {
err = transform.ErrShortDst
break
}
for i := 0; i < size; i++ {
dst[nDst] = src[nSrc]
nDst++
nSrc++
}
}
return
}
// Map returns a Transformer that maps the runes in the input using the given
// mapping. Illegal bytes in the input are converted to utf8.RuneError before
// being passed to the mapping func.
func Map(mapping func(rune) rune) Transformer {
return Transformer{mapper(mapping)}
}
type mapper func(rune) rune
func (mapper) Reset() {}
// Span implements transform.Spanner.
func (t mapper) Span(src []byte, atEOF bool) (n int, err error) {
for r, size := rune(0), 0; n < len(src); n += size {
if r = rune(src[n]); r < utf8.RuneSelf {
size = 1
} else if r, size = utf8.DecodeRune(src[n:]); size == 1 {
// Invalid rune.
if !atEOF && !utf8.FullRune(src[n:]) {
err = transform.ErrShortSrc
} else {
err = transform.ErrEndOfSpan
}
break
}
if t(r) != r {
err = transform.ErrEndOfSpan
break
}
}
return n, err
}
// Transform implements transform.Transformer.
func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
var replacement rune
var b [utf8.UTFMax]byte
for r, size := rune(0), 0; nSrc < len(src); {
if r = rune(src[nSrc]); r < utf8.RuneSelf {
if replacement = t(r); replacement < utf8.RuneSelf {
if nDst == len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = byte(replacement)
nDst++
nSrc++
continue
}
size = 1
} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 {
// Invalid rune.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
if replacement = t(utf8.RuneError); replacement == utf8.RuneError {
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = runeErrorString[0]
dst[nDst+1] = runeErrorString[1]
dst[nDst+2] = runeErrorString[2]
nDst += 3
nSrc++
continue
}
} else if replacement = t(r); replacement == r {
if nDst+size > len(dst) {
err = transform.ErrShortDst
break
}
for i := 0; i < size; i++ {
dst[nDst] = src[nSrc]
nDst++
nSrc++
}
continue
}
n := utf8.EncodeRune(b[:], replacement)
if nDst+n > len(dst) {
err = transform.ErrShortDst
break
}
for i := 0; i < n; i++ {
dst[nDst] = b[i]
nDst++
}
nSrc += size
}
return
}
// ReplaceIllFormed returns a transformer that replaces all input bytes that are
// not part of a well-formed UTF-8 code sequence with utf8.RuneError.
func ReplaceIllFormed() Transformer {
return Transformer{&replaceIllFormed{}}
}
type replaceIllFormed struct{ transform.NopResetter }
func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) {
for n < len(src) {
// ASCII fast path.
if src[n] < utf8.RuneSelf {
n++
continue
}
r, size := utf8.DecodeRune(src[n:])
// Look for a valid non-ASCII rune.
if r != utf8.RuneError || size != 1 {
n += size
continue
}
// Look for short source data.
if !atEOF && !utf8.FullRune(src[n:]) {
err = transform.ErrShortSrc
break
}
// We have an invalid rune.
err = transform.ErrEndOfSpan
break
}
return n, err
}
func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
for nSrc < len(src) {
// ASCII fast path.
if r := src[nSrc]; r < utf8.RuneSelf {
if nDst == len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst] = r
nDst++
nSrc++
continue
}
// Look for a valid non-ASCII rune.
if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 {
if size != copy(dst[nDst:], src[nSrc:nSrc+size]) {
err = transform.ErrShortDst
break
}
nDst += size
nSrc += size
continue
}
// Look for short source data.
if !atEOF && !utf8.FullRune(src[nSrc:]) {
err = transform.ErrShortSrc
break
}
// We have an invalid rune.
if nDst+3 > len(dst) {
err = transform.ErrShortDst
break
}
dst[nDst+0] = runeErrorString[0]
dst[nDst+1] = runeErrorString[1]
dst[nDst+2] = runeErrorString[2]
nDst += 3
nSrc++
}
return nDst, nSrc, err
}
+5
View File
@@ -62,6 +62,11 @@ func (e *Common) Default() string {
return ""
}
// Element returns the XML element name.
func (e *Common) Element() string {
return e.name
}
// GetCommon returns e. It is provided such that Common implements Elem.
func (e *Common) GetCommon() *Common {
return e
+1 -1
View File
@@ -47,7 +47,7 @@ type Loader interface {
Reader(i int) (io.ReadCloser, error)
}
var fileRe = regexp.MustCompile(".*/(.*)/(.*)\\.xml")
var fileRe = regexp.MustCompile(`.*[/\\](.*)[/\\](.*)\.xml`)
// Decode loads and decodes the files represented by l.
func (d *Decoder) Decode(l Loader) (cldr *CLDR, err error) {
+41 -3
View File
@@ -126,6 +126,7 @@ type SupplementalData struct {
Population string `xml:"population,attr"`
LanguagePopulation []*struct {
Common
LiteracyPercent string `xml:"literacyPercent,attr"`
WritingPercent string `xml:"writingPercent,attr"`
PopulationPercent string `xml:"populationPercent,attr"`
OfficialStatus string `xml:"officialStatus,attr"`
@@ -517,12 +518,22 @@ type SupplementalData struct {
Common
LanguageMatches []*struct {
Common
ParadigmLocales []*struct {
Common
Locales string `xml:"locales,attr"`
} `xml:"paradigmLocales"`
MatchVariable []*struct {
Common
Id string `xml:"id,attr"`
Value string `xml:"value,attr"`
} `xml:"matchVariable"`
LanguageMatch []*struct {
Common
Desired string `xml:"desired,attr"`
Oneway string `xml:"oneway,attr"`
Percent string `xml:"percent,attr"`
Supported string `xml:"supported,attr"`
Percent string `xml:"percent,attr"`
Distance string `xml:"distance,attr"`
Oneway string `xml:"oneway,attr"`
} `xml:"languageMatch"`
} `xml:"languageMatches"`
} `xml:"languageMatching"`
@@ -625,6 +636,13 @@ type SupplementalData struct {
Path string `xml:"path,attr"`
} `xml:"rgPath"`
} `xml:"rgScope"`
LanguageGroups *struct {
Common
LanguageGroup []*struct {
Common
Parent string `xml:"parent,attr"`
} `xml:"languageGroup"`
} `xml:"languageGroups"`
}
// LDML is the top-level type for locale-specific data.
@@ -695,6 +713,15 @@ type LDML struct {
Common
Registry string `xml:"registry,attr"`
} `xml:"mapping"`
ParseLenients []*struct {
Common
Scope string `xml:"scope,attr"`
Level string `xml:"level,attr"`
ParseLenient []*struct {
Common
Sample string `xml:"sample,attr"`
} `xml:"parseLenient"`
} `xml:"parseLenients"`
} `xml:"characters"`
Delimiters *struct {
Common
@@ -1450,7 +1477,18 @@ type Numbers struct {
Count string `xml:"count,attr"`
} `xml:"pattern"`
} `xml:"miscPatterns"`
MinimalPairs []*struct {
Common
PluralMinimalPairs []*struct {
Common
Count string `xml:"count,attr"`
} `xml:"pluralMinimalPairs"`
OrdinalMinimalPairs []*struct {
Common
Ordinal string `xml:"ordinal,attr"`
} `xml:"ordinalMinimalPairs"`
} `xml:"minimalPairs"`
}
// Version is the version of CLDR from which the XML definitions are generated.
const Version = "30"
const Version = "32"
+6 -12
View File
@@ -33,17 +33,9 @@ const (
// streamSafe implements the policy of when a CGJ should be inserted.
type streamSafe uint8
// mkStreamSafe is a shorthand for declaring a streamSafe var and calling
// first on it.
func mkStreamSafe(p Properties) streamSafe {
return streamSafe(p.nTrailingNonStarters())
}
// first inserts the first rune of a segment.
// first inserts the first rune of a segment. It is a faster version of next if
// it is known p represents the first rune in a segment.
func (ss *streamSafe) first(p Properties) {
if *ss != 0 {
panic("!= 0")
}
*ss = streamSafe(p.nTrailingNonStarters())
}
@@ -66,7 +58,7 @@ func (ss *streamSafe) next(p Properties) ssState {
// be a non-starter. Note that it always hold that if nLead > 0 then
// nLead == nTrail.
if n == 0 {
*ss = 0
*ss = streamSafe(p.nTrailingNonStarters())
return ssStarter
}
return ssSuccess
@@ -142,7 +134,6 @@ func (rb *reorderBuffer) setFlusher(out []byte, f func(*reorderBuffer) bool) {
func (rb *reorderBuffer) reset() {
rb.nrune = 0
rb.nbyte = 0
rb.ss = 0
}
func (rb *reorderBuffer) doFlush() bool {
@@ -257,6 +248,9 @@ func (rb *reorderBuffer) insertUnsafe(src input, i int, info Properties) {
// It flushes the buffer on each new segment start.
func (rb *reorderBuffer) insertDecomposed(dcomp []byte) insertErr {
rb.tmpBytes.setBytes(dcomp)
// As the streamSafe accounting already handles the counting for modifiers,
// we don't have to call next. However, we do need to keep the accounting
// intact when flushing the buffer.
for i := 0; i < len(dcomp); {
info := rb.f.info(rb.tmpBytes, i)
if info.BoundaryBefore() && rb.nrune > 0 && !rb.doFlush() {
+6 -2
View File
@@ -90,16 +90,20 @@ func (in *input) charinfoNFKC(p int) (uint16, int) {
}
func (in *input) hangul(p int) (r rune) {
var size int
if in.bytes == nil {
if !isHangulString(in.str[p:]) {
return 0
}
r, _ = utf8.DecodeRuneInString(in.str[p:])
r, size = utf8.DecodeRuneInString(in.str[p:])
} else {
if !isHangul(in.bytes[p:]) {
return 0
}
r, _ = utf8.DecodeRune(in.bytes[p:])
r, size = utf8.DecodeRune(in.bytes[p:])
}
if size != hangulUTF8Size {
return 0
}
return r
}
+19 -12
View File
@@ -41,6 +41,7 @@ func (i *Iter) Init(f Form, src []byte) {
i.next = i.rb.f.nextMain
i.asciiF = nextASCIIBytes
i.info = i.rb.f.info(i.rb.src, i.p)
i.rb.ss.first(i.info)
}
// InitString initializes i to iterate over src after normalizing it to Form f.
@@ -56,11 +57,12 @@ func (i *Iter) InitString(f Form, src string) {
i.next = i.rb.f.nextMain
i.asciiF = nextASCIIString
i.info = i.rb.f.info(i.rb.src, i.p)
i.rb.ss.first(i.info)
}
// Seek sets the segment to be returned by the next call to Next to start
// at position p. It is the responsibility of the caller to set p to the
// start of a UTF8 rune.
// start of a segment.
func (i *Iter) Seek(offset int64, whence int) (int64, error) {
var abs int64
switch whence {
@@ -84,6 +86,7 @@ func (i *Iter) Seek(offset int64, whence int) (int64, error) {
i.multiSeg = nil
i.next = i.rb.f.nextMain
i.info = i.rb.f.info(i.rb.src, i.p)
i.rb.ss.first(i.info)
return abs, nil
}
@@ -161,6 +164,7 @@ func nextHangul(i *Iter) []byte {
if next >= i.rb.nsrc {
i.setDone()
} else if i.rb.src.hangul(next) == 0 {
i.rb.ss.next(i.info)
i.info = i.rb.f.info(i.rb.src, i.p)
i.next = i.rb.f.nextMain
return i.next(i)
@@ -204,12 +208,10 @@ func nextMultiNorm(i *Iter) []byte {
if info.BoundaryBefore() {
i.rb.compose()
seg := i.buf[:i.rb.flushCopy(i.buf[:])]
i.rb.ss.first(info)
i.rb.insertUnsafe(input{bytes: d}, j, info)
i.multiSeg = d[j+int(info.size):]
return seg
}
i.rb.ss.next(info)
i.rb.insertUnsafe(input{bytes: d}, j, info)
j += int(info.size)
}
@@ -222,9 +224,9 @@ func nextMultiNorm(i *Iter) []byte {
func nextDecomposed(i *Iter) (next []byte) {
outp := 0
inCopyStart, outCopyStart := i.p, 0
ss := mkStreamSafe(i.info)
for {
if sz := int(i.info.size); sz <= 1 {
i.rb.ss = 0
p := i.p
i.p++ // ASCII or illegal byte. Either way, advance by 1.
if i.p >= i.rb.nsrc {
@@ -243,6 +245,8 @@ func nextDecomposed(i *Iter) (next []byte) {
p := outp + len(d)
if outp > 0 {
i.rb.src.copySlice(i.buf[outCopyStart:], inCopyStart, i.p)
// TODO: this condition should not be possible, but we leave it
// in for defensive purposes.
if p > len(i.buf) {
return i.buf[:outp]
}
@@ -266,7 +270,7 @@ func nextDecomposed(i *Iter) (next []byte) {
} else {
i.info = i.rb.f.info(i.rb.src, i.p)
}
switch ss.next(i.info) {
switch i.rb.ss.next(i.info) {
case ssOverflow:
i.next = nextCGJDecompose
fallthrough
@@ -309,7 +313,7 @@ func nextDecomposed(i *Iter) (next []byte) {
}
prevCC := i.info.tccc
i.info = i.rb.f.info(i.rb.src, i.p)
if v := ss.next(i.info); v == ssStarter {
if v := i.rb.ss.next(i.info); v == ssStarter {
break
} else if v == ssOverflow {
i.next = nextCGJDecompose
@@ -335,10 +339,6 @@ doNorm:
func doNormDecomposed(i *Iter) []byte {
for {
if s := i.rb.ss.next(i.info); s == ssOverflow {
i.next = nextCGJDecompose
break
}
i.rb.insertUnsafe(i.rb.src, i.p, i.info)
if i.p += int(i.info.size); i.p >= i.rb.nsrc {
i.setDone()
@@ -348,6 +348,10 @@ func doNormDecomposed(i *Iter) []byte {
if i.info.ccc == 0 {
break
}
if s := i.rb.ss.next(i.info); s == ssOverflow {
i.next = nextCGJDecompose
break
}
}
// new segment or too many combining characters: exit normalization
return i.buf[:i.rb.flushCopy(i.buf[:])]
@@ -357,6 +361,7 @@ func nextCGJDecompose(i *Iter) []byte {
i.rb.ss = 0
i.rb.insertCGJ()
i.next = nextDecomposed
i.rb.ss.first(i.info)
buf := doNormDecomposed(i)
return buf
}
@@ -365,7 +370,6 @@ func nextCGJDecompose(i *Iter) []byte {
func nextComposed(i *Iter) []byte {
outp, startp := 0, i.p
var prevCC uint8
ss := mkStreamSafe(i.info)
for {
if !i.info.isYesC() {
goto doNorm
@@ -385,11 +389,12 @@ func nextComposed(i *Iter) []byte {
i.setDone()
break
} else if i.rb.src._byte(i.p) < utf8.RuneSelf {
i.rb.ss = 0
i.next = i.asciiF
break
}
i.info = i.rb.f.info(i.rb.src, i.p)
if v := ss.next(i.info); v == ssStarter {
if v := i.rb.ss.next(i.info); v == ssStarter {
break
} else if v == ssOverflow {
i.next = nextCGJCompose
@@ -401,8 +406,10 @@ func nextComposed(i *Iter) []byte {
}
return i.returnSlice(startp, i.p)
doNorm:
// reset to start position
i.p = startp
i.info = i.rb.f.info(i.rb.src, i.p)
i.rb.ss.first(i.info)
if i.info.multiSegment() {
d := i.info.Decomposition()
info := i.rb.f.info(input{bytes: d}, 0)
+2 -2
View File
@@ -795,7 +795,7 @@ func makeTables() {
}
fmt.Fprintf(w, "// Total size of tables: %dKB (%d bytes)\n", (size+512)/1024, size)
gen.WriteGoFile("tables.go", "norm", w.Bytes())
gen.WriteVersionedGoFile("tables.go", "norm", w.Bytes())
}
func printChars() {
@@ -972,5 +972,5 @@ func printTestdata() {
}
}
fmt.Fprintln(w, "}")
gen.WriteGoFile("data_test.go", "norm", w.Bytes())
gen.WriteVersionedGoFile("data_test.go", "norm", w.Bytes())
}
+8 -8
View File
@@ -324,7 +324,6 @@ func (f *formInfo) quickSpan(src input, i, end int, atEOF bool) (n int, ok bool)
// have an overflow for runes that are starters (e.g. with U+FF9E).
switch ss.next(info) {
case ssStarter:
ss.first(info)
lastSegStart = i
case ssOverflow:
return lastSegStart, false
@@ -441,6 +440,8 @@ func (f Form) nextBoundary(src input, nsrc int, atEOF bool) int {
}
return -1
}
// TODO: Using streamSafe to determine the boundary isn't the same as
// using BoundaryBefore. Determine which should be used.
if s := ss.next(info); s != ssSuccess {
return i
}
@@ -505,15 +506,14 @@ func decomposeSegment(rb *reorderBuffer, sp int, atEOF bool) int {
if info.size == 0 {
return 0
}
if rb.nrune > 0 {
if s := rb.ss.next(info); s == ssStarter {
goto end
} else if s == ssOverflow {
rb.insertCGJ()
if s := rb.ss.next(info); s == ssStarter {
// TODO: this could be removed if we don't support merging.
if rb.nrune > 0 {
goto end
}
} else {
rb.ss.first(info)
} else if s == ssOverflow {
rb.insertCGJ()
goto end
}
if err := rb.insertFlush(rb.src, sp, info); err != iSuccess {
return int(err)
File diff suppressed because it is too large Load Diff
@@ -1,5 +1,7 @@
// Code generated by running "go generate" in golang.org/x/text. DO NOT EDIT.
// +build !go1.10
package norm
const (
+1 -1
View File
@@ -40,7 +40,7 @@ func (f Form) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error)
}
func flushTransform(rb *reorderBuffer) bool {
// Write out (must fully fit in dst, or else it is a ErrShortDst).
// Write out (must fully fit in dst, or else it is an ErrShortDst).
if len(rb.out) < rb.nrune*utf8.UTFMax {
return false
}