utf8.go raw
1 package characters
2
3 import (
4 "unicode/utf8"
5 )
6
7 type utf8Err struct {
8 Index int
9 Size int
10 }
11
12 func (u utf8Err) Zero() bool {
13 return u.Size == 0
14 }
15
16 // Verified that a given string is only made of valid UTF-8 characters allowed
17 // by the TOML spec:
18 //
19 // Any Unicode character may be used except those that must be escaped:
20 // quotation mark, backslash, and the control characters other than tab (U+0000
21 // to U+0008, U+000A to U+001F, U+007F).
22 //
23 // It is a copy of the Go 1.17 utf8.Valid implementation, tweaked to exit early
24 // when a character is not allowed.
25 //
26 // The returned utf8Err is Zero() if the string is valid, or contains the byte
27 // index and size of the invalid character.
28 //
29 // quotation mark => already checked
30 // backslash => already checked
31 // 0-0x8 => invalid
32 // 0x9 => tab, ok
33 // 0xA - 0x1F => invalid
34 // 0x7F => invalid
35 func Utf8TomlValidAlreadyEscaped(p []byte) (err utf8Err) {
36 // Fast path. Check for and skip 8 bytes of ASCII characters per iteration.
37 offset := 0
38 for len(p) >= 8 {
39 // Combining two 32 bit loads allows the same code to be used
40 // for 32 and 64 bit platforms.
41 // The compiler can generate a 32bit load for first32 and second32
42 // on many platforms. See test/codegen/memcombine.go.
43 first32 := uint32(p[0]) | uint32(p[1])<<8 | uint32(p[2])<<16 | uint32(p[3])<<24
44 second32 := uint32(p[4]) | uint32(p[5])<<8 | uint32(p[6])<<16 | uint32(p[7])<<24
45 if (first32|second32)&0x80808080 != 0 {
46 // Found a non ASCII byte (>= RuneSelf).
47 break
48 }
49
50 for i, b := range p[:8] {
51 if InvalidAscii(b) {
52 err.Index = offset + i
53 err.Size = 1
54 return
55 }
56 }
57
58 p = p[8:]
59 offset += 8
60 }
61 n := len(p)
62 for i := 0; i < n; {
63 pi := p[i]
64 if pi < utf8.RuneSelf {
65 if InvalidAscii(pi) {
66 err.Index = offset + i
67 err.Size = 1
68 return
69 }
70 i++
71 continue
72 }
73 x := first[pi]
74 if x == xx {
75 // Illegal starter byte.
76 err.Index = offset + i
77 err.Size = 1
78 return
79 }
80 size := int(x & 7)
81 if i+size > n {
82 // Short or invalid.
83 err.Index = offset + i
84 err.Size = n - i
85 return
86 }
87 accept := acceptRanges[x>>4]
88 if c := p[i+1]; c < accept.lo || accept.hi < c {
89 err.Index = offset + i
90 err.Size = 2
91 return
92 } else if size == 2 {
93 } else if c := p[i+2]; c < locb || hicb < c {
94 err.Index = offset + i
95 err.Size = 3
96 return
97 } else if size == 3 {
98 } else if c := p[i+3]; c < locb || hicb < c {
99 err.Index = offset + i
100 err.Size = 4
101 return
102 }
103 i += size
104 }
105 return
106 }
107
108 // Return the size of the next rune if valid, 0 otherwise.
109 func Utf8ValidNext(p []byte) int {
110 c := p[0]
111
112 if c < utf8.RuneSelf {
113 if InvalidAscii(c) {
114 return 0
115 }
116 return 1
117 }
118
119 x := first[c]
120 if x == xx {
121 // Illegal starter byte.
122 return 0
123 }
124 size := int(x & 7)
125 if size > len(p) {
126 // Short or invalid.
127 return 0
128 }
129 accept := acceptRanges[x>>4]
130 if c := p[1]; c < accept.lo || accept.hi < c {
131 return 0
132 } else if size == 2 {
133 } else if c := p[2]; c < locb || hicb < c {
134 return 0
135 } else if size == 3 {
136 } else if c := p[3]; c < locb || hicb < c {
137 return 0
138 }
139
140 return size
141 }
142
143 // acceptRange gives the range of valid values for the second byte in a UTF-8
144 // sequence.
145 type acceptRange struct {
146 lo uint8 // lowest value for second byte.
147 hi uint8 // highest value for second byte.
148 }
149
150 // acceptRanges has size 16 to avoid bounds checks in the code that uses it.
151 var acceptRanges = [16]acceptRange{
152 0: {locb, hicb},
153 1: {0xA0, hicb},
154 2: {locb, 0x9F},
155 3: {0x90, hicb},
156 4: {locb, 0x8F},
157 }
158
159 // first is information about the first byte in a UTF-8 sequence.
160 var first = [256]uint8{
161 // 1 2 3 4 5 6 7 8 9 A B C D E F
162 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x00-0x0F
163 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x10-0x1F
164 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x20-0x2F
165 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x30-0x3F
166 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x40-0x4F
167 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x50-0x5F
168 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x60-0x6F
169 as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, as, // 0x70-0x7F
170 // 1 2 3 4 5 6 7 8 9 A B C D E F
171 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x80-0x8F
172 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0x90-0x9F
173 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xA0-0xAF
174 xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xB0-0xBF
175 xx, xx, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xC0-0xCF
176 s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, s1, // 0xD0-0xDF
177 s2, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s3, s4, s3, s3, // 0xE0-0xEF
178 s5, s6, s6, s6, s7, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, xx, // 0xF0-0xFF
179 }
180
181 const (
182 // The default lowest and highest continuation byte.
183 locb = 0b10000000
184 hicb = 0b10111111
185
186 // These names of these constants are chosen to give nice alignment in the
187 // table below. The first nibble is an index into acceptRanges or F for
188 // special one-byte cases. The second nibble is the Rune length or the
189 // Status for the special one-byte case.
190 xx = 0xF1 // invalid: size 1
191 as = 0xF0 // ASCII: size 1
192 s1 = 0x02 // accept 0, size 2
193 s2 = 0x13 // accept 1, size 3
194 s3 = 0x03 // accept 0, size 3
195 s4 = 0x23 // accept 2, size 3
196 s5 = 0x34 // accept 3, size 4
197 s6 = 0x04 // accept 0, size 4
198 s7 = 0x44 // accept 4, size 4
199 )
200