1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 5 // Package bytes implements functions for the manipulation of byte slices.
6 // It is analogous to the facilities of the [strings] package.
7 package bytes
8 9 import (
10 "internal/bytealg"
11 "math/bits"
12 "unicode"
13 "unicode/utf8"
14 _ "unsafe" // for linkname
15 )
16 17 // Equal reports whether a and b
18 // are the same length and contain the same bytes.
19 // A nil argument is equivalent to an empty slice.
20 func Equal(a, b []byte) bool {
21 // Neither cmd/compile nor gccgo allocates for these string conversions.
22 return []byte(a) == []byte(b)
23 }
24 25 // Compare returns an integer comparing two byte slices lexicographically.
26 // The result will be 0 if a == b, -1 if a < b, and +1 if a > b.
27 // A nil argument is equivalent to an empty slice.
28 func Compare(a, b []byte) int {
29 return bytealg.Compare(a, b)
30 }
31 32 // explode splits s into a slice of UTF-8 sequences, one per Unicode code point (still slices of bytes),
33 // up to a maximum of n byte slices. Invalid UTF-8 sequences are chopped into individual bytes.
34 func explode(s []byte, n int) [][]byte {
35 if n <= 0 || n > len(s) {
36 n = len(s)
37 }
38 a := [][]byte{:n}
39 var size int
40 na := 0
41 for len(s) > 0 {
42 if na+1 >= n {
43 a[na] = s
44 na++
45 break
46 }
47 _, size = utf8.DecodeRune(s)
48 a[na] = s[0:size:size]
49 s = s[size:]
50 na++
51 }
52 return a[0:na]
53 }
54 55 // Count counts the number of non-overlapping instances of sep in s.
56 // If sep is an empty slice, Count returns 1 + the number of UTF-8-encoded code points in s.
57 func Count(s, sep []byte) int {
58 // special case
59 if len(sep) == 0 {
60 return utf8.RuneCount(s) + 1
61 }
62 if len(sep) == 1 {
63 return bytealg.Count(s, sep[0])
64 }
65 n := 0
66 for {
67 i := Index(s, sep)
68 if i == -1 {
69 return n
70 }
71 n++
72 s = s[i+len(sep):]
73 }
74 }
75 76 // Contains reports whether subslice is within b.
77 func Contains(b, subslice []byte) bool {
78 return Index(b, subslice) != -1
79 }
80 81 // ContainsAny reports whether any of the UTF-8-encoded code points in chars are within b.
82 func ContainsAny(b []byte, chars []byte) bool {
83 return IndexAny(b, chars) >= 0
84 }
85 86 // ContainsRune reports whether the rune is contained in the UTF-8-encoded byte slice b.
87 func ContainsRune(b []byte, r rune) bool {
88 return IndexRune(b, r) >= 0
89 }
90 91 // ContainsFunc reports whether any of the UTF-8-encoded code points r within b satisfy f(r).
92 func ContainsFunc(b []byte, f func(rune) bool) bool {
93 return IndexFunc(b, f) >= 0
94 }
95 96 // IndexByte returns the index of the first instance of c in b, or -1 if c is not present in b.
97 func IndexByte(b []byte, c byte) int {
98 return bytealg.IndexByte(b, c)
99 }
100 101 func indexBytePortable(s []byte, c byte) int {
102 for i, b := range s {
103 if b == c {
104 return i
105 }
106 }
107 return -1
108 }
109 110 // LastIndex returns the index of the last instance of sep in s, or -1 if sep is not present in s.
111 func LastIndex(s, sep []byte) int {
112 n := len(sep)
113 switch {
114 case n == 0:
115 return len(s)
116 case n == 1:
117 return bytealg.LastIndexByte(s, sep[0])
118 case n == len(s):
119 if Equal(s, sep) {
120 return 0
121 }
122 return -1
123 case n > len(s):
124 return -1
125 }
126 return bytealg.LastIndexRabinKarp(s, sep)
127 }
128 129 // LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s.
130 func LastIndexByte(s []byte, c byte) int {
131 return bytealg.LastIndexByte(s, c)
132 }
133 134 // IndexRune interprets s as a sequence of UTF-8-encoded code points.
135 // It returns the byte index of the first occurrence in s of the given rune.
136 // It returns -1 if rune is not present in s.
137 // If r is [utf8.RuneError], it returns the first instance of any
138 // invalid UTF-8 byte sequence.
139 func IndexRune(s []byte, r rune) int {
140 const haveFastIndex = bytealg.MaxBruteForce > 0
141 switch {
142 case 0 <= r && r < utf8.RuneSelf:
143 return IndexByte(s, byte(r))
144 case r == utf8.RuneError:
145 for i := 0; i < len(s); {
146 r1, n := utf8.DecodeRune(s[i:])
147 if r1 == utf8.RuneError {
148 return i
149 }
150 i += n
151 }
152 return -1
153 case !utf8.ValidRune(r):
154 return -1
155 default:
156 // Search for rune r using the last byte of its UTF-8 encoded form.
157 // The distribution of the last byte is more uniform compared to the
158 // first byte which has a 78% chance of being [240, 243, 244].
159 var b [utf8.UTFMax]byte
160 n := utf8.EncodeRune(b[:], r)
161 last := n - 1
162 i := last
163 fails := 0
164 for i < len(s) {
165 if s[i] != b[last] {
166 o := IndexByte(s[i+1:], b[last])
167 if o < 0 {
168 return -1
169 }
170 i += o + 1
171 }
172 // Step backwards comparing bytes.
173 for j := 1; j < n; j++ {
174 if s[i-j] != b[last-j] {
175 goto next
176 }
177 }
178 return i - last
179 next:
180 fails++
181 i++
182 if (haveFastIndex && fails > bytealg.Cutover(i)) && i < len(s) ||
183 (!haveFastIndex && fails >= 4+i>>4 && i < len(s)) {
184 goto fallback
185 }
186 }
187 return -1
188 189 fallback:
190 // Switch to bytealg.Index, if available, or a brute force search when
191 // IndexByte returns too many false positives.
192 if haveFastIndex {
193 if j := bytealg.Index(s[i-last:], b[:n]); j >= 0 {
194 return i + j - last
195 }
196 } else {
197 // If bytealg.Index is not available a brute force search is
198 // ~1.5-3x faster than Rabin-Karp since n is small.
199 c0 := b[last]
200 c1 := b[last-1] // There are at least 2 chars to match
201 loop:
202 for ; i < len(s); i++ {
203 if s[i] == c0 && s[i-1] == c1 {
204 for k := 2; k < n; k++ {
205 if s[i-k] != b[last-k] {
206 continue loop
207 }
208 }
209 return i - last
210 }
211 }
212 }
213 return -1
214 }
215 }
216 217 // IndexAny interprets s as a sequence of UTF-8-encoded Unicode code points.
218 // It returns the byte index of the first occurrence in s of any of the Unicode
219 // code points in chars. It returns -1 if chars is empty or if there is no code
220 // point in common.
221 func IndexAny(s []byte, chars []byte) int {
222 if chars == "" {
223 // Avoid scanning all of s.
224 return -1
225 }
226 if len(s) == 1 {
227 r := rune(s[0])
228 if r >= utf8.RuneSelf {
229 // search utf8.RuneError.
230 for ci := 0; ci < len(chars); {
231 cr, cw := utf8.DecodeRune(chars[ci:])
232 if cr == utf8.RuneError {
233 return 0
234 }
235 ci += cw
236 }
237 return -1
238 }
239 if bytealg.IndexByteString(chars, s[0]) >= 0 {
240 return 0
241 }
242 return -1
243 }
244 if len(chars) == 1 {
245 r := rune(chars[0])
246 if r >= utf8.RuneSelf {
247 r = utf8.RuneError
248 }
249 return IndexRune(s, r)
250 }
251 if len(s) > 8 {
252 if as, isASCII := makeASCIISet(chars); isASCII {
253 for i, c := range s {
254 if as.contains(c) {
255 return i
256 }
257 }
258 return -1
259 }
260 }
261 var width int
262 for i := 0; i < len(s); i += width {
263 r := rune(s[i])
264 if r < utf8.RuneSelf {
265 if bytealg.IndexByteString(chars, s[i]) >= 0 {
266 return i
267 }
268 width = 1
269 continue
270 }
271 r, width = utf8.DecodeRune(s[i:])
272 if r != utf8.RuneError {
273 // r is 2 to 4 bytes
274 if len(chars) == width {
275 if Equal(chars, s[i:i+width]) {
276 return i
277 }
278 continue
279 }
280 // Use bytealg.IndexString for performance if available.
281 if bytealg.MaxLen >= width {
282 if bytealg.IndexString(chars, s[i:i+width]) >= 0 {
283 return i
284 }
285 continue
286 }
287 }
288 for ci := 0; ci < len(chars); {
289 ch, cw := utf8.DecodeRune(chars[ci:])
290 if r == ch {
291 return i
292 }
293 ci += cw
294 }
295 }
296 return -1
297 }
298 299 // LastIndexAny interprets s as a sequence of UTF-8-encoded Unicode code
300 // points. It returns the byte index of the last occurrence in s of any of
301 // the Unicode code points in chars. It returns -1 if chars is empty or if
302 // there is no code point in common.
303 func LastIndexAny(s []byte, chars []byte) int {
304 if chars == "" {
305 // Avoid scanning all of s.
306 return -1
307 }
308 if len(s) > 8 {
309 if as, isASCII := makeASCIISet(chars); isASCII {
310 for i := len(s) - 1; i >= 0; i-- {
311 if as.contains(s[i]) {
312 return i
313 }
314 }
315 return -1
316 }
317 }
318 if len(s) == 1 {
319 r := rune(s[0])
320 if r >= utf8.RuneSelf {
321 for ci := 0; ci < len(chars); {
322 cr, cw := utf8.DecodeRune(chars[ci:])
323 if cr == utf8.RuneError {
324 return 0
325 }
326 ci += cw
327 }
328 return -1
329 }
330 if bytealg.IndexByteString(chars, s[0]) >= 0 {
331 return 0
332 }
333 return -1
334 }
335 if len(chars) == 1 {
336 cr := rune(chars[0])
337 if cr >= utf8.RuneSelf {
338 cr = utf8.RuneError
339 }
340 for i := len(s); i > 0; {
341 r, size := utf8.DecodeLastRune(s[:i])
342 i -= size
343 if r == cr {
344 return i
345 }
346 }
347 return -1
348 }
349 for i := len(s); i > 0; {
350 r := rune(s[i-1])
351 if r < utf8.RuneSelf {
352 if bytealg.IndexByteString(chars, s[i-1]) >= 0 {
353 return i - 1
354 }
355 i--
356 continue
357 }
358 r, size := utf8.DecodeLastRune(s[:i])
359 i -= size
360 if r != utf8.RuneError {
361 // r is 2 to 4 bytes
362 if len(chars) == size {
363 if Equal(chars, s[i:i+size]) {
364 return i
365 }
366 continue
367 }
368 // Use bytealg.IndexString for performance if available.
369 if bytealg.MaxLen >= size {
370 if bytealg.IndexString(chars, s[i:i+size]) >= 0 {
371 return i
372 }
373 continue
374 }
375 }
376 for ci := 0; ci < len(chars); {
377 ch, cw := utf8.DecodeRune(chars[ci:])
378 if r == ch {
379 return i
380 }
381 ci += cw
382 }
383 }
384 return -1
385 }
386 387 // Generic split: splits after each instance of sep,
388 // including sepSave bytes of sep in the subslices.
389 func genSplit(s, sep []byte, sepSave, n int) [][]byte {
390 if n == 0 {
391 return nil
392 }
393 if len(sep) == 0 {
394 return explode(s, n)
395 }
396 if n < 0 {
397 n = Count(s, sep) + 1
398 }
399 if n > len(s)+1 {
400 n = len(s) + 1
401 }
402 403 a := [][]byte{:n}
404 n--
405 i := 0
406 for i < n {
407 m := Index(s, sep)
408 if m < 0 {
409 break
410 }
411 a[i] = s[: m+sepSave : m+sepSave]
412 s = s[m+len(sep):]
413 i++
414 }
415 a[i] = s
416 return a[:i+1]
417 }
418 419 // SplitN slices s into subslices separated by sep and returns a slice of
420 // the subslices between those separators.
421 // If sep is empty, SplitN splits after each UTF-8 sequence.
422 // The count determines the number of subslices to return:
423 // - n > 0: at most n subslices; the last subslice will be the unsplit remainder;
424 // - n == 0: the result is nil (zero subslices);
425 // - n < 0: all subslices.
426 //
427 // To split around the first instance of a separator, see [Cut].
428 func SplitN(s, sep []byte, n int) [][]byte { return genSplit(s, sep, 0, n) }
429 430 // SplitAfterN slices s into subslices after each instance of sep and
431 // returns a slice of those subslices.
432 // If sep is empty, SplitAfterN splits after each UTF-8 sequence.
433 // The count determines the number of subslices to return:
434 // - n > 0: at most n subslices; the last subslice will be the unsplit remainder;
435 // - n == 0: the result is nil (zero subslices);
436 // - n < 0: all subslices.
437 func SplitAfterN(s, sep []byte, n int) [][]byte {
438 return genSplit(s, sep, len(sep), n)
439 }
440 441 // Split slices s into all subslices separated by sep and returns a slice of
442 // the subslices between those separators.
443 // If sep is empty, Split splits after each UTF-8 sequence.
444 // It is equivalent to SplitN with a count of -1.
445 //
446 // To split around the first instance of a separator, see [Cut].
447 func Split(s, sep []byte) [][]byte { return genSplit(s, sep, 0, -1) }
448 449 // SplitAfter slices s into all subslices after each instance of sep and
450 // returns a slice of those subslices.
451 // If sep is empty, SplitAfter splits after each UTF-8 sequence.
452 // It is equivalent to SplitAfterN with a count of -1.
453 func SplitAfter(s, sep []byte) [][]byte {
454 return genSplit(s, sep, len(sep), -1)
455 }
456 457 var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
458 459 // Fields interprets s as a sequence of UTF-8-encoded code points.
460 // It splits the slice s around each instance of one or more consecutive white space
461 // characters, as defined by [unicode.IsSpace], returning a slice of subslices of s or an
462 // empty slice if s contains only white space. Every element of the returned slice is
463 // non-empty. Unlike [Split], leading and trailing runs of white space characters
464 // are discarded.
465 func Fields(s []byte) [][]byte {
466 // First count the fields.
467 // This is an exact count if s is ASCII, otherwise it is an approximation.
468 n := 0
469 wasSpace := 1
470 // setBits is used to track which bits are set in the bytes of s.
471 setBits := uint8(0)
472 for i := 0; i < len(s); i++ {
473 r := s[i]
474 setBits |= r
475 isSpace := int(asciiSpace[r])
476 n += wasSpace & ^isSpace
477 wasSpace = isSpace
478 }
479 480 if setBits >= utf8.RuneSelf {
481 // Some runes in the input slice are not ASCII.
482 return FieldsFunc(s, unicode.IsSpace)
483 }
484 485 // ASCII fast path
486 a := [][]byte{:n}
487 na := 0
488 fieldStart := 0
489 i := 0
490 // Skip spaces in the front of the input.
491 for i < len(s) && asciiSpace[s[i]] != 0 {
492 i++
493 }
494 fieldStart = i
495 for i < len(s) {
496 if asciiSpace[s[i]] == 0 {
497 i++
498 continue
499 }
500 a[na] = s[fieldStart:i:i]
501 na++
502 i++
503 // Skip spaces in between fields.
504 for i < len(s) && asciiSpace[s[i]] != 0 {
505 i++
506 }
507 fieldStart = i
508 }
509 if fieldStart < len(s) { // Last field might end at EOF.
510 a[na] = s[fieldStart:len(s):len(s)]
511 }
512 return a
513 }
514 515 // FieldsFunc interprets s as a sequence of UTF-8-encoded code points.
516 // It splits the slice s at each run of code points c satisfying f(c) and
517 // returns a slice of subslices of s. If all code points in s satisfy f(c), or
518 // len(s) == 0, an empty slice is returned. Every element of the returned slice is
519 // non-empty. Unlike [SplitFunc], leading and trailing runs of code points
520 // satisfying f(c) are discarded.
521 //
522 // FieldsFunc makes no guarantees about the order in which it calls f(c)
523 // and assumes that f always returns the same value for a given c.
524 func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
525 // A span is used to record a slice of s of the form s[start:end].
526 // The start index is inclusive and the end index is exclusive.
527 type span struct {
528 start int
529 end int
530 }
531 spans := []span{:0:32}
532 533 // Find the field start and end indices.
534 // Doing this in a separate pass (rather than slicing the string s
535 // and collecting the result substrings right away) is significantly
536 // more efficient, possibly due to cache effects.
537 start := -1 // valid span start if >= 0
538 for i := 0; i < len(s); {
539 size := 1
540 r := rune(s[i])
541 if r >= utf8.RuneSelf {
542 r, size = utf8.DecodeRune(s[i:])
543 }
544 if f(r) {
545 if start >= 0 {
546 spans = append(spans, span{start, i})
547 start = -1
548 }
549 } else {
550 if start < 0 {
551 start = i
552 }
553 }
554 i += size
555 }
556 557 // Last field might end at EOF.
558 if start >= 0 {
559 spans = append(spans, span{start, len(s)})
560 }
561 562 // Create subslices from recorded field indices.
563 a := [][]byte{:len(spans)}
564 for i, span := range spans {
565 a[i] = s[span.start:span.end:span.end]
566 }
567 568 return a
569 }
570 571 // Join concatenates the elements of s to create a new byte slice. The separator
572 // sep is placed between elements in the resulting slice.
573 func Join(s [][]byte, sep []byte) []byte {
574 if len(s) == 0 {
575 return []byte{}
576 }
577 if len(s) == 1 {
578 // Just return a copy.
579 return append([]byte(nil), s[0]...)
580 }
581 582 var n int
583 if len(sep) > 0 {
584 if len(sep) >= maxInt/(len(s)-1) {
585 panic("bytes: Join output length overflow")
586 }
587 n += len(sep) * (len(s) - 1)
588 }
589 for _, v := range s {
590 if len(v) > maxInt-n {
591 panic("bytes: Join output length overflow")
592 }
593 n += len(v)
594 }
595 596 b := bytealg.MakeNoZero(n)[:n:n]
597 bp := copy(b, s[0])
598 for _, v := range s[1:] {
599 bp += copy(b[bp:], sep)
600 bp += copy(b[bp:], v)
601 }
602 return b
603 }
604 605 // HasPrefix reports whether the byte slice s begins with prefix.
606 func HasPrefix(s, prefix []byte) bool {
607 return len(s) >= len(prefix) && Equal(s[:len(prefix)], prefix)
608 }
609 610 // HasSuffix reports whether the byte slice s ends with suffix.
611 func HasSuffix(s, suffix []byte) bool {
612 return len(s) >= len(suffix) && Equal(s[len(s)-len(suffix):], suffix)
613 }
614 615 // Map returns a copy of the byte slice s with all its characters modified
616 // according to the mapping function. If mapping returns a negative value, the character is
617 // dropped from the byte slice with no replacement. The characters in s and the
618 // output are interpreted as UTF-8-encoded code points.
619 func Map(mapping func(r rune) rune, s []byte) []byte {
620 // In the worst case, the slice can grow when mapped, making
621 // things unpleasant. But it's so rare we barge in assuming it's
622 // fine. It could also shrink but that falls out naturally.
623 b := []byte{:0:len(s)}
624 for i := 0; i < len(s); {
625 wid := 1
626 r := rune(s[i])
627 if r >= utf8.RuneSelf {
628 r, wid = utf8.DecodeRune(s[i:])
629 }
630 r = mapping(r)
631 if r >= 0 {
632 b = utf8.AppendRune(b, r)
633 }
634 i += wid
635 }
636 return b
637 }
638 639 // Despite being an exported symbol,
640 // Repeat is linknamed by widely used packages.
641 // Notable members of the hall of shame include:
642 // - gitee.com/quant1x/num
643 //
644 // Do not remove or change the type signature.
645 // See go.dev/issue/67401.
646 //
647 // Note that this comment is not part of the doc comment.
648 //
649 //go:linkname Repeat
650 651 // Repeat returns a new byte slice consisting of count copies of b.
652 //
653 // It panics if count is negative or if the result of (len(b) * count)
654 // overflows.
655 func Repeat(b []byte, count int) []byte {
656 if count == 0 {
657 return []byte{}
658 }
659 660 // Since we cannot return an error on overflow,
661 // we should panic if the repeat will generate an overflow.
662 // See golang.org/issue/16237.
663 if count < 0 {
664 panic("bytes: negative Repeat count")
665 }
666 hi, lo := bits.Mul(uint(len(b)), uint(count))
667 if hi > 0 || lo > uint(maxInt) {
668 panic("bytes: Repeat output length overflow")
669 }
670 n := int(lo) // lo = len(b) * count
671 672 if len(b) == 0 {
673 return []byte{}
674 }
675 676 // Past a certain chunk size it is counterproductive to use
677 // larger chunks as the source of the write, as when the source
678 // is too large we are basically just thrashing the CPU D-cache.
679 // So if the result length is larger than an empirically-found
680 // limit (8KB), we stop growing the source string once the limit
681 // is reached and keep reusing the same source string - that
682 // should therefore be always resident in the L1 cache - until we
683 // have completed the construction of the result.
684 // This yields significant speedups (up to +100%) in cases where
685 // the result length is large (roughly, over L2 cache size).
686 const chunkLimit = 8 * 1024
687 chunkMax := n
688 if chunkMax > chunkLimit {
689 chunkMax = chunkLimit / len(b) * len(b)
690 if chunkMax == 0 {
691 chunkMax = len(b)
692 }
693 }
694 nb := bytealg.MakeNoZero(n)[:n:n]
695 bp := copy(nb, b)
696 for bp < n {
697 chunk := min(bp, chunkMax)
698 bp += copy(nb[bp:], nb[:chunk])
699 }
700 return nb
701 }
702 703 // ToUpper returns a copy of the byte slice s with all Unicode letters mapped to
704 // their upper case.
705 func ToUpper(s []byte) []byte {
706 isASCII, hasLower := true, false
707 for i := 0; i < len(s); i++ {
708 c := s[i]
709 if c >= utf8.RuneSelf {
710 isASCII = false
711 break
712 }
713 hasLower = hasLower || ('a' <= c && c <= 'z')
714 }
715 716 if isASCII { // optimize for ASCII-only byte slices.
717 if !hasLower {
718 // Just return a copy.
719 return append([]byte(""), s...)
720 }
721 b := bytealg.MakeNoZero(len(s))[:len(s):len(s)]
722 for i := 0; i < len(s); i++ {
723 c := s[i]
724 if 'a' <= c && c <= 'z' {
725 c -= 'a' - 'A'
726 }
727 b[i] = c
728 }
729 return b
730 }
731 return Map(unicode.ToUpper, s)
732 }
733 734 // ToLower returns a copy of the byte slice s with all Unicode letters mapped to
735 // their lower case.
736 func ToLower(s []byte) []byte {
737 isASCII, hasUpper := true, false
738 for i := 0; i < len(s); i++ {
739 c := s[i]
740 if c >= utf8.RuneSelf {
741 isASCII = false
742 break
743 }
744 hasUpper = hasUpper || ('A' <= c && c <= 'Z')
745 }
746 747 if isASCII { // optimize for ASCII-only byte slices.
748 if !hasUpper {
749 return append([]byte(""), s...)
750 }
751 b := bytealg.MakeNoZero(len(s))[:len(s):len(s)]
752 for i := 0; i < len(s); i++ {
753 c := s[i]
754 if 'A' <= c && c <= 'Z' {
755 c += 'a' - 'A'
756 }
757 b[i] = c
758 }
759 return b
760 }
761 return Map(unicode.ToLower, s)
762 }
763 764 // ToTitle treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their title case.
765 func ToTitle(s []byte) []byte { return Map(unicode.ToTitle, s) }
766 767 // ToUpperSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
768 // upper case, giving priority to the special casing rules.
769 func ToUpperSpecial(c unicode.SpecialCase, s []byte) []byte {
770 return Map(c.ToUpper, s)
771 }
772 773 // ToLowerSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
774 // lower case, giving priority to the special casing rules.
775 func ToLowerSpecial(c unicode.SpecialCase, s []byte) []byte {
776 return Map(c.ToLower, s)
777 }
778 779 // ToTitleSpecial treats s as UTF-8-encoded bytes and returns a copy with all the Unicode letters mapped to their
780 // title case, giving priority to the special casing rules.
781 func ToTitleSpecial(c unicode.SpecialCase, s []byte) []byte {
782 return Map(c.ToTitle, s)
783 }
784 785 // ToValidUTF8 treats s as UTF-8-encoded bytes and returns a copy with each run of bytes
786 // representing invalid UTF-8 replaced with the bytes in replacement, which may be empty.
787 func ToValidUTF8(s, replacement []byte) []byte {
788 b := []byte{:0:len(s)+len(replacement)}
789 invalid := false // previous byte was from an invalid UTF-8 sequence
790 for i := 0; i < len(s); {
791 c := s[i]
792 if c < utf8.RuneSelf {
793 i++
794 invalid = false
795 b = append(b, c)
796 continue
797 }
798 _, wid := utf8.DecodeRune(s[i:])
799 if wid == 1 {
800 i++
801 if !invalid {
802 invalid = true
803 b = append(b, replacement...)
804 }
805 continue
806 }
807 invalid = false
808 b = append(b, s[i:i+wid]...)
809 i += wid
810 }
811 return b
812 }
813 814 // isSeparator reports whether the rune could mark a word boundary.
815 // TODO: update when package unicode captures more of the properties.
816 func isSeparator(r rune) bool {
817 // ASCII alphanumerics and underscore are not separators
818 if r <= 0x7F {
819 switch {
820 case '0' <= r && r <= '9':
821 return false
822 case 'a' <= r && r <= 'z':
823 return false
824 case 'A' <= r && r <= 'Z':
825 return false
826 case r == '_':
827 return false
828 }
829 return true
830 }
831 // Letters and digits are not separators
832 if unicode.IsLetter(r) || unicode.IsDigit(r) {
833 return false
834 }
835 // Otherwise, all we can do for now is treat spaces as separators.
836 return unicode.IsSpace(r)
837 }
838 839 // Title treats s as UTF-8-encoded bytes and returns a copy with all Unicode letters that begin
840 // words mapped to their title case.
841 //
842 // Deprecated: The rule Title uses for word boundaries does not handle Unicode
843 // punctuation properly. Use golang.org/x/text/cases instead.
844 func Title(s []byte) []byte {
845 // Use a closure here to remember state.
846 // Hackish but effective. Depends on Map scanning in order and calling
847 // the closure once per rune.
848 prev := ' '
849 return Map(
850 func(r rune) rune {
851 if isSeparator(prev) {
852 prev = r
853 return unicode.ToTitle(r)
854 }
855 prev = r
856 return r
857 },
858 s)
859 }
860 861 // TrimLeftFunc treats s as UTF-8-encoded bytes and returns a subslice of s by slicing off
862 // all leading UTF-8-encoded code points c that satisfy f(c).
863 func TrimLeftFunc(s []byte, f func(r rune) bool) []byte {
864 i := indexFunc(s, f, false)
865 if i == -1 {
866 return nil
867 }
868 return s[i:]
869 }
870 871 // TrimRightFunc returns a subslice of s by slicing off all trailing
872 // UTF-8-encoded code points c that satisfy f(c).
873 func TrimRightFunc(s []byte, f func(r rune) bool) []byte {
874 i := lastIndexFunc(s, f, false)
875 if i >= 0 && s[i] >= utf8.RuneSelf {
876 _, wid := utf8.DecodeRune(s[i:])
877 i += wid
878 } else {
879 i++
880 }
881 return s[0:i]
882 }
883 884 // TrimFunc returns a subslice of s by slicing off all leading and trailing
885 // UTF-8-encoded code points c that satisfy f(c).
886 func TrimFunc(s []byte, f func(r rune) bool) []byte {
887 return TrimRightFunc(TrimLeftFunc(s, f), f)
888 }
889 890 // TrimPrefix returns s without the provided leading prefix string.
891 // If s doesn't start with prefix, s is returned unchanged.
892 func TrimPrefix(s, prefix []byte) []byte {
893 if HasPrefix(s, prefix) {
894 return s[len(prefix):]
895 }
896 return s
897 }
898 899 // TrimSuffix returns s without the provided trailing suffix string.
900 // If s doesn't end with suffix, s is returned unchanged.
901 func TrimSuffix(s, suffix []byte) []byte {
902 if HasSuffix(s, suffix) {
903 return s[:len(s)-len(suffix)]
904 }
905 return s
906 }
907 908 // IndexFunc interprets s as a sequence of UTF-8-encoded code points.
909 // It returns the byte index in s of the first Unicode
910 // code point satisfying f(c), or -1 if none do.
911 func IndexFunc(s []byte, f func(r rune) bool) int {
912 return indexFunc(s, f, true)
913 }
914 915 // LastIndexFunc interprets s as a sequence of UTF-8-encoded code points.
916 // It returns the byte index in s of the last Unicode
917 // code point satisfying f(c), or -1 if none do.
918 func LastIndexFunc(s []byte, f func(r rune) bool) int {
919 return lastIndexFunc(s, f, true)
920 }
921 922 // indexFunc is the same as IndexFunc except that if
923 // truth==false, the sense of the predicate function is
924 // inverted.
925 func indexFunc(s []byte, f func(r rune) bool, truth bool) int {
926 start := 0
927 for start < len(s) {
928 wid := 1
929 r := rune(s[start])
930 if r >= utf8.RuneSelf {
931 r, wid = utf8.DecodeRune(s[start:])
932 }
933 if f(r) == truth {
934 return start
935 }
936 start += wid
937 }
938 return -1
939 }
940 941 // lastIndexFunc is the same as LastIndexFunc except that if
942 // truth==false, the sense of the predicate function is
943 // inverted.
944 func lastIndexFunc(s []byte, f func(r rune) bool, truth bool) int {
945 for i := len(s); i > 0; {
946 r, size := rune(s[i-1]), 1
947 if r >= utf8.RuneSelf {
948 r, size = utf8.DecodeLastRune(s[0:i])
949 }
950 i -= size
951 if f(r) == truth {
952 return i
953 }
954 }
955 return -1
956 }
957 958 // asciiSet is a 32-byte value, where each bit represents the presence of a
959 // given ASCII character in the set. The 128-bits of the lower 16 bytes,
960 // starting with the least-significant bit of the lowest word to the
961 // most-significant bit of the highest word, map to the full range of all
962 // 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,
963 // ensuring that any non-ASCII character will be reported as not in the set.
964 // This allocates a total of 32 bytes even though the upper half
965 // is unused to avoid bounds checks in asciiSet.contains.
966 type asciiSet [8]uint32
967 968 // makeASCIISet creates a set of ASCII characters and reports whether all
969 // characters in chars are ASCII.
970 func makeASCIISet(chars []byte) (as asciiSet, ok bool) {
971 for i := 0; i < len(chars); i++ {
972 c := chars[i]
973 if c >= utf8.RuneSelf {
974 return as, false
975 }
976 as[c/32] |= 1 << (c % 32)
977 }
978 return as, true
979 }
980 981 // contains reports whether c is inside the set.
982 func (as *asciiSet) contains(c byte) bool {
983 return (as[c/32] & (1 << (c % 32))) != 0
984 }
985 986 // containsRune is a simplified version of strings.ContainsRune
987 // to avoid importing the strings package.
988 // We avoid bytes.ContainsRune to avoid allocating a temporary copy of s.
989 func containsRune(s []byte, r rune) bool {
990 for i := 0; i < len(s); {
991 c, w := utf8.DecodeRune(s[i:])
992 if c == r {
993 return true
994 }
995 i += w
996 }
997 return false
998 }
999 1000 // Trim returns a subslice of s by slicing off all leading and
1001 // trailing UTF-8-encoded code points contained in cutset.
1002 func Trim(s []byte, cutset []byte) []byte {
1003 if len(s) == 0 {
1004 // This is what we've historically done.
1005 return nil
1006 }
1007 if cutset == "" {
1008 return s
1009 }
1010 if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
1011 return trimLeftByte(trimRightByte(s, cutset[0]), cutset[0])
1012 }
1013 if as, ok := makeASCIISet(cutset); ok {
1014 return trimLeftASCII(trimRightASCII(s, &as), &as)
1015 }
1016 return trimLeftUnicode(trimRightUnicode(s, cutset), cutset)
1017 }
1018 1019 // TrimLeft returns a subslice of s by slicing off all leading
1020 // UTF-8-encoded code points contained in cutset.
1021 func TrimLeft(s []byte, cutset []byte) []byte {
1022 if len(s) == 0 {
1023 // This is what we've historically done.
1024 return nil
1025 }
1026 if cutset == "" {
1027 return s
1028 }
1029 if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
1030 return trimLeftByte(s, cutset[0])
1031 }
1032 if as, ok := makeASCIISet(cutset); ok {
1033 return trimLeftASCII(s, &as)
1034 }
1035 return trimLeftUnicode(s, cutset)
1036 }
1037 1038 func trimLeftByte(s []byte, c byte) []byte {
1039 for len(s) > 0 && s[0] == c {
1040 s = s[1:]
1041 }
1042 if len(s) == 0 {
1043 // This is what we've historically done.
1044 return nil
1045 }
1046 return s
1047 }
1048 1049 func trimLeftASCII(s []byte, as *asciiSet) []byte {
1050 for len(s) > 0 {
1051 if !as.contains(s[0]) {
1052 break
1053 }
1054 s = s[1:]
1055 }
1056 if len(s) == 0 {
1057 // This is what we've historically done.
1058 return nil
1059 }
1060 return s
1061 }
1062 1063 func trimLeftUnicode(s []byte, cutset []byte) []byte {
1064 for len(s) > 0 {
1065 r, n := rune(s[0]), 1
1066 if r >= utf8.RuneSelf {
1067 r, n = utf8.DecodeRune(s)
1068 }
1069 if !containsRune(cutset, r) {
1070 break
1071 }
1072 s = s[n:]
1073 }
1074 if len(s) == 0 {
1075 // This is what we've historically done.
1076 return nil
1077 }
1078 return s
1079 }
1080 1081 // TrimRight returns a subslice of s by slicing off all trailing
1082 // UTF-8-encoded code points that are contained in cutset.
1083 func TrimRight(s []byte, cutset []byte) []byte {
1084 if len(s) == 0 || cutset == "" {
1085 return s
1086 }
1087 if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
1088 return trimRightByte(s, cutset[0])
1089 }
1090 if as, ok := makeASCIISet(cutset); ok {
1091 return trimRightASCII(s, &as)
1092 }
1093 return trimRightUnicode(s, cutset)
1094 }
1095 1096 func trimRightByte(s []byte, c byte) []byte {
1097 for len(s) > 0 && s[len(s)-1] == c {
1098 s = s[:len(s)-1]
1099 }
1100 return s
1101 }
1102 1103 func trimRightASCII(s []byte, as *asciiSet) []byte {
1104 for len(s) > 0 {
1105 if !as.contains(s[len(s)-1]) {
1106 break
1107 }
1108 s = s[:len(s)-1]
1109 }
1110 return s
1111 }
1112 1113 func trimRightUnicode(s []byte, cutset []byte) []byte {
1114 for len(s) > 0 {
1115 r, n := rune(s[len(s)-1]), 1
1116 if r >= utf8.RuneSelf {
1117 r, n = utf8.DecodeLastRune(s)
1118 }
1119 if !containsRune(cutset, r) {
1120 break
1121 }
1122 s = s[:len(s)-n]
1123 }
1124 return s
1125 }
1126 1127 // TrimSpace returns a subslice of s by slicing off all leading and
1128 // trailing white space, as defined by Unicode.
1129 func TrimSpace(s []byte) []byte {
1130 // Fast path for ASCII: look for the first ASCII non-space byte
1131 start := 0
1132 for ; start < len(s); start++ {
1133 c := s[start]
1134 if c >= utf8.RuneSelf {
1135 // If we run into a non-ASCII byte, fall back to the
1136 // slower unicode-aware method on the remaining bytes
1137 return TrimFunc(s[start:], unicode.IsSpace)
1138 }
1139 if asciiSpace[c] == 0 {
1140 break
1141 }
1142 }
1143 1144 // Now look for the first ASCII non-space byte from the end
1145 stop := len(s)
1146 for ; stop > start; stop-- {
1147 c := s[stop-1]
1148 if c >= utf8.RuneSelf {
1149 return TrimFunc(s[start:stop], unicode.IsSpace)
1150 }
1151 if asciiSpace[c] == 0 {
1152 break
1153 }
1154 }
1155 1156 // At this point s[start:stop] starts and ends with an ASCII
1157 // non-space bytes, so we're done. Non-ASCII cases have already
1158 // been handled above.
1159 if start == stop {
1160 // Special case to preserve previous TrimLeftFunc behavior,
1161 // returning nil instead of empty slice if all spaces.
1162 return nil
1163 }
1164 return s[start:stop]
1165 }
1166 1167 // Runes interprets s as a sequence of UTF-8-encoded code points.
1168 // It returns a slice of runes (Unicode code points) equivalent to s.
1169 func Runes(s []byte) []rune {
1170 t := []rune{:utf8.RuneCount(s)}
1171 i := 0
1172 for len(s) > 0 {
1173 r, l := utf8.DecodeRune(s)
1174 t[i] = r
1175 i++
1176 s = s[l:]
1177 }
1178 return t
1179 }
1180 1181 // Replace returns a copy of the slice s with the first n
1182 // non-overlapping instances of old replaced by new.
1183 // If old is empty, it matches at the beginning of the slice
1184 // and after each UTF-8 sequence, yielding up to k+1 replacements
1185 // for a k-rune slice.
1186 // If n < 0, there is no limit on the number of replacements.
1187 func Replace(s, old, new []byte, n int) []byte {
1188 m := 0
1189 if n != 0 {
1190 // Compute number of replacements.
1191 m = Count(s, old)
1192 }
1193 if m == 0 {
1194 // Just return a copy.
1195 return append([]byte(nil), s...)
1196 }
1197 if n < 0 || m < n {
1198 n = m
1199 }
1200 1201 // Apply replacements to buffer.
1202 t := []byte{:len(s)+n*(len(new)-len(old))}
1203 w := 0
1204 start := 0
1205 if len(old) > 0 {
1206 for range n {
1207 j := start + Index(s[start:], old)
1208 w += copy(t[w:], s[start:j])
1209 w += copy(t[w:], new)
1210 start = j + len(old)
1211 }
1212 } else { // len(old) == 0
1213 w += copy(t[w:], new)
1214 for range n - 1 {
1215 _, wid := utf8.DecodeRune(s[start:])
1216 j := start + wid
1217 w += copy(t[w:], s[start:j])
1218 w += copy(t[w:], new)
1219 start = j
1220 }
1221 }
1222 w += copy(t[w:], s[start:])
1223 return t[0:w]
1224 }
1225 1226 // ReplaceAll returns a copy of the slice s with all
1227 // non-overlapping instances of old replaced by new.
1228 // If old is empty, it matches at the beginning of the slice
1229 // and after each UTF-8 sequence, yielding up to k+1 replacements
1230 // for a k-rune slice.
1231 func ReplaceAll(s, old, new []byte) []byte {
1232 return Replace(s, old, new, -1)
1233 }
1234 1235 // EqualFold reports whether s and t, interpreted as UTF-8 strings,
1236 // are equal under simple Unicode case-folding, which is a more general
1237 // form of case-insensitivity.
1238 func EqualFold(s, t []byte) bool {
1239 // ASCII fast path
1240 i := 0
1241 for n := min(len(s), len(t)); i < n; i++ {
1242 sr := s[i]
1243 tr := t[i]
1244 if sr|tr >= utf8.RuneSelf {
1245 goto hasUnicode
1246 }
1247 1248 // Easy case.
1249 if tr == sr {
1250 continue
1251 }
1252 1253 // Make sr < tr to simplify what follows.
1254 if tr < sr {
1255 tr, sr = sr, tr
1256 }
1257 // ASCII only, sr/tr must be upper/lower case
1258 if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' {
1259 continue
1260 }
1261 return false
1262 }
1263 // Check if we've exhausted both strings.
1264 return len(s) == len(t)
1265 1266 hasUnicode:
1267 s = s[i:]
1268 t = t[i:]
1269 for len(s) != 0 && len(t) != 0 {
1270 // Extract first rune from each.
1271 var sr, tr rune
1272 if s[0] < utf8.RuneSelf {
1273 sr, s = rune(s[0]), s[1:]
1274 } else {
1275 r, size := utf8.DecodeRune(s)
1276 sr, s = r, s[size:]
1277 }
1278 if t[0] < utf8.RuneSelf {
1279 tr, t = rune(t[0]), t[1:]
1280 } else {
1281 r, size := utf8.DecodeRune(t)
1282 tr, t = r, t[size:]
1283 }
1284 1285 // If they match, keep going; if not, return false.
1286 1287 // Easy case.
1288 if tr == sr {
1289 continue
1290 }
1291 1292 // Make sr < tr to simplify what follows.
1293 if tr < sr {
1294 tr, sr = sr, tr
1295 }
1296 // Fast check for ASCII.
1297 if tr < utf8.RuneSelf {
1298 // ASCII only, sr/tr must be upper/lower case
1299 if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' {
1300 continue
1301 }
1302 return false
1303 }
1304 1305 // General case. SimpleFold(x) returns the next equivalent rune > x
1306 // or wraps around to smaller values.
1307 r := unicode.SimpleFold(sr)
1308 for r != sr && r < tr {
1309 r = unicode.SimpleFold(r)
1310 }
1311 if r == tr {
1312 continue
1313 }
1314 return false
1315 }
1316 1317 // One string is empty. Are both?
1318 return len(s) == len(t)
1319 }
1320 1321 // Index returns the index of the first instance of sep in s, or -1 if sep is not present in s.
1322 func Index(s, sep []byte) int {
1323 n := len(sep)
1324 switch {
1325 case n == 0:
1326 return 0
1327 case n == 1:
1328 return IndexByte(s, sep[0])
1329 case n == len(s):
1330 if Equal(sep, s) {
1331 return 0
1332 }
1333 return -1
1334 case n > len(s):
1335 return -1
1336 case n <= bytealg.MaxLen:
1337 // Use brute force when s and sep both are small
1338 if len(s) <= bytealg.MaxBruteForce {
1339 return bytealg.Index(s, sep)
1340 }
1341 c0 := sep[0]
1342 c1 := sep[1]
1343 i := 0
1344 t := len(s) - n + 1
1345 fails := 0
1346 for i < t {
1347 if s[i] != c0 {
1348 // IndexByte is faster than bytealg.Index, so use it as long as
1349 // we're not getting lots of false positives.
1350 o := IndexByte(s[i+1:t], c0)
1351 if o < 0 {
1352 return -1
1353 }
1354 i += o + 1
1355 }
1356 if s[i+1] == c1 && Equal(s[i:i+n], sep) {
1357 return i
1358 }
1359 fails++
1360 i++
1361 // Switch to bytealg.Index when IndexByte produces too many false positives.
1362 if fails > bytealg.Cutover(i) {
1363 r := bytealg.Index(s[i:], sep)
1364 if r >= 0 {
1365 return r + i
1366 }
1367 return -1
1368 }
1369 }
1370 return -1
1371 }
1372 c0 := sep[0]
1373 c1 := sep[1]
1374 i := 0
1375 fails := 0
1376 t := len(s) - n + 1
1377 for i < t {
1378 if s[i] != c0 {
1379 o := IndexByte(s[i+1:t], c0)
1380 if o < 0 {
1381 break
1382 }
1383 i += o + 1
1384 }
1385 if s[i+1] == c1 && Equal(s[i:i+n], sep) {
1386 return i
1387 }
1388 i++
1389 fails++
1390 if fails >= 4+i>>4 && i < t {
1391 // Give up on IndexByte, it isn't skipping ahead
1392 // far enough to be better than Rabin-Karp.
1393 // Experiments (using IndexPeriodic) suggest
1394 // the cutover is about 16 byte skips.
1395 // TODO: if large prefixes of sep are matching
1396 // we should cutover at even larger average skips,
1397 // because Equal becomes that much more expensive.
1398 // This code does not take that effect into account.
1399 j := bytealg.IndexRabinKarp(s[i:], sep)
1400 if j < 0 {
1401 return -1
1402 }
1403 return i + j
1404 }
1405 }
1406 return -1
1407 }
1408 1409 // Cut slices s around the first instance of sep,
1410 // returning the text before and after sep.
1411 // The found result reports whether sep appears in s.
1412 // If sep does not appear in s, cut returns s, nil, false.
1413 //
1414 // Cut returns slices of the original slice s, not copies.
1415 func Cut(s, sep []byte) (before, after []byte, found bool) {
1416 if i := Index(s, sep); i >= 0 {
1417 return s[:i], s[i+len(sep):], true
1418 }
1419 return s, nil, false
1420 }
1421 1422 // Clone returns a copy of b[:len(b)].
1423 // The result may have additional unused capacity.
1424 // Clone(nil) returns nil.
1425 func Clone(b []byte) []byte {
1426 if b == nil {
1427 return nil
1428 }
1429 return append([]byte{}, b...)
1430 }
1431 1432 // CutPrefix returns s without the provided leading prefix byte slice
1433 // and reports whether it found the prefix.
1434 // If s doesn't start with prefix, CutPrefix returns s, false.
1435 // If prefix is the empty byte slice, CutPrefix returns s, true.
1436 //
1437 // CutPrefix returns slices of the original slice s, not copies.
1438 func CutPrefix(s, prefix []byte) (after []byte, found bool) {
1439 if !HasPrefix(s, prefix) {
1440 return s, false
1441 }
1442 return s[len(prefix):], true
1443 }
1444 1445 // CutSuffix returns s without the provided ending suffix byte slice
1446 // and reports whether it found the suffix.
1447 // If s doesn't end with suffix, CutSuffix returns s, false.
1448 // If suffix is the empty byte slice, CutSuffix returns s, true.
1449 //
1450 // CutSuffix returns slices of the original slice s, not copies.
1451 func CutSuffix(s, suffix []byte) (before []byte, found bool) {
1452 if !HasSuffix(s, suffix) {
1453 return s, false
1454 }
1455 return s[:len(s)-len(suffix)], true
1456 }
1457