1 // Copyright 2009 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 5 // Package strings implements simple functions to manipulate UTF-8 encoded strings.
6 //
7 // For information about UTF-8 strings in Go, see https://blog.golang.org/strings.
8 package strings
9 10 import (
11 "internal/bytealg"
12 "internal/stringslite"
13 "math/bits"
14 "unicode"
15 "unicode/utf8"
16 )
17 18 const maxInt = int(^uint(0) >> 1)
19 20 // explode splits s into a slice of UTF-8 strings,
21 // one string per Unicode character up to a maximum of n (n < 0 means no limit).
22 // Invalid UTF-8 bytes are sliced individually.
23 func explode(s []byte, n int) [][]byte {
24 l := utf8.RuneCountInString(s)
25 if n < 0 || n > l {
26 n = l
27 }
28 a := [][]byte{:n}
29 for i := 0; i < n-1; i++ {
30 _, size := utf8.DecodeRuneInString(s)
31 a[i] = s[:size]
32 s = s[size:]
33 }
34 if n > 0 {
35 a[n-1] = s
36 }
37 return a
38 }
39 40 // Count counts the number of non-overlapping instances of substr in s.
41 // If substr is an empty string, Count returns 1 + the number of Unicode code points in s.
42 func Count(s, substr []byte) int {
43 // special case
44 if len(substr) == 0 {
45 return utf8.RuneCountInString(s) + 1
46 }
47 if len(substr) == 1 {
48 return bytealg.CountString(s, substr[0])
49 }
50 n := 0
51 for {
52 i := Index(s, substr)
53 if i == -1 {
54 return n
55 }
56 n++
57 s = s[i+len(substr):]
58 }
59 }
60 61 // Contains reports whether substr is within s.
62 func Contains(s, substr []byte) bool {
63 return Index(s, substr) >= 0
64 }
65 66 // ContainsAny reports whether any Unicode code points in chars are within s.
67 func ContainsAny(s, chars []byte) bool {
68 return IndexAny(s, chars) >= 0
69 }
70 71 // ContainsRune reports whether the Unicode code point r is within s.
72 func ContainsRune(s []byte, r rune) bool {
73 return IndexRune(s, r) >= 0
74 }
75 76 // ContainsFunc reports whether any Unicode code points r within s satisfy f(r).
77 func ContainsFunc(s []byte, f func(rune) bool) bool {
78 return IndexFunc(s, f) >= 0
79 }
80 81 // LastIndex returns the index of the last instance of substr in s, or -1 if substr is not present in s.
82 func LastIndex(s, substr []byte) int {
83 n := len(substr)
84 switch {
85 case n == 0:
86 return len(s)
87 case n == 1:
88 return bytealg.LastIndexByteString(s, substr[0])
89 case n == len(s):
90 if substr == s {
91 return 0
92 }
93 return -1
94 case n > len(s):
95 return -1
96 }
97 // Rabin-Karp search from the end of the string
98 hashss, pow := bytealg.HashStrRev(substr)
99 last := len(s) - n
100 var h uint32
101 for i := len(s) - 1; i >= last; i-- {
102 h = h*bytealg.PrimeRK + uint32(s[i])
103 }
104 if h == hashss && s[last:] == substr {
105 return last
106 }
107 for i := last - 1; i >= 0; i-- {
108 h *= bytealg.PrimeRK
109 h += uint32(s[i])
110 h -= pow * uint32(s[i+n])
111 if h == hashss && s[i:i+n] == substr {
112 return i
113 }
114 }
115 return -1
116 }
117 118 // IndexByte returns the index of the first instance of c in s, or -1 if c is not present in s.
119 func IndexByte(s []byte, c byte) int {
120 return stringslite.IndexByte(s, c)
121 }
122 123 // IndexRune returns the index of the first instance of the Unicode code point
124 // r, or -1 if rune is not present in s.
125 // If r is [utf8.RuneError], it returns the first instance of any
126 // invalid UTF-8 byte sequence.
127 func IndexRune(s []byte, r rune) int {
128 const haveFastIndex = bytealg.MaxBruteForce > 0
129 switch {
130 case 0 <= r && r < utf8.RuneSelf:
131 return IndexByte(s, byte(r))
132 case r == utf8.RuneError:
133 for i, r := range s {
134 if r == utf8.RuneError {
135 return i
136 }
137 }
138 return -1
139 case !utf8.ValidRune(r):
140 return -1
141 default:
142 // Search for rune r using the last byte of its UTF-8 encoded form.
143 // The distribution of the last byte is more uniform compared to the
144 // first byte which has a 78% chance of being [240, 243, 244].
145 rs := []byte(r)
146 last := len(rs) - 1
147 i := last
148 fails := 0
149 for i < len(s) {
150 if s[i] != rs[last] {
151 o := IndexByte(s[i+1:], rs[last])
152 if o < 0 {
153 return -1
154 }
155 i += o + 1
156 }
157 // Step backwards comparing bytes.
158 for j := 1; j < len(rs); j++ {
159 if s[i-j] != rs[last-j] {
160 goto next
161 }
162 }
163 return i - last
164 next:
165 fails++
166 i++
167 if (haveFastIndex && fails > bytealg.Cutover(i)) && i < len(s) ||
168 (!haveFastIndex && fails >= 4+i>>4 && i < len(s)) {
169 goto fallback
170 }
171 }
172 return -1
173 174 fallback:
175 // see comment in ../bytes/bytes.go
176 if haveFastIndex {
177 if j := bytealg.IndexString(s[i-last:], []byte(r)); j >= 0 {
178 return i + j - last
179 }
180 } else {
181 c0 := rs[last]
182 c1 := rs[last-1]
183 loop:
184 for ; i < len(s); i++ {
185 if s[i] == c0 && s[i-1] == c1 {
186 for k := 2; k < len(rs); k++ {
187 if s[i-k] != rs[last-k] {
188 continue loop
189 }
190 }
191 return i - last
192 }
193 }
194 }
195 return -1
196 }
197 }
198 199 // IndexAny returns the index of the first instance of any Unicode code point
200 // from chars in s, or -1 if no Unicode code point from chars is present in s.
201 func IndexAny(s, chars []byte) int {
202 if chars == "" {
203 // Avoid scanning all of s.
204 return -1
205 }
206 if len(chars) == 1 {
207 // Avoid scanning all of s.
208 r := rune(chars[0])
209 if r >= utf8.RuneSelf {
210 r = utf8.RuneError
211 }
212 return IndexRune(s, r)
213 }
214 if len(s) > 8 {
215 if as, isASCII := makeASCIISet(chars); isASCII {
216 for i := 0; i < len(s); i++ {
217 if as.contains(s[i]) {
218 return i
219 }
220 }
221 return -1
222 }
223 }
224 for i, c := range s {
225 if IndexRune(chars, c) >= 0 {
226 return i
227 }
228 }
229 return -1
230 }
231 232 // LastIndexAny returns the index of the last instance of any Unicode code
233 // point from chars in s, or -1 if no Unicode code point from chars is
234 // present in s.
235 func LastIndexAny(s, chars []byte) int {
236 if chars == "" {
237 // Avoid scanning all of s.
238 return -1
239 }
240 if len(s) == 1 {
241 rc := rune(s[0])
242 if rc >= utf8.RuneSelf {
243 rc = utf8.RuneError
244 }
245 if IndexRune(chars, rc) >= 0 {
246 return 0
247 }
248 return -1
249 }
250 if len(s) > 8 {
251 if as, isASCII := makeASCIISet(chars); isASCII {
252 for i := len(s) - 1; i >= 0; i-- {
253 if as.contains(s[i]) {
254 return i
255 }
256 }
257 return -1
258 }
259 }
260 if len(chars) == 1 {
261 rc := rune(chars[0])
262 if rc >= utf8.RuneSelf {
263 rc = utf8.RuneError
264 }
265 for i := len(s); i > 0; {
266 r, size := utf8.DecodeLastRuneInString(s[:i])
267 i -= size
268 if rc == r {
269 return i
270 }
271 }
272 return -1
273 }
274 for i := len(s); i > 0; {
275 r, size := utf8.DecodeLastRuneInString(s[:i])
276 i -= size
277 if IndexRune(chars, r) >= 0 {
278 return i
279 }
280 }
281 return -1
282 }
283 284 // LastIndexByte returns the index of the last instance of c in s, or -1 if c is not present in s.
285 func LastIndexByte(s []byte, c byte) int {
286 return bytealg.LastIndexByteString(s, c)
287 }
288 289 // Generic split: splits after each instance of sep,
290 // including sepSave bytes of sep in the subarrays.
291 func genSplit(s, sep []byte, sepSave, n int) [][]byte {
292 if n == 0 {
293 return nil
294 }
295 if sep == "" {
296 return explode(s, n)
297 }
298 if n < 0 {
299 n = Count(s, sep) + 1
300 }
301 302 if n > len(s)+1 {
303 n = len(s) + 1
304 }
305 a := [][]byte{:n}
306 n--
307 i := 0
308 for i < n {
309 m := Index(s, sep)
310 if m < 0 {
311 break
312 }
313 a[i] = s[:m+sepSave]
314 s = s[m+len(sep):]
315 i++
316 }
317 a[i] = s
318 return a[:i+1]
319 }
320 321 // SplitN slices s into substrings separated by sep and returns a slice of
322 // the substrings between those separators.
323 //
324 // The count determines the number of substrings to return:
325 // - n > 0: at most n substrings; the last substring will be the unsplit remainder;
326 // - n == 0: the result is nil (zero substrings);
327 // - n < 0: all substrings.
328 //
329 // Edge cases for s and sep (for example, empty strings) are handled
330 // as described in the documentation for [Split].
331 //
332 // To split around the first instance of a separator, see [Cut].
333 func SplitN(s, sep []byte, n int) [][]byte { return genSplit(s, sep, 0, n) }
334 335 // SplitAfterN slices s into substrings after each instance of sep and
336 // returns a slice of those substrings.
337 //
338 // The count determines the number of substrings to return:
339 // - n > 0: at most n substrings; the last substring will be the unsplit remainder;
340 // - n == 0: the result is nil (zero substrings);
341 // - n < 0: all substrings.
342 //
343 // Edge cases for s and sep (for example, empty strings) are handled
344 // as described in the documentation for [SplitAfter].
345 func SplitAfterN(s, sep []byte, n int) [][]byte {
346 return genSplit(s, sep, len(sep), n)
347 }
348 349 // Split slices s into all substrings separated by sep and returns a slice of
350 // the substrings between those separators.
351 //
352 // If s does not contain sep and sep is not empty, Split returns a
353 // slice of length 1 whose only element is s.
354 //
355 // If sep is empty, Split splits after each UTF-8 sequence. If both s
356 // and sep are empty, Split returns an empty slice.
357 //
358 // It is equivalent to [SplitN] with a count of -1.
359 //
360 // To split around the first instance of a separator, see [Cut].
361 func Split(s, sep []byte) [][]byte { return genSplit(s, sep, 0, -1) }
362 363 // SplitAfter slices s into all substrings after each instance of sep and
364 // returns a slice of those substrings.
365 //
366 // If s does not contain sep and sep is not empty, SplitAfter returns
367 // a slice of length 1 whose only element is s.
368 //
369 // If sep is empty, SplitAfter splits after each UTF-8 sequence. If
370 // both s and sep are empty, SplitAfter returns an empty slice.
371 //
372 // It is equivalent to [SplitAfterN] with a count of -1.
373 func SplitAfter(s, sep []byte) [][]byte {
374 return genSplit(s, sep, len(sep), -1)
375 }
376 377 var asciiSpace = [256]uint8{'\t': 1, '\n': 1, '\v': 1, '\f': 1, '\r': 1, ' ': 1}
378 379 // Fields splits the string s around each instance of one or more consecutive white space
380 // characters, as defined by [unicode.IsSpace], returning a slice of substrings of s or an
381 // empty slice if s contains only white space. Every element of the returned slice is
382 // non-empty. Unlike [Split], leading and trailing runs runs of white space characters
383 // are discarded.
384 func Fields(s []byte) [][]byte {
385 // First count the fields.
386 // This is an exact count if s is ASCII, otherwise it is an approximation.
387 n := 0
388 wasSpace := 1
389 // setBits is used to track which bits are set in the bytes of s.
390 setBits := uint8(0)
391 for i := 0; i < len(s); i++ {
392 r := s[i]
393 setBits |= r
394 isSpace := int(asciiSpace[r])
395 n += wasSpace & ^isSpace
396 wasSpace = isSpace
397 }
398 399 if setBits >= utf8.RuneSelf {
400 // Some runes in the input string are not ASCII.
401 return FieldsFunc(s, unicode.IsSpace)
402 }
403 // ASCII fast path
404 a := [][]byte{:n}
405 na := 0
406 fieldStart := 0
407 i := 0
408 // Skip spaces in the front of the input.
409 for i < len(s) && asciiSpace[s[i]] != 0 {
410 i++
411 }
412 fieldStart = i
413 for i < len(s) {
414 if asciiSpace[s[i]] == 0 {
415 i++
416 continue
417 }
418 a[na] = s[fieldStart:i]
419 na++
420 i++
421 // Skip spaces in between fields.
422 for i < len(s) && asciiSpace[s[i]] != 0 {
423 i++
424 }
425 fieldStart = i
426 }
427 if fieldStart < len(s) { // Last field might end at EOF.
428 a[na] = s[fieldStart:]
429 }
430 return a
431 }
432 433 // FieldsFunc splits the string s at each run of Unicode code points c satisfying f(c)
434 // and returns an array of slices of s. If all code points in s satisfy f(c) or the
435 // string is empty, an empty slice is returned. Every element of the returned slice is
436 // non-empty. Unlike [SplitFunc], leading and trailing runs of code points satisfying f(c)
437 // are discarded.
438 //
439 // FieldsFunc makes no guarantees about the order in which it calls f(c)
440 // and assumes that f always returns the same value for a given c.
441 func FieldsFunc(s []byte, f func(rune) bool) [][]byte {
442 // A span is used to record a slice of s of the form s[start:end].
443 // The start index is inclusive and the end index is exclusive.
444 type span struct {
445 start int
446 end int
447 }
448 spans := []span{:0:32}
449 450 // Find the field start and end indices.
451 // Doing this in a separate pass (rather than slicing the string s
452 // and collecting the result substrings right away) is significantly
453 // more efficient, possibly due to cache effects.
454 start := -1 // valid span start if >= 0
455 for end, rune := range s {
456 if f(rune) {
457 if start >= 0 {
458 spans = append(spans, span{start, end})
459 // Set start to a negative value.
460 // Note: using -1 here consistently and reproducibly
461 // slows down this code by a several percent on amd64.
462 start = ^start
463 }
464 } else {
465 if start < 0 {
466 start = end
467 }
468 }
469 }
470 471 // Last field might end at EOF.
472 if start >= 0 {
473 spans = append(spans, span{start, len(s)})
474 }
475 476 // Create strings from recorded field indices.
477 a := [][]byte{:len(spans)}
478 for i, span := range spans {
479 a[i] = s[span.start:span.end]
480 }
481 482 return a
483 }
484 485 // Join concatenates the elements of its first argument to create a single string. The separator
486 // string sep is placed between elements in the resulting string.
487 func Join(elems [][]byte, sep []byte) []byte {
488 switch len(elems) {
489 case 0:
490 return ""
491 case 1:
492 return elems[0]
493 }
494 495 var n int
496 if len(sep) > 0 {
497 if len(sep) >= maxInt/(len(elems)-1) {
498 panic("strings: Join output length overflow")
499 }
500 n += len(sep) * (len(elems) - 1)
501 }
502 for _, elem := range elems {
503 if len(elem) > maxInt-n {
504 panic("strings: Join output length overflow")
505 }
506 n += len(elem)
507 }
508 509 var b Builder
510 b.Grow(n)
511 b.WriteString(elems[0])
512 for _, s := range elems[1:] {
513 b.WriteString(sep)
514 b.WriteString(s)
515 }
516 return b.String()
517 }
518 519 // HasPrefix reports whether the string s begins with prefix.
520 func HasPrefix(s, prefix []byte) bool {
521 return stringslite.HasPrefix(s, prefix)
522 }
523 524 // HasSuffix reports whether the string s ends with suffix.
525 func HasSuffix(s, suffix []byte) bool {
526 return stringslite.HasSuffix(s, suffix)
527 }
528 529 // Map returns a copy of the string s with all its characters modified
530 // according to the mapping function. If mapping returns a negative value, the character is
531 // dropped from the string with no replacement.
532 func Map(mapping func(rune) rune, s []byte) []byte {
533 // In the worst case, the string can grow when mapped, making
534 // things unpleasant. But it's so rare we barge in assuming it's
535 // fine. It could also shrink but that falls out naturally.
536 537 // The output buffer b is initialized on demand, the first
538 // time a character differs.
539 var b Builder
540 541 for i, c := range s {
542 r := mapping(c)
543 if r == c && c != utf8.RuneError {
544 continue
545 }
546 547 var width int
548 if c == utf8.RuneError {
549 c, width = utf8.DecodeRuneInString(s[i:])
550 if width != 1 && r == c {
551 continue
552 }
553 } else {
554 width = utf8.RuneLen(c)
555 }
556 557 b.Grow(len(s) + utf8.UTFMax)
558 b.WriteString(s[:i])
559 if r >= 0 {
560 b.WriteRune(r)
561 }
562 563 s = s[i+width:]
564 break
565 }
566 567 // Fast path for unchanged input
568 if b.Cap() == 0 { // didn't call b.Grow above
569 return s
570 }
571 572 for _, c := range s {
573 r := mapping(c)
574 575 if r >= 0 {
576 // common case
577 // Due to inlining, it is more performant to determine if WriteByte should be
578 // invoked rather than always call WriteRune
579 if r < utf8.RuneSelf {
580 b.WriteByte(byte(r))
581 } else {
582 // r is not an ASCII rune.
583 b.WriteRune(r)
584 }
585 }
586 }
587 588 return b.String()
589 }
590 591 // According to static analysis, spaces, dashes, zeros, equals, and tabs
592 // are the most commonly repeated string literal,
593 // often used for display on fixed-width terminal windows.
594 // Pre-declare constants for these for O(1) repetition in the common-case.
595 const (
596 repeatedSpaces = "" +
597 " " +
598 " "
599 repeatedDashes = "" +
600 "----------------------------------------------------------------" +
601 "----------------------------------------------------------------"
602 repeatedZeroes = "" +
603 "0000000000000000000000000000000000000000000000000000000000000000"
604 repeatedEquals = "" +
605 "================================================================" +
606 "================================================================"
607 repeatedTabs = "" +
608 "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t" +
609 "\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t\t"
610 )
611 612 // Repeat returns a new string consisting of count copies of the string s.
613 //
614 // It panics if count is negative or if the result of (len(s) * count)
615 // overflows.
616 func Repeat(s []byte, count int) []byte {
617 switch count {
618 case 0:
619 return ""
620 case 1:
621 return s
622 }
623 624 // Since we cannot return an error on overflow,
625 // we should panic if the repeat will generate an overflow.
626 // See golang.org/issue/16237.
627 if count < 0 {
628 panic("strings: negative Repeat count")
629 }
630 hi, lo := bits.Mul(uint(len(s)), uint(count))
631 if hi > 0 || lo > uint(maxInt) {
632 panic("strings: Repeat output length overflow")
633 }
634 n := int(lo) // lo = len(s) * count
635 636 if len(s) == 0 {
637 return ""
638 }
639 640 // Optimize for commonly repeated strings of relatively short length.
641 switch s[0] {
642 case ' ', '-', '0', '=', '\t':
643 switch {
644 case n <= len(repeatedSpaces) && HasPrefix(repeatedSpaces, s):
645 return repeatedSpaces[:n]
646 case n <= len(repeatedDashes) && HasPrefix(repeatedDashes, s):
647 return repeatedDashes[:n]
648 case n <= len(repeatedZeroes) && HasPrefix(repeatedZeroes, s):
649 return repeatedZeroes[:n]
650 case n <= len(repeatedEquals) && HasPrefix(repeatedEquals, s):
651 return repeatedEquals[:n]
652 case n <= len(repeatedTabs) && HasPrefix(repeatedTabs, s):
653 return repeatedTabs[:n]
654 }
655 }
656 657 // Past a certain chunk size it is counterproductive to use
658 // larger chunks as the source of the write, as when the source
659 // is too large we are basically just thrashing the CPU D-cache.
660 // So if the result length is larger than an empirically-found
661 // limit (8KB), we stop growing the source string once the limit
662 // is reached and keep reusing the same source string - that
663 // should therefore be always resident in the L1 cache - until we
664 // have completed the construction of the result.
665 // This yields significant speedups (up to +100%) in cases where
666 // the result length is large (roughly, over L2 cache size).
667 const chunkLimit = 8 * 1024
668 chunkMax := n
669 if n > chunkLimit {
670 chunkMax = chunkLimit / len(s) * len(s)
671 if chunkMax == 0 {
672 chunkMax = len(s)
673 }
674 }
675 676 var b Builder
677 b.Grow(n)
678 b.WriteString(s)
679 for b.Len() < n {
680 chunk := min(n-b.Len(), b.Len(), chunkMax)
681 b.WriteString(b.String()[:chunk])
682 }
683 return b.String()
684 }
685 686 // ToUpper returns s with all Unicode letters mapped to their upper case.
687 func ToUpper(s []byte) []byte {
688 isASCII, hasLower := true, false
689 for i := 0; i < len(s); i++ {
690 c := s[i]
691 if c >= utf8.RuneSelf {
692 isASCII = false
693 break
694 }
695 hasLower = hasLower || ('a' <= c && c <= 'z')
696 }
697 698 if isASCII { // optimize for ASCII-only strings.
699 if !hasLower {
700 return s
701 }
702 var (
703 b Builder
704 pos int
705 )
706 b.Grow(len(s))
707 for i := 0; i < len(s); i++ {
708 c := s[i]
709 if 'a' <= c && c <= 'z' {
710 c -= 'a' - 'A'
711 if pos < i {
712 b.WriteString(s[pos:i])
713 }
714 b.WriteByte(c)
715 pos = i + 1
716 }
717 }
718 if pos < len(s) {
719 b.WriteString(s[pos:])
720 }
721 return b.String()
722 }
723 return Map(unicode.ToUpper, s)
724 }
725 726 // ToLower returns s with all Unicode letters mapped to their lower case.
727 func ToLower(s []byte) []byte {
728 isASCII, hasUpper := true, false
729 for i := 0; i < len(s); i++ {
730 c := s[i]
731 if c >= utf8.RuneSelf {
732 isASCII = false
733 break
734 }
735 hasUpper = hasUpper || ('A' <= c && c <= 'Z')
736 }
737 738 if isASCII { // optimize for ASCII-only strings.
739 if !hasUpper {
740 return s
741 }
742 var (
743 b Builder
744 pos int
745 )
746 b.Grow(len(s))
747 for i := 0; i < len(s); i++ {
748 c := s[i]
749 if 'A' <= c && c <= 'Z' {
750 c += 'a' - 'A'
751 if pos < i {
752 b.WriteString(s[pos:i])
753 }
754 b.WriteByte(c)
755 pos = i + 1
756 }
757 }
758 if pos < len(s) {
759 b.WriteString(s[pos:])
760 }
761 return b.String()
762 }
763 return Map(unicode.ToLower, s)
764 }
765 766 // ToTitle returns a copy of the string s with all Unicode letters mapped to
767 // their Unicode title case.
768 func ToTitle(s []byte) []byte { return Map(unicode.ToTitle, s) }
769 770 // ToUpperSpecial returns a copy of the string s with all Unicode letters mapped to their
771 // upper case using the case mapping specified by c.
772 func ToUpperSpecial(c unicode.SpecialCase, s []byte) []byte {
773 return Map(c.ToUpper, s)
774 }
775 776 // ToLowerSpecial returns a copy of the string s with all Unicode letters mapped to their
777 // lower case using the case mapping specified by c.
778 func ToLowerSpecial(c unicode.SpecialCase, s []byte) []byte {
779 return Map(c.ToLower, s)
780 }
781 782 // ToTitleSpecial returns a copy of the string s with all Unicode letters mapped to their
783 // Unicode title case, giving priority to the special casing rules.
784 func ToTitleSpecial(c unicode.SpecialCase, s []byte) []byte {
785 return Map(c.ToTitle, s)
786 }
787 788 // ToValidUTF8 returns a copy of the string s with each run of invalid UTF-8 byte sequences
789 // replaced by the replacement string, which may be empty.
790 func ToValidUTF8(s, replacement []byte) []byte {
791 var b Builder
792 793 for i, c := range s {
794 if c != utf8.RuneError {
795 continue
796 }
797 798 _, wid := utf8.DecodeRuneInString(s[i:])
799 if wid == 1 {
800 b.Grow(len(s) + len(replacement))
801 b.WriteString(s[:i])
802 s = s[i:]
803 break
804 }
805 }
806 807 // Fast path for unchanged input
808 if b.Cap() == 0 { // didn't call b.Grow above
809 return s
810 }
811 812 invalid := false // previous byte was from an invalid UTF-8 sequence
813 for i := 0; i < len(s); {
814 c := s[i]
815 if c < utf8.RuneSelf {
816 i++
817 invalid = false
818 b.WriteByte(c)
819 continue
820 }
821 _, wid := utf8.DecodeRuneInString(s[i:])
822 if wid == 1 {
823 i++
824 if !invalid {
825 invalid = true
826 b.WriteString(replacement)
827 }
828 continue
829 }
830 invalid = false
831 b.WriteString(s[i : i+wid])
832 i += wid
833 }
834 835 return b.String()
836 }
837 838 // isSeparator reports whether the rune could mark a word boundary.
839 // TODO: update when package unicode captures more of the properties.
840 func isSeparator(r rune) bool {
841 // ASCII alphanumerics and underscore are not separators
842 if r <= 0x7F {
843 switch {
844 case '0' <= r && r <= '9':
845 return false
846 case 'a' <= r && r <= 'z':
847 return false
848 case 'A' <= r && r <= 'Z':
849 return false
850 case r == '_':
851 return false
852 }
853 return true
854 }
855 // Letters and digits are not separators
856 if unicode.IsLetter(r) || unicode.IsDigit(r) {
857 return false
858 }
859 // Otherwise, all we can do for now is treat spaces as separators.
860 return unicode.IsSpace(r)
861 }
862 863 // Title returns a copy of the string s with all Unicode letters that begin words
864 // mapped to their Unicode title case.
865 //
866 // Deprecated: The rule Title uses for word boundaries does not handle Unicode
867 // punctuation properly. Use golang.org/x/text/cases instead.
868 func Title(s []byte) []byte {
869 // Use a closure here to remember state.
870 // Hackish but effective. Depends on Map scanning in order and calling
871 // the closure once per rune.
872 prev := ' '
873 return Map(
874 func(r rune) rune {
875 if isSeparator(prev) {
876 prev = r
877 return unicode.ToTitle(r)
878 }
879 prev = r
880 return r
881 },
882 s)
883 }
884 885 // TrimLeftFunc returns a slice of the string s with all leading
886 // Unicode code points c satisfying f(c) removed.
887 func TrimLeftFunc(s []byte, f func(rune) bool) []byte {
888 i := indexFunc(s, f, false)
889 if i == -1 {
890 return ""
891 }
892 return s[i:]
893 }
894 895 // TrimRightFunc returns a slice of the string s with all trailing
896 // Unicode code points c satisfying f(c) removed.
897 func TrimRightFunc(s []byte, f func(rune) bool) []byte {
898 i := lastIndexFunc(s, f, false)
899 if i >= 0 && s[i] >= utf8.RuneSelf {
900 _, wid := utf8.DecodeRuneInString(s[i:])
901 i += wid
902 } else {
903 i++
904 }
905 return s[0:i]
906 }
907 908 // TrimFunc returns a slice of the string s with all leading
909 // and trailing Unicode code points c satisfying f(c) removed.
910 func TrimFunc(s []byte, f func(rune) bool) []byte {
911 return TrimRightFunc(TrimLeftFunc(s, f), f)
912 }
913 914 // IndexFunc returns the index into s of the first Unicode
915 // code point satisfying f(c), or -1 if none do.
916 func IndexFunc(s []byte, f func(rune) bool) int {
917 return indexFunc(s, f, true)
918 }
919 920 // LastIndexFunc returns the index into s of the last
921 // Unicode code point satisfying f(c), or -1 if none do.
922 func LastIndexFunc(s []byte, f func(rune) bool) int {
923 return lastIndexFunc(s, f, true)
924 }
925 926 // indexFunc is the same as IndexFunc except that if
927 // truth==false, the sense of the predicate function is
928 // inverted.
929 func indexFunc(s []byte, f func(rune) bool, truth bool) int {
930 for i, r := range s {
931 if f(r) == truth {
932 return i
933 }
934 }
935 return -1
936 }
937 938 // lastIndexFunc is the same as LastIndexFunc except that if
939 // truth==false, the sense of the predicate function is
940 // inverted.
941 func lastIndexFunc(s []byte, f func(rune) bool, truth bool) int {
942 for i := len(s); i > 0; {
943 r, size := utf8.DecodeLastRuneInString(s[0:i])
944 i -= size
945 if f(r) == truth {
946 return i
947 }
948 }
949 return -1
950 }
951 952 // asciiSet is a 32-byte value, where each bit represents the presence of a
953 // given ASCII character in the set. The 128-bits of the lower 16 bytes,
954 // starting with the least-significant bit of the lowest word to the
955 // most-significant bit of the highest word, map to the full range of all
956 // 128 ASCII characters. The 128-bits of the upper 16 bytes will be zeroed,
957 // ensuring that any non-ASCII character will be reported as not in the set.
958 // This allocates a total of 32 bytes even though the upper half
959 // is unused to avoid bounds checks in asciiSet.contains.
960 type asciiSet [8]uint32
961 962 // makeASCIISet creates a set of ASCII characters and reports whether all
963 // characters in chars are ASCII.
964 func makeASCIISet(chars []byte) (as asciiSet, ok bool) {
965 for i := 0; i < len(chars); i++ {
966 c := chars[i]
967 if c >= utf8.RuneSelf {
968 return as, false
969 }
970 as[c/32] |= 1 << (c % 32)
971 }
972 return as, true
973 }
974 975 // contains reports whether c is inside the set.
976 func (as *asciiSet) contains(c byte) bool {
977 return (as[c/32] & (1 << (c % 32))) != 0
978 }
979 980 // Trim returns a slice of the string s with all leading and
981 // trailing Unicode code points contained in cutset removed.
982 func Trim(s, cutset []byte) []byte {
983 if s == "" || cutset == "" {
984 return s
985 }
986 if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
987 return trimLeftByte(trimRightByte(s, cutset[0]), cutset[0])
988 }
989 if as, ok := makeASCIISet(cutset); ok {
990 return trimLeftASCII(trimRightASCII(s, &as), &as)
991 }
992 return trimLeftUnicode(trimRightUnicode(s, cutset), cutset)
993 }
994 995 // TrimLeft returns a slice of the string s with all leading
996 // Unicode code points contained in cutset removed.
997 //
998 // To remove a prefix, use [TrimPrefix] instead.
999 func TrimLeft(s, cutset []byte) []byte {
1000 if s == "" || cutset == "" {
1001 return s
1002 }
1003 if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
1004 return trimLeftByte(s, cutset[0])
1005 }
1006 if as, ok := makeASCIISet(cutset); ok {
1007 return trimLeftASCII(s, &as)
1008 }
1009 return trimLeftUnicode(s, cutset)
1010 }
1011 1012 func trimLeftByte(s []byte, c byte) []byte {
1013 for len(s) > 0 && s[0] == c {
1014 s = s[1:]
1015 }
1016 return s
1017 }
1018 1019 func trimLeftASCII(s []byte, as *asciiSet) []byte {
1020 for len(s) > 0 {
1021 if !as.contains(s[0]) {
1022 break
1023 }
1024 s = s[1:]
1025 }
1026 return s
1027 }
1028 1029 func trimLeftUnicode(s, cutset []byte) []byte {
1030 for len(s) > 0 {
1031 r, n := rune(s[0]), 1
1032 if r >= utf8.RuneSelf {
1033 r, n = utf8.DecodeRuneInString(s)
1034 }
1035 if !ContainsRune(cutset, r) {
1036 break
1037 }
1038 s = s[n:]
1039 }
1040 return s
1041 }
1042 1043 // TrimRight returns a slice of the string s, with all trailing
1044 // Unicode code points contained in cutset removed.
1045 //
1046 // To remove a suffix, use [TrimSuffix] instead.
1047 func TrimRight(s, cutset []byte) []byte {
1048 if s == "" || cutset == "" {
1049 return s
1050 }
1051 if len(cutset) == 1 && cutset[0] < utf8.RuneSelf {
1052 return trimRightByte(s, cutset[0])
1053 }
1054 if as, ok := makeASCIISet(cutset); ok {
1055 return trimRightASCII(s, &as)
1056 }
1057 return trimRightUnicode(s, cutset)
1058 }
1059 1060 func trimRightByte(s []byte, c byte) []byte {
1061 for len(s) > 0 && s[len(s)-1] == c {
1062 s = s[:len(s)-1]
1063 }
1064 return s
1065 }
1066 1067 func trimRightASCII(s []byte, as *asciiSet) []byte {
1068 for len(s) > 0 {
1069 if !as.contains(s[len(s)-1]) {
1070 break
1071 }
1072 s = s[:len(s)-1]
1073 }
1074 return s
1075 }
1076 1077 func trimRightUnicode(s, cutset []byte) []byte {
1078 for len(s) > 0 {
1079 r, n := rune(s[len(s)-1]), 1
1080 if r >= utf8.RuneSelf {
1081 r, n = utf8.DecodeLastRuneInString(s)
1082 }
1083 if !ContainsRune(cutset, r) {
1084 break
1085 }
1086 s = s[:len(s)-n]
1087 }
1088 return s
1089 }
1090 1091 // TrimSpace returns a slice of the string s, with all leading
1092 // and trailing white space removed, as defined by Unicode.
1093 func TrimSpace(s []byte) []byte {
1094 // Fast path for ASCII: look for the first ASCII non-space byte
1095 start := 0
1096 for ; start < len(s); start++ {
1097 c := s[start]
1098 if c >= utf8.RuneSelf {
1099 // If we run into a non-ASCII byte, fall back to the
1100 // slower unicode-aware method on the remaining bytes
1101 return TrimFunc(s[start:], unicode.IsSpace)
1102 }
1103 if asciiSpace[c] == 0 {
1104 break
1105 }
1106 }
1107 1108 // Now look for the first ASCII non-space byte from the end
1109 stop := len(s)
1110 for ; stop > start; stop-- {
1111 c := s[stop-1]
1112 if c >= utf8.RuneSelf {
1113 // start has been already trimmed above, should trim end only
1114 return TrimRightFunc(s[start:stop], unicode.IsSpace)
1115 }
1116 if asciiSpace[c] == 0 {
1117 break
1118 }
1119 }
1120 1121 // At this point s[start:stop] starts and ends with an ASCII
1122 // non-space bytes, so we're done. Non-ASCII cases have already
1123 // been handled above.
1124 return s[start:stop]
1125 }
1126 1127 // TrimPrefix returns s without the provided leading prefix string.
1128 // If s doesn't start with prefix, s is returned unchanged.
1129 func TrimPrefix(s, prefix []byte) []byte {
1130 return stringslite.TrimPrefix(s, prefix)
1131 }
1132 1133 // TrimSuffix returns s without the provided trailing suffix string.
1134 // If s doesn't end with suffix, s is returned unchanged.
1135 func TrimSuffix(s, suffix []byte) []byte {
1136 return stringslite.TrimSuffix(s, suffix)
1137 }
1138 1139 // Replace returns a copy of the string s with the first n
1140 // non-overlapping instances of old replaced by new.
1141 // If old is empty, it matches at the beginning of the string
1142 // and after each UTF-8 sequence, yielding up to k+1 replacements
1143 // for a k-rune string.
1144 // If n < 0, there is no limit on the number of replacements.
1145 func Replace(s, old, new []byte, n int) []byte {
1146 if old == new || n == 0 {
1147 return s // avoid allocation
1148 }
1149 1150 // Compute number of replacements.
1151 if m := Count(s, old); m == 0 {
1152 return s // avoid allocation
1153 } else if n < 0 || m < n {
1154 n = m
1155 }
1156 1157 // Apply replacements to buffer.
1158 var b Builder
1159 b.Grow(len(s) + n*(len(new)-len(old)))
1160 start := 0
1161 if len(old) > 0 {
1162 for range n {
1163 j := start + Index(s[start:], old)
1164 b.WriteString(s[start:j])
1165 b.WriteString(new)
1166 start = j + len(old)
1167 }
1168 } else { // len(old) == 0
1169 b.WriteString(new)
1170 for range n - 1 {
1171 _, wid := utf8.DecodeRuneInString(s[start:])
1172 j := start + wid
1173 b.WriteString(s[start:j])
1174 b.WriteString(new)
1175 start = j
1176 }
1177 }
1178 b.WriteString(s[start:])
1179 return b.String()
1180 }
1181 1182 // ReplaceAll returns a copy of the string s with all
1183 // non-overlapping instances of old replaced by new.
1184 // If old is empty, it matches at the beginning of the string
1185 // and after each UTF-8 sequence, yielding up to k+1 replacements
1186 // for a k-rune string.
1187 func ReplaceAll(s, old, new []byte) []byte {
1188 return Replace(s, old, new, -1)
1189 }
1190 1191 // EqualFold reports whether s and t, interpreted as UTF-8 strings,
1192 // are equal under simple Unicode case-folding, which is a more general
1193 // form of case-insensitivity.
1194 func EqualFold(s, t []byte) bool {
1195 // ASCII fast path
1196 i := 0
1197 for n := min(len(s), len(t)); i < n; i++ {
1198 sr := s[i]
1199 tr := t[i]
1200 if sr|tr >= utf8.RuneSelf {
1201 goto hasUnicode
1202 }
1203 1204 // Easy case.
1205 if tr == sr {
1206 continue
1207 }
1208 1209 // Make sr < tr to simplify what follows.
1210 if tr < sr {
1211 tr, sr = sr, tr
1212 }
1213 // ASCII only, sr/tr must be upper/lower case
1214 if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' {
1215 continue
1216 }
1217 return false
1218 }
1219 // Check if we've exhausted both strings.
1220 return len(s) == len(t)
1221 1222 hasUnicode:
1223 s = s[i:]
1224 t = t[i:]
1225 for _, sr := range s {
1226 // If t is exhausted the strings are not equal.
1227 if len(t) == 0 {
1228 return false
1229 }
1230 1231 // Extract first rune from second string.
1232 var tr rune
1233 if t[0] < utf8.RuneSelf {
1234 tr, t = rune(t[0]), t[1:]
1235 } else {
1236 r, size := utf8.DecodeRuneInString(t)
1237 tr, t = r, t[size:]
1238 }
1239 1240 // If they match, keep going; if not, return false.
1241 1242 // Easy case.
1243 if tr == sr {
1244 continue
1245 }
1246 1247 // Make sr < tr to simplify what follows.
1248 if tr < sr {
1249 tr, sr = sr, tr
1250 }
1251 // Fast check for ASCII.
1252 if tr < utf8.RuneSelf {
1253 // ASCII only, sr/tr must be upper/lower case
1254 if 'A' <= sr && sr <= 'Z' && tr == sr+'a'-'A' {
1255 continue
1256 }
1257 return false
1258 }
1259 1260 // General case. SimpleFold(x) returns the next equivalent rune > x
1261 // or wraps around to smaller values.
1262 r := unicode.SimpleFold(sr)
1263 for r != sr && r < tr {
1264 r = unicode.SimpleFold(r)
1265 }
1266 if r == tr {
1267 continue
1268 }
1269 return false
1270 }
1271 1272 // First string is empty, so check if the second one is also empty.
1273 return len(t) == 0
1274 }
1275 1276 // Index returns the index of the first instance of substr in s, or -1 if substr is not present in s.
1277 func Index(s, substr []byte) int {
1278 return stringslite.Index(s, substr)
1279 }
1280 1281 // Cut slices s around the first instance of sep,
1282 // returning the text before and after sep.
1283 // The found result reports whether sep appears in s.
1284 // If sep does not appear in s, cut returns s, "", false.
1285 func Cut(s, sep []byte) (before, after []byte, found bool) {
1286 return stringslite.Cut(s, sep)
1287 }
1288 1289 // CutPrefix returns s without the provided leading prefix string
1290 // and reports whether it found the prefix.
1291 // If s doesn't start with prefix, CutPrefix returns s, false.
1292 // If prefix is the empty string, CutPrefix returns s, true.
1293 func CutPrefix(s, prefix []byte) (after []byte, found bool) {
1294 return stringslite.CutPrefix(s, prefix)
1295 }
1296 1297 // CutSuffix returns s without the provided ending suffix string
1298 // and reports whether it found the suffix.
1299 // If s doesn't end with suffix, CutSuffix returns s, false.
1300 // If suffix is the empty string, CutSuffix returns s, true.
1301 func CutSuffix(s, suffix []byte) (before []byte, found bool) {
1302 return stringslite.CutSuffix(s, suffix)
1303 }
1304