utf8.go raw

   1  /*
   2   * Copyright 2022 ByteDance Inc.
   3   *
   4   * Licensed under the Apache License, Version 2.0 (the "License");
   5   * you may not use this file except in compliance with the License.
   6   * You may obtain a copy of the License at
   7   *
   8   *     http://www.apache.org/licenses/LICENSE-2.0
   9   *
  10   * Unless required by applicable law or agreed to in writing, software
  11   * distributed under the License is distributed on an "AS IS" BASIS,
  12   * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13   * See the License for the specific language governing permissions and
  14   * limitations under the License.
  15   */
  16  
  17  package utf8
  18  
  19  import (
  20  	`runtime`
  21  
  22      `github.com/bytedance/sonic/internal/rt`
  23      `github.com/bytedance/sonic/internal/native/types`
  24      `github.com/bytedance/sonic/internal/native`
  25  )
  26  
  27  // CorrectWith corrects the invalid utf8 byte with repl string.
  28  func CorrectWith(dst []byte, src []byte, repl string) []byte {
  29      sstr := rt.Mem2Str(src)
  30      sidx := 0
  31  
  32      /* state machine records the invalid positions */
  33      m := types.NewStateMachine()
  34      m.Sp = 0 // invalid utf8 numbers
  35  
  36      for sidx < len(sstr) {
  37          scur  := sidx
  38          ecode := native.ValidateUTF8(&sstr, &sidx, m)
  39  
  40          if m.Sp != 0 {
  41              if m.Sp > len(sstr) {
  42                  panic("numbers of invalid utf8 exceed the string len!")
  43              }
  44          }
  45          
  46          for i := 0; i < m.Sp; i++ {
  47              ipos := m.Vt[i] // invalid utf8 position
  48              dst  = append(dst, sstr[scur:ipos]...)
  49              dst  = append(dst, repl...)
  50              scur = m.Vt[i] + 1
  51          }
  52          /* append the remained valid utf8 bytes */
  53          dst = append(dst, sstr[scur:sidx]...)
  54  
  55          /* not enough space, reset and continue */
  56          if ecode != 0 {
  57              m.Sp = 0
  58          }
  59      }
  60  
  61      types.FreeStateMachine(m)
  62      return dst
  63  }
  64  
  65  // Validate is a simd-accelereated drop-in replacement for the standard library's utf8.Valid.
  66  func Validate(src []byte) bool {
  67  	if src == nil {
  68  		return true
  69  	}
  70      return ValidateString(rt.Mem2Str(src))
  71  }
  72  
  73  // ValidateString as Validate, but for string.
  74  func ValidateString(src string) bool {
  75  	if src == "" {
  76  		return true
  77  	}
  78      ret := native.ValidateUTF8Fast(&src) == 0
  79  	runtime.KeepAlive(src)
  80  	return ret
  81  }
  82