iter_str.go raw

   1  package jsoniter
   2  
   3  import (
   4  	"fmt"
   5  	"unicode/utf16"
   6  )
   7  
   8  // ReadString read string from iterator
   9  func (iter *Iterator) ReadString() (ret string) {
  10  	c := iter.nextToken()
  11  	if c == '"' {
  12  		for i := iter.head; i < iter.tail; i++ {
  13  			c := iter.buf[i]
  14  			if c == '"' {
  15  				ret = string(iter.buf[iter.head:i])
  16  				iter.head = i + 1
  17  				return ret
  18  			} else if c == '\\' {
  19  				break
  20  			} else if c < ' ' {
  21  				iter.ReportError("ReadString",
  22  					fmt.Sprintf(`invalid control character found: %d`, c))
  23  				return
  24  			}
  25  		}
  26  		return iter.readStringSlowPath()
  27  	} else if c == 'n' {
  28  		iter.skipThreeBytes('u', 'l', 'l')
  29  		return ""
  30  	}
  31  	iter.ReportError("ReadString", `expects " or n, but found `+string([]byte{c}))
  32  	return
  33  }
  34  
  35  func (iter *Iterator) readStringSlowPath() (ret string) {
  36  	var str []byte
  37  	var c byte
  38  	for iter.Error == nil {
  39  		c = iter.readByte()
  40  		if c == '"' {
  41  			return string(str)
  42  		}
  43  		if c == '\\' {
  44  			c = iter.readByte()
  45  			str = iter.readEscapedChar(c, str)
  46  		} else {
  47  			str = append(str, c)
  48  		}
  49  	}
  50  	iter.ReportError("readStringSlowPath", "unexpected end of input")
  51  	return
  52  }
  53  
  54  func (iter *Iterator) readEscapedChar(c byte, str []byte) []byte {
  55  	switch c {
  56  	case 'u':
  57  		r := iter.readU4()
  58  		if utf16.IsSurrogate(r) {
  59  			c = iter.readByte()
  60  			if iter.Error != nil {
  61  				return nil
  62  			}
  63  			if c != '\\' {
  64  				iter.unreadByte()
  65  				str = appendRune(str, r)
  66  				return str
  67  			}
  68  			c = iter.readByte()
  69  			if iter.Error != nil {
  70  				return nil
  71  			}
  72  			if c != 'u' {
  73  				str = appendRune(str, r)
  74  				return iter.readEscapedChar(c, str)
  75  			}
  76  			r2 := iter.readU4()
  77  			if iter.Error != nil {
  78  				return nil
  79  			}
  80  			combined := utf16.DecodeRune(r, r2)
  81  			if combined == '\uFFFD' {
  82  				str = appendRune(str, r)
  83  				str = appendRune(str, r2)
  84  			} else {
  85  				str = appendRune(str, combined)
  86  			}
  87  		} else {
  88  			str = appendRune(str, r)
  89  		}
  90  	case '"':
  91  		str = append(str, '"')
  92  	case '\\':
  93  		str = append(str, '\\')
  94  	case '/':
  95  		str = append(str, '/')
  96  	case 'b':
  97  		str = append(str, '\b')
  98  	case 'f':
  99  		str = append(str, '\f')
 100  	case 'n':
 101  		str = append(str, '\n')
 102  	case 'r':
 103  		str = append(str, '\r')
 104  	case 't':
 105  		str = append(str, '\t')
 106  	default:
 107  		iter.ReportError("readEscapedChar",
 108  			`invalid escape char after \`)
 109  		return nil
 110  	}
 111  	return str
 112  }
 113  
 114  // ReadStringAsSlice read string from iterator without copying into string form.
 115  // The []byte can not be kept, as it will change after next iterator call.
 116  func (iter *Iterator) ReadStringAsSlice() (ret []byte) {
 117  	c := iter.nextToken()
 118  	if c == '"' {
 119  		for i := iter.head; i < iter.tail; i++ {
 120  			// require ascii string and no escape
 121  			// for: field name, base64, number
 122  			if iter.buf[i] == '"' {
 123  				// fast path: reuse the underlying buffer
 124  				ret = iter.buf[iter.head:i]
 125  				iter.head = i + 1
 126  				return ret
 127  			}
 128  		}
 129  		readLen := iter.tail - iter.head
 130  		copied := make([]byte, readLen, readLen*2)
 131  		copy(copied, iter.buf[iter.head:iter.tail])
 132  		iter.head = iter.tail
 133  		for iter.Error == nil {
 134  			c := iter.readByte()
 135  			if c == '"' {
 136  				return copied
 137  			}
 138  			copied = append(copied, c)
 139  		}
 140  		return copied
 141  	}
 142  	iter.ReportError("ReadStringAsSlice", `expects " or n, but found `+string([]byte{c}))
 143  	return
 144  }
 145  
 146  func (iter *Iterator) readU4() (ret rune) {
 147  	for i := 0; i < 4; i++ {
 148  		c := iter.readByte()
 149  		if iter.Error != nil {
 150  			return
 151  		}
 152  		if c >= '0' && c <= '9' {
 153  			ret = ret*16 + rune(c-'0')
 154  		} else if c >= 'a' && c <= 'f' {
 155  			ret = ret*16 + rune(c-'a'+10)
 156  		} else if c >= 'A' && c <= 'F' {
 157  			ret = ret*16 + rune(c-'A'+10)
 158  		} else {
 159  			iter.ReportError("readU4", "expects 0~9 or a~f, but found "+string([]byte{c}))
 160  			return
 161  		}
 162  	}
 163  	return ret
 164  }
 165  
 166  const (
 167  	t1 = 0x00 // 0000 0000
 168  	tx = 0x80 // 1000 0000
 169  	t2 = 0xC0 // 1100 0000
 170  	t3 = 0xE0 // 1110 0000
 171  	t4 = 0xF0 // 1111 0000
 172  	t5 = 0xF8 // 1111 1000
 173  
 174  	maskx = 0x3F // 0011 1111
 175  	mask2 = 0x1F // 0001 1111
 176  	mask3 = 0x0F // 0000 1111
 177  	mask4 = 0x07 // 0000 0111
 178  
 179  	rune1Max = 1<<7 - 1
 180  	rune2Max = 1<<11 - 1
 181  	rune3Max = 1<<16 - 1
 182  
 183  	surrogateMin = 0xD800
 184  	surrogateMax = 0xDFFF
 185  
 186  	maxRune   = '\U0010FFFF' // Maximum valid Unicode code point.
 187  	runeError = '\uFFFD'     // the "error" Rune or "Unicode replacement character"
 188  )
 189  
 190  func appendRune(p []byte, r rune) []byte {
 191  	// Negative values are erroneous. Making it unsigned addresses the problem.
 192  	switch i := uint32(r); {
 193  	case i <= rune1Max:
 194  		p = append(p, byte(r))
 195  		return p
 196  	case i <= rune2Max:
 197  		p = append(p, t2|byte(r>>6))
 198  		p = append(p, tx|byte(r)&maskx)
 199  		return p
 200  	case i > maxRune, surrogateMin <= i && i <= surrogateMax:
 201  		r = runeError
 202  		fallthrough
 203  	case i <= rune3Max:
 204  		p = append(p, t3|byte(r>>12))
 205  		p = append(p, tx|byte(r>>6)&maskx)
 206  		p = append(p, tx|byte(r)&maskx)
 207  		return p
 208  	default:
 209  		p = append(p, t4|byte(r>>18))
 210  		p = append(p, tx|byte(r>>12)&maskx)
 211  		p = append(p, tx|byte(r>>6)&maskx)
 212  		p = append(p, tx|byte(r)&maskx)
 213  		return p
 214  	}
 215  }
 216