hzgb2312.go raw

   1  // Copyright 2013 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package simplifiedchinese
   6  
   7  import (
   8  	"unicode/utf8"
   9  
  10  	"golang.org/x/text/encoding"
  11  	"golang.org/x/text/encoding/internal"
  12  	"golang.org/x/text/encoding/internal/identifier"
  13  	"golang.org/x/text/transform"
  14  )
  15  
  16  // HZGB2312 is the HZ-GB2312 encoding.
  17  var HZGB2312 encoding.Encoding = &hzGB2312
  18  
  19  var hzGB2312 = internal.Encoding{
  20  	Encoding: internal.FuncEncoding{Decoder: hzGB2312NewDecoder, Encoder: hzGB2312NewEncoder},
  21  	Name:     "HZ-GB2312",
  22  	MIB:      identifier.HZGB2312,
  23  }
  24  
  25  func hzGB2312NewDecoder() transform.Transformer {
  26  	return new(hzGB2312Decoder)
  27  }
  28  
  29  func hzGB2312NewEncoder() transform.Transformer {
  30  	return new(hzGB2312Encoder)
  31  }
  32  
  33  const (
  34  	asciiState = iota
  35  	gbState
  36  )
  37  
  38  type hzGB2312Decoder int
  39  
  40  func (d *hzGB2312Decoder) Reset() {
  41  	*d = asciiState
  42  }
  43  
  44  func (d *hzGB2312Decoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  45  	r, size := rune(0), 0
  46  loop:
  47  	for ; nSrc < len(src); nSrc += size {
  48  		c0 := src[nSrc]
  49  		if c0 >= utf8.RuneSelf {
  50  			r, size = utf8.RuneError, 1
  51  			goto write
  52  		}
  53  
  54  		if c0 == '~' {
  55  			if nSrc+1 >= len(src) {
  56  				if !atEOF {
  57  					err = transform.ErrShortSrc
  58  					break loop
  59  				}
  60  				r, size = utf8.RuneError, 1
  61  				goto write
  62  			}
  63  			size = 2
  64  			switch src[nSrc+1] {
  65  			case '{':
  66  				*d = gbState
  67  				continue
  68  			case '}':
  69  				*d = asciiState
  70  				continue
  71  			case '~':
  72  				if nDst >= len(dst) {
  73  					err = transform.ErrShortDst
  74  					break loop
  75  				}
  76  				dst[nDst] = '~'
  77  				nDst++
  78  				continue
  79  			case '\n':
  80  				continue
  81  			default:
  82  				r = utf8.RuneError
  83  				goto write
  84  			}
  85  		}
  86  
  87  		if *d == asciiState {
  88  			r, size = rune(c0), 1
  89  		} else {
  90  			if nSrc+1 >= len(src) {
  91  				if !atEOF {
  92  					err = transform.ErrShortSrc
  93  					break loop
  94  				}
  95  				r, size = utf8.RuneError, 1
  96  				goto write
  97  			}
  98  			size = 2
  99  			c1 := src[nSrc+1]
 100  			if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
 101  				// error
 102  			} else if i := int(c0-0x01)*190 + int(c1+0x3f); i < len(decode) {
 103  				r = rune(decode[i])
 104  				if r != 0 {
 105  					goto write
 106  				}
 107  			}
 108  			if c1 > utf8.RuneSelf {
 109  				// Be consistent and always treat non-ASCII as a single error.
 110  				size = 1
 111  			}
 112  			r = utf8.RuneError
 113  		}
 114  
 115  	write:
 116  		if nDst+utf8.RuneLen(r) > len(dst) {
 117  			err = transform.ErrShortDst
 118  			break loop
 119  		}
 120  		nDst += utf8.EncodeRune(dst[nDst:], r)
 121  	}
 122  	return nDst, nSrc, err
 123  }
 124  
 125  type hzGB2312Encoder int
 126  
 127  func (d *hzGB2312Encoder) Reset() {
 128  	*d = asciiState
 129  }
 130  
 131  func (e *hzGB2312Encoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 132  	r, size := rune(0), 0
 133  	for ; nSrc < len(src); nSrc += size {
 134  		r = rune(src[nSrc])
 135  
 136  		// Decode a 1-byte rune.
 137  		if r < utf8.RuneSelf {
 138  			size = 1
 139  			if r == '~' {
 140  				if nDst+2 > len(dst) {
 141  					err = transform.ErrShortDst
 142  					break
 143  				}
 144  				dst[nDst+0] = '~'
 145  				dst[nDst+1] = '~'
 146  				nDst += 2
 147  				continue
 148  			} else if *e != asciiState {
 149  				if nDst+3 > len(dst) {
 150  					err = transform.ErrShortDst
 151  					break
 152  				}
 153  				*e = asciiState
 154  				dst[nDst+0] = '~'
 155  				dst[nDst+1] = '}'
 156  				nDst += 2
 157  			} else if nDst >= len(dst) {
 158  				err = transform.ErrShortDst
 159  				break
 160  			}
 161  			dst[nDst] = uint8(r)
 162  			nDst += 1
 163  			continue
 164  
 165  		}
 166  
 167  		// Decode a multi-byte rune.
 168  		r, size = utf8.DecodeRune(src[nSrc:])
 169  		if size == 1 {
 170  			// All valid runes of size 1 (those below utf8.RuneSelf) were
 171  			// handled above. We have invalid UTF-8 or we haven't seen the
 172  			// full character yet.
 173  			if !atEOF && !utf8.FullRune(src[nSrc:]) {
 174  				err = transform.ErrShortSrc
 175  				break
 176  			}
 177  		}
 178  
 179  		// func init checks that the switch covers all tables.
 180  		switch {
 181  		case encode0Low <= r && r < encode0High:
 182  			if r = rune(encode0[r-encode0Low]); r != 0 {
 183  				goto writeGB
 184  			}
 185  		case encode1Low <= r && r < encode1High:
 186  			if r = rune(encode1[r-encode1Low]); r != 0 {
 187  				goto writeGB
 188  			}
 189  		case encode2Low <= r && r < encode2High:
 190  			if r = rune(encode2[r-encode2Low]); r != 0 {
 191  				goto writeGB
 192  			}
 193  		case encode3Low <= r && r < encode3High:
 194  			if r = rune(encode3[r-encode3Low]); r != 0 {
 195  				goto writeGB
 196  			}
 197  		case encode4Low <= r && r < encode4High:
 198  			if r = rune(encode4[r-encode4Low]); r != 0 {
 199  				goto writeGB
 200  			}
 201  		}
 202  
 203  	terminateInASCIIState:
 204  		// Switch back to ASCII state in case of error so that an ASCII
 205  		// replacement character can be written in the correct state.
 206  		if *e != asciiState {
 207  			if nDst+2 > len(dst) {
 208  				err = transform.ErrShortDst
 209  				break
 210  			}
 211  			dst[nDst+0] = '~'
 212  			dst[nDst+1] = '}'
 213  			nDst += 2
 214  		}
 215  		err = internal.ErrASCIIReplacement
 216  		break
 217  
 218  	writeGB:
 219  		c0 := uint8(r>>8) - 0x80
 220  		c1 := uint8(r) - 0x80
 221  		if c0 < 0x21 || 0x7e <= c0 || c1 < 0x21 || 0x7f <= c1 {
 222  			goto terminateInASCIIState
 223  		}
 224  		if *e == asciiState {
 225  			if nDst+4 > len(dst) {
 226  				err = transform.ErrShortDst
 227  				break
 228  			}
 229  			*e = gbState
 230  			dst[nDst+0] = '~'
 231  			dst[nDst+1] = '{'
 232  			nDst += 2
 233  		} else if nDst+2 > len(dst) {
 234  			err = transform.ErrShortDst
 235  			break
 236  		}
 237  		dst[nDst+0] = c0
 238  		dst[nDst+1] = c1
 239  		nDst += 2
 240  		continue
 241  	}
 242  	// TODO: should one always terminate in ASCII state to make it safe to
 243  	// concatenate two HZ-GB2312-encoded strings?
 244  	return nDst, nSrc, err
 245  }
 246