iso2022jp.go raw

   1  // Copyright 2013 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package japanese
   6  
   7  import (
   8  	"unicode/utf8"
   9  
  10  	"golang.org/x/text/encoding"
  11  	"golang.org/x/text/encoding/internal"
  12  	"golang.org/x/text/encoding/internal/identifier"
  13  	"golang.org/x/text/transform"
  14  )
  15  
  16  // ISO2022JP is the ISO-2022-JP encoding.
  17  var ISO2022JP encoding.Encoding = &iso2022JP
  18  
  19  var iso2022JP = internal.Encoding{
  20  	Encoding: internal.FuncEncoding{Decoder: iso2022JPNewDecoder, Encoder: iso2022JPNewEncoder},
  21  	Name:     "ISO-2022-JP",
  22  	MIB:      identifier.ISO2022JP,
  23  }
  24  
  25  func iso2022JPNewDecoder() transform.Transformer {
  26  	return new(iso2022JPDecoder)
  27  }
  28  
  29  func iso2022JPNewEncoder() transform.Transformer {
  30  	return new(iso2022JPEncoder)
  31  }
  32  
  33  const (
  34  	asciiState = iota
  35  	katakanaState
  36  	jis0208State
  37  	jis0212State
  38  )
  39  
  40  const asciiEsc = 0x1b
  41  
  42  type iso2022JPDecoder int
  43  
  44  func (d *iso2022JPDecoder) Reset() {
  45  	*d = asciiState
  46  }
  47  
  48  func (d *iso2022JPDecoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
  49  	r, size := rune(0), 0
  50  	for ; nSrc < len(src); nSrc += size {
  51  		c0 := src[nSrc]
  52  		if c0 >= utf8.RuneSelf {
  53  			r, size = '\ufffd', 1
  54  			goto write
  55  		}
  56  
  57  		if c0 == asciiEsc {
  58  			if nSrc+2 >= len(src) {
  59  				if !atEOF {
  60  					return nDst, nSrc, transform.ErrShortSrc
  61  				}
  62  				// TODO: is it correct to only skip 1??
  63  				r, size = '\ufffd', 1
  64  				goto write
  65  			}
  66  			size = 3
  67  			c1 := src[nSrc+1]
  68  			c2 := src[nSrc+2]
  69  			switch {
  70  			case c1 == '$' && (c2 == '@' || c2 == 'B'): // 0x24 {0x40, 0x42}
  71  				*d = jis0208State
  72  				continue
  73  			case c1 == '$' && c2 == '(': // 0x24 0x28
  74  				if nSrc+3 >= len(src) {
  75  					if !atEOF {
  76  						return nDst, nSrc, transform.ErrShortSrc
  77  					}
  78  					r, size = '\ufffd', 1
  79  					goto write
  80  				}
  81  				size = 4
  82  				if src[nSrc+3] == 'D' {
  83  					*d = jis0212State
  84  					continue
  85  				}
  86  			case c1 == '(' && (c2 == 'B' || c2 == 'J'): // 0x28 {0x42, 0x4A}
  87  				*d = asciiState
  88  				continue
  89  			case c1 == '(' && c2 == 'I': // 0x28 0x49
  90  				*d = katakanaState
  91  				continue
  92  			}
  93  			r, size = '\ufffd', 1
  94  			goto write
  95  		}
  96  
  97  		switch *d {
  98  		case asciiState:
  99  			r, size = rune(c0), 1
 100  
 101  		case katakanaState:
 102  			if c0 < 0x21 || 0x60 <= c0 {
 103  				r, size = '\ufffd', 1
 104  				goto write
 105  			}
 106  			r, size = rune(c0)+(0xff61-0x21), 1
 107  
 108  		default:
 109  			if c0 == 0x0a {
 110  				*d = asciiState
 111  				r, size = rune(c0), 1
 112  				goto write
 113  			}
 114  			if nSrc+1 >= len(src) {
 115  				if !atEOF {
 116  					return nDst, nSrc, transform.ErrShortSrc
 117  				}
 118  				r, size = '\ufffd', 1
 119  				goto write
 120  			}
 121  			size = 2
 122  			c1 := src[nSrc+1]
 123  			i := int(c0-0x21)*94 + int(c1-0x21)
 124  			if *d == jis0208State && i < len(jis0208Decode) {
 125  				r = rune(jis0208Decode[i])
 126  			} else if *d == jis0212State && i < len(jis0212Decode) {
 127  				r = rune(jis0212Decode[i])
 128  			} else {
 129  				r = '\ufffd'
 130  				goto write
 131  			}
 132  			if r == 0 {
 133  				r = '\ufffd'
 134  			}
 135  		}
 136  
 137  	write:
 138  		if nDst+utf8.RuneLen(r) > len(dst) {
 139  			return nDst, nSrc, transform.ErrShortDst
 140  		}
 141  		nDst += utf8.EncodeRune(dst[nDst:], r)
 142  	}
 143  	return nDst, nSrc, err
 144  }
 145  
 146  type iso2022JPEncoder int
 147  
 148  func (e *iso2022JPEncoder) Reset() {
 149  	*e = asciiState
 150  }
 151  
 152  func (e *iso2022JPEncoder) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
 153  	r, size := rune(0), 0
 154  	for ; nSrc < len(src); nSrc += size {
 155  		r = rune(src[nSrc])
 156  
 157  		// Decode a 1-byte rune.
 158  		if r < utf8.RuneSelf {
 159  			size = 1
 160  
 161  		} else {
 162  			// Decode a multi-byte rune.
 163  			r, size = utf8.DecodeRune(src[nSrc:])
 164  			if size == 1 {
 165  				// All valid runes of size 1 (those below utf8.RuneSelf) were
 166  				// handled above. We have invalid UTF-8 or we haven't seen the
 167  				// full character yet.
 168  				if !atEOF && !utf8.FullRune(src[nSrc:]) {
 169  					err = transform.ErrShortSrc
 170  					break
 171  				}
 172  			}
 173  
 174  			// func init checks that the switch covers all tables.
 175  			//
 176  			// http://encoding.spec.whatwg.org/#iso-2022-jp says that "the index jis0212
 177  			// is not used by the iso-2022-jp encoder due to lack of widespread support".
 178  			//
 179  			// TODO: do we have to special-case U+00A5 and U+203E, as per
 180  			// http://encoding.spec.whatwg.org/#iso-2022-jp
 181  			// Doing so would mean that "\u00a5" would not be preserved
 182  			// after an encode-decode round trip.
 183  			switch {
 184  			case encode0Low <= r && r < encode0High:
 185  				if r = rune(encode0[r-encode0Low]); r>>tableShift == jis0208 {
 186  					goto writeJIS
 187  				}
 188  			case encode1Low <= r && r < encode1High:
 189  				if r = rune(encode1[r-encode1Low]); r>>tableShift == jis0208 {
 190  					goto writeJIS
 191  				}
 192  			case encode2Low <= r && r < encode2High:
 193  				if r = rune(encode2[r-encode2Low]); r>>tableShift == jis0208 {
 194  					goto writeJIS
 195  				}
 196  			case encode3Low <= r && r < encode3High:
 197  				if r = rune(encode3[r-encode3Low]); r>>tableShift == jis0208 {
 198  					goto writeJIS
 199  				}
 200  			case encode4Low <= r && r < encode4High:
 201  				if r = rune(encode4[r-encode4Low]); r>>tableShift == jis0208 {
 202  					goto writeJIS
 203  				}
 204  			case encode5Low <= r && r < encode5High:
 205  				if 0xff61 <= r && r < 0xffa0 {
 206  					goto writeKatakana
 207  				}
 208  				if r = rune(encode5[r-encode5Low]); r>>tableShift == jis0208 {
 209  					goto writeJIS
 210  				}
 211  			}
 212  
 213  			// Switch back to ASCII state in case of error so that an ASCII
 214  			// replacement character can be written in the correct state.
 215  			if *e != asciiState {
 216  				if nDst+3 > len(dst) {
 217  					err = transform.ErrShortDst
 218  					break
 219  				}
 220  				*e = asciiState
 221  				dst[nDst+0] = asciiEsc
 222  				dst[nDst+1] = '('
 223  				dst[nDst+2] = 'B'
 224  				nDst += 3
 225  			}
 226  			err = internal.ErrASCIIReplacement
 227  			break
 228  		}
 229  
 230  		if *e != asciiState {
 231  			if nDst+4 > len(dst) {
 232  				err = transform.ErrShortDst
 233  				break
 234  			}
 235  			*e = asciiState
 236  			dst[nDst+0] = asciiEsc
 237  			dst[nDst+1] = '('
 238  			dst[nDst+2] = 'B'
 239  			nDst += 3
 240  		} else if nDst >= len(dst) {
 241  			err = transform.ErrShortDst
 242  			break
 243  		}
 244  		dst[nDst] = uint8(r)
 245  		nDst++
 246  		continue
 247  
 248  	writeJIS:
 249  		if *e != jis0208State {
 250  			if nDst+5 > len(dst) {
 251  				err = transform.ErrShortDst
 252  				break
 253  			}
 254  			*e = jis0208State
 255  			dst[nDst+0] = asciiEsc
 256  			dst[nDst+1] = '$'
 257  			dst[nDst+2] = 'B'
 258  			nDst += 3
 259  		} else if nDst+2 > len(dst) {
 260  			err = transform.ErrShortDst
 261  			break
 262  		}
 263  		dst[nDst+0] = 0x21 + uint8(r>>codeShift)&codeMask
 264  		dst[nDst+1] = 0x21 + uint8(r)&codeMask
 265  		nDst += 2
 266  		continue
 267  
 268  	writeKatakana:
 269  		if *e != katakanaState {
 270  			if nDst+4 > len(dst) {
 271  				err = transform.ErrShortDst
 272  				break
 273  			}
 274  			*e = katakanaState
 275  			dst[nDst+0] = asciiEsc
 276  			dst[nDst+1] = '('
 277  			dst[nDst+2] = 'I'
 278  			nDst += 3
 279  		} else if nDst >= len(dst) {
 280  			err = transform.ErrShortDst
 281  			break
 282  		}
 283  		dst[nDst] = uint8(r - (0xff61 - 0x21))
 284  		nDst++
 285  		continue
 286  	}
 287  	if atEOF && err == nil && *e != asciiState {
 288  		if nDst+3 > len(dst) {
 289  			err = transform.ErrShortDst
 290  		} else {
 291  			*e = asciiState
 292  			dst[nDst+0] = asciiEsc
 293  			dst[nDst+1] = '('
 294  			dst[nDst+2] = 'B'
 295  			nDst += 3
 296  		}
 297  	}
 298  	return nDst, nSrc, err
 299  }
 300