utfbom.go raw

   1  // Package utfbom implements the detection of the BOM (Unicode Byte Order Mark) and removing as necessary.
   2  // It wraps an io.Reader object, creating another object (Reader) that also implements the io.Reader
   3  // interface but provides automatic BOM checking and removing as necessary.
   4  package utfbom
   5  
   6  import (
   7  	"errors"
   8  	"io"
   9  )
  10  
  11  // Encoding is type alias for detected UTF encoding.
  12  type Encoding int
  13  
  14  // Constants to identify detected UTF encodings.
  15  const (
  16  	// Unknown encoding, returned when no BOM was detected
  17  	Unknown Encoding = iota
  18  
  19  	// UTF8, BOM bytes: EF BB BF
  20  	UTF8
  21  
  22  	// UTF-16, big-endian, BOM bytes: FE FF
  23  	UTF16BigEndian
  24  
  25  	// UTF-16, little-endian, BOM bytes: FF FE
  26  	UTF16LittleEndian
  27  
  28  	// UTF-32, big-endian, BOM bytes: 00 00 FE FF
  29  	UTF32BigEndian
  30  
  31  	// UTF-32, little-endian, BOM bytes: FF FE 00 00
  32  	UTF32LittleEndian
  33  )
  34  
  35  // String returns a user-friendly string representation of the encoding. Satisfies fmt.Stringer interface.
  36  func (e Encoding) String() string {
  37  	switch e {
  38  	case UTF8:
  39  		return "UTF8"
  40  	case UTF16BigEndian:
  41  		return "UTF16BigEndian"
  42  	case UTF16LittleEndian:
  43  		return "UTF16LittleEndian"
  44  	case UTF32BigEndian:
  45  		return "UTF32BigEndian"
  46  	case UTF32LittleEndian:
  47  		return "UTF32LittleEndian"
  48  	default:
  49  		return "Unknown"
  50  	}
  51  }
  52  
  53  const maxConsecutiveEmptyReads = 100
  54  
  55  // Skip creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary.
  56  // It also returns the encoding detected by the BOM.
  57  // If the detected encoding is not needed, you can call the SkipOnly function.
  58  func Skip(rd io.Reader) (*Reader, Encoding) {
  59  	// Is it already a Reader?
  60  	b, ok := rd.(*Reader)
  61  	if ok {
  62  		return b, Unknown
  63  	}
  64  
  65  	enc, left, err := detectUtf(rd)
  66  	return &Reader{
  67  		rd:  rd,
  68  		buf: left,
  69  		err: err,
  70  	}, enc
  71  }
  72  
  73  // SkipOnly creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary.
  74  func SkipOnly(rd io.Reader) *Reader {
  75  	r, _ := Skip(rd)
  76  	return r
  77  }
  78  
  79  // Reader implements automatic BOM (Unicode Byte Order Mark) checking and
  80  // removing as necessary for an io.Reader object.
  81  type Reader struct {
  82  	rd  io.Reader // reader provided by the client
  83  	buf []byte    // buffered data
  84  	err error     // last error
  85  }
  86  
  87  // Read is an implementation of io.Reader interface.
  88  // The bytes are taken from the underlying Reader, but it checks for BOMs, removing them as necessary.
  89  func (r *Reader) Read(p []byte) (n int, err error) {
  90  	if len(p) == 0 {
  91  		return 0, nil
  92  	}
  93  
  94  	if r.buf == nil {
  95  		if r.err != nil {
  96  			return 0, r.readErr()
  97  		}
  98  
  99  		return r.rd.Read(p)
 100  	}
 101  
 102  	// copy as much as we can
 103  	n = copy(p, r.buf)
 104  	r.buf = nilIfEmpty(r.buf[n:])
 105  	return n, nil
 106  }
 107  
 108  func (r *Reader) readErr() error {
 109  	err := r.err
 110  	r.err = nil
 111  	return err
 112  }
 113  
 114  var errNegativeRead = errors.New("utfbom: reader returned negative count from Read")
 115  
 116  func detectUtf(rd io.Reader) (enc Encoding, buf []byte, err error) {
 117  	buf, err = readBOM(rd)
 118  
 119  	if len(buf) >= 4 {
 120  		if isUTF32BigEndianBOM4(buf) {
 121  			return UTF32BigEndian, nilIfEmpty(buf[4:]), err
 122  		}
 123  		if isUTF32LittleEndianBOM4(buf) {
 124  			return UTF32LittleEndian, nilIfEmpty(buf[4:]), err
 125  		}
 126  	}
 127  
 128  	if len(buf) > 2 && isUTF8BOM3(buf) {
 129  		return UTF8, nilIfEmpty(buf[3:]), err
 130  	}
 131  
 132  	if (err != nil && err != io.EOF) || (len(buf) < 2) {
 133  		return Unknown, nilIfEmpty(buf), err
 134  	}
 135  
 136  	if isUTF16BigEndianBOM2(buf) {
 137  		return UTF16BigEndian, nilIfEmpty(buf[2:]), err
 138  	}
 139  	if isUTF16LittleEndianBOM2(buf) {
 140  		return UTF16LittleEndian, nilIfEmpty(buf[2:]), err
 141  	}
 142  
 143  	return Unknown, nilIfEmpty(buf), err
 144  }
 145  
 146  func readBOM(rd io.Reader) (buf []byte, err error) {
 147  	const maxBOMSize = 4
 148  	var bom [maxBOMSize]byte // used to read BOM
 149  
 150  	// read as many bytes as possible
 151  	for nEmpty, n := 0, 0; err == nil && len(buf) < maxBOMSize; buf = bom[:len(buf)+n] {
 152  		if n, err = rd.Read(bom[len(buf):]); n < 0 {
 153  			panic(errNegativeRead)
 154  		}
 155  		if n > 0 {
 156  			nEmpty = 0
 157  		} else {
 158  			nEmpty++
 159  			if nEmpty >= maxConsecutiveEmptyReads {
 160  				err = io.ErrNoProgress
 161  			}
 162  		}
 163  	}
 164  	return
 165  }
 166  
 167  func isUTF32BigEndianBOM4(buf []byte) bool {
 168  	return buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF
 169  }
 170  
 171  func isUTF32LittleEndianBOM4(buf []byte) bool {
 172  	return buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00
 173  }
 174  
 175  func isUTF8BOM3(buf []byte) bool {
 176  	return buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF
 177  }
 178  
 179  func isUTF16BigEndianBOM2(buf []byte) bool {
 180  	return buf[0] == 0xFE && buf[1] == 0xFF
 181  }
 182  
 183  func isUTF16LittleEndianBOM2(buf []byte) bool {
 184  	return buf[0] == 0xFF && buf[1] == 0xFE
 185  }
 186  
 187  func nilIfEmpty(buf []byte) (res []byte) {
 188  	if len(buf) > 0 {
 189  		res = buf
 190  	}
 191  	return
 192  }
 193