utfbom.go raw
1 // Package utfbom implements the detection of the BOM (Unicode Byte Order Mark) and removing as necessary.
2 // It wraps an io.Reader object, creating another object (Reader) that also implements the io.Reader
3 // interface but provides automatic BOM checking and removing as necessary.
4 package utfbom
5
6 import (
7 "errors"
8 "io"
9 )
10
11 // Encoding is type alias for detected UTF encoding.
12 type Encoding int
13
14 // Constants to identify detected UTF encodings.
15 const (
16 // Unknown encoding, returned when no BOM was detected
17 Unknown Encoding = iota
18
19 // UTF8, BOM bytes: EF BB BF
20 UTF8
21
22 // UTF-16, big-endian, BOM bytes: FE FF
23 UTF16BigEndian
24
25 // UTF-16, little-endian, BOM bytes: FF FE
26 UTF16LittleEndian
27
28 // UTF-32, big-endian, BOM bytes: 00 00 FE FF
29 UTF32BigEndian
30
31 // UTF-32, little-endian, BOM bytes: FF FE 00 00
32 UTF32LittleEndian
33 )
34
35 // String returns a user-friendly string representation of the encoding. Satisfies fmt.Stringer interface.
36 func (e Encoding) String() string {
37 switch e {
38 case UTF8:
39 return "UTF8"
40 case UTF16BigEndian:
41 return "UTF16BigEndian"
42 case UTF16LittleEndian:
43 return "UTF16LittleEndian"
44 case UTF32BigEndian:
45 return "UTF32BigEndian"
46 case UTF32LittleEndian:
47 return "UTF32LittleEndian"
48 default:
49 return "Unknown"
50 }
51 }
52
53 const maxConsecutiveEmptyReads = 100
54
55 // Skip creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary.
56 // It also returns the encoding detected by the BOM.
57 // If the detected encoding is not needed, you can call the SkipOnly function.
58 func Skip(rd io.Reader) (*Reader, Encoding) {
59 // Is it already a Reader?
60 b, ok := rd.(*Reader)
61 if ok {
62 return b, Unknown
63 }
64
65 enc, left, err := detectUtf(rd)
66 return &Reader{
67 rd: rd,
68 buf: left,
69 err: err,
70 }, enc
71 }
72
73 // SkipOnly creates Reader which automatically detects BOM (Unicode Byte Order Mark) and removes it as necessary.
74 func SkipOnly(rd io.Reader) *Reader {
75 r, _ := Skip(rd)
76 return r
77 }
78
79 // Reader implements automatic BOM (Unicode Byte Order Mark) checking and
80 // removing as necessary for an io.Reader object.
81 type Reader struct {
82 rd io.Reader // reader provided by the client
83 buf []byte // buffered data
84 err error // last error
85 }
86
87 // Read is an implementation of io.Reader interface.
88 // The bytes are taken from the underlying Reader, but it checks for BOMs, removing them as necessary.
89 func (r *Reader) Read(p []byte) (n int, err error) {
90 if len(p) == 0 {
91 return 0, nil
92 }
93
94 if r.buf == nil {
95 if r.err != nil {
96 return 0, r.readErr()
97 }
98
99 return r.rd.Read(p)
100 }
101
102 // copy as much as we can
103 n = copy(p, r.buf)
104 r.buf = nilIfEmpty(r.buf[n:])
105 return n, nil
106 }
107
108 func (r *Reader) readErr() error {
109 err := r.err
110 r.err = nil
111 return err
112 }
113
114 var errNegativeRead = errors.New("utfbom: reader returned negative count from Read")
115
116 func detectUtf(rd io.Reader) (enc Encoding, buf []byte, err error) {
117 buf, err = readBOM(rd)
118
119 if len(buf) >= 4 {
120 if isUTF32BigEndianBOM4(buf) {
121 return UTF32BigEndian, nilIfEmpty(buf[4:]), err
122 }
123 if isUTF32LittleEndianBOM4(buf) {
124 return UTF32LittleEndian, nilIfEmpty(buf[4:]), err
125 }
126 }
127
128 if len(buf) > 2 && isUTF8BOM3(buf) {
129 return UTF8, nilIfEmpty(buf[3:]), err
130 }
131
132 if (err != nil && err != io.EOF) || (len(buf) < 2) {
133 return Unknown, nilIfEmpty(buf), err
134 }
135
136 if isUTF16BigEndianBOM2(buf) {
137 return UTF16BigEndian, nilIfEmpty(buf[2:]), err
138 }
139 if isUTF16LittleEndianBOM2(buf) {
140 return UTF16LittleEndian, nilIfEmpty(buf[2:]), err
141 }
142
143 return Unknown, nilIfEmpty(buf), err
144 }
145
146 func readBOM(rd io.Reader) (buf []byte, err error) {
147 const maxBOMSize = 4
148 var bom [maxBOMSize]byte // used to read BOM
149
150 // read as many bytes as possible
151 for nEmpty, n := 0, 0; err == nil && len(buf) < maxBOMSize; buf = bom[:len(buf)+n] {
152 if n, err = rd.Read(bom[len(buf):]); n < 0 {
153 panic(errNegativeRead)
154 }
155 if n > 0 {
156 nEmpty = 0
157 } else {
158 nEmpty++
159 if nEmpty >= maxConsecutiveEmptyReads {
160 err = io.ErrNoProgress
161 }
162 }
163 }
164 return
165 }
166
167 func isUTF32BigEndianBOM4(buf []byte) bool {
168 return buf[0] == 0x00 && buf[1] == 0x00 && buf[2] == 0xFE && buf[3] == 0xFF
169 }
170
171 func isUTF32LittleEndianBOM4(buf []byte) bool {
172 return buf[0] == 0xFF && buf[1] == 0xFE && buf[2] == 0x00 && buf[3] == 0x00
173 }
174
175 func isUTF8BOM3(buf []byte) bool {
176 return buf[0] == 0xEF && buf[1] == 0xBB && buf[2] == 0xBF
177 }
178
179 func isUTF16BigEndianBOM2(buf []byte) bool {
180 return buf[0] == 0xFE && buf[1] == 0xFF
181 }
182
183 func isUTF16LittleEndianBOM2(buf []byte) bool {
184 return buf[0] == 0xFF && buf[1] == 0xFE
185 }
186
187 func nilIfEmpty(buf []byte) (res []byte) {
188 if len(buf) > 0 {
189 res = buf
190 }
191 return
192 }
193