1 // Copyright 2011 The Go Authors. All rights reserved.
2 // Use of this source code is governed by a BSD-style
3 // license that can be found in the LICENSE file.
4 5 // Package bzip2 implements bzip2 decompression.
6 package bzip2
7 8 import "io"
9 10 // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
11 // of guessing: https://en.wikipedia.org/wiki/Bzip2
12 // The source code to pyflate was useful for debugging:
13 // http://www.paul.sladen.org/projects/pyflate
14 15 // A StructuralError is returned when the bzip2 data is found to be
16 // syntactically invalid.
17 type StructuralError string
18 19 func (s StructuralError) Error() string {
20 return "bzip2 data invalid: " + string(s)
21 }
22 23 // A reader decompresses bzip2 compressed data.
24 type reader struct {
25 br bitReader
26 fileCRC uint32
27 blockCRC uint32
28 wantBlockCRC uint32
29 setupDone bool // true if we have parsed the bzip2 header.
30 eof bool
31 blockSize int // blockSize in bytes, i.e. 900 * 1000.
32 c [256]uint // the ``C'' array for the inverse BWT.
33 tt []uint32 // mirrors the ``tt'' array in the bzip2 source and contains the P array in the upper 24 bits.
34 tPos uint32 // Index of the next output byte in tt.
35 36 preRLE []uint32 // contains the RLE data still to be processed.
37 preRLEUsed int // number of entries of preRLE used.
38 lastByte int // the last byte value seen.
39 byteRepeats uint // the number of repeats of lastByte seen.
40 repeats uint // the number of copies of lastByte to output.
41 }
42 43 // NewReader returns an [io.Reader] which decompresses bzip2 data from r.
44 // If r does not also implement [io.ByteReader],
45 // the decompressor may read more data than necessary from r.
46 func NewReader(r io.Reader) io.Reader {
47 bz2 := &reader{}
48 bz2.br = newBitReader(r)
49 return bz2
50 }
51 52 const bzip2FileMagic = 0x425a // "BZ"
53 const bzip2BlockMagic = 0x314159265359
54 const bzip2FinalMagic = 0x177245385090
55 56 // setup parses the bzip2 header.
57 func (bz2 *reader) setup(needMagic bool) error {
58 br := &bz2.br
59 60 if needMagic {
61 magic := br.ReadBits(16)
62 if magic != bzip2FileMagic {
63 return StructuralError("bad magic value")
64 }
65 }
66 67 t := br.ReadBits(8)
68 if t != 'h' {
69 return StructuralError("non-Huffman entropy encoding")
70 }
71 72 level := br.ReadBits(8)
73 if level < '1' || level > '9' {
74 return StructuralError("invalid compression level")
75 }
76 77 bz2.fileCRC = 0
78 bz2.blockSize = 100 * 1000 * (level - '0')
79 if bz2.blockSize > len(bz2.tt) {
80 bz2.tt = []uint32{:bz2.blockSize}
81 }
82 return nil
83 }
84 85 func (bz2 *reader) Read(buf []byte) (n int, err error) {
86 if bz2.eof {
87 return 0, io.EOF
88 }
89 90 if !bz2.setupDone {
91 err = bz2.setup(true)
92 brErr := bz2.br.Err()
93 if brErr != nil {
94 err = brErr
95 }
96 if err != nil {
97 return 0, err
98 }
99 bz2.setupDone = true
100 }
101 102 n, err = bz2.read(buf)
103 brErr := bz2.br.Err()
104 if brErr != nil {
105 err = brErr
106 }
107 return
108 }
109 110 func (bz2 *reader) readFromBlock(buf []byte) int {
111 // bzip2 is a block based compressor, except that it has a run-length
112 // preprocessing step. The block based nature means that we can
113 // preallocate fixed-size buffers and reuse them. However, the RLE
114 // preprocessing would require allocating huge buffers to store the
115 // maximum expansion. Thus we process blocks all at once, except for
116 // the RLE which we decompress as required.
117 n := 0
118 for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
119 // We have RLE data pending.
120 121 // The run-length encoding works like this:
122 // Any sequence of four equal bytes is followed by a length
123 // byte which contains the number of repeats of that byte to
124 // include. (The number of repeats can be zero.) Because we are
125 // decompressing on-demand our state is kept in the reader
126 // object.
127 128 if bz2.repeats > 0 {
129 buf[n] = byte(bz2.lastByte)
130 n++
131 bz2.repeats--
132 if bz2.repeats == 0 {
133 bz2.lastByte = -1
134 }
135 continue
136 }
137 138 bz2.tPos = bz2.preRLE[bz2.tPos]
139 b := byte(bz2.tPos)
140 bz2.tPos >>= 8
141 bz2.preRLEUsed++
142 143 if bz2.byteRepeats == 3 {
144 bz2.repeats = uint(b)
145 bz2.byteRepeats = 0
146 continue
147 }
148 149 if bz2.lastByte == int(b) {
150 bz2.byteRepeats++
151 } else {
152 bz2.byteRepeats = 0
153 }
154 bz2.lastByte = int(b)
155 156 buf[n] = b
157 n++
158 }
159 160 return n
161 }
162 163 func (bz2 *reader) read(buf []byte) (int, error) {
164 for {
165 n := bz2.readFromBlock(buf)
166 if n > 0 || len(buf) == 0 {
167 bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n])
168 return n, nil
169 }
170 171 // End of block. Check CRC.
172 if bz2.blockCRC != bz2.wantBlockCRC {
173 bz2.br.err = StructuralError("block checksum mismatch")
174 return 0, bz2.br.err
175 }
176 177 // Find next block.
178 br := &bz2.br
179 switch br.ReadBits64(48) {
180 default:
181 return 0, StructuralError("bad magic value found")
182 183 case bzip2BlockMagic:
184 // Start of block.
185 err := bz2.readBlock()
186 if err != nil {
187 return 0, err
188 }
189 190 case bzip2FinalMagic:
191 // Check end-of-file CRC.
192 wantFileCRC := uint32(br.ReadBits64(32))
193 if br.err != nil {
194 return 0, br.err
195 }
196 if bz2.fileCRC != wantFileCRC {
197 br.err = StructuralError("file checksum mismatch")
198 return 0, br.err
199 }
200 201 // Skip ahead to byte boundary.
202 // Is there a file concatenated to this one?
203 // It would start with BZ.
204 if br.bits%8 != 0 {
205 br.ReadBits(br.bits % 8)
206 }
207 b, err := br.r.ReadByte()
208 if err == io.EOF {
209 br.err = io.EOF
210 bz2.eof = true
211 return 0, io.EOF
212 }
213 if err != nil {
214 br.err = err
215 return 0, err
216 }
217 z, err := br.r.ReadByte()
218 if err != nil {
219 if err == io.EOF {
220 err = io.ErrUnexpectedEOF
221 }
222 br.err = err
223 return 0, err
224 }
225 if b != 'B' || z != 'Z' {
226 return 0, StructuralError("bad magic value in continuation file")
227 }
228 if err := bz2.setup(false); err != nil {
229 return 0, err
230 }
231 }
232 }
233 }
234 235 // readBlock reads a bzip2 block. The magic number should already have been consumed.
236 func (bz2 *reader) readBlock() (err error) {
237 br := &bz2.br
238 bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is.
239 bz2.blockCRC = 0
240 bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC
241 randomized := br.ReadBits(1)
242 if randomized != 0 {
243 return StructuralError("deprecated randomized files")
244 }
245 origPtr := uint(br.ReadBits(24))
246 247 // If not every byte value is used in the block (i.e., it's text) then
248 // the symbol set is reduced. The symbols used are stored as a
249 // two-level, 16x16 bitmap.
250 symbolRangeUsedBitmap := br.ReadBits(16)
251 symbolPresent := []bool{:256}
252 numSymbols := 0
253 for symRange := uint(0); symRange < 16; symRange++ {
254 if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
255 bits := br.ReadBits(16)
256 for symbol := uint(0); symbol < 16; symbol++ {
257 if bits&(1<<(15-symbol)) != 0 {
258 symbolPresent[16*symRange+symbol] = true
259 numSymbols++
260 }
261 }
262 }
263 }
264 265 if numSymbols == 0 {
266 // There must be an EOF symbol.
267 return StructuralError("no symbols in input")
268 }
269 270 // A block uses between two and six different Huffman trees.
271 numHuffmanTrees := br.ReadBits(3)
272 if numHuffmanTrees < 2 || numHuffmanTrees > 6 {
273 return StructuralError("invalid number of Huffman trees")
274 }
275 276 // The Huffman tree can switch every 50 symbols so there's a list of
277 // tree indexes telling us which tree to use for each 50 symbol block.
278 numSelectors := br.ReadBits(15)
279 treeIndexes := []uint8{:numSelectors}
280 281 // The tree indexes are move-to-front transformed and stored as unary
282 // numbers.
283 mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
284 for i := range treeIndexes {
285 c := 0
286 for {
287 inc := br.ReadBits(1)
288 if inc == 0 {
289 break
290 }
291 c++
292 }
293 if c >= numHuffmanTrees {
294 return StructuralError("tree index too large")
295 }
296 treeIndexes[i] = mtfTreeDecoder.Decode(c)
297 }
298 299 // The list of symbols for the move-to-front transform is taken from
300 // the previously decoded symbol bitmap.
301 symbols := []byte{:numSymbols}
302 nextSymbol := 0
303 for i := 0; i < 256; i++ {
304 if symbolPresent[i] {
305 symbols[nextSymbol] = byte(i)
306 nextSymbol++
307 }
308 }
309 mtf := newMTFDecoder(symbols)
310 311 numSymbols += 2 // to account for RUNA and RUNB symbols
312 huffmanTrees := []huffmanTree{:numHuffmanTrees}
313 314 // Now we decode the arrays of code-lengths for each tree.
315 lengths := []uint8{:numSymbols}
316 for i := range huffmanTrees {
317 // The code lengths are delta encoded from a 5-bit base value.
318 length := br.ReadBits(5)
319 for j := range lengths {
320 for {
321 if length < 1 || length > 20 {
322 return StructuralError("Huffman length out of range")
323 }
324 if !br.ReadBit() {
325 break
326 }
327 if br.ReadBit() {
328 length--
329 } else {
330 length++
331 }
332 }
333 lengths[j] = uint8(length)
334 }
335 huffmanTrees[i], err = newHuffmanTree(lengths)
336 if err != nil {
337 return err
338 }
339 }
340 341 selectorIndex := 1 // the next tree index to use
342 if len(treeIndexes) == 0 {
343 return StructuralError("no tree selectors given")
344 }
345 if int(treeIndexes[0]) >= len(huffmanTrees) {
346 return StructuralError("tree selector out of range")
347 }
348 currentHuffmanTree := huffmanTrees[treeIndexes[0]]
349 bufIndex := 0 // indexes bz2.buf, the output buffer.
350 // The output of the move-to-front transform is run-length encoded and
351 // we merge the decoding into the Huffman parsing loop. These two
352 // variables accumulate the repeat count. See the Wikipedia page for
353 // details.
354 repeat := 0
355 repeatPower := 0
356 357 // The `C' array (used by the inverse BWT) needs to be zero initialized.
358 clear(bz2.c[:])
359 360 decoded := 0 // counts the number of symbols decoded by the current tree.
361 for {
362 if decoded == 50 {
363 if selectorIndex >= numSelectors {
364 return StructuralError("insufficient selector indices for number of symbols")
365 }
366 if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) {
367 return StructuralError("tree selector out of range")
368 }
369 currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
370 selectorIndex++
371 decoded = 0
372 }
373 374 v := currentHuffmanTree.Decode(br)
375 decoded++
376 377 if v < 2 {
378 // This is either the RUNA or RUNB symbol.
379 if repeat == 0 {
380 repeatPower = 1
381 }
382 repeat += repeatPower << v
383 repeatPower <<= 1
384 385 // This limit of 2 million comes from the bzip2 source
386 // code. It prevents repeat from overflowing.
387 if repeat > 2*1024*1024 {
388 return StructuralError("repeat count too large")
389 }
390 continue
391 }
392 393 if repeat > 0 {
394 // We have decoded a complete run-length so we need to
395 // replicate the last output symbol.
396 if repeat > bz2.blockSize-bufIndex {
397 return StructuralError("repeats past end of block")
398 }
399 for i := 0; i < repeat; i++ {
400 b := mtf.First()
401 bz2.tt[bufIndex] = uint32(b)
402 bz2.c[b]++
403 bufIndex++
404 }
405 repeat = 0
406 }
407 408 if int(v) == numSymbols-1 {
409 // This is the EOF symbol. Because it's always at the
410 // end of the move-to-front list, and never gets moved
411 // to the front, it has this unique value.
412 break
413 }
414 415 // Since two metasymbols (RUNA and RUNB) have values 0 and 1,
416 // one would expect |v-2| to be passed to the MTF decoder.
417 // However, the front of the MTF list is never referenced as 0,
418 // it's always referenced with a run-length of 1. Thus 0
419 // doesn't need to be encoded and we have |v-1| in the next
420 // line.
421 b := mtf.Decode(int(v - 1))
422 if bufIndex >= bz2.blockSize {
423 return StructuralError("data exceeds block size")
424 }
425 bz2.tt[bufIndex] = uint32(b)
426 bz2.c[b]++
427 bufIndex++
428 }
429 430 if origPtr >= uint(bufIndex) {
431 return StructuralError("origPtr out of bounds")
432 }
433 434 // We have completed the entropy decoding. Now we can perform the
435 // inverse BWT and setup the RLE buffer.
436 bz2.preRLE = bz2.tt[:bufIndex]
437 bz2.preRLEUsed = 0
438 bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
439 bz2.lastByte = -1
440 bz2.byteRepeats = 0
441 bz2.repeats = 0
442 443 return nil
444 }
445 446 // inverseBWT implements the inverse Burrows-Wheeler transform as described in
447 // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
448 // In that document, origPtr is called “I” and c is the “C” array after the
449 // first pass over the data. It's an argument here because we merge the first
450 // pass with the Huffman decoding.
451 //
452 // This also implements the “single array” method from the bzip2 source code
453 // which leaves the output, still shuffled, in the bottom 8 bits of tt with the
454 // index of the next byte in the top 24-bits. The index of the first byte is
455 // returned.
456 func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
457 sum := uint(0)
458 for i := 0; i < 256; i++ {
459 sum += c[i]
460 c[i] = sum - c[i]
461 }
462 463 for i := range tt {
464 b := tt[i] & 0xff
465 tt[c[b]] |= uint32(i) << 8
466 c[b]++
467 }
468 469 return tt[origPtr] >> 8
470 }
471 472 // This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed,
473 // causing the bits in the input to be processed in the reverse of the usual order.
474 475 var crctab [256]uint32
476 477 func init() {
478 const poly = 0x04C11DB7
479 for i := range crctab {
480 crc := uint32(i) << 24
481 for j := 0; j < 8; j++ {
482 if crc&0x80000000 != 0 {
483 crc = (crc << 1) ^ poly
484 } else {
485 crc <<= 1
486 }
487 }
488 crctab[i] = crc
489 }
490 }
491 492 // updateCRC updates the crc value to incorporate the data in b.
493 // The initial value is 0.
494 func updateCRC(val uint32, b []byte) uint32 {
495 crc := ^val
496 for _, v := range b {
497 crc = crctab[byte(crc>>24)^v] ^ (crc << 8)
498 }
499 return ^crc
500 }
501