bzip2.mx raw

   1  // Copyright 2011 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  // Package bzip2 implements bzip2 decompression.
   6  package bzip2
   7  
   8  import "io"
   9  
  10  // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
  11  // of guessing: https://en.wikipedia.org/wiki/Bzip2
  12  // The source code to pyflate was useful for debugging:
  13  // http://www.paul.sladen.org/projects/pyflate
  14  
  15  // A StructuralError is returned when the bzip2 data is found to be
  16  // syntactically invalid.
  17  type StructuralError string
  18  
  19  func (s StructuralError) Error() string {
  20  	return "bzip2 data invalid: " + string(s)
  21  }
  22  
  23  // A reader decompresses bzip2 compressed data.
  24  type reader struct {
  25  	br           bitReader
  26  	fileCRC      uint32
  27  	blockCRC     uint32
  28  	wantBlockCRC uint32
  29  	setupDone    bool // true if we have parsed the bzip2 header.
  30  	eof          bool
  31  	blockSize    int       // blockSize in bytes, i.e. 900 * 1000.
  32  	c            [256]uint // the ``C'' array for the inverse BWT.
  33  	tt           []uint32  // mirrors the ``tt'' array in the bzip2 source and contains the P array in the upper 24 bits.
  34  	tPos         uint32    // Index of the next output byte in tt.
  35  
  36  	preRLE      []uint32 // contains the RLE data still to be processed.
  37  	preRLEUsed  int      // number of entries of preRLE used.
  38  	lastByte    int      // the last byte value seen.
  39  	byteRepeats uint     // the number of repeats of lastByte seen.
  40  	repeats     uint     // the number of copies of lastByte to output.
  41  }
  42  
  43  // NewReader returns an [io.Reader] which decompresses bzip2 data from r.
  44  // If r does not also implement [io.ByteReader],
  45  // the decompressor may read more data than necessary from r.
  46  func NewReader(r io.Reader) io.Reader {
  47  	bz2 := &reader{}
  48  	bz2.br = newBitReader(r)
  49  	return bz2
  50  }
  51  
  52  const bzip2FileMagic = 0x425a // "BZ"
  53  const bzip2BlockMagic = 0x314159265359
  54  const bzip2FinalMagic = 0x177245385090
  55  
  56  // setup parses the bzip2 header.
  57  func (bz2 *reader) setup(needMagic bool) error {
  58  	br := &bz2.br
  59  
  60  	if needMagic {
  61  		magic := br.ReadBits(16)
  62  		if magic != bzip2FileMagic {
  63  			return StructuralError("bad magic value")
  64  		}
  65  	}
  66  
  67  	t := br.ReadBits(8)
  68  	if t != 'h' {
  69  		return StructuralError("non-Huffman entropy encoding")
  70  	}
  71  
  72  	level := br.ReadBits(8)
  73  	if level < '1' || level > '9' {
  74  		return StructuralError("invalid compression level")
  75  	}
  76  
  77  	bz2.fileCRC = 0
  78  	bz2.blockSize = 100 * 1000 * (level - '0')
  79  	if bz2.blockSize > len(bz2.tt) {
  80  		bz2.tt = []uint32{:bz2.blockSize}
  81  	}
  82  	return nil
  83  }
  84  
  85  func (bz2 *reader) Read(buf []byte) (n int, err error) {
  86  	if bz2.eof {
  87  		return 0, io.EOF
  88  	}
  89  
  90  	if !bz2.setupDone {
  91  		err = bz2.setup(true)
  92  		brErr := bz2.br.Err()
  93  		if brErr != nil {
  94  			err = brErr
  95  		}
  96  		if err != nil {
  97  			return 0, err
  98  		}
  99  		bz2.setupDone = true
 100  	}
 101  
 102  	n, err = bz2.read(buf)
 103  	brErr := bz2.br.Err()
 104  	if brErr != nil {
 105  		err = brErr
 106  	}
 107  	return
 108  }
 109  
 110  func (bz2 *reader) readFromBlock(buf []byte) int {
 111  	// bzip2 is a block based compressor, except that it has a run-length
 112  	// preprocessing step. The block based nature means that we can
 113  	// preallocate fixed-size buffers and reuse them. However, the RLE
 114  	// preprocessing would require allocating huge buffers to store the
 115  	// maximum expansion. Thus we process blocks all at once, except for
 116  	// the RLE which we decompress as required.
 117  	n := 0
 118  	for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
 119  		// We have RLE data pending.
 120  
 121  		// The run-length encoding works like this:
 122  		// Any sequence of four equal bytes is followed by a length
 123  		// byte which contains the number of repeats of that byte to
 124  		// include. (The number of repeats can be zero.) Because we are
 125  		// decompressing on-demand our state is kept in the reader
 126  		// object.
 127  
 128  		if bz2.repeats > 0 {
 129  			buf[n] = byte(bz2.lastByte)
 130  			n++
 131  			bz2.repeats--
 132  			if bz2.repeats == 0 {
 133  				bz2.lastByte = -1
 134  			}
 135  			continue
 136  		}
 137  
 138  		bz2.tPos = bz2.preRLE[bz2.tPos]
 139  		b := byte(bz2.tPos)
 140  		bz2.tPos >>= 8
 141  		bz2.preRLEUsed++
 142  
 143  		if bz2.byteRepeats == 3 {
 144  			bz2.repeats = uint(b)
 145  			bz2.byteRepeats = 0
 146  			continue
 147  		}
 148  
 149  		if bz2.lastByte == int(b) {
 150  			bz2.byteRepeats++
 151  		} else {
 152  			bz2.byteRepeats = 0
 153  		}
 154  		bz2.lastByte = int(b)
 155  
 156  		buf[n] = b
 157  		n++
 158  	}
 159  
 160  	return n
 161  }
 162  
 163  func (bz2 *reader) read(buf []byte) (int, error) {
 164  	for {
 165  		n := bz2.readFromBlock(buf)
 166  		if n > 0 || len(buf) == 0 {
 167  			bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n])
 168  			return n, nil
 169  		}
 170  
 171  		// End of block. Check CRC.
 172  		if bz2.blockCRC != bz2.wantBlockCRC {
 173  			bz2.br.err = StructuralError("block checksum mismatch")
 174  			return 0, bz2.br.err
 175  		}
 176  
 177  		// Find next block.
 178  		br := &bz2.br
 179  		switch br.ReadBits64(48) {
 180  		default:
 181  			return 0, StructuralError("bad magic value found")
 182  
 183  		case bzip2BlockMagic:
 184  			// Start of block.
 185  			err := bz2.readBlock()
 186  			if err != nil {
 187  				return 0, err
 188  			}
 189  
 190  		case bzip2FinalMagic:
 191  			// Check end-of-file CRC.
 192  			wantFileCRC := uint32(br.ReadBits64(32))
 193  			if br.err != nil {
 194  				return 0, br.err
 195  			}
 196  			if bz2.fileCRC != wantFileCRC {
 197  				br.err = StructuralError("file checksum mismatch")
 198  				return 0, br.err
 199  			}
 200  
 201  			// Skip ahead to byte boundary.
 202  			// Is there a file concatenated to this one?
 203  			// It would start with BZ.
 204  			if br.bits%8 != 0 {
 205  				br.ReadBits(br.bits % 8)
 206  			}
 207  			b, err := br.r.ReadByte()
 208  			if err == io.EOF {
 209  				br.err = io.EOF
 210  				bz2.eof = true
 211  				return 0, io.EOF
 212  			}
 213  			if err != nil {
 214  				br.err = err
 215  				return 0, err
 216  			}
 217  			z, err := br.r.ReadByte()
 218  			if err != nil {
 219  				if err == io.EOF {
 220  					err = io.ErrUnexpectedEOF
 221  				}
 222  				br.err = err
 223  				return 0, err
 224  			}
 225  			if b != 'B' || z != 'Z' {
 226  				return 0, StructuralError("bad magic value in continuation file")
 227  			}
 228  			if err := bz2.setup(false); err != nil {
 229  				return 0, err
 230  			}
 231  		}
 232  	}
 233  }
 234  
 235  // readBlock reads a bzip2 block. The magic number should already have been consumed.
 236  func (bz2 *reader) readBlock() (err error) {
 237  	br := &bz2.br
 238  	bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is.
 239  	bz2.blockCRC = 0
 240  	bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC
 241  	randomized := br.ReadBits(1)
 242  	if randomized != 0 {
 243  		return StructuralError("deprecated randomized files")
 244  	}
 245  	origPtr := uint(br.ReadBits(24))
 246  
 247  	// If not every byte value is used in the block (i.e., it's text) then
 248  	// the symbol set is reduced. The symbols used are stored as a
 249  	// two-level, 16x16 bitmap.
 250  	symbolRangeUsedBitmap := br.ReadBits(16)
 251  	symbolPresent := []bool{:256}
 252  	numSymbols := 0
 253  	for symRange := uint(0); symRange < 16; symRange++ {
 254  		if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
 255  			bits := br.ReadBits(16)
 256  			for symbol := uint(0); symbol < 16; symbol++ {
 257  				if bits&(1<<(15-symbol)) != 0 {
 258  					symbolPresent[16*symRange+symbol] = true
 259  					numSymbols++
 260  				}
 261  			}
 262  		}
 263  	}
 264  
 265  	if numSymbols == 0 {
 266  		// There must be an EOF symbol.
 267  		return StructuralError("no symbols in input")
 268  	}
 269  
 270  	// A block uses between two and six different Huffman trees.
 271  	numHuffmanTrees := br.ReadBits(3)
 272  	if numHuffmanTrees < 2 || numHuffmanTrees > 6 {
 273  		return StructuralError("invalid number of Huffman trees")
 274  	}
 275  
 276  	// The Huffman tree can switch every 50 symbols so there's a list of
 277  	// tree indexes telling us which tree to use for each 50 symbol block.
 278  	numSelectors := br.ReadBits(15)
 279  	treeIndexes := []uint8{:numSelectors}
 280  
 281  	// The tree indexes are move-to-front transformed and stored as unary
 282  	// numbers.
 283  	mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
 284  	for i := range treeIndexes {
 285  		c := 0
 286  		for {
 287  			inc := br.ReadBits(1)
 288  			if inc == 0 {
 289  				break
 290  			}
 291  			c++
 292  		}
 293  		if c >= numHuffmanTrees {
 294  			return StructuralError("tree index too large")
 295  		}
 296  		treeIndexes[i] = mtfTreeDecoder.Decode(c)
 297  	}
 298  
 299  	// The list of symbols for the move-to-front transform is taken from
 300  	// the previously decoded symbol bitmap.
 301  	symbols := []byte{:numSymbols}
 302  	nextSymbol := 0
 303  	for i := 0; i < 256; i++ {
 304  		if symbolPresent[i] {
 305  			symbols[nextSymbol] = byte(i)
 306  			nextSymbol++
 307  		}
 308  	}
 309  	mtf := newMTFDecoder(symbols)
 310  
 311  	numSymbols += 2 // to account for RUNA and RUNB symbols
 312  	huffmanTrees := []huffmanTree{:numHuffmanTrees}
 313  
 314  	// Now we decode the arrays of code-lengths for each tree.
 315  	lengths := []uint8{:numSymbols}
 316  	for i := range huffmanTrees {
 317  		// The code lengths are delta encoded from a 5-bit base value.
 318  		length := br.ReadBits(5)
 319  		for j := range lengths {
 320  			for {
 321  				if length < 1 || length > 20 {
 322  					return StructuralError("Huffman length out of range")
 323  				}
 324  				if !br.ReadBit() {
 325  					break
 326  				}
 327  				if br.ReadBit() {
 328  					length--
 329  				} else {
 330  					length++
 331  				}
 332  			}
 333  			lengths[j] = uint8(length)
 334  		}
 335  		huffmanTrees[i], err = newHuffmanTree(lengths)
 336  		if err != nil {
 337  			return err
 338  		}
 339  	}
 340  
 341  	selectorIndex := 1 // the next tree index to use
 342  	if len(treeIndexes) == 0 {
 343  		return StructuralError("no tree selectors given")
 344  	}
 345  	if int(treeIndexes[0]) >= len(huffmanTrees) {
 346  		return StructuralError("tree selector out of range")
 347  	}
 348  	currentHuffmanTree := huffmanTrees[treeIndexes[0]]
 349  	bufIndex := 0 // indexes bz2.buf, the output buffer.
 350  	// The output of the move-to-front transform is run-length encoded and
 351  	// we merge the decoding into the Huffman parsing loop. These two
 352  	// variables accumulate the repeat count. See the Wikipedia page for
 353  	// details.
 354  	repeat := 0
 355  	repeatPower := 0
 356  
 357  	// The `C' array (used by the inverse BWT) needs to be zero initialized.
 358  	clear(bz2.c[:])
 359  
 360  	decoded := 0 // counts the number of symbols decoded by the current tree.
 361  	for {
 362  		if decoded == 50 {
 363  			if selectorIndex >= numSelectors {
 364  				return StructuralError("insufficient selector indices for number of symbols")
 365  			}
 366  			if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) {
 367  				return StructuralError("tree selector out of range")
 368  			}
 369  			currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
 370  			selectorIndex++
 371  			decoded = 0
 372  		}
 373  
 374  		v := currentHuffmanTree.Decode(br)
 375  		decoded++
 376  
 377  		if v < 2 {
 378  			// This is either the RUNA or RUNB symbol.
 379  			if repeat == 0 {
 380  				repeatPower = 1
 381  			}
 382  			repeat += repeatPower << v
 383  			repeatPower <<= 1
 384  
 385  			// This limit of 2 million comes from the bzip2 source
 386  			// code. It prevents repeat from overflowing.
 387  			if repeat > 2*1024*1024 {
 388  				return StructuralError("repeat count too large")
 389  			}
 390  			continue
 391  		}
 392  
 393  		if repeat > 0 {
 394  			// We have decoded a complete run-length so we need to
 395  			// replicate the last output symbol.
 396  			if repeat > bz2.blockSize-bufIndex {
 397  				return StructuralError("repeats past end of block")
 398  			}
 399  			for i := 0; i < repeat; i++ {
 400  				b := mtf.First()
 401  				bz2.tt[bufIndex] = uint32(b)
 402  				bz2.c[b]++
 403  				bufIndex++
 404  			}
 405  			repeat = 0
 406  		}
 407  
 408  		if int(v) == numSymbols-1 {
 409  			// This is the EOF symbol. Because it's always at the
 410  			// end of the move-to-front list, and never gets moved
 411  			// to the front, it has this unique value.
 412  			break
 413  		}
 414  
 415  		// Since two metasymbols (RUNA and RUNB) have values 0 and 1,
 416  		// one would expect |v-2| to be passed to the MTF decoder.
 417  		// However, the front of the MTF list is never referenced as 0,
 418  		// it's always referenced with a run-length of 1. Thus 0
 419  		// doesn't need to be encoded and we have |v-1| in the next
 420  		// line.
 421  		b := mtf.Decode(int(v - 1))
 422  		if bufIndex >= bz2.blockSize {
 423  			return StructuralError("data exceeds block size")
 424  		}
 425  		bz2.tt[bufIndex] = uint32(b)
 426  		bz2.c[b]++
 427  		bufIndex++
 428  	}
 429  
 430  	if origPtr >= uint(bufIndex) {
 431  		return StructuralError("origPtr out of bounds")
 432  	}
 433  
 434  	// We have completed the entropy decoding. Now we can perform the
 435  	// inverse BWT and setup the RLE buffer.
 436  	bz2.preRLE = bz2.tt[:bufIndex]
 437  	bz2.preRLEUsed = 0
 438  	bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
 439  	bz2.lastByte = -1
 440  	bz2.byteRepeats = 0
 441  	bz2.repeats = 0
 442  
 443  	return nil
 444  }
 445  
 446  // inverseBWT implements the inverse Burrows-Wheeler transform as described in
 447  // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
 448  // In that document, origPtr is called “I” and c is the “C” array after the
 449  // first pass over the data. It's an argument here because we merge the first
 450  // pass with the Huffman decoding.
 451  //
 452  // This also implements the “single array” method from the bzip2 source code
 453  // which leaves the output, still shuffled, in the bottom 8 bits of tt with the
 454  // index of the next byte in the top 24-bits. The index of the first byte is
 455  // returned.
 456  func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
 457  	sum := uint(0)
 458  	for i := 0; i < 256; i++ {
 459  		sum += c[i]
 460  		c[i] = sum - c[i]
 461  	}
 462  
 463  	for i := range tt {
 464  		b := tt[i] & 0xff
 465  		tt[c[b]] |= uint32(i) << 8
 466  		c[b]++
 467  	}
 468  
 469  	return tt[origPtr] >> 8
 470  }
 471  
 472  // This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed,
 473  // causing the bits in the input to be processed in the reverse of the usual order.
 474  
 475  var crctab [256]uint32
 476  
 477  func init() {
 478  	const poly = 0x04C11DB7
 479  	for i := range crctab {
 480  		crc := uint32(i) << 24
 481  		for j := 0; j < 8; j++ {
 482  			if crc&0x80000000 != 0 {
 483  				crc = (crc << 1) ^ poly
 484  			} else {
 485  				crc <<= 1
 486  			}
 487  		}
 488  		crctab[i] = crc
 489  	}
 490  }
 491  
 492  // updateCRC updates the crc value to incorporate the data in b.
 493  // The initial value is 0.
 494  func updateCRC(val uint32, b []byte) uint32 {
 495  	crc := ^val
 496  	for _, v := range b {
 497  		crc = crctab[byte(crc>>24)^v] ^ (crc << 8)
 498  	}
 499  	return ^crc
 500  }
 501