huffman.mx raw

   1  // Copyright 2011 The Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package bzip2
   6  
   7  import (
   8  	"cmp"
   9  	"slices"
  10  )
  11  
  12  // A huffmanTree is a binary tree which is navigated, bit-by-bit to reach a
  13  // symbol.
  14  type huffmanTree struct {
  15  	// nodes contains all the non-leaf nodes in the tree. nodes[0] is the
  16  	// root of the tree and nextNode contains the index of the next element
  17  	// of nodes to use when the tree is being constructed.
  18  	nodes    []huffmanNode
  19  	nextNode int
  20  }
  21  
  22  // A huffmanNode is a node in the tree. left and right contain indexes into the
  23  // nodes slice of the tree. If left or right is invalidNodeValue then the child
  24  // is a left node and its value is in leftValue/rightValue.
  25  //
  26  // The symbols are uint16s because bzip2 encodes not only MTF indexes in the
  27  // tree, but also two magic values for run-length encoding and an EOF symbol.
  28  // Thus there are more than 256 possible symbols.
  29  type huffmanNode struct {
  30  	left, right           uint16
  31  	leftValue, rightValue uint16
  32  }
  33  
  34  // invalidNodeValue is an invalid index which marks a leaf node in the tree.
  35  const invalidNodeValue = 0xffff
  36  
  37  // Decode reads bits from the given bitReader and navigates the tree until a
  38  // symbol is found.
  39  func (t *huffmanTree) Decode(br *bitReader) (v uint16) {
  40  	nodeIndex := uint16(0) // node 0 is the root of the tree.
  41  
  42  	for {
  43  		node := &t.nodes[nodeIndex]
  44  
  45  		var bit uint16
  46  		if br.bits > 0 {
  47  			// Get next bit - fast path.
  48  			br.bits--
  49  			bit = uint16(br.n>>(br.bits&63)) & 1
  50  		} else {
  51  			// Get next bit - slow path.
  52  			// Use ReadBits to retrieve a single bit
  53  			// from the underling io.ByteReader.
  54  			bit = uint16(br.ReadBits(1))
  55  		}
  56  
  57  		// Trick a compiler into generating conditional move instead of branch,
  58  		// by making both loads unconditional.
  59  		l, r := node.left, node.right
  60  
  61  		if bit == 1 {
  62  			nodeIndex = l
  63  		} else {
  64  			nodeIndex = r
  65  		}
  66  
  67  		if nodeIndex == invalidNodeValue {
  68  			// We found a leaf. Use the value of bit to decide
  69  			// whether is a left or a right value.
  70  			l, r := node.leftValue, node.rightValue
  71  			if bit == 1 {
  72  				v = l
  73  			} else {
  74  				v = r
  75  			}
  76  			return
  77  		}
  78  	}
  79  }
  80  
  81  // newHuffmanTree builds a Huffman tree from a slice containing the code
  82  // lengths of each symbol. The maximum code length is 32 bits.
  83  func newHuffmanTree(lengths []uint8) (huffmanTree, error) {
  84  	// There are many possible trees that assign the same code length to
  85  	// each symbol (consider reflecting a tree down the middle, for
  86  	// example). Since the code length assignments determine the
  87  	// efficiency of the tree, each of these trees is equally good. In
  88  	// order to minimize the amount of information needed to build a tree
  89  	// bzip2 uses a canonical tree so that it can be reconstructed given
  90  	// only the code length assignments.
  91  
  92  	if len(lengths) < 2 {
  93  		panic("newHuffmanTree: too few symbols")
  94  	}
  95  
  96  	var t huffmanTree
  97  
  98  	// First we sort the code length assignments by ascending code length,
  99  	// using the symbol value to break ties.
 100  	pairs := []huffmanSymbolLengthPair{:len(lengths)}
 101  	for i, length := range lengths {
 102  		pairs[i].value = uint16(i)
 103  		pairs[i].length = length
 104  	}
 105  
 106  	slices.SortFunc(pairs, func(a, b huffmanSymbolLengthPair) int {
 107  		if c := cmp.Compare(a.length, b.length); c != 0 {
 108  			return c
 109  		}
 110  		return cmp.Compare(a.value, b.value)
 111  	})
 112  
 113  	// Now we assign codes to the symbols, starting with the longest code.
 114  	// We keep the codes packed into a uint32, at the most-significant end.
 115  	// So branches are taken from the MSB downwards. This makes it easy to
 116  	// sort them later.
 117  	code := uint32(0)
 118  	length := uint8(32)
 119  
 120  	codes := []huffmanCode{:len(lengths)}
 121  	for i := len(pairs) - 1; i >= 0; i-- {
 122  		if length > pairs[i].length {
 123  			length = pairs[i].length
 124  		}
 125  		codes[i].code = code
 126  		codes[i].codeLen = length
 127  		codes[i].value = pairs[i].value
 128  		// We need to 'increment' the code, which means treating |code|
 129  		// like a |length| bit number.
 130  		code += 1 << (32 - length)
 131  	}
 132  
 133  	// Now we can sort by the code so that the left half of each branch are
 134  	// grouped together, recursively.
 135  	slices.SortFunc(codes, func(a, b huffmanCode) int {
 136  		return cmp.Compare(a.code, b.code)
 137  	})
 138  
 139  	t.nodes = []huffmanNode{:len(codes)}
 140  	_, err := buildHuffmanNode(&t, codes, 0)
 141  	return t, err
 142  }
 143  
 144  // huffmanSymbolLengthPair contains a symbol and its code length.
 145  type huffmanSymbolLengthPair struct {
 146  	value  uint16
 147  	length uint8
 148  }
 149  
 150  // huffmanCode contains a symbol, its code and code length.
 151  type huffmanCode struct {
 152  	code    uint32
 153  	codeLen uint8
 154  	value   uint16
 155  }
 156  
 157  // buildHuffmanNode takes a slice of sorted huffmanCodes and builds a node in
 158  // the Huffman tree at the given level. It returns the index of the newly
 159  // constructed node.
 160  func buildHuffmanNode(t *huffmanTree, codes []huffmanCode, level uint32) (nodeIndex uint16, err error) {
 161  	test := uint32(1) << (31 - level)
 162  
 163  	// We have to search the list of codes to find the divide between the left and right sides.
 164  	firstRightIndex := len(codes)
 165  	for i, code := range codes {
 166  		if code.code&test != 0 {
 167  			firstRightIndex = i
 168  			break
 169  		}
 170  	}
 171  
 172  	left := codes[:firstRightIndex]
 173  	right := codes[firstRightIndex:]
 174  
 175  	if len(left) == 0 || len(right) == 0 {
 176  		// There is a superfluous level in the Huffman tree indicating
 177  		// a bug in the encoder. However, this bug has been observed in
 178  		// the wild so we handle it.
 179  
 180  		// If this function was called recursively then we know that
 181  		// len(codes) >= 2 because, otherwise, we would have hit the
 182  		// "leaf node" case, below, and not recurred.
 183  		//
 184  		// However, for the initial call it's possible that len(codes)
 185  		// is zero or one. Both cases are invalid because a zero length
 186  		// tree cannot encode anything and a length-1 tree can only
 187  		// encode EOF and so is superfluous. We reject both.
 188  		if len(codes) < 2 {
 189  			return 0, StructuralError("empty Huffman tree")
 190  		}
 191  
 192  		// In this case the recursion doesn't always reduce the length
 193  		// of codes so we need to ensure termination via another
 194  		// mechanism.
 195  		if level == 31 {
 196  			// Since len(codes) >= 2 the only way that the values
 197  			// can match at all 32 bits is if they are equal, which
 198  			// is invalid. This ensures that we never enter
 199  			// infinite recursion.
 200  			return 0, StructuralError("equal symbols in Huffman tree")
 201  		}
 202  
 203  		if len(left) == 0 {
 204  			return buildHuffmanNode(t, right, level+1)
 205  		}
 206  		return buildHuffmanNode(t, left, level+1)
 207  	}
 208  
 209  	nodeIndex = uint16(t.nextNode)
 210  	node := &t.nodes[t.nextNode]
 211  	t.nextNode++
 212  
 213  	if len(left) == 1 {
 214  		// leaf node
 215  		node.left = invalidNodeValue
 216  		node.leftValue = left[0].value
 217  	} else {
 218  		node.left, err = buildHuffmanNode(t, left, level+1)
 219  	}
 220  
 221  	if err != nil {
 222  		return
 223  	}
 224  
 225  	if len(right) == 1 {
 226  		// leaf node
 227  		node.right = invalidNodeValue
 228  		node.rightValue = right[0].value
 229  	} else {
 230  		node.right, err = buildHuffmanNode(t, right, level+1)
 231  	}
 232  
 233  	return
 234  }
 235