fse.go raw

   1  // Copyright 2018 Klaus Post. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  // Based on work Copyright (c) 2013, Yann Collet, released under BSD License.
   5  
   6  // Package fse provides Finite State Entropy encoding and decoding.
   7  //
   8  // Finite State Entropy encoding provides a fast near-optimal symbol encoding/decoding
   9  // for byte blocks as implemented in zstd.
  10  //
  11  // See https://github.com/klauspost/compress/tree/master/fse for more information.
  12  package fse
  13  
  14  import (
  15  	"errors"
  16  	"fmt"
  17  	"math/bits"
  18  )
  19  
  20  const (
  21  	/*!MEMORY_USAGE :
  22  	 *  Memory usage formula : N->2^N Bytes (examples : 10 -> 1KB; 12 -> 4KB ; 16 -> 64KB; 20 -> 1MB; etc.)
  23  	 *  Increasing memory usage improves compression ratio
  24  	 *  Reduced memory usage can improve speed, due to cache effect
  25  	 *  Recommended max value is 14, for 16KB, which nicely fits into Intel x86 L1 cache */
  26  	maxMemoryUsage     = 14
  27  	defaultMemoryUsage = 13
  28  
  29  	maxTableLog     = maxMemoryUsage - 2
  30  	maxTablesize    = 1 << maxTableLog
  31  	defaultTablelog = defaultMemoryUsage - 2
  32  	minTablelog     = 5
  33  	maxSymbolValue  = 255
  34  )
  35  
  36  var (
  37  	// ErrIncompressible is returned when input is judged to be too hard to compress.
  38  	ErrIncompressible = errors.New("input is not compressible")
  39  
  40  	// ErrUseRLE is returned from the compressor when the input is a single byte value repeated.
  41  	ErrUseRLE = errors.New("input is single value repeated")
  42  )
  43  
  44  // Scratch provides temporary storage for compression and decompression.
  45  type Scratch struct {
  46  	// Private
  47  	count    [maxSymbolValue + 1]uint32
  48  	norm     [maxSymbolValue + 1]int16
  49  	br       byteReader
  50  	bits     bitReader
  51  	bw       bitWriter
  52  	ct       cTable      // Compression tables.
  53  	decTable []decSymbol // Decompression table.
  54  	maxCount int         // count of the most probable symbol
  55  
  56  	// Per block parameters.
  57  	// These can be used to override compression parameters of the block.
  58  	// Do not touch, unless you know what you are doing.
  59  
  60  	// Out is output buffer.
  61  	// If the scratch is re-used before the caller is done processing the output,
  62  	// set this field to nil.
  63  	// Otherwise the output buffer will be re-used for next Compression/Decompression step
  64  	// and allocation will be avoided.
  65  	Out []byte
  66  
  67  	// DecompressLimit limits the maximum decoded size acceptable.
  68  	// If > 0 decompression will stop when approximately this many bytes
  69  	// has been decoded.
  70  	// If 0, maximum size will be 2GB.
  71  	DecompressLimit int
  72  
  73  	symbolLen      uint16 // Length of active part of the symbol table.
  74  	actualTableLog uint8  // Selected tablelog.
  75  	zeroBits       bool   // no bits has prob > 50%.
  76  	clearCount     bool   // clear count
  77  
  78  	// MaxSymbolValue will override the maximum symbol value of the next block.
  79  	MaxSymbolValue uint8
  80  
  81  	// TableLog will attempt to override the tablelog for the next block.
  82  	TableLog uint8
  83  }
  84  
  85  // Histogram allows to populate the histogram and skip that step in the compression,
  86  // It otherwise allows to inspect the histogram when compression is done.
  87  // To indicate that you have populated the histogram call HistogramFinished
  88  // with the value of the highest populated symbol, as well as the number of entries
  89  // in the most populated entry. These are accepted at face value.
  90  // The returned slice will always be length 256.
  91  func (s *Scratch) Histogram() []uint32 {
  92  	return s.count[:]
  93  }
  94  
  95  // HistogramFinished can be called to indicate that the histogram has been populated.
  96  // maxSymbol is the index of the highest set symbol of the next data segment.
  97  // maxCount is the number of entries in the most populated entry.
  98  // These are accepted at face value.
  99  func (s *Scratch) HistogramFinished(maxSymbol uint8, maxCount int) {
 100  	s.maxCount = maxCount
 101  	s.symbolLen = uint16(maxSymbol) + 1
 102  	s.clearCount = maxCount != 0
 103  }
 104  
 105  // prepare will prepare and allocate scratch tables used for both compression and decompression.
 106  func (s *Scratch) prepare(in []byte) (*Scratch, error) {
 107  	if s == nil {
 108  		s = &Scratch{}
 109  	}
 110  	if s.MaxSymbolValue == 0 {
 111  		s.MaxSymbolValue = 255
 112  	}
 113  	if s.TableLog == 0 {
 114  		s.TableLog = defaultTablelog
 115  	}
 116  	if s.TableLog > maxTableLog {
 117  		return nil, fmt.Errorf("tableLog (%d) > maxTableLog (%d)", s.TableLog, maxTableLog)
 118  	}
 119  	if cap(s.Out) == 0 {
 120  		s.Out = make([]byte, 0, len(in))
 121  	}
 122  	if s.clearCount && s.maxCount == 0 {
 123  		for i := range s.count {
 124  			s.count[i] = 0
 125  		}
 126  		s.clearCount = false
 127  	}
 128  	s.br.init(in)
 129  	if s.DecompressLimit == 0 {
 130  		// Max size 2GB.
 131  		s.DecompressLimit = (2 << 30) - 1
 132  	}
 133  
 134  	return s, nil
 135  }
 136  
 137  // tableStep returns the next table index.
 138  func tableStep(tableSize uint32) uint32 {
 139  	return (tableSize >> 1) + (tableSize >> 3) + 3
 140  }
 141  
 142  func highBits(val uint32) (n uint32) {
 143  	return uint32(bits.Len32(val) - 1)
 144  }
 145