compressible.go raw

   1  package compress
   2  
   3  import "math"
   4  
   5  // Estimate returns a normalized compressibility estimate of block b.
   6  // Values close to zero are likely uncompressible.
   7  // Values above 0.1 are likely to be compressible.
   8  // Values above 0.5 are very compressible.
   9  // Very small lengths will return 0.
  10  func Estimate(b []byte) float64 {
  11  	if len(b) < 16 {
  12  		return 0
  13  	}
  14  
  15  	// Correctly predicted order 1
  16  	hits := 0
  17  	lastMatch := false
  18  	var o1 [256]byte
  19  	var hist [256]int
  20  	c1 := byte(0)
  21  	for _, c := range b {
  22  		if c == o1[c1] {
  23  			// We only count a hit if there was two correct predictions in a row.
  24  			if lastMatch {
  25  				hits++
  26  			}
  27  			lastMatch = true
  28  		} else {
  29  			lastMatch = false
  30  		}
  31  		o1[c1] = c
  32  		c1 = c
  33  		hist[c]++
  34  	}
  35  
  36  	// Use x^0.6 to give better spread
  37  	prediction := math.Pow(float64(hits)/float64(len(b)), 0.6)
  38  
  39  	// Calculate histogram distribution
  40  	variance := float64(0)
  41  	avg := float64(len(b)) / 256
  42  
  43  	for _, v := range hist {
  44  		Δ := float64(v) - avg
  45  		variance += Δ * Δ
  46  	}
  47  
  48  	stddev := math.Sqrt(float64(variance)) / float64(len(b))
  49  	exp := math.Sqrt(1 / float64(len(b)))
  50  
  51  	// Subtract expected stddev
  52  	stddev -= exp
  53  	if stddev < 0 {
  54  		stddev = 0
  55  	}
  56  	stddev *= 1 + exp
  57  
  58  	// Use x^0.4 to give better spread
  59  	entropy := math.Pow(stddev, 0.4)
  60  
  61  	// 50/50 weight between prediction and histogram distribution
  62  	return math.Pow((prediction+entropy)/2, 0.9)
  63  }
  64  
  65  // ShannonEntropyBits returns the number of bits minimum required to represent
  66  // an entropy encoding of the input bytes.
  67  // https://en.wiktionary.org/wiki/Shannon_entropy
  68  func ShannonEntropyBits(b []byte) int {
  69  	if len(b) == 0 {
  70  		return 0
  71  	}
  72  	var hist [256]int
  73  	for _, c := range b {
  74  		hist[c]++
  75  	}
  76  	shannon := float64(0)
  77  	invTotal := 1.0 / float64(len(b))
  78  	for _, v := range hist[:] {
  79  		if v > 0 {
  80  			n := float64(v)
  81  			shannon += math.Ceil(-math.Log2(n*invTotal) * n)
  82  		}
  83  	}
  84  	return int(math.Ceil(shannon))
  85  }
  86