bloom.go raw

   1  // Copyright 2013 The LevelDB-Go Authors. All rights reserved.
   2  // Use of this source code is governed by a BSD-style
   3  // license that can be found in the LICENSE file.
   4  
   5  package y
   6  
   7  import "math"
   8  
   9  // Filter is an encoded set of []byte keys.
  10  type Filter []byte
  11  
  12  func (f Filter) MayContainKey(k []byte) bool {
  13  	return f.MayContain(Hash(k))
  14  }
  15  
  16  // MayContain returns whether the filter may contain given key. False positives
  17  // are possible, where it returns true for keys not in the original set.
  18  func (f Filter) MayContain(h uint32) bool {
  19  	if len(f) < 2 {
  20  		return false
  21  	}
  22  	k := f[len(f)-1]
  23  	if k > 30 {
  24  		// This is reserved for potentially new encodings for short Bloom filters.
  25  		// Consider it a match.
  26  		return true
  27  	}
  28  	nBits := uint32(8 * (len(f) - 1))
  29  	delta := h>>17 | h<<15
  30  	for j := uint8(0); j < k; j++ {
  31  		bitPos := h % nBits
  32  		if f[bitPos/8]&(1<<(bitPos%8)) == 0 {
  33  			return false
  34  		}
  35  		h += delta
  36  	}
  37  	return true
  38  }
  39  
  40  // NewFilter returns a new Bloom filter that encodes a set of []byte keys with
  41  // the given number of bits per key, approximately.
  42  //
  43  // A good bitsPerKey value is 10, which yields a filter with ~ 1% false
  44  // positive rate.
  45  func NewFilter(keys []uint32, bitsPerKey int) Filter {
  46  	return Filter(appendFilter(nil, keys, bitsPerKey))
  47  }
  48  
  49  // BloomBitsPerKey returns the bits per key required by bloomfilter based on
  50  // the false positive rate.
  51  func BloomBitsPerKey(numEntries int, fp float64) int {
  52  	size := -1 * float64(numEntries) * math.Log(fp) / math.Pow(float64(0.69314718056), 2)
  53  	locs := math.Ceil(float64(0.69314718056) * size / float64(numEntries))
  54  	return int(locs)
  55  }
  56  
  57  func appendFilter(buf []byte, keys []uint32, bitsPerKey int) []byte {
  58  	if bitsPerKey < 0 {
  59  		bitsPerKey = 0
  60  	}
  61  	// 0.69 is approximately ln(2).
  62  	k := uint32(float64(bitsPerKey) * 0.69)
  63  	if k < 1 {
  64  		k = 1
  65  	}
  66  	if k > 30 {
  67  		k = 30
  68  	}
  69  
  70  	nBits := len(keys) * bitsPerKey
  71  	// For small len(keys), we can see a very high false positive rate. Fix it
  72  	// by enforcing a minimum bloom filter length.
  73  	if nBits < 64 {
  74  		nBits = 64
  75  	}
  76  	nBytes := (nBits + 7) / 8
  77  	nBits = nBytes * 8
  78  	buf, filter := extend(buf, nBytes+1)
  79  
  80  	for _, h := range keys {
  81  		delta := h>>17 | h<<15
  82  		for j := uint32(0); j < k; j++ {
  83  			bitPos := h % uint32(nBits)
  84  			filter[bitPos/8] |= 1 << (bitPos % 8)
  85  			h += delta
  86  		}
  87  	}
  88  	filter[nBytes] = uint8(k)
  89  
  90  	return buf
  91  }
  92  
  93  // extend appends n zero bytes to b. It returns the overall slice (of length
  94  // n+len(originalB)) and the slice of n trailing zeroes.
  95  func extend(b []byte, n int) (overall, trailer []byte) {
  96  	want := n + len(b)
  97  	if want <= cap(b) {
  98  		overall = b[:want]
  99  		trailer = overall[len(b):]
 100  		for i := range trailer {
 101  			trailer[i] = 0
 102  		}
 103  	} else {
 104  		// Grow the capacity exponentially, with a 1KiB minimum.
 105  		c := 1024
 106  		for c < want {
 107  			c += c / 4
 108  		}
 109  		overall = make([]byte, want, c)
 110  		trailer = overall[len(b):]
 111  		copy(overall, b)
 112  	}
 113  	return overall, trailer
 114  }
 115  
 116  // hash implements a hashing algorithm similar to the Murmur hash.
 117  func Hash(b []byte) uint32 {
 118  	const (
 119  		seed = 0xbc9f1d34
 120  		m    = 0xc6a4a793
 121  	)
 122  	h := uint32(seed) ^ uint32(len(b))*m
 123  	for ; len(b) >= 4; b = b[4:] {
 124  		h += uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24
 125  		h *= m
 126  		h ^= h >> 16
 127  	}
 128  	switch len(b) {
 129  	case 3:
 130  		h += uint32(b[2]) << 16
 131  		fallthrough
 132  	case 2:
 133  		h += uint32(b[1]) << 8
 134  		fallthrough
 135  	case 1:
 136  		h += uint32(b[0])
 137  		h *= m
 138  		h ^= h >> 24
 139  	}
 140  	return h
 141  }
 142  
 143  // FilterPolicy implements the db.FilterPolicy interface from the leveldb/db
 144  // package.
 145  //
 146  // The integer value is the approximate number of bits used per key. A good
 147  // value is 10, which yields a filter with ~ 1% false positive rate.
 148  //
 149  // It is valid to use the other API in this package (leveldb/bloom) without
 150  // using this type or the leveldb/db package.
 151  
 152  // type FilterPolicy int
 153  
 154  // // Name implements the db.FilterPolicy interface.
 155  // func (p FilterPolicy) Name() string {
 156  // 	// This string looks arbitrary, but its value is written to LevelDB .ldb
 157  // 	// files, and should be this exact value to be compatible with those files
 158  // 	// and with the C++ LevelDB code.
 159  // 	return "leveldb.BuiltinBloomFilter2"
 160  // }
 161  
 162  // // AppendFilter implements the db.FilterPolicy interface.
 163  // func (p FilterPolicy) AppendFilter(dst []byte, keys [][]byte) []byte {
 164  // 	return appendFilter(dst, keys, int(p))
 165  // }
 166  
 167  // // MayContain implements the db.FilterPolicy interface.
 168  // func (p FilterPolicy) MayContain(filter, key []byte) bool {
 169  // 	return Filter(filter).MayContain(key)
 170  // }
 171