bbloom.go raw

   1  // The MIT License (MIT)
   2  // Copyright (c) 2014 Andreas Briese, eduToolbox@Bri-C GmbH, Sarstedt
   3  
   4  // Permission is hereby granted, free of charge, to any person obtaining a copy of
   5  // this software and associated documentation files (the "Software"), to deal in
   6  // the Software without restriction, including without limitation the rights to
   7  // use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
   8  // the Software, and to permit persons to whom the Software is furnished to do so,
   9  // subject to the following conditions:
  10  
  11  // The above copyright notice and this permission notice shall be included in all
  12  // copies or substantial portions of the Software.
  13  
  14  // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
  16  // FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
  17  // COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
  18  // IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  19  // CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  20  
  21  package z
  22  
  23  import (
  24  	"bytes"
  25  	"encoding/json"
  26  	"log"
  27  	"math"
  28  	"unsafe"
  29  )
  30  
  31  // helper
  32  var mask = []uint8{1, 2, 4, 8, 16, 32, 64, 128}
  33  
  34  func getSize(ui64 uint64) (size uint64, exponent uint64) {
  35  	if ui64 < uint64(512) {
  36  		ui64 = uint64(512)
  37  	}
  38  	size = uint64(1)
  39  	for size < ui64 {
  40  		size <<= 1
  41  		exponent++
  42  	}
  43  	return size, exponent
  44  }
  45  
  46  func calcSizeByWrongPositives(numEntries, wrongs float64) (uint64, uint64) {
  47  	size := -1 * numEntries * math.Log(wrongs) / math.Pow(float64(0.69314718056), 2)
  48  	locs := math.Ceil(float64(0.69314718056) * size / numEntries)
  49  	return uint64(size), uint64(locs)
  50  }
  51  
  52  // NewBloomFilter returns a new bloomfilter.
  53  func NewBloomFilter(params ...float64) (bloomfilter *Bloom) {
  54  	var entries, locs uint64
  55  	if len(params) == 2 {
  56  		if params[1] < 1 {
  57  			entries, locs = calcSizeByWrongPositives(params[0], params[1])
  58  		} else {
  59  			entries, locs = uint64(params[0]), uint64(params[1])
  60  		}
  61  	} else {
  62  		log.Fatal("usage: New(float64(number_of_entries), float64(number_of_hashlocations))" +
  63  			" i.e. New(float64(1000), float64(3)) or New(float64(number_of_entries)," +
  64  			" float64(number_of_hashlocations)) i.e. New(float64(1000), float64(0.03))")
  65  	}
  66  	size, exponent := getSize(entries)
  67  	bloomfilter = &Bloom{
  68  		sizeExp: exponent,
  69  		size:    size - 1,
  70  		setLocs: locs,
  71  		shift:   64 - exponent,
  72  	}
  73  	bloomfilter.Size(size)
  74  	return bloomfilter
  75  }
  76  
  77  // Bloom filter
  78  type Bloom struct {
  79  	bitset  []uint64
  80  	ElemNum uint64
  81  	sizeExp uint64
  82  	size    uint64
  83  	setLocs uint64
  84  	shift   uint64
  85  }
  86  
  87  // <--- http://www.cse.yorku.ca/~oz/hash.html
  88  // modified Berkeley DB Hash (32bit)
  89  // hash is casted to l, h = 16bit fragments
  90  // func (bl Bloom) absdbm(b *[]byte) (l, h uint64) {
  91  // 	hash := uint64(len(*b))
  92  // 	for _, c := range *b {
  93  // 		hash = uint64(c) + (hash << 6) + (hash << bl.sizeExp) - hash
  94  // 	}
  95  // 	h = hash >> bl.shift
  96  // 	l = hash << bl.shift >> bl.shift
  97  // 	return l, h
  98  // }
  99  
 100  // Add adds hash of a key to the bloomfilter.
 101  func (bl *Bloom) Add(hash uint64) {
 102  	h := hash >> bl.shift
 103  	l := hash << bl.shift >> bl.shift
 104  	for i := uint64(0); i < bl.setLocs; i++ {
 105  		bl.Set((h + i*l) & bl.size)
 106  		bl.ElemNum++
 107  	}
 108  }
 109  
 110  // Has checks if bit(s) for entry hash is/are set,
 111  // returns true if the hash was added to the Bloom Filter.
 112  func (bl Bloom) Has(hash uint64) bool {
 113  	h := hash >> bl.shift
 114  	l := hash << bl.shift >> bl.shift
 115  	for i := uint64(0); i < bl.setLocs; i++ {
 116  		if !bl.IsSet((h + i*l) & bl.size) {
 117  			return false
 118  		}
 119  	}
 120  	return true
 121  }
 122  
 123  // AddIfNotHas only Adds hash, if it's not present in the bloomfilter.
 124  // Returns true if hash was added.
 125  // Returns false if hash was already registered in the bloomfilter.
 126  func (bl *Bloom) AddIfNotHas(hash uint64) bool {
 127  	if bl.Has(hash) {
 128  		return false
 129  	}
 130  	bl.Add(hash)
 131  	return true
 132  }
 133  
 134  // TotalSize returns the total size of the bloom filter.
 135  func (bl *Bloom) TotalSize() int {
 136  	// The bl struct has 5 members and each one is 8 byte. The bitset is a
 137  	// uint64 byte slice.
 138  	return len(bl.bitset)*8 + 5*8
 139  }
 140  
 141  // Size makes Bloom filter with as bitset of size sz.
 142  func (bl *Bloom) Size(sz uint64) {
 143  	bl.bitset = make([]uint64, sz>>6)
 144  }
 145  
 146  // Clear resets the Bloom filter.
 147  func (bl *Bloom) Clear() {
 148  	for i := range bl.bitset {
 149  		bl.bitset[i] = 0
 150  	}
 151  }
 152  
 153  // Set sets the bit[idx] of bitset.
 154  func (bl *Bloom) Set(idx uint64) {
 155  	ptr := unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[idx>>6])) + uintptr((idx%64)>>3))
 156  	*(*uint8)(ptr) |= mask[idx%8]
 157  }
 158  
 159  // IsSet checks if bit[idx] of bitset is set, returns true/false.
 160  func (bl *Bloom) IsSet(idx uint64) bool {
 161  	ptr := unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[idx>>6])) + uintptr((idx%64)>>3))
 162  	r := ((*(*uint8)(ptr)) >> (idx % 8)) & 1
 163  	return r == 1
 164  }
 165  
 166  // bloomJSONImExport
 167  // Im/Export structure used by JSONMarshal / JSONUnmarshal
 168  type bloomJSONImExport struct {
 169  	FilterSet []byte
 170  	SetLocs   uint64
 171  }
 172  
 173  // NewWithBoolset takes a []byte slice and number of locs per entry,
 174  // returns the bloomfilter with a bitset populated according to the input []byte.
 175  func newWithBoolset(bs *[]byte, locs uint64) *Bloom {
 176  	bloomfilter := NewBloomFilter(float64(len(*bs)<<3), float64(locs))
 177  	for i, b := range *bs {
 178  		*(*uint8)(unsafe.Pointer(uintptr(unsafe.Pointer(&bloomfilter.bitset[0])) + uintptr(i))) = b
 179  	}
 180  	return bloomfilter
 181  }
 182  
 183  // JSONUnmarshal takes JSON-Object (type bloomJSONImExport) as []bytes
 184  // returns bloom32 / bloom64 object.
 185  func JSONUnmarshal(dbData []byte) (*Bloom, error) {
 186  	bloomImEx := bloomJSONImExport{}
 187  	if err := json.Unmarshal(dbData, &bloomImEx); err != nil {
 188  		return nil, err
 189  	}
 190  	buf := bytes.NewBuffer(bloomImEx.FilterSet)
 191  	bs := buf.Bytes()
 192  	bf := newWithBoolset(&bs, bloomImEx.SetLocs)
 193  	return bf, nil
 194  }
 195  
 196  // JSONMarshal returns JSON-object (type bloomJSONImExport) as []byte.
 197  func (bl Bloom) JSONMarshal() []byte {
 198  	bloomImEx := bloomJSONImExport{}
 199  	bloomImEx.SetLocs = bl.setLocs
 200  	bloomImEx.FilterSet = make([]byte, len(bl.bitset)<<3)
 201  	for i := range bloomImEx.FilterSet {
 202  		bloomImEx.FilterSet[i] = *(*byte)(unsafe.Pointer(uintptr(unsafe.Pointer(&bl.bitset[0])) +
 203  			uintptr(i)))
 204  	}
 205  	data, err := json.Marshal(bloomImEx)
 206  	if err != nil {
 207  		log.Fatal("json.Marshal failed: ", err)
 208  	}
 209  	return data
 210  }
 211