hll.go raw

   1  package hyperloglog
   2  
   3  import (
   4  	"encoding/binary"
   5  	"encoding/hex"
   6  	"fmt"
   7  )
   8  
   9  // Everything is hardcoded to use precision 8, i.e. 256 registers.
  10  type HyperLogLog struct {
  11  	offset    int
  12  	registers []uint8
  13  }
  14  
  15  func New(offset int) *HyperLogLog {
  16  	if offset < 0 || offset > 32-8 {
  17  		panic(fmt.Errorf("invalid offset %d", offset))
  18  	}
  19  
  20  	// precision is always 8
  21  	// the number of registers is always 256 (1<<8)
  22  	hll := &HyperLogLog{offset: offset}
  23  	hll.registers = make([]uint8, 256)
  24  	return hll
  25  }
  26  
  27  func NewWithRegisters(registers []byte, offset int) *HyperLogLog {
  28  	if offset < 0 || offset > 32-8 {
  29  		panic(fmt.Errorf("invalid offset %d", offset))
  30  	}
  31  	if len(registers) != 256 {
  32  		panic(fmt.Errorf("invalid number of registers %d", len(registers)))
  33  	}
  34  	return &HyperLogLog{registers: registers, offset: offset}
  35  }
  36  
  37  func (hll *HyperLogLog) GetRegisters() []byte    { return hll.registers }
  38  func (hll *HyperLogLog) SetRegisters(enc []byte) { hll.registers = enc }
  39  func (hll *HyperLogLog) MergeRegisters(other []byte) {
  40  	for i, v := range other {
  41  		if v > hll.registers[i] {
  42  			hll.registers[i] = v
  43  		}
  44  	}
  45  }
  46  
  47  func (hll *HyperLogLog) Clear() {
  48  	for i := range hll.registers {
  49  		hll.registers[i] = 0
  50  	}
  51  }
  52  
  53  // Add takes a Nostr event pubkey which will be used as the item "key" (that combined with the offset)
  54  func (hll *HyperLogLog) Add(pubkey string) {
  55  	x, _ := hex.DecodeString(pubkey[hll.offset*2 : hll.offset*2+8*2])
  56  	j := x[0] // register address (first 8 bits, i.e. first byte)
  57  
  58  	w := binary.BigEndian.Uint64(x) // number that we will use
  59  	zeroBits := clz56(w) + 1        // count zeroes (skip the first byte, so only use 56 bits)
  60  
  61  	if zeroBits > hll.registers[j] {
  62  		hll.registers[j] = zeroBits
  63  	}
  64  }
  65  
  66  // AddBytes is like Add, but takes pubkey as bytes instead of as string
  67  func (hll *HyperLogLog) AddBytes(pubkey []byte) {
  68  	x := pubkey[hll.offset : hll.offset+8]
  69  	j := x[0] // register address (first 8 bits, i.e. first byte)
  70  
  71  	w := binary.BigEndian.Uint64(x) // number that we will use
  72  	zeroBits := clz56(w) + 1        // count zeroes (skip the first byte, so only use 56 bits)
  73  
  74  	if zeroBits > hll.registers[j] {
  75  		hll.registers[j] = zeroBits
  76  	}
  77  }
  78  
  79  func (hll *HyperLogLog) Merge(other *HyperLogLog) {
  80  	for i, v := range other.registers {
  81  		if v > hll.registers[i] {
  82  			hll.registers[i] = v
  83  		}
  84  	}
  85  }
  86  
  87  func (hll *HyperLogLog) Count() uint64 {
  88  	v := countZeros(hll.registers)
  89  
  90  	if v != 0 {
  91  		lc := linearCounting(256 /* nregisters */, v)
  92  
  93  		if lc <= 220 /* threshold */ {
  94  			return uint64(lc)
  95  		}
  96  	}
  97  
  98  	est := hll.calculateEstimate()
  99  	if est <= 256 /* nregisters */ *3 {
 100  		if v != 0 {
 101  			return uint64(linearCounting(256 /* nregisters */, v))
 102  		}
 103  	}
 104  
 105  	return uint64(est)
 106  }
 107  
 108  func (hll HyperLogLog) calculateEstimate() float64 {
 109  	sum := 0.0
 110  	for _, val := range hll.registers {
 111  		sum += 1.0 / float64(uint64(1)<<val) // this is the same as 2^(-val)
 112  	}
 113  
 114  	return 0.7182725932495458 /* alpha for 256 registers */ * 256 /* nregisters */ * 256 /* nregisters */ / sum
 115  }
 116