lattice.mx raw

   1  package lattice
   2  
   3  import (
   4  	"crypto/siphash"
   5  	"fmt"
   6  )
   7  
   8  type Branch uint8
   9  
  10  // C=8: 8 branches, one per semantic axis.
  11  // Bnoun/Bverb/Bmodifier are preserved as aliases for the primary axes
  12  // so existing transdb code compiles without changes during migration.
  13  const (
  14  	Bsemantic   Branch = 0 // axis 0: ontological category (animacy, abstractness)
  15  	Bgrammatical Branch = 1 // axis 1: syntactic role (nominal/verbal/function)
  16  	Bcooccur    Branch = 2 // axis 2: PVN co-occurrence context
  17  	Bmorphology Branch = 3 // axis 3: morphological state (tense/polarity/aspect/…)
  18  	Bpragmatic  Branch = 4 // axis 4: domain/register context
  19  	Bphonology  Branch = 5 // axis 5: phonological (dormant — all coordinates 0)
  20  	Bvalency    Branch = 6 // axis 6: argument count (intransitive/transitive/…)
  21  	Bregister   Branch = 7 // axis 7: social register (neutral/polite/honorific/…)
  22  
  23  	// Aliases for backward compatibility with C=3 code.
  24  	Bnoun     = Bgrammatical // grammatical nominal → was Bnoun
  25  	Bverb     = Bmorphology  // morphological verbal → was Bverb
  26  	Bmodifier = Bpragmatic   // pragmatic context → was Bmodifier
  27  )
  28  
  29  const C = 8 // coordination number — node holds up to C keys
  30  
  31  const NullIndex uint32 = 0xFFFFFFFF
  32  const NullRec   uint32 = 0xFFFFFFFF
  33  
  34  // Key is a 128-bit SipHash key.
  35  type Key [2]uint64
  36  
  37  // KeyZero is the zero key (used as NullKey sentinel in comparisons).
  38  var KeyZero = Key{0, 0}
  39  
  40  func keyLess(a, b Key) bool {
  41  	if a[0] != b[0] {
  42  		return a[0] < b[0]
  43  	}
  44  	return a[1] < b[1]
  45  }
  46  
  47  func keyEqual(a, b Key) bool {
  48  	return a[0] == b[0] && a[1] == b[1]
  49  }
  50  
  51  // Node is 200 bytes. Coordination number C=8: 8 × 128-bit keys, 9 children.
  52  // Flags byte: bit 0 = deleted, bits 1-3 = branch (0-7), bits 4-7 = key count.
  53  type Node struct {
  54  	Keys     [C]Key
  55  	RecPtrs  [C]uint32
  56  	Children [C + 1]uint32
  57  	Mult     uint8
  58  	Flags    uint8
  59  	_pad     [2]byte
  60  }
  61  
  62  const (
  63  	flagDeleted    uint8 = 0x01
  64  	flagBranchMask uint8 = 0x0E // bits 1-3 encode branch 0-7
  65  )
  66  
  67  func (n *Node) KeyCount() int      { return int(n.Flags >> 4) }
  68  func (n *Node) SetCount(k int)     { n.Flags = (n.Flags & 0x0F) | uint8(k<<4) }
  69  func (n *Node) IsDeleted() bool    { return n.Flags&flagDeleted != 0 }
  70  func (n *Node) SetDeleted()        { n.Flags |= flagDeleted }
  71  func (n *Node) GetBranch() Branch  { return Branch((n.Flags & flagBranchMask) >> 1) }
  72  func (n *Node) SetBranch(b Branch) { n.Flags = (n.Flags &^ flagBranchMask) | uint8(b<<1) }
  73  
  74  func (n *Node) IsLeaf() bool {
  75  	for i := 0; i <= n.KeyCount(); i++ {
  76  		if n.Children[i] != NullIndex {
  77  			return false
  78  		}
  79  	}
  80  	return true
  81  }
  82  
  83  func initNodeChildren(n *Node) {
  84  	for i := range n.Children {
  85  		n.Children[i] = NullIndex
  86  	}
  87  	for i := range n.RecPtrs {
  88  		n.RecPtrs[i] = NullRec
  89  	}
  90  }
  91  
  92  // Record is 48 bytes. Cross-branch links live here (stable across B-tree splits).
  93  type Record struct {
  94  	DataFile uint32
  95  	DataOff  uint32
  96  	DataLen  uint32
  97  	Link     [2]uint32
  98  	Branch   uint8
  99  	_rpad    [3]byte
 100  	Inline   [24]byte
 101  }
 102  
 103  func (r *Record) IsInline() bool { return r.DataFile == 0 && r.DataLen <= 24 }
 104  
 105  func (r *Record) SetInline(data []byte) {
 106  	r.DataFile = 0
 107  	r.DataOff = 0
 108  	r.DataLen = uint32(len(data))
 109  	copy(r.Inline[:], data)
 110  }
 111  
 112  func (r *Record) SetOverflow(file, off, length uint32) {
 113  	r.DataFile = file
 114  	r.DataOff = off
 115  	r.DataLen = length
 116  }
 117  
 118  func (r *Record) InlineData() []byte {
 119  	if r.DataLen > 24 {
 120  		return nil
 121  	}
 122  	return r.Inline[:r.DataLen]
 123  }
 124  
 125  func (r *Record) initLinks() {
 126  	r.Link[0] = NullRec
 127  	r.Link[1] = NullRec
 128  }
 129  
 130  type PendingExp struct {
 131  	Branch Branch
 132  	Depth  uint16
 133  	Factor uint8
 134  }
 135  
 136  type Tree struct {
 137  	Roots     [C]uint32
 138  	FreeHead  uint32
 139  	FreeCount uint32
 140  	nCount    uint32
 141  	rCount    uint32
 142  	Pending   []PendingExp
 143  	RecKey    map[uint32]Key
 144  	cache     *PageCache
 145  	Nodes     []Node
 146  	Records   []Record
 147  }
 148  
 149  func (t *Tree) RootIdx(branch Branch) uint32 { return t.Roots[branch] }
 150  
 151  func (t *Tree) GetRecord(idx uint32) *Record { return t.getRecord(idx) }
 152  func (t *Tree) GetNode(idx uint32) *Node     { return t.getNode(idx) }
 153  
 154  // IsMemory returns true if the tree has no disk backing (created with NewTree).
 155  func (t *Tree) IsMemory() bool { return t.cache == nil }
 156  
 157  // AllocTree creates a tree with pre-allocated node and record slices for
 158  // deserialization from a flat binary file. roots is indexed by Branch constant.
 159  func AllocTree(nCount, rCount uint32, roots [C]uint32) *Tree {
 160  	t := &Tree{
 161  		Nodes:    []Node{:int(nCount):int(nCount)},
 162  		Records:  []Record{:int(rCount):int(rCount)},
 163  		FreeHead: NullIndex,
 164  		nCount:   nCount,
 165  		rCount:   rCount,
 166  		RecKey:   map[uint32]Key{},
 167  		Roots:    roots,
 168  	}
 169  	return t
 170  }
 171  
 172  // WalkPath returns the node indices visited from root to the target key
 173  // in branch's tree. Returns nil if key is not found.
 174  func (t *Tree) WalkPath(branch Branch, key Key) []uint32 {
 175  	path := []uint32{:0:8}
 176  	idx := t.Roots[branch]
 177  	for {
 178  		path = append(path, idx)
 179  		node := t.getNode(idx)
 180  		k := node.KeyCount()
 181  		p := 0
 182  		for p < k && keyLess(node.Keys[p], key) {
 183  			p++
 184  		}
 185  		if p < k && keyEqual(key, node.Keys[p]) {
 186  			return path
 187  		}
 188  		if node.IsLeaf() {
 189  			return nil
 190  		}
 191  		child := node.Children[p]
 192  		if child == NullIndex {
 193  			return nil
 194  		}
 195  		idx = child
 196  	}
 197  }
 198  
 199  func (t *Tree) getNode(idx uint32) *Node {
 200  	if t.cache != nil {
 201  		return t.cache.GetNode(idx)
 202  	}
 203  	return &t.Nodes[idx]
 204  }
 205  
 206  func (t *Tree) getRecord(idx uint32) *Record {
 207  	if t.cache != nil {
 208  		return t.cache.GetRecord(idx)
 209  	}
 210  	return &t.Records[idx]
 211  }
 212  
 213  func (t *Tree) markNodeDirty(idx uint32) {
 214  	if t.cache != nil {
 215  		t.cache.MarkNodeDirty(idx)
 216  	}
 217  }
 218  
 219  func (t *Tree) markRecDirty(idx uint32) {
 220  	if t.cache != nil {
 221  		t.cache.MarkRecDirty(idx)
 222  	}
 223  }
 224  
 225  func NewTree(cap int) *Tree {
 226  	if cap < 16 {
 227  		cap = 16
 228  	}
 229  	t := &Tree{
 230  		Nodes:    []Node{:C:cap},
 231  		Records:  []Record{:0:cap},
 232  		FreeHead: NullIndex,
 233  		nCount:   C,
 234  		RecKey:   map[uint32]Key{},
 235  	}
 236  	for i := 0; i < C; i++ {
 237  		t.Roots[i] = uint32(i)
 238  		initNodeChildren(&t.Nodes[i])
 239  		t.Nodes[i].SetBranch(Branch(i))
 240  		t.Nodes[i].Mult = 1
 241  	}
 242  	return t
 243  }
 244  
 245  func (t *Tree) allocNode() uint32 {
 246  	if t.FreeHead != NullIndex {
 247  		idx := t.FreeHead
 248  		node := t.getNode(idx)
 249  		t.FreeHead = node.Children[0]
 250  		t.FreeCount--
 251  		*node = Node{}
 252  		initNodeChildren(node)
 253  		t.markNodeDirty(idx)
 254  		return idx
 255  	}
 256  	if t.cache != nil {
 257  		return t.cache.AllocNode(&t.nCount)
 258  	}
 259  	idx := t.nCount
 260  	t.nCount++
 261  	if int(idx) < len(t.Nodes) {
 262  		t.Nodes[idx] = Node{}
 263  		initNodeChildren(&t.Nodes[idx])
 264  		return idx
 265  	}
 266  	var n Node
 267  	initNodeChildren(&n)
 268  	t.Nodes = append(t.Nodes, n)
 269  	return idx
 270  }
 271  
 272  func (t *Tree) allocRecord() uint32 {
 273  	if t.cache != nil {
 274  		return t.cache.AllocRecord(&t.rCount)
 275  	}
 276  	idx := t.rCount
 277  	t.rCount++
 278  	if int(idx) < len(t.Records) {
 279  		t.Records[idx] = Record{}
 280  		t.Records[idx].Link[0] = NullRec
 281  		t.Records[idx].Link[1] = NullRec
 282  		return idx
 283  	}
 284  	t.Records = append(t.Records, Record{})
 285  	t.Records[idx].Link[0] = NullRec
 286  	t.Records[idx].Link[1] = NullRec
 287  	return idx
 288  }
 289  
 290  func (t *Tree) NodeCount() int   { return int(t.nCount) }
 291  func (t *Tree) RecordCount() int { return int(t.rCount) }
 292  
 293  func Create(path string) (*Tree, error) {
 294  	pc, err := CreateStore(path)
 295  	if err != nil {
 296  		return nil, err
 297  	}
 298  	t := &Tree{
 299  		FreeHead:  NullIndex,
 300  		FreeCount: 0,
 301  		nCount:    C,
 302  		RecKey:    map[uint32]Key{},
 303  		cache:     pc,
 304  	}
 305  	for i := 0; i < C; i++ {
 306  		t.Roots[i] = uint32(i)
 307  	}
 308  	return t, nil
 309  }
 310  
 311  func Open(path string) (*Tree, error) {
 312  	pc, hdr, err := OpenStore(path)
 313  	if err != nil {
 314  		return nil, err
 315  	}
 316  	t := &Tree{
 317  		Roots:     hdr.roots,
 318  		FreeHead:  hdr.freeHead,
 319  		FreeCount: hdr.freeCount,
 320  		nCount:    hdr.nCount,
 321  		rCount:    hdr.rCount,
 322  		RecKey:    map[uint32]Key{},
 323  		cache:     pc,
 324  	}
 325  	if hdr.pendCount > 0 {
 326  		pendOff := pc.RecOff + int64(hdr.rCount)*48
 327  		t.Pending = []PendingExp{:0:int(hdr.pendCount)}
 328  		for i := uint32(0); i < hdr.pendCount; i++ {
 329  			var buf [4]byte
 330  			pc.F.ReadAt(buf[:], pendOff+int64(i)*4)
 331  			t.Pending = append(t.Pending, PendingExp{
 332  				Branch: Branch(buf[0]),
 333  				Depth:  le.Uint16(buf[1:3]),
 334  				Factor: buf[3],
 335  			})
 336  		}
 337  	}
 338  	t.rebuildRecKey()
 339  	return t, nil
 340  }
 341  
 342  func (t *Tree) rebuildRecKey() {
 343  	for b := 0; b < C; b++ {
 344  		t.walkRecKey(t.Roots[b])
 345  	}
 346  }
 347  
 348  func (t *Tree) walkRecKey(idx uint32) {
 349  	if idx == NullIndex {
 350  		return
 351  	}
 352  	node := t.getNode(idx)
 353  	if node.IsDeleted() {
 354  		return
 355  	}
 356  	k := node.KeyCount()
 357  	for i := 0; i < k; i++ {
 358  		if node.RecPtrs[i] != NullRec {
 359  			t.RecKey[node.RecPtrs[i]] = node.Keys[i]
 360  		}
 361  	}
 362  	for i := 0; i <= k; i++ {
 363  		if node.Children[i] != NullIndex {
 364  			t.walkRecKey(node.Children[i])
 365  		}
 366  	}
 367  }
 368  
 369  func (t *Tree) Flush() error {
 370  	if t.cache == nil {
 371  		return nil
 372  	}
 373  	return t.cache.Flush(t)
 374  }
 375  
 376  func (t *Tree) Close() error {
 377  	if t.cache == nil {
 378  		return nil
 379  	}
 380  	return t.cache.Close(t)
 381  }
 382  
 383  // LookupRecIdx returns the record index for a key, or NullRec if not found.
 384  func (t *Tree) LookupRecIdx(branch Branch, key Key) uint32 {
 385  	idx, pos, found := t.Walk(branch, key)
 386  	if !found {
 387  		return NullRec
 388  	}
 389  	node := t.getNode(idx)
 390  	if node.IsDeleted() {
 391  		return NullRec
 392  	}
 393  	return node.RecPtrs[pos]
 394  }
 395  
 396  func (t *Tree) DumpNode(idx uint32) {
 397  	n := t.getNode(idx)
 398  	fmt.Printf("node[%d] keys=%d branch=%d leaf=%v children=[%x,%x,%x,%x] rec=[%x,%x,%x]\n",
 399  		idx, n.KeyCount(), n.GetBranch(), n.IsLeaf(),
 400  		n.Children[0], n.Children[1], n.Children[2], n.Children[3],
 401  		n.RecPtrs[0], n.RecPtrs[1], n.RecPtrs[2])
 402  }
 403  
 404  // HashKey returns a 128-bit SipHash key for data using the default seed.
 405  // Convenience wrapper for callers that don't need domain separation.
 406  // For domain-separated keys (e.g. EN vs JA in transdb), prepend a domain
 407  // byte to data before calling.
 408  func HashKey(data []byte) Key {
 409  	return Key(siphash.Sum128(siphash.DefaultKey, data))
 410  }
 411