package lattice import ( "crypto/siphash" "fmt" ) type Branch uint8 // C=8: 8 branches, one per semantic axis. // Bnoun/Bverb/Bmodifier are preserved as aliases for the primary axes // so existing transdb code compiles without changes during migration. const ( Bsemantic Branch = 0 // axis 0: ontological category (animacy, abstractness) Bgrammatical Branch = 1 // axis 1: syntactic role (nominal/verbal/function) Bcooccur Branch = 2 // axis 2: PVN co-occurrence context Bmorphology Branch = 3 // axis 3: morphological state (tense/polarity/aspect/…) Bpragmatic Branch = 4 // axis 4: domain/register context Bphonology Branch = 5 // axis 5: phonological (dormant — all coordinates 0) Bvalency Branch = 6 // axis 6: argument count (intransitive/transitive/…) Bregister Branch = 7 // axis 7: social register (neutral/polite/honorific/…) // Aliases for backward compatibility with C=3 code. Bnoun = Bgrammatical // grammatical nominal → was Bnoun Bverb = Bmorphology // morphological verbal → was Bverb Bmodifier = Bpragmatic // pragmatic context → was Bmodifier ) const C = 8 // coordination number — node holds up to C keys const NullIndex uint32 = 0xFFFFFFFF const NullRec uint32 = 0xFFFFFFFF // Key is a 128-bit SipHash key. type Key [2]uint64 // KeyZero is the zero key (used as NullKey sentinel in comparisons). var KeyZero = Key{0, 0} func keyLess(a, b Key) bool { if a[0] != b[0] { return a[0] < b[0] } return a[1] < b[1] } func keyEqual(a, b Key) bool { return a[0] == b[0] && a[1] == b[1] } // Node is 200 bytes. Coordination number C=8: 8 × 128-bit keys, 9 children. // Flags byte: bit 0 = deleted, bits 1-3 = branch (0-7), bits 4-7 = key count. type Node struct { Keys [C]Key RecPtrs [C]uint32 Children [C + 1]uint32 Mult uint8 Flags uint8 _pad [2]byte } const ( flagDeleted uint8 = 0x01 flagBranchMask uint8 = 0x0E // bits 1-3 encode branch 0-7 ) func (n *Node) KeyCount() int { return int(n.Flags >> 4) } func (n *Node) SetCount(k int) { n.Flags = (n.Flags & 0x0F) | uint8(k<<4) } func (n *Node) IsDeleted() bool { return n.Flags&flagDeleted != 0 } func (n *Node) SetDeleted() { n.Flags |= flagDeleted } func (n *Node) GetBranch() Branch { return Branch((n.Flags & flagBranchMask) >> 1) } func (n *Node) SetBranch(b Branch) { n.Flags = (n.Flags &^ flagBranchMask) | uint8(b<<1) } func (n *Node) IsLeaf() bool { for i := 0; i <= n.KeyCount(); i++ { if n.Children[i] != NullIndex { return false } } return true } func initNodeChildren(n *Node) { for i := range n.Children { n.Children[i] = NullIndex } for i := range n.RecPtrs { n.RecPtrs[i] = NullRec } } // Record is 48 bytes. Cross-branch links live here (stable across B-tree splits). type Record struct { DataFile uint32 DataOff uint32 DataLen uint32 Link [2]uint32 Branch uint8 _rpad [3]byte Inline [24]byte } func (r *Record) IsInline() bool { return r.DataFile == 0 && r.DataLen <= 24 } func (r *Record) SetInline(data []byte) { r.DataFile = 0 r.DataOff = 0 r.DataLen = uint32(len(data)) copy(r.Inline[:], data) } func (r *Record) SetOverflow(file, off, length uint32) { r.DataFile = file r.DataOff = off r.DataLen = length } func (r *Record) InlineData() []byte { if r.DataLen > 24 { return nil } return r.Inline[:r.DataLen] } func (r *Record) initLinks() { r.Link[0] = NullRec r.Link[1] = NullRec } type PendingExp struct { Branch Branch Depth uint16 Factor uint8 } type Tree struct { Roots [C]uint32 FreeHead uint32 FreeCount uint32 nCount uint32 rCount uint32 Pending []PendingExp RecKey map[uint32]Key cache *PageCache Nodes []Node Records []Record } func (t *Tree) RootIdx(branch Branch) uint32 { return t.Roots[branch] } func (t *Tree) GetRecord(idx uint32) *Record { return t.getRecord(idx) } func (t *Tree) GetNode(idx uint32) *Node { return t.getNode(idx) } // IsMemory returns true if the tree has no disk backing (created with NewTree). func (t *Tree) IsMemory() bool { return t.cache == nil } // AllocTree creates a tree with pre-allocated node and record slices for // deserialization from a flat binary file. roots is indexed by Branch constant. func AllocTree(nCount, rCount uint32, roots [C]uint32) *Tree { t := &Tree{ Nodes: []Node{:int(nCount):int(nCount)}, Records: []Record{:int(rCount):int(rCount)}, FreeHead: NullIndex, nCount: nCount, rCount: rCount, RecKey: map[uint32]Key{}, Roots: roots, } return t } // WalkPath returns the node indices visited from root to the target key // in branch's tree. Returns nil if key is not found. func (t *Tree) WalkPath(branch Branch, key Key) []uint32 { path := []uint32{:0:8} idx := t.Roots[branch] for { path = append(path, idx) node := t.getNode(idx) k := node.KeyCount() p := 0 for p < k && keyLess(node.Keys[p], key) { p++ } if p < k && keyEqual(key, node.Keys[p]) { return path } if node.IsLeaf() { return nil } child := node.Children[p] if child == NullIndex { return nil } idx = child } } func (t *Tree) getNode(idx uint32) *Node { if t.cache != nil { return t.cache.GetNode(idx) } return &t.Nodes[idx] } func (t *Tree) getRecord(idx uint32) *Record { if t.cache != nil { return t.cache.GetRecord(idx) } return &t.Records[idx] } func (t *Tree) markNodeDirty(idx uint32) { if t.cache != nil { t.cache.MarkNodeDirty(idx) } } func (t *Tree) markRecDirty(idx uint32) { if t.cache != nil { t.cache.MarkRecDirty(idx) } } func NewTree(cap int) *Tree { if cap < 16 { cap = 16 } t := &Tree{ Nodes: []Node{:C:cap}, Records: []Record{:0:cap}, FreeHead: NullIndex, nCount: C, RecKey: map[uint32]Key{}, } for i := 0; i < C; i++ { t.Roots[i] = uint32(i) initNodeChildren(&t.Nodes[i]) t.Nodes[i].SetBranch(Branch(i)) t.Nodes[i].Mult = 1 } return t } func (t *Tree) allocNode() uint32 { if t.FreeHead != NullIndex { idx := t.FreeHead node := t.getNode(idx) t.FreeHead = node.Children[0] t.FreeCount-- *node = Node{} initNodeChildren(node) t.markNodeDirty(idx) return idx } if t.cache != nil { return t.cache.AllocNode(&t.nCount) } idx := t.nCount t.nCount++ if int(idx) < len(t.Nodes) { t.Nodes[idx] = Node{} initNodeChildren(&t.Nodes[idx]) return idx } var n Node initNodeChildren(&n) t.Nodes = append(t.Nodes, n) return idx } func (t *Tree) allocRecord() uint32 { if t.cache != nil { return t.cache.AllocRecord(&t.rCount) } idx := t.rCount t.rCount++ if int(idx) < len(t.Records) { t.Records[idx] = Record{} t.Records[idx].Link[0] = NullRec t.Records[idx].Link[1] = NullRec return idx } t.Records = append(t.Records, Record{}) t.Records[idx].Link[0] = NullRec t.Records[idx].Link[1] = NullRec return idx } func (t *Tree) NodeCount() int { return int(t.nCount) } func (t *Tree) RecordCount() int { return int(t.rCount) } func Create(path string) (*Tree, error) { pc, err := CreateStore(path) if err != nil { return nil, err } t := &Tree{ FreeHead: NullIndex, FreeCount: 0, nCount: C, RecKey: map[uint32]Key{}, cache: pc, } for i := 0; i < C; i++ { t.Roots[i] = uint32(i) } return t, nil } func Open(path string) (*Tree, error) { pc, hdr, err := OpenStore(path) if err != nil { return nil, err } t := &Tree{ Roots: hdr.roots, FreeHead: hdr.freeHead, FreeCount: hdr.freeCount, nCount: hdr.nCount, rCount: hdr.rCount, RecKey: map[uint32]Key{}, cache: pc, } if hdr.pendCount > 0 { pendOff := pc.RecOff + int64(hdr.rCount)*48 t.Pending = []PendingExp{:0:int(hdr.pendCount)} for i := uint32(0); i < hdr.pendCount; i++ { var buf [4]byte pc.F.ReadAt(buf[:], pendOff+int64(i)*4) t.Pending = append(t.Pending, PendingExp{ Branch: Branch(buf[0]), Depth: le.Uint16(buf[1:3]), Factor: buf[3], }) } } t.rebuildRecKey() return t, nil } func (t *Tree) rebuildRecKey() { for b := 0; b < C; b++ { t.walkRecKey(t.Roots[b]) } } func (t *Tree) walkRecKey(idx uint32) { if idx == NullIndex { return } node := t.getNode(idx) if node.IsDeleted() { return } k := node.KeyCount() for i := 0; i < k; i++ { if node.RecPtrs[i] != NullRec { t.RecKey[node.RecPtrs[i]] = node.Keys[i] } } for i := 0; i <= k; i++ { if node.Children[i] != NullIndex { t.walkRecKey(node.Children[i]) } } } func (t *Tree) Flush() error { if t.cache == nil { return nil } return t.cache.Flush(t) } func (t *Tree) Close() error { if t.cache == nil { return nil } return t.cache.Close(t) } // LookupRecIdx returns the record index for a key, or NullRec if not found. func (t *Tree) LookupRecIdx(branch Branch, key Key) uint32 { idx, pos, found := t.Walk(branch, key) if !found { return NullRec } node := t.getNode(idx) if node.IsDeleted() { return NullRec } return node.RecPtrs[pos] } func (t *Tree) DumpNode(idx uint32) { n := t.getNode(idx) fmt.Printf("node[%d] keys=%d branch=%d leaf=%v children=[%x,%x,%x,%x] rec=[%x,%x,%x]\n", idx, n.KeyCount(), n.GetBranch(), n.IsLeaf(), n.Children[0], n.Children[1], n.Children[2], n.Children[3], n.RecPtrs[0], n.RecPtrs[1], n.RecPtrs[2]) } // HashKey returns a 128-bit SipHash key for data using the default seed. // Convenience wrapper for callers that don't need domain separation. // For domain-separated keys (e.g. EN vs JA in transdb), prepend a domain // byte to data before calling. func HashKey(data []byte) Key { return Key(siphash.Sum128(siphash.DefaultKey, data)) }