lattice.mx raw
1 package lattice
2
3 import (
4 "crypto/siphash"
5 "fmt"
6 )
7
8 type Branch uint8
9
10 // C=8: 8 branches, one per semantic axis.
11 // Bnoun/Bverb/Bmodifier are preserved as aliases for the primary axes
12 // so existing transdb code compiles without changes during migration.
13 const (
14 Bsemantic Branch = 0 // axis 0: ontological category (animacy, abstractness)
15 Bgrammatical Branch = 1 // axis 1: syntactic role (nominal/verbal/function)
16 Bcooccur Branch = 2 // axis 2: PVN co-occurrence context
17 Bmorphology Branch = 3 // axis 3: morphological state (tense/polarity/aspect/…)
18 Bpragmatic Branch = 4 // axis 4: domain/register context
19 Bphonology Branch = 5 // axis 5: phonological (dormant — all coordinates 0)
20 Bvalency Branch = 6 // axis 6: argument count (intransitive/transitive/…)
21 Bregister Branch = 7 // axis 7: social register (neutral/polite/honorific/…)
22
23 // Aliases for backward compatibility with C=3 code.
24 Bnoun = Bgrammatical // grammatical nominal → was Bnoun
25 Bverb = Bmorphology // morphological verbal → was Bverb
26 Bmodifier = Bpragmatic // pragmatic context → was Bmodifier
27 )
28
29 const C = 8 // coordination number — node holds up to C keys
30
31 const NullIndex uint32 = 0xFFFFFFFF
32 const NullRec uint32 = 0xFFFFFFFF
33
34 // Key is a 128-bit SipHash key.
35 type Key [2]uint64
36
37 // KeyZero is the zero key (used as NullKey sentinel in comparisons).
38 var KeyZero = Key{0, 0}
39
40 func keyLess(a, b Key) bool {
41 if a[0] != b[0] {
42 return a[0] < b[0]
43 }
44 return a[1] < b[1]
45 }
46
47 func keyEqual(a, b Key) bool {
48 return a[0] == b[0] && a[1] == b[1]
49 }
50
51 // Node is 200 bytes. Coordination number C=8: 8 × 128-bit keys, 9 children.
52 // Flags byte: bit 0 = deleted, bits 1-3 = branch (0-7), bits 4-7 = key count.
53 type Node struct {
54 Keys [C]Key
55 RecPtrs [C]uint32
56 Children [C + 1]uint32
57 Mult uint8
58 Flags uint8
59 _pad [2]byte
60 }
61
62 const (
63 flagDeleted uint8 = 0x01
64 flagBranchMask uint8 = 0x0E // bits 1-3 encode branch 0-7
65 )
66
67 func (n *Node) KeyCount() int { return int(n.Flags >> 4) }
68 func (n *Node) SetCount(k int) { n.Flags = (n.Flags & 0x0F) | uint8(k<<4) }
69 func (n *Node) IsDeleted() bool { return n.Flags&flagDeleted != 0 }
70 func (n *Node) SetDeleted() { n.Flags |= flagDeleted }
71 func (n *Node) GetBranch() Branch { return Branch((n.Flags & flagBranchMask) >> 1) }
72 func (n *Node) SetBranch(b Branch) { n.Flags = (n.Flags &^ flagBranchMask) | uint8(b<<1) }
73
74 func (n *Node) IsLeaf() bool {
75 for i := 0; i <= n.KeyCount(); i++ {
76 if n.Children[i] != NullIndex {
77 return false
78 }
79 }
80 return true
81 }
82
83 func initNodeChildren(n *Node) {
84 for i := range n.Children {
85 n.Children[i] = NullIndex
86 }
87 for i := range n.RecPtrs {
88 n.RecPtrs[i] = NullRec
89 }
90 }
91
92 // Record is 48 bytes. Cross-branch links live here (stable across B-tree splits).
93 type Record struct {
94 DataFile uint32
95 DataOff uint32
96 DataLen uint32
97 Link [2]uint32
98 Branch uint8
99 _rpad [3]byte
100 Inline [24]byte
101 }
102
103 func (r *Record) IsInline() bool { return r.DataFile == 0 && r.DataLen <= 24 }
104
105 func (r *Record) SetInline(data []byte) {
106 r.DataFile = 0
107 r.DataOff = 0
108 r.DataLen = uint32(len(data))
109 copy(r.Inline[:], data)
110 }
111
112 func (r *Record) SetOverflow(file, off, length uint32) {
113 r.DataFile = file
114 r.DataOff = off
115 r.DataLen = length
116 }
117
118 func (r *Record) InlineData() []byte {
119 if r.DataLen > 24 {
120 return nil
121 }
122 return r.Inline[:r.DataLen]
123 }
124
125 func (r *Record) initLinks() {
126 r.Link[0] = NullRec
127 r.Link[1] = NullRec
128 }
129
130 type PendingExp struct {
131 Branch Branch
132 Depth uint16
133 Factor uint8
134 }
135
136 type Tree struct {
137 Roots [C]uint32
138 FreeHead uint32
139 FreeCount uint32
140 nCount uint32
141 rCount uint32
142 Pending []PendingExp
143 RecKey map[uint32]Key
144 cache *PageCache
145 Nodes []Node
146 Records []Record
147 }
148
149 func (t *Tree) RootIdx(branch Branch) uint32 { return t.Roots[branch] }
150
151 func (t *Tree) GetRecord(idx uint32) *Record { return t.getRecord(idx) }
152 func (t *Tree) GetNode(idx uint32) *Node { return t.getNode(idx) }
153
154 // IsMemory returns true if the tree has no disk backing (created with NewTree).
155 func (t *Tree) IsMemory() bool { return t.cache == nil }
156
157 // AllocTree creates a tree with pre-allocated node and record slices for
158 // deserialization from a flat binary file. roots is indexed by Branch constant.
159 func AllocTree(nCount, rCount uint32, roots [C]uint32) *Tree {
160 t := &Tree{
161 Nodes: []Node{:int(nCount):int(nCount)},
162 Records: []Record{:int(rCount):int(rCount)},
163 FreeHead: NullIndex,
164 nCount: nCount,
165 rCount: rCount,
166 RecKey: map[uint32]Key{},
167 Roots: roots,
168 }
169 return t
170 }
171
172 // WalkPath returns the node indices visited from root to the target key
173 // in branch's tree. Returns nil if key is not found.
174 func (t *Tree) WalkPath(branch Branch, key Key) []uint32 {
175 path := []uint32{:0:8}
176 idx := t.Roots[branch]
177 for {
178 path = append(path, idx)
179 node := t.getNode(idx)
180 k := node.KeyCount()
181 p := 0
182 for p < k && keyLess(node.Keys[p], key) {
183 p++
184 }
185 if p < k && keyEqual(key, node.Keys[p]) {
186 return path
187 }
188 if node.IsLeaf() {
189 return nil
190 }
191 child := node.Children[p]
192 if child == NullIndex {
193 return nil
194 }
195 idx = child
196 }
197 }
198
199 func (t *Tree) getNode(idx uint32) *Node {
200 if t.cache != nil {
201 return t.cache.GetNode(idx)
202 }
203 return &t.Nodes[idx]
204 }
205
206 func (t *Tree) getRecord(idx uint32) *Record {
207 if t.cache != nil {
208 return t.cache.GetRecord(idx)
209 }
210 return &t.Records[idx]
211 }
212
213 func (t *Tree) markNodeDirty(idx uint32) {
214 if t.cache != nil {
215 t.cache.MarkNodeDirty(idx)
216 }
217 }
218
219 func (t *Tree) markRecDirty(idx uint32) {
220 if t.cache != nil {
221 t.cache.MarkRecDirty(idx)
222 }
223 }
224
225 func NewTree(cap int) *Tree {
226 if cap < 16 {
227 cap = 16
228 }
229 t := &Tree{
230 Nodes: []Node{:C:cap},
231 Records: []Record{:0:cap},
232 FreeHead: NullIndex,
233 nCount: C,
234 RecKey: map[uint32]Key{},
235 }
236 for i := 0; i < C; i++ {
237 t.Roots[i] = uint32(i)
238 initNodeChildren(&t.Nodes[i])
239 t.Nodes[i].SetBranch(Branch(i))
240 t.Nodes[i].Mult = 1
241 }
242 return t
243 }
244
245 func (t *Tree) allocNode() uint32 {
246 if t.FreeHead != NullIndex {
247 idx := t.FreeHead
248 node := t.getNode(idx)
249 t.FreeHead = node.Children[0]
250 t.FreeCount--
251 *node = Node{}
252 initNodeChildren(node)
253 t.markNodeDirty(idx)
254 return idx
255 }
256 if t.cache != nil {
257 return t.cache.AllocNode(&t.nCount)
258 }
259 idx := t.nCount
260 t.nCount++
261 if int(idx) < len(t.Nodes) {
262 t.Nodes[idx] = Node{}
263 initNodeChildren(&t.Nodes[idx])
264 return idx
265 }
266 var n Node
267 initNodeChildren(&n)
268 t.Nodes = append(t.Nodes, n)
269 return idx
270 }
271
272 func (t *Tree) allocRecord() uint32 {
273 if t.cache != nil {
274 return t.cache.AllocRecord(&t.rCount)
275 }
276 idx := t.rCount
277 t.rCount++
278 if int(idx) < len(t.Records) {
279 t.Records[idx] = Record{}
280 t.Records[idx].Link[0] = NullRec
281 t.Records[idx].Link[1] = NullRec
282 return idx
283 }
284 t.Records = append(t.Records, Record{})
285 t.Records[idx].Link[0] = NullRec
286 t.Records[idx].Link[1] = NullRec
287 return idx
288 }
289
290 func (t *Tree) NodeCount() int { return int(t.nCount) }
291 func (t *Tree) RecordCount() int { return int(t.rCount) }
292
293 func Create(path string) (*Tree, error) {
294 pc, err := CreateStore(path)
295 if err != nil {
296 return nil, err
297 }
298 t := &Tree{
299 FreeHead: NullIndex,
300 FreeCount: 0,
301 nCount: C,
302 RecKey: map[uint32]Key{},
303 cache: pc,
304 }
305 for i := 0; i < C; i++ {
306 t.Roots[i] = uint32(i)
307 }
308 return t, nil
309 }
310
311 func Open(path string) (*Tree, error) {
312 pc, hdr, err := OpenStore(path)
313 if err != nil {
314 return nil, err
315 }
316 t := &Tree{
317 Roots: hdr.roots,
318 FreeHead: hdr.freeHead,
319 FreeCount: hdr.freeCount,
320 nCount: hdr.nCount,
321 rCount: hdr.rCount,
322 RecKey: map[uint32]Key{},
323 cache: pc,
324 }
325 if hdr.pendCount > 0 {
326 pendOff := pc.RecOff + int64(hdr.rCount)*48
327 t.Pending = []PendingExp{:0:int(hdr.pendCount)}
328 for i := uint32(0); i < hdr.pendCount; i++ {
329 var buf [4]byte
330 pc.F.ReadAt(buf[:], pendOff+int64(i)*4)
331 t.Pending = append(t.Pending, PendingExp{
332 Branch: Branch(buf[0]),
333 Depth: le.Uint16(buf[1:3]),
334 Factor: buf[3],
335 })
336 }
337 }
338 t.rebuildRecKey()
339 return t, nil
340 }
341
342 func (t *Tree) rebuildRecKey() {
343 for b := 0; b < C; b++ {
344 t.walkRecKey(t.Roots[b])
345 }
346 }
347
348 func (t *Tree) walkRecKey(idx uint32) {
349 if idx == NullIndex {
350 return
351 }
352 node := t.getNode(idx)
353 if node.IsDeleted() {
354 return
355 }
356 k := node.KeyCount()
357 for i := 0; i < k; i++ {
358 if node.RecPtrs[i] != NullRec {
359 t.RecKey[node.RecPtrs[i]] = node.Keys[i]
360 }
361 }
362 for i := 0; i <= k; i++ {
363 if node.Children[i] != NullIndex {
364 t.walkRecKey(node.Children[i])
365 }
366 }
367 }
368
369 func (t *Tree) Flush() error {
370 if t.cache == nil {
371 return nil
372 }
373 return t.cache.Flush(t)
374 }
375
376 func (t *Tree) Close() error {
377 if t.cache == nil {
378 return nil
379 }
380 return t.cache.Close(t)
381 }
382
383 // LookupRecIdx returns the record index for a key, or NullRec if not found.
384 func (t *Tree) LookupRecIdx(branch Branch, key Key) uint32 {
385 idx, pos, found := t.Walk(branch, key)
386 if !found {
387 return NullRec
388 }
389 node := t.getNode(idx)
390 if node.IsDeleted() {
391 return NullRec
392 }
393 return node.RecPtrs[pos]
394 }
395
396 func (t *Tree) DumpNode(idx uint32) {
397 n := t.getNode(idx)
398 fmt.Printf("node[%d] keys=%d branch=%d leaf=%v children=[%x,%x,%x,%x] rec=[%x,%x,%x]\n",
399 idx, n.KeyCount(), n.GetBranch(), n.IsLeaf(),
400 n.Children[0], n.Children[1], n.Children[2], n.Children[3],
401 n.RecPtrs[0], n.RecPtrs[1], n.RecPtrs[2])
402 }
403
404 // HashKey returns a 128-bit SipHash key for data using the default seed.
405 // Convenience wrapper for callers that don't need domain separation.
406 // For domain-separated keys (e.g. EN vs JA in transdb), prepend a domain
407 // byte to data before calling.
408 func HashKey(data []byte) Key {
409 return Key(siphash.Sum128(siphash.DefaultKey, data))
410 }
411