atomidx.mx raw
1 package iskra
2
3 import (
4 "bufio"
5 "bytes"
6 "encoding/binary"
7 "io"
8 "os"
9 "sort"
10
11 "git.smesh.lol/iskradb/lattice"
12 )
13
14 // AtomIdxEntry is one row of the sidecar index. The lattice key it points
15 // at is reconstructible from these fields via AtomLinkKey, so we only
16 // store the components, not the hashed key itself.
17 //
18 // Sort order is: SrcLang, then SrcAtom, then ContextA, then DstLang,
19 // then DstAtom. This makes "all candidates for (srcLang, srcAtom)" and
20 // "all candidates for (srcLang, srcAtom, contextA)" both prefix scans.
21 type AtomIdxEntry struct {
22 SrcLang uint8
23 DstLang uint8
24 RoleA uint8
25 RoleB uint8
26 Gen uint8 // GenLegacy or GenContexted
27 RArchaic uint8 // source-corpus register coord: archaism axis
28 RDiscourse uint8 // source-corpus register coord: discourse-shape axis
29 Weight uint32 // observation count
30 SrcAtom string
31 ContextA string
32 DstAtom string
33 ContextB string
34 }
35
36 // AtomIdx is the in-memory sorted index loaded from the sidecar file.
37 // Built by BuildAtomIdx from a Tree's Bpragmatic records; serialized to
38 // disk for reuse without rescanning the lattice.
39 type AtomIdx struct {
40 Entries []AtomIdxEntry
41 }
42
43 // BuildAtomIdx scans every Bpragmatic record in the lattice, decodes
44 // each as an AtomIdxEntry (recognizing both GenLegacy '=' form and
45 // GenContexted '|' form), and returns a sorted index. The index is
46 // disposable: regenerate from the primary store at any time.
47 func BuildAtomIdx(t *Tree) *AtomIdx {
48 idx := &AtomIdx{}
49 for ri := range t.RecMeta {
50 rec := t.DB().GetRecord(uint32(ri))
51 if rec == nil || rec.Branch != uint8(lattice.Bpragmatic) {
52 continue
53 }
54 m := t.MetaAt(uint32(ri))
55 if m == nil {
56 continue
57 }
58 form := FormFromRecord(rec, t.Pool())
59 w := m.Count
60 if w == 0 {
61 continue
62 }
63 // StageTag low nibble = srcLang; high nibble = generation.
64 srcLang := m.StageTag & 0x0f
65 gen := (m.StageTag >> 4) & 0x0f
66
67 entry := AtomIdxEntry{
68 SrcLang: srcLang,
69 Gen: gen,
70 Weight: w,
71 RArchaic: m.Extra[2],
72 RDiscourse: m.Extra[3],
73 }
74 if gen == GenContexted {
75 // Form: "atomA|contextA|atomB|contextB"
76 parts := splitPipe4(form)
77 if parts == nil {
78 continue
79 }
80 entry.SrcAtom = parts[0]
81 entry.ContextA = parts[1]
82 entry.DstAtom = parts[2]
83 entry.ContextB = parts[3]
84 entry.RoleA = m.Extra[4]
85 entry.RoleB = m.Extra[0]
86 entry.DstLang = m.Extra[1]
87 } else if gen == GenDictionary {
88 // Form: "atomA=atomB" with role/lang in Extra.
89 eq := -1
90 for j := 0; j < len(form); j++ {
91 if form[j] == '=' {
92 eq = j
93 break
94 }
95 }
96 if eq < 0 {
97 continue
98 }
99 entry.SrcAtom = form[:eq]
100 entry.DstAtom = form[eq+1:]
101 entry.RoleA = m.Extra[4]
102 entry.RoleB = m.Extra[0]
103 entry.DstLang = m.Extra[1]
104 } else {
105 // Form: "atomA=atomB" (legacy bilateral, no context, no roles)
106 eq := -1
107 for j := 0; j < len(form); j++ {
108 if form[j] == '=' {
109 eq = j
110 break
111 }
112 }
113 if eq < 0 {
114 continue
115 }
116 entry.SrcAtom = form[:eq]
117 entry.DstAtom = form[eq+1:]
118 // Legacy bilateral: srcLang indicated by StageTag; dstLang
119 // implicit (the other side of the bilateral). For the JA-EN
120 // bilateral table that's the only existing pair, srcLang is
121 // in {1,2} and dstLang is the complement.
122 if srcLang == 1 {
123 entry.DstLang = 2
124 } else if srcLang == 2 {
125 entry.DstLang = 1
126 }
127 }
128 // Skip pathological entries: anything whose component strings
129 // wouldn't fit in uint16 length fields. These are typically
130 // lemmatizer artifacts from old ingest (whole sentences treated
131 // as atoms) - rare and not useful for lookup.
132 if len(entry.SrcAtom) > 0xffff || len(entry.ContextA) > 0xffff ||
133 len(entry.DstAtom) > 0xffff || len(entry.ContextB) > 0xffff {
134 continue
135 }
136 // Skip empty src/dst atoms - lemmatizer-collapsed records that
137 // won't be looked up by any meaningful query.
138 if entry.SrcAtom == "" || entry.DstAtom == "" {
139 continue
140 }
141 idx.Entries = append(idx.Entries, entry)
142 }
143 // Mirror every entry so lookups work in both directions.
144 // Records are ingested langA->langB only; without mirroring,
145 // the reverse direction finds zero candidates.
146 base := len(idx.Entries)
147 for i := 0; i < base; i++ {
148 e := &idx.Entries[i]
149 idx.Entries = append(idx.Entries, AtomIdxEntry{
150 SrcLang: e.DstLang,
151 DstLang: e.SrcLang,
152 RoleA: e.RoleB,
153 RoleB: e.RoleA,
154 Gen: e.Gen,
155 RArchaic: e.RArchaic,
156 RDiscourse: e.RDiscourse,
157 Weight: e.Weight,
158 SrcAtom: e.DstAtom,
159 ContextA: e.ContextB,
160 DstAtom: e.SrcAtom,
161 ContextB: e.ContextA,
162 })
163 }
164 idx.sortEntries()
165 return idx
166 }
167
168 // atomIdxSort implements sort.Interface for AtomIdxEntry slices, ordering
169 // by (SrcLang, SrcAtom, ContextA, DstLang, DstAtom). Used by sortEntries
170 // to produce the prefix-scannable order.
171 type atomIdxSort []AtomIdxEntry
172
173 func (s atomIdxSort) Len() int32 { return len(s) }
174 func (s atomIdxSort) Swap(i, j int32) {
175 s[i], s[j] = s[j], s[i]
176 }
177 func (s atomIdxSort) Less(i, j int32) bool {
178 a := &s[i]
179 b := &s[j]
180 if a.SrcLang != b.SrcLang {
181 return a.SrcLang < b.SrcLang
182 }
183 if a.SrcAtom != b.SrcAtom {
184 return a.SrcAtom < b.SrcAtom
185 }
186 if a.ContextA != b.ContextA {
187 return a.ContextA < b.ContextA
188 }
189 if a.DstLang != b.DstLang {
190 return a.DstLang < b.DstLang
191 }
192 return a.DstAtom < b.DstAtom
193 }
194
195 func (idx *AtomIdx) sortEntries() {
196 sort.Sort(atomIdxSort(idx.Entries))
197 }
198
199 // FindBySrc returns all entries with the given srcLang and srcAtom.
200 // Prefix scan on the sorted slice - O(log N) binary search + O(K) range
201 // where K is the number of matches.
202 func (idx *AtomIdx) FindBySrc(srcLang uint8, srcAtom string) []AtomIdxEntry {
203 // Binary-search for first entry >= (srcLang, srcAtom).
204 lo := sort.Search(len(idx.Entries), func(i int32) bool {
205 e := &idx.Entries[i]
206 if e.SrcLang != srcLang {
207 return e.SrcLang > srcLang
208 }
209 return e.SrcAtom >= srcAtom
210 })
211 var out []AtomIdxEntry
212 for k := lo; k < len(idx.Entries); k++ {
213 e := &idx.Entries[k]
214 if e.SrcLang != srcLang || e.SrcAtom != srcAtom {
215 break
216 }
217 out = append(out, *e)
218 }
219 return out
220 }
221
222 // FindBySrcContext returns entries that exactly match the full (srcLang,
223 // srcAtom, contextA) triple. Used by the lookup function's most-specific
224 // tier before relaxing context.
225 func (idx *AtomIdx) FindBySrcContext(srcLang uint8, srcAtom, contextA string) []AtomIdxEntry {
226 all := idx.FindBySrc(srcLang, srcAtom)
227 var out []AtomIdxEntry
228 for _, e := range all {
229 if e.ContextA == contextA {
230 out = append(out, e)
231 }
232 }
233 return out
234 }
235
236 // Save writes the index to a binary file. Format is a length-prefixed
237 // stream of entries; can be read back by Load.
238 func (idx *AtomIdx) Save(path string) error {
239 f, err := os.Create(path)
240 if err != nil {
241 return err
242 }
243 defer f.Close()
244 w := bufio.NewWriter(f)
245 // Magic + version header. v2 added per-record register coordinate.
246 if _, err := w.Write([]byte("ATIDX\x02")); err != nil {
247 return err
248 }
249 // Entry count.
250 var hdr [4]byte
251 binary.LittleEndian().PutUint32(hdr[:], uint32(len(idx.Entries)))
252 if _, err := w.Write(hdr[:]); err != nil {
253 return err
254 }
255 for _, e := range idx.Entries {
256 if err := writeEntry(w, &e); err != nil {
257 return err
258 }
259 }
260 return w.Flush()
261 }
262
263 // Load reads a sidecar index from disk.
264 func LoadAtomIdx(path string) (*AtomIdx, error) {
265 f, err := os.Open(path)
266 if err != nil {
267 return nil, err
268 }
269 defer f.Close()
270 r := bufio.NewReader(f)
271 magic := [6]byte{}
272 if _, err := io.ReadFull(r, magic[:]); err != nil {
273 return nil, err
274 }
275 if string(magic[:]) != "ATIDX\x02" {
276 return nil, ErrAtomIdxFormat
277 }
278 var hdr [4]byte
279 if _, err := io.ReadFull(r, hdr[:]); err != nil {
280 return nil, err
281 }
282 n := binary.LittleEndian().Uint32(hdr[:])
283 idx := &AtomIdx{Entries: []AtomIdxEntry{:0:int32(n)}}
284 for i := uint32(0); i < n; i++ {
285 e, err := readEntry(r)
286 if err != nil {
287 return nil, err
288 }
289 idx.Entries = append(idx.Entries, e)
290 }
291 return idx, nil
292 }
293
294 const ErrAtomIdxFormat = errorString("atomidx: bad magic / version")
295
296 type errorString string
297
298 func (e errorString) Error() string { return string(e) }
299
300 func writeEntry(w io.Writer, e *AtomIdxEntry) error {
301 var hdr [11]byte
302 hdr[0] = e.SrcLang
303 hdr[1] = e.DstLang
304 hdr[2] = e.RoleA
305 hdr[3] = e.RoleB
306 hdr[4] = e.Gen
307 hdr[5] = e.RArchaic
308 hdr[6] = e.RDiscourse
309 binary.LittleEndian().PutUint32(hdr[7:11], e.Weight)
310 if _, err := w.Write(hdr[:]); err != nil {
311 return err
312 }
313 var lens [8]byte
314 binary.LittleEndian().PutUint16(lens[0:2], uint16(len(e.SrcAtom)))
315 binary.LittleEndian().PutUint16(lens[2:4], uint16(len(e.ContextA)))
316 binary.LittleEndian().PutUint16(lens[4:6], uint16(len(e.DstAtom)))
317 binary.LittleEndian().PutUint16(lens[6:8], uint16(len(e.ContextB)))
318 if _, err := w.Write(lens[:]); err != nil {
319 return err
320 }
321 if _, err := w.Write([]byte(e.SrcAtom)); err != nil {
322 return err
323 }
324 if _, err := w.Write([]byte(e.ContextA)); err != nil {
325 return err
326 }
327 if _, err := w.Write([]byte(e.DstAtom)); err != nil {
328 return err
329 }
330 if _, err := w.Write([]byte(e.ContextB)); err != nil {
331 return err
332 }
333 return nil
334 }
335
336 func readEntry(r io.Reader) (AtomIdxEntry, error) {
337 var hdr [11]byte
338 if _, err := io.ReadFull(r, hdr[:]); err != nil {
339 return AtomIdxEntry{}, err
340 }
341 var lens [8]byte
342 if _, err := io.ReadFull(r, lens[:]); err != nil {
343 return AtomIdxEntry{}, err
344 }
345 srcLen := int32(binary.LittleEndian().Uint16(lens[0:2]))
346 ctxALen := int32(binary.LittleEndian().Uint16(lens[2:4]))
347 dstLen := int32(binary.LittleEndian().Uint16(lens[4:6]))
348 ctxBLen := int32(binary.LittleEndian().Uint16(lens[6:8]))
349 buf := []byte{:srcLen + ctxALen + dstLen + ctxBLen:srcLen + ctxALen + dstLen + ctxBLen}
350 if _, err := io.ReadFull(r, buf); err != nil {
351 return AtomIdxEntry{}, err
352 }
353 off := 0
354 srcAtom := string(buf[off : off+srcLen])
355 off += srcLen
356 ctxA := string(buf[off : off+ctxALen])
357 off += ctxALen
358 dstAtom := string(buf[off : off+dstLen])
359 off += dstLen
360 ctxB := string(buf[off : off+ctxBLen])
361 _ = off
362 return AtomIdxEntry{
363 SrcLang: hdr[0],
364 DstLang: hdr[1],
365 RoleA: hdr[2],
366 RoleB: hdr[3],
367 Gen: hdr[4],
368 RArchaic: hdr[5],
369 RDiscourse: hdr[6],
370 Weight: binary.LittleEndian().Uint32(hdr[7:11]),
371 SrcAtom: srcAtom,
372 ContextA: ctxA,
373 DstAtom: dstAtom,
374 ContextB: ctxB,
375 }, nil
376 }
377
378 // splitPipe4 splits a "a|b|c|d" form into [a, b, c, d]. Returns nil if
379 // the form has the wrong number of separators (defensive against legacy
380 // records that slipped through).
381 func splitPipe4(s string) []string {
382 parts := bytes.SplitN([]byte(s), []byte{'|'}, 4)
383 if len(parts) != 4 {
384 return nil
385 }
386 return []string{string(parts[0]), string(parts[1]), string(parts[2]), string(parts[3])}
387 }
388