package iskra import ( "bufio" "bytes" "encoding/binary" "io" "os" "sort" "git.smesh.lol/iskradb/lattice" ) // AtomIdxEntry is one row of the sidecar index. The lattice key it points // at is reconstructible from these fields via AtomLinkKey, so we only // store the components, not the hashed key itself. // // Sort order is: SrcLang, then SrcAtom, then ContextA, then DstLang, // then DstAtom. This makes "all candidates for (srcLang, srcAtom)" and // "all candidates for (srcLang, srcAtom, contextA)" both prefix scans. type AtomIdxEntry struct { SrcLang uint8 DstLang uint8 RoleA uint8 RoleB uint8 Gen uint8 // GenLegacy or GenContexted RArchaic uint8 // source-corpus register coord: archaism axis RDiscourse uint8 // source-corpus register coord: discourse-shape axis Weight uint32 // observation count SrcAtom string ContextA string DstAtom string ContextB string } // AtomIdx is the in-memory sorted index loaded from the sidecar file. // Built by BuildAtomIdx from a Tree's Bpragmatic records; serialized to // disk for reuse without rescanning the lattice. type AtomIdx struct { Entries []AtomIdxEntry } // BuildAtomIdx scans every Bpragmatic record in the lattice, decodes // each as an AtomIdxEntry (recognizing both GenLegacy '=' form and // GenContexted '|' form), and returns a sorted index. The index is // disposable: regenerate from the primary store at any time. func BuildAtomIdx(t *Tree) *AtomIdx { idx := &AtomIdx{} for ri := range t.RecMeta { rec := t.DB().GetRecord(uint32(ri)) if rec == nil || rec.Branch != uint8(lattice.Bpragmatic) { continue } m := t.MetaAt(uint32(ri)) if m == nil { continue } form := FormFromRecord(rec, t.Pool()) w := m.Count if w == 0 { continue } // StageTag low nibble = srcLang; high nibble = generation. srcLang := m.StageTag & 0x0f gen := (m.StageTag >> 4) & 0x0f entry := AtomIdxEntry{ SrcLang: srcLang, Gen: gen, Weight: w, RArchaic: m.Extra[2], RDiscourse: m.Extra[3], } if gen == GenContexted { // Form: "atomA|contextA|atomB|contextB" parts := splitPipe4(form) if parts == nil { continue } entry.SrcAtom = parts[0] entry.ContextA = parts[1] entry.DstAtom = parts[2] entry.ContextB = parts[3] entry.RoleA = m.Extra[4] entry.RoleB = m.Extra[0] entry.DstLang = m.Extra[1] } else if gen == GenDictionary { // Form: "atomA=atomB" with role/lang in Extra. eq := -1 for j := 0; j < len(form); j++ { if form[j] == '=' { eq = j break } } if eq < 0 { continue } entry.SrcAtom = form[:eq] entry.DstAtom = form[eq+1:] entry.RoleA = m.Extra[4] entry.RoleB = m.Extra[0] entry.DstLang = m.Extra[1] } else { // Form: "atomA=atomB" (legacy bilateral, no context, no roles) eq := -1 for j := 0; j < len(form); j++ { if form[j] == '=' { eq = j break } } if eq < 0 { continue } entry.SrcAtom = form[:eq] entry.DstAtom = form[eq+1:] // Legacy bilateral: srcLang indicated by StageTag; dstLang // implicit (the other side of the bilateral). For the JA-EN // bilateral table that's the only existing pair, srcLang is // in {1,2} and dstLang is the complement. if srcLang == 1 { entry.DstLang = 2 } else if srcLang == 2 { entry.DstLang = 1 } } // Skip pathological entries: anything whose component strings // wouldn't fit in uint16 length fields. These are typically // lemmatizer artifacts from old ingest (whole sentences treated // as atoms) - rare and not useful for lookup. if len(entry.SrcAtom) > 0xffff || len(entry.ContextA) > 0xffff || len(entry.DstAtom) > 0xffff || len(entry.ContextB) > 0xffff { continue } // Skip empty src/dst atoms - lemmatizer-collapsed records that // won't be looked up by any meaningful query. if entry.SrcAtom == "" || entry.DstAtom == "" { continue } idx.Entries = append(idx.Entries, entry) } // Mirror every entry so lookups work in both directions. // Records are ingested langA->langB only; without mirroring, // the reverse direction finds zero candidates. base := len(idx.Entries) for i := 0; i < base; i++ { e := &idx.Entries[i] idx.Entries = append(idx.Entries, AtomIdxEntry{ SrcLang: e.DstLang, DstLang: e.SrcLang, RoleA: e.RoleB, RoleB: e.RoleA, Gen: e.Gen, RArchaic: e.RArchaic, RDiscourse: e.RDiscourse, Weight: e.Weight, SrcAtom: e.DstAtom, ContextA: e.ContextB, DstAtom: e.SrcAtom, ContextB: e.ContextA, }) } idx.sortEntries() return idx } // atomIdxSort implements sort.Interface for AtomIdxEntry slices, ordering // by (SrcLang, SrcAtom, ContextA, DstLang, DstAtom). Used by sortEntries // to produce the prefix-scannable order. type atomIdxSort []AtomIdxEntry func (s atomIdxSort) Len() int32 { return len(s) } func (s atomIdxSort) Swap(i, j int32) { s[i], s[j] = s[j], s[i] } func (s atomIdxSort) Less(i, j int32) bool { a := &s[i] b := &s[j] if a.SrcLang != b.SrcLang { return a.SrcLang < b.SrcLang } if a.SrcAtom != b.SrcAtom { return a.SrcAtom < b.SrcAtom } if a.ContextA != b.ContextA { return a.ContextA < b.ContextA } if a.DstLang != b.DstLang { return a.DstLang < b.DstLang } return a.DstAtom < b.DstAtom } func (idx *AtomIdx) sortEntries() { sort.Sort(atomIdxSort(idx.Entries)) } // FindBySrc returns all entries with the given srcLang and srcAtom. // Prefix scan on the sorted slice - O(log N) binary search + O(K) range // where K is the number of matches. func (idx *AtomIdx) FindBySrc(srcLang uint8, srcAtom string) []AtomIdxEntry { // Binary-search for first entry >= (srcLang, srcAtom). lo := sort.Search(len(idx.Entries), func(i int32) bool { e := &idx.Entries[i] if e.SrcLang != srcLang { return e.SrcLang > srcLang } return e.SrcAtom >= srcAtom }) var out []AtomIdxEntry for k := lo; k < len(idx.Entries); k++ { e := &idx.Entries[k] if e.SrcLang != srcLang || e.SrcAtom != srcAtom { break } out = append(out, *e) } return out } // FindBySrcContext returns entries that exactly match the full (srcLang, // srcAtom, contextA) triple. Used by the lookup function's most-specific // tier before relaxing context. func (idx *AtomIdx) FindBySrcContext(srcLang uint8, srcAtom, contextA string) []AtomIdxEntry { all := idx.FindBySrc(srcLang, srcAtom) var out []AtomIdxEntry for _, e := range all { if e.ContextA == contextA { out = append(out, e) } } return out } // Save writes the index to a binary file. Format is a length-prefixed // stream of entries; can be read back by Load. func (idx *AtomIdx) Save(path string) error { f, err := os.Create(path) if err != nil { return err } defer f.Close() w := bufio.NewWriter(f) // Magic + version header. v2 added per-record register coordinate. if _, err := w.Write([]byte("ATIDX\x02")); err != nil { return err } // Entry count. var hdr [4]byte binary.LittleEndian().PutUint32(hdr[:], uint32(len(idx.Entries))) if _, err := w.Write(hdr[:]); err != nil { return err } for _, e := range idx.Entries { if err := writeEntry(w, &e); err != nil { return err } } return w.Flush() } // Load reads a sidecar index from disk. func LoadAtomIdx(path string) (*AtomIdx, error) { f, err := os.Open(path) if err != nil { return nil, err } defer f.Close() r := bufio.NewReader(f) magic := [6]byte{} if _, err := io.ReadFull(r, magic[:]); err != nil { return nil, err } if string(magic[:]) != "ATIDX\x02" { return nil, ErrAtomIdxFormat } var hdr [4]byte if _, err := io.ReadFull(r, hdr[:]); err != nil { return nil, err } n := binary.LittleEndian().Uint32(hdr[:]) idx := &AtomIdx{Entries: []AtomIdxEntry{:0:int32(n)}} for i := uint32(0); i < n; i++ { e, err := readEntry(r) if err != nil { return nil, err } idx.Entries = append(idx.Entries, e) } return idx, nil } const ErrAtomIdxFormat = errorString("atomidx: bad magic / version") type errorString string func (e errorString) Error() string { return string(e) } func writeEntry(w io.Writer, e *AtomIdxEntry) error { var hdr [11]byte hdr[0] = e.SrcLang hdr[1] = e.DstLang hdr[2] = e.RoleA hdr[3] = e.RoleB hdr[4] = e.Gen hdr[5] = e.RArchaic hdr[6] = e.RDiscourse binary.LittleEndian().PutUint32(hdr[7:11], e.Weight) if _, err := w.Write(hdr[:]); err != nil { return err } var lens [8]byte binary.LittleEndian().PutUint16(lens[0:2], uint16(len(e.SrcAtom))) binary.LittleEndian().PutUint16(lens[2:4], uint16(len(e.ContextA))) binary.LittleEndian().PutUint16(lens[4:6], uint16(len(e.DstAtom))) binary.LittleEndian().PutUint16(lens[6:8], uint16(len(e.ContextB))) if _, err := w.Write(lens[:]); err != nil { return err } if _, err := w.Write([]byte(e.SrcAtom)); err != nil { return err } if _, err := w.Write([]byte(e.ContextA)); err != nil { return err } if _, err := w.Write([]byte(e.DstAtom)); err != nil { return err } if _, err := w.Write([]byte(e.ContextB)); err != nil { return err } return nil } func readEntry(r io.Reader) (AtomIdxEntry, error) { var hdr [11]byte if _, err := io.ReadFull(r, hdr[:]); err != nil { return AtomIdxEntry{}, err } var lens [8]byte if _, err := io.ReadFull(r, lens[:]); err != nil { return AtomIdxEntry{}, err } srcLen := int32(binary.LittleEndian().Uint16(lens[0:2])) ctxALen := int32(binary.LittleEndian().Uint16(lens[2:4])) dstLen := int32(binary.LittleEndian().Uint16(lens[4:6])) ctxBLen := int32(binary.LittleEndian().Uint16(lens[6:8])) buf := []byte{:srcLen + ctxALen + dstLen + ctxBLen:srcLen + ctxALen + dstLen + ctxBLen} if _, err := io.ReadFull(r, buf); err != nil { return AtomIdxEntry{}, err } off := 0 srcAtom := string(buf[off : off+srcLen]) off += srcLen ctxA := string(buf[off : off+ctxALen]) off += ctxALen dstAtom := string(buf[off : off+dstLen]) off += dstLen ctxB := string(buf[off : off+ctxBLen]) _ = off return AtomIdxEntry{ SrcLang: hdr[0], DstLang: hdr[1], RoleA: hdr[2], RoleB: hdr[3], Gen: hdr[4], RArchaic: hdr[5], RDiscourse: hdr[6], Weight: binary.LittleEndian().Uint32(hdr[7:11]), SrcAtom: srcAtom, ContextA: ctxA, DstAtom: dstAtom, ContextB: ctxB, }, nil } // splitPipe4 splits a "a|b|c|d" form into [a, b, c, d]. Returns nil if // the form has the wrong number of separators (defensive against legacy // records that slipped through). func splitPipe4(s string) []string { parts := bytes.SplitN([]byte(s), []byte{'|'}, 4) if len(parts) != 4 { return nil } return []string{string(parts[0]), string(parts[1]), string(parts[2]), string(parts[3])} }