atomidx.mx raw

   1  package iskra
   2  
   3  import (
   4  	"bufio"
   5  	"bytes"
   6  	"encoding/binary"
   7  	"io"
   8  	"os"
   9  	"sort"
  10  
  11  	"git.smesh.lol/iskradb/lattice"
  12  )
  13  
  14  // AtomIdxEntry is one row of the sidecar index. The lattice key it points
  15  // at is reconstructible from these fields via AtomLinkKey, so we only
  16  // store the components, not the hashed key itself.
  17  //
  18  // Sort order is: SrcLang, then SrcAtom, then ContextA, then DstLang,
  19  // then DstAtom. This makes "all candidates for (srcLang, srcAtom)" and
  20  // "all candidates for (srcLang, srcAtom, contextA)" both prefix scans.
  21  type AtomIdxEntry struct {
  22  	SrcLang    uint8
  23  	DstLang    uint8
  24  	RoleA      uint8
  25  	RoleB      uint8
  26  	Gen        uint8  // GenLegacy or GenContexted
  27  	RArchaic   uint8  // source-corpus register coord: archaism axis
  28  	RDiscourse uint8  // source-corpus register coord: discourse-shape axis
  29  	Weight     uint32 // observation count
  30  	SrcAtom    string
  31  	ContextA   string
  32  	DstAtom    string
  33  	ContextB   string
  34  }
  35  
  36  // AtomIdx is the in-memory sorted index loaded from the sidecar file.
  37  // Built by BuildAtomIdx from a Tree's Bpragmatic records; serialized to
  38  // disk for reuse without rescanning the lattice.
  39  type AtomIdx struct {
  40  	Entries []AtomIdxEntry
  41  }
  42  
  43  // BuildAtomIdx scans every Bpragmatic record in the lattice, decodes
  44  // each as an AtomIdxEntry (recognizing both GenLegacy '=' form and
  45  // GenContexted '|' form), and returns a sorted index. The index is
  46  // disposable: regenerate from the primary store at any time.
  47  func BuildAtomIdx(t *Tree) *AtomIdx {
  48  	idx := &AtomIdx{}
  49  	for ri := range t.RecMeta {
  50  		rec := t.DB().GetRecord(uint32(ri))
  51  		if rec == nil || rec.Branch != uint8(lattice.Bpragmatic) {
  52  			continue
  53  		}
  54  		m := t.MetaAt(uint32(ri))
  55  		if m == nil {
  56  			continue
  57  		}
  58  		form := FormFromRecord(rec, t.Pool())
  59  		w := m.Count
  60  		if w == 0 {
  61  			continue
  62  		}
  63  		// StageTag low nibble = srcLang; high nibble = generation.
  64  		srcLang := m.StageTag & 0x0f
  65  		gen := (m.StageTag >> 4) & 0x0f
  66  
  67  		entry := AtomIdxEntry{
  68  			SrcLang:    srcLang,
  69  			Gen:        gen,
  70  			Weight:     w,
  71  			RArchaic:   m.Extra[2],
  72  			RDiscourse: m.Extra[3],
  73  		}
  74  		if gen == GenContexted {
  75  			// Form: "atomA|contextA|atomB|contextB"
  76  			parts := splitPipe4(form)
  77  			if parts == nil {
  78  				continue
  79  			}
  80  			entry.SrcAtom = parts[0]
  81  			entry.ContextA = parts[1]
  82  			entry.DstAtom = parts[2]
  83  			entry.ContextB = parts[3]
  84  			entry.RoleA = m.Extra[4]
  85  			entry.RoleB = m.Extra[0]
  86  			entry.DstLang = m.Extra[1]
  87  		} else if gen == GenDictionary {
  88  			// Form: "atomA=atomB" with role/lang in Extra.
  89  			eq := -1
  90  			for j := 0; j < len(form); j++ {
  91  				if form[j] == '=' {
  92  					eq = j
  93  					break
  94  				}
  95  			}
  96  			if eq < 0 {
  97  				continue
  98  			}
  99  			entry.SrcAtom = form[:eq]
 100  			entry.DstAtom = form[eq+1:]
 101  			entry.RoleA = m.Extra[4]
 102  			entry.RoleB = m.Extra[0]
 103  			entry.DstLang = m.Extra[1]
 104  		} else {
 105  			// Form: "atomA=atomB" (legacy bilateral, no context, no roles)
 106  			eq := -1
 107  			for j := 0; j < len(form); j++ {
 108  				if form[j] == '=' {
 109  					eq = j
 110  					break
 111  				}
 112  			}
 113  			if eq < 0 {
 114  				continue
 115  			}
 116  			entry.SrcAtom = form[:eq]
 117  			entry.DstAtom = form[eq+1:]
 118  			// Legacy bilateral: srcLang indicated by StageTag; dstLang
 119  			// implicit (the other side of the bilateral). For the JA-EN
 120  			// bilateral table that's the only existing pair, srcLang is
 121  			// in {1,2} and dstLang is the complement.
 122  			if srcLang == 1 {
 123  				entry.DstLang = 2
 124  			} else if srcLang == 2 {
 125  				entry.DstLang = 1
 126  			}
 127  		}
 128  		// Skip pathological entries: anything whose component strings
 129  		// wouldn't fit in uint16 length fields. These are typically
 130  		// lemmatizer artifacts from old ingest (whole sentences treated
 131  		// as atoms) - rare and not useful for lookup.
 132  		if len(entry.SrcAtom) > 0xffff || len(entry.ContextA) > 0xffff ||
 133  			len(entry.DstAtom) > 0xffff || len(entry.ContextB) > 0xffff {
 134  			continue
 135  		}
 136  		// Skip empty src/dst atoms - lemmatizer-collapsed records that
 137  		// won't be looked up by any meaningful query.
 138  		if entry.SrcAtom == "" || entry.DstAtom == "" {
 139  			continue
 140  		}
 141  		idx.Entries = append(idx.Entries, entry)
 142  	}
 143  	// Mirror every entry so lookups work in both directions.
 144  	// Records are ingested langA->langB only; without mirroring,
 145  	// the reverse direction finds zero candidates.
 146  	base := len(idx.Entries)
 147  	for i := 0; i < base; i++ {
 148  		e := &idx.Entries[i]
 149  		idx.Entries = append(idx.Entries, AtomIdxEntry{
 150  			SrcLang:    e.DstLang,
 151  			DstLang:    e.SrcLang,
 152  			RoleA:      e.RoleB,
 153  			RoleB:      e.RoleA,
 154  			Gen:        e.Gen,
 155  			RArchaic:   e.RArchaic,
 156  			RDiscourse: e.RDiscourse,
 157  			Weight:     e.Weight,
 158  			SrcAtom:    e.DstAtom,
 159  			ContextA:   e.ContextB,
 160  			DstAtom:    e.SrcAtom,
 161  			ContextB:   e.ContextA,
 162  		})
 163  	}
 164  	idx.sortEntries()
 165  	return idx
 166  }
 167  
 168  // atomIdxSort implements sort.Interface for AtomIdxEntry slices, ordering
 169  // by (SrcLang, SrcAtom, ContextA, DstLang, DstAtom). Used by sortEntries
 170  // to produce the prefix-scannable order.
 171  type atomIdxSort []AtomIdxEntry
 172  
 173  func (s atomIdxSort) Len() int32 { return len(s) }
 174  func (s atomIdxSort) Swap(i, j int32) {
 175  	s[i], s[j] = s[j], s[i]
 176  }
 177  func (s atomIdxSort) Less(i, j int32) bool {
 178  	a := &s[i]
 179  	b := &s[j]
 180  	if a.SrcLang != b.SrcLang {
 181  		return a.SrcLang < b.SrcLang
 182  	}
 183  	if a.SrcAtom != b.SrcAtom {
 184  		return a.SrcAtom < b.SrcAtom
 185  	}
 186  	if a.ContextA != b.ContextA {
 187  		return a.ContextA < b.ContextA
 188  	}
 189  	if a.DstLang != b.DstLang {
 190  		return a.DstLang < b.DstLang
 191  	}
 192  	return a.DstAtom < b.DstAtom
 193  }
 194  
 195  func (idx *AtomIdx) sortEntries() {
 196  	sort.Sort(atomIdxSort(idx.Entries))
 197  }
 198  
 199  // FindBySrc returns all entries with the given srcLang and srcAtom.
 200  // Prefix scan on the sorted slice - O(log N) binary search + O(K) range
 201  // where K is the number of matches.
 202  func (idx *AtomIdx) FindBySrc(srcLang uint8, srcAtom string) []AtomIdxEntry {
 203  	// Binary-search for first entry >= (srcLang, srcAtom).
 204  	lo := sort.Search(len(idx.Entries), func(i int32) bool {
 205  		e := &idx.Entries[i]
 206  		if e.SrcLang != srcLang {
 207  			return e.SrcLang > srcLang
 208  		}
 209  		return e.SrcAtom >= srcAtom
 210  	})
 211  	var out []AtomIdxEntry
 212  	for k := lo; k < len(idx.Entries); k++ {
 213  		e := &idx.Entries[k]
 214  		if e.SrcLang != srcLang || e.SrcAtom != srcAtom {
 215  			break
 216  		}
 217  		out = append(out, *e)
 218  	}
 219  	return out
 220  }
 221  
 222  // FindBySrcContext returns entries that exactly match the full (srcLang,
 223  // srcAtom, contextA) triple. Used by the lookup function's most-specific
 224  // tier before relaxing context.
 225  func (idx *AtomIdx) FindBySrcContext(srcLang uint8, srcAtom, contextA string) []AtomIdxEntry {
 226  	all := idx.FindBySrc(srcLang, srcAtom)
 227  	var out []AtomIdxEntry
 228  	for _, e := range all {
 229  		if e.ContextA == contextA {
 230  			out = append(out, e)
 231  		}
 232  	}
 233  	return out
 234  }
 235  
 236  // Save writes the index to a binary file. Format is a length-prefixed
 237  // stream of entries; can be read back by Load.
 238  func (idx *AtomIdx) Save(path string) error {
 239  	f, err := os.Create(path)
 240  	if err != nil {
 241  		return err
 242  	}
 243  	defer f.Close()
 244  	w := bufio.NewWriter(f)
 245  	// Magic + version header. v2 added per-record register coordinate.
 246  	if _, err := w.Write([]byte("ATIDX\x02")); err != nil {
 247  		return err
 248  	}
 249  	// Entry count.
 250  	var hdr [4]byte
 251  	binary.LittleEndian().PutUint32(hdr[:], uint32(len(idx.Entries)))
 252  	if _, err := w.Write(hdr[:]); err != nil {
 253  		return err
 254  	}
 255  	for _, e := range idx.Entries {
 256  		if err := writeEntry(w, &e); err != nil {
 257  			return err
 258  		}
 259  	}
 260  	return w.Flush()
 261  }
 262  
 263  // Load reads a sidecar index from disk.
 264  func LoadAtomIdx(path string) (*AtomIdx, error) {
 265  	f, err := os.Open(path)
 266  	if err != nil {
 267  		return nil, err
 268  	}
 269  	defer f.Close()
 270  	r := bufio.NewReader(f)
 271  	magic := [6]byte{}
 272  	if _, err := io.ReadFull(r, magic[:]); err != nil {
 273  		return nil, err
 274  	}
 275  	if string(magic[:]) != "ATIDX\x02" {
 276  		return nil, ErrAtomIdxFormat
 277  	}
 278  	var hdr [4]byte
 279  	if _, err := io.ReadFull(r, hdr[:]); err != nil {
 280  		return nil, err
 281  	}
 282  	n := binary.LittleEndian().Uint32(hdr[:])
 283  	idx := &AtomIdx{Entries: []AtomIdxEntry{:0:int32(n)}}
 284  	for i := uint32(0); i < n; i++ {
 285  		e, err := readEntry(r)
 286  		if err != nil {
 287  			return nil, err
 288  		}
 289  		idx.Entries = append(idx.Entries, e)
 290  	}
 291  	return idx, nil
 292  }
 293  
 294  const ErrAtomIdxFormat = errorString("atomidx: bad magic / version")
 295  
 296  type errorString string
 297  
 298  func (e errorString) Error() string { return string(e) }
 299  
 300  func writeEntry(w io.Writer, e *AtomIdxEntry) error {
 301  	var hdr [11]byte
 302  	hdr[0] = e.SrcLang
 303  	hdr[1] = e.DstLang
 304  	hdr[2] = e.RoleA
 305  	hdr[3] = e.RoleB
 306  	hdr[4] = e.Gen
 307  	hdr[5] = e.RArchaic
 308  	hdr[6] = e.RDiscourse
 309  	binary.LittleEndian().PutUint32(hdr[7:11], e.Weight)
 310  	if _, err := w.Write(hdr[:]); err != nil {
 311  		return err
 312  	}
 313  	var lens [8]byte
 314  	binary.LittleEndian().PutUint16(lens[0:2], uint16(len(e.SrcAtom)))
 315  	binary.LittleEndian().PutUint16(lens[2:4], uint16(len(e.ContextA)))
 316  	binary.LittleEndian().PutUint16(lens[4:6], uint16(len(e.DstAtom)))
 317  	binary.LittleEndian().PutUint16(lens[6:8], uint16(len(e.ContextB)))
 318  	if _, err := w.Write(lens[:]); err != nil {
 319  		return err
 320  	}
 321  	if _, err := w.Write([]byte(e.SrcAtom)); err != nil {
 322  		return err
 323  	}
 324  	if _, err := w.Write([]byte(e.ContextA)); err != nil {
 325  		return err
 326  	}
 327  	if _, err := w.Write([]byte(e.DstAtom)); err != nil {
 328  		return err
 329  	}
 330  	if _, err := w.Write([]byte(e.ContextB)); err != nil {
 331  		return err
 332  	}
 333  	return nil
 334  }
 335  
 336  func readEntry(r io.Reader) (AtomIdxEntry, error) {
 337  	var hdr [11]byte
 338  	if _, err := io.ReadFull(r, hdr[:]); err != nil {
 339  		return AtomIdxEntry{}, err
 340  	}
 341  	var lens [8]byte
 342  	if _, err := io.ReadFull(r, lens[:]); err != nil {
 343  		return AtomIdxEntry{}, err
 344  	}
 345  	srcLen := int32(binary.LittleEndian().Uint16(lens[0:2]))
 346  	ctxALen := int32(binary.LittleEndian().Uint16(lens[2:4]))
 347  	dstLen := int32(binary.LittleEndian().Uint16(lens[4:6]))
 348  	ctxBLen := int32(binary.LittleEndian().Uint16(lens[6:8]))
 349  	buf := []byte{:srcLen + ctxALen + dstLen + ctxBLen:srcLen + ctxALen + dstLen + ctxBLen}
 350  	if _, err := io.ReadFull(r, buf); err != nil {
 351  		return AtomIdxEntry{}, err
 352  	}
 353  	off := 0
 354  	srcAtom := string(buf[off : off+srcLen])
 355  	off += srcLen
 356  	ctxA := string(buf[off : off+ctxALen])
 357  	off += ctxALen
 358  	dstAtom := string(buf[off : off+dstLen])
 359  	off += dstLen
 360  	ctxB := string(buf[off : off+ctxBLen])
 361  	_ = off
 362  	return AtomIdxEntry{
 363  		SrcLang:    hdr[0],
 364  		DstLang:    hdr[1],
 365  		RoleA:      hdr[2],
 366  		RoleB:      hdr[3],
 367  		Gen:        hdr[4],
 368  		RArchaic:   hdr[5],
 369  		RDiscourse: hdr[6],
 370  		Weight:     binary.LittleEndian().Uint32(hdr[7:11]),
 371  		SrcAtom:    srcAtom,
 372  		ContextA:   ctxA,
 373  		DstAtom:    dstAtom,
 374  		ContextB:   ctxB,
 375  	}, nil
 376  }
 377  
 378  // splitPipe4 splits a "a|b|c|d" form into [a, b, c, d]. Returns nil if
 379  // the form has the wrong number of separators (defensive against legacy
 380  // records that slipped through).
 381  func splitPipe4(s string) []string {
 382  	parts := bytes.SplitN([]byte(s), []byte{'|'}, 4)
 383  	if len(parts) != 4 {
 384  		return nil
 385  	}
 386  	return []string{string(parts[0]), string(parts[1]), string(parts[2]), string(parts[3])}
 387  }
 388