package ingest import ( "fmt" "io" "os" "git.smesh.lol/iskradb/lattice" "git.smesh.lol/transdb" ) // DB is the in-memory translation lattice with its string pool. type DB struct { Tree *lattice.Tree StringPool []byte } // NewDB creates an empty translation database. func NewDB(cap int) *DB { return &DB{ Tree: lattice.NewTree(cap), StringPool: []byte{:0:65536}, } } // LoadJMdict parses a JMdict XML file (or "-" for stdin) and inserts all // entries into the database. If path ends in ".gz", gunzip is used to decompress. func LoadJMdict(db *DB, path string) (int, int, error) { r, cleanup, err := openInput(path) if err != nil { return 0, 0, err } defer cleanup() inserted := 0 skipped := 0 ParseJMdict(r, func(e JMEntry) { if insertJMEntry(db, e) { inserted++ } else { skipped++ } }) return inserted, skipped, nil } // LoadKanjidic parses a KANJIDIC2 XML file and inserts all kanji. func LoadKanjidic(db *DB, path string) (int, error) { r, cleanup, err := openInput(path) if err != nil { return 0, err } defer cleanup() inserted := 0 ParseKanjidic(r, func(e KanjiEntry) { if insertKanjiEntry(db, e) { inserted++ } }) return inserted, nil } func insertJMEntry(db *DB, e JMEntry) bool { // Particles, copulae, and auxiliaries are morphological fork labels — // they belong in the tree structure, not as content records. if e.IsFunction() { return false } form := e.PrimaryForm() if form == "" { return false } branch := e.Branch() // Build JA record. var jaRec lattice.Record transdb.SetFormOnRecord(&jaRec, form, &db.StringPool) reg, dom, spec := e.RegisterBits() jaRec.Branch = transdb.PackBranch(uint8(branch), reg, dom, spec) // DataOff/DataLen set by SetFormOnRecord; do not overwrite. // Compute entry coord from axes 1-4 (grammatical, pragmatic, valency, register). // Base forms are inserted at BOTH coord=0 (universal fallback) and the specific // coord when it differs. This enables precise lookup for context-aware translation. entryCoord := e.EntryCoord(branch) jaKey := transdb.MakeKey(transdb.LangJA, 0, form) // Collision check at coord=0. if ri := db.Tree.LookupRecIdx(branch, jaKey); ri != lattice.NullRec { existing := transdb.FormFromInline(db.Tree.GetRecord(ri), db.StringPool) if existing != form { fmt.Fprintf(os.Stderr, "collision: key=%016x form=%q existing=%q\n", jaKey, form, existing) } // Either duplicate or collision - skip insert to avoid overwriting. } jaRI := db.Tree.InsertRec(branch, jaKey, jaRec) // Insert alias records for all readings that differ from the primary form. // Entries with a kanji primary form (e.g. 珈琲) have katakana readings // (e.g. コーヒー) that must be directly lookupable for tokenization and // JA→EN translation. Each alias points to the same EN translations. var readingRIs []uint32 for _, reading := range e.Readings { if reading == "" || reading == form { continue } rKey := transdb.MakeKey(transdb.LangJA, 0, reading) if db.Tree.LookupRecIdx(branch, rKey) != lattice.NullRec { if ri := db.Tree.LookupRecIdx(branch, rKey); ri != lattice.NullRec { readingRIs = append(readingRIs, ri) } continue } var rRec lattice.Record transdb.SetFormOnRecord(&rRec, reading, &db.StringPool) rRec.Branch = uint8(branch) ri := db.Tree.InsertRec(branch, rKey, rRec) readingRIs = append(readingRIs, ri) } // Insert English glosses. Collect all EN record indices. // Do NOT cache record pointers across InsertRec calls - the Records slice // may be reallocated, invalidating pointers from earlier calls. firstEN := lattice.NullRec var enRIs []uint32 for _, sense := range e.Senses { for _, gloss := range sense.Glosses { if gloss == "" { continue } var enRec lattice.Record transdb.SetFormOnRecord(&enRec, gloss, &db.StringPool) enRec.Branch = transdb.PackBranch(uint8(branch), reg, dom, spec) // DataOff/DataLen are used by SetFormOnRecord for overflow pool // references; do not overwrite for provenance. Use RecMeta instead. enKey := transdb.MakeKey(transdb.LangEN, 0, gloss) if ri := db.Tree.LookupRecIdx(branch, enKey); ri != lattice.NullRec { if firstEN == lattice.NullRec { firstEN = ri } enRIs = append(enRIs, ri) continue } enRI := db.Tree.InsertRec(branch, enKey, enRec) if firstEN == lattice.NullRec { firstEN = enRI } enRIs = append(enRIs, enRI) } } // Wire all links AFTER all inserts to use stable indices (not stale pointers). // Re-fetch record pointers fresh here; no more appends after this point. for _, enRI := range enRIs { if enR := db.Tree.GetRecord(enRI); enR != nil && enR.Link[0] == lattice.NullRec { enR.Link[0] = jaRI // EN → JA } } // Wire reading aliases to the primary EN translation. for _, rRI := range readingRIs { if rRec := db.Tree.GetRecord(rRI); rRec != nil && rRec.Link[0] == lattice.NullRec { rRec.Link[0] = firstEN } } if jRec := db.Tree.GetRecord(jaRI); jRec != nil { if firstEN != lattice.NullRec { jRec.Link[0] = firstEN // JA → EN (primary) } // Link[1] points to the first reading alias for JA→alt traversal. if len(readingRIs) > 0 && jRec.Link[1] == lattice.NullRec { jRec.Link[1] = readingRIs[0] } } // Insert specific-coord alias when entryCoord carries non-default axis values. // This enables context-aware lookup (vt/vi disambiguation, register selection). // The coord=0 record above is the universal fallback; this record is the precise hit. if entryCoord != 0 && firstEN != lattice.NullRec { jaKeyC := transdb.MakeKey(transdb.LangJA, entryCoord, form) if db.Tree.LookupRecIdx(branch, jaKeyC) == lattice.NullRec { var cRec lattice.Record transdb.SetFormOnRecord(&cRec, form, &db.StringPool) cRec.Branch = jaRec.Branch newRI := db.Tree.InsertRec(branch, jaKeyC, cRec) if r := db.Tree.GetRecord(newRI); r != nil { r.Link[0] = firstEN } } // Specific-coord EN alias → links to JA specific-coord record. for _, gloss := range e.Senses[0].Glosses { if gloss == "" { continue } enKeyC := transdb.MakeKey(transdb.LangEN, entryCoord, gloss) if db.Tree.LookupRecIdx(branch, enKeyC) == lattice.NullRec { var eRec lattice.Record transdb.SetFormOnRecord(&eRec, gloss, &db.StringPool) eRec.Branch = jaRec.Branch newRI := db.Tree.InsertRec(branch, enKeyC, eRec) // Wire: EN coord-specific → JA coord-specific jaCoordRI := db.Tree.LookupRecIdx(branch, jaKeyC) if r := db.Tree.GetRecord(newRI); r != nil && jaCoordRI != lattice.NullRec { r.Link[0] = jaCoordRI } } } } // Generate JA conjugated forms (食べた, 食べている, etc.) for verb entries. // Also register verb class in Bcooccur so InflectJAFromTree can compute // any form at runtime without requiring all forms to be pre-stored. if firstEN != lattice.NullRec { verbClass := e.VerbClass() classCode := transdb.VerbClassCode(verbClass) GenerateConjugations(db, form, verbClass, branch, jaRec.Branch, firstEN) transdb.RegisterVerbClass(db.Tree, &db.StringPool, transdb.LangJA, form, classCode) for _, reading := range e.Readings { if reading != form { GenerateConjugations(db, reading, verbClass, branch, jaRec.Branch, firstEN) transdb.RegisterVerbClass(db.Tree, &db.StringPool, transdb.LangJA, reading, classCode) } } } // Generate EN surface forms (sang, singing, sings) for verb entries. // Each EN form links to the JA record so EN→JA can navigate the cluster. if jaRI != lattice.NullRec && branch == lattice.Bverb { enBranchByte := transdb.PackBranch(uint8(lattice.Bverb), reg, dom, spec) for _, sense := range e.Senses { for _, gloss := range sense.Glosses { GenerateENForms(db, gloss, enBranchByte, jaRI) } } } return true } func insertKanjiEntry(db *DB, e KanjiEntry) bool { if e.Literal == "" { return false } var rec lattice.Record transdb.SetFormOnRecord(&rec, e.Literal, &db.StringPool) rec.Branch = uint8(lattice.Bnoun) key := transdb.MakeKey(transdb.LangJA, 0, e.Literal) if db.Tree.LookupRecIdx(lattice.Bnoun, key) != lattice.NullRec { return false // already in tree from JMdict } kaRI := db.Tree.InsertRec(lattice.Bnoun, key, rec) // Link ON reading as Link[1]. if len(e.OnYomi) > 0 { var onRec lattice.Record transdb.SetFormOnRecord(&onRec, e.OnYomi[0], &db.StringPool) onRec.Branch = uint8(lattice.Bnoun) onKey := transdb.MakeKey(transdb.LangJA, 0, e.OnYomi[0]) if db.Tree.LookupRecIdx(lattice.Bnoun, onKey) == lattice.NullRec { onRI := db.Tree.InsertRec(lattice.Bnoun, onKey, onRec) kaRec := db.Tree.GetRecord(kaRI) if kaRec != nil { kaRec.Link[1] = onRI } } } // Insert first English meaning, wire link. if len(e.Meanings) > 0 { gloss := e.Meanings[0] var enRec lattice.Record transdb.SetFormOnRecord(&enRec, gloss, &db.StringPool) enRec.Branch = uint8(lattice.Bnoun) enKey := transdb.MakeKey(transdb.LangEN, 0, gloss) if db.Tree.LookupRecIdx(lattice.Bnoun, enKey) == lattice.NullRec { enRI := db.Tree.InsertRec(lattice.Bnoun, enKey, enRec) kaRec := db.Tree.GetRecord(kaRI) enRec2 := db.Tree.GetRecord(enRI) if kaRec != nil && enRI != lattice.NullRec { kaRec.Link[0] = enRI } if enRec2 != nil { enRec2.Link[0] = kaRI } } } return true } // openInput returns a reader for path. // Use "-" for stdin (e.g. after: gunzip -c file.gz | transdb load -jmdict -). // .gz files: Moxie cannot fork/exec a subprocess (runtime limitation), so // pass stdin or a pre-decompressed path. func openInput(path string) (io.Reader, func(), error) { if path == "-" { return os.Stdin, func() {}, nil } if len(path) > 3 && path[len(path)-3:] == ".gz" { return nil, func() {}, fmt.Errorf( "%q is a .gz file; Moxie cannot spawn gunzip internally.\n"+ "Decompress first: gunzip -c %s | transdb load -jmdict -", path, path) } f, err := os.Open(path) if err != nil { return nil, func() {}, err } return f, func() { f.Close() }, nil }