load.mx raw
1 package ingest
2
3 import (
4 "fmt"
5 "io"
6 "os"
7
8 "git.smesh.lol/iskradb/lattice"
9 "git.smesh.lol/transdb"
10 )
11
12 // DB is the in-memory translation lattice with its string pool.
13 type DB struct {
14 Tree *lattice.Tree
15 StringPool []byte
16 }
17
18 // NewDB creates an empty translation database.
19 func NewDB(cap int) *DB {
20 return &DB{
21 Tree: lattice.NewTree(cap),
22 StringPool: []byte{:0:65536},
23 }
24 }
25
26 // LoadJMdict parses a JMdict XML file (or "-" for stdin) and inserts all
27 // entries into the database. If path ends in ".gz", gunzip is used to decompress.
28 func LoadJMdict(db *DB, path string) (int, int, error) {
29 r, cleanup, err := openInput(path)
30 if err != nil {
31 return 0, 0, err
32 }
33 defer cleanup()
34
35 inserted := 0
36 skipped := 0
37
38 ParseJMdict(r, func(e JMEntry) {
39 if insertJMEntry(db, e) {
40 inserted++
41 } else {
42 skipped++
43 }
44 })
45
46 return inserted, skipped, nil
47 }
48
49 // LoadKanjidic parses a KANJIDIC2 XML file and inserts all kanji.
50 func LoadKanjidic(db *DB, path string) (int, error) {
51 r, cleanup, err := openInput(path)
52 if err != nil {
53 return 0, err
54 }
55 defer cleanup()
56
57 inserted := 0
58 ParseKanjidic(r, func(e KanjiEntry) {
59 if insertKanjiEntry(db, e) {
60 inserted++
61 }
62 })
63 return inserted, nil
64 }
65
66 func insertJMEntry(db *DB, e JMEntry) bool {
67 // Particles, copulae, and auxiliaries are morphological fork labels —
68 // they belong in the tree structure, not as content records.
69 if e.IsFunction() {
70 return false
71 }
72 form := e.PrimaryForm()
73 if form == "" {
74 return false
75 }
76 branch := e.Branch()
77
78 // Build JA record.
79 var jaRec lattice.Record
80 transdb.SetFormOnRecord(&jaRec, form, &db.StringPool)
81 reg, dom, spec := e.RegisterBits()
82 jaRec.Branch = transdb.PackBranch(uint8(branch), reg, dom, spec)
83 // DataOff/DataLen set by SetFormOnRecord; do not overwrite.
84
85 // Compute entry coord from axes 1-4 (grammatical, pragmatic, valency, register).
86 // Base forms are inserted at BOTH coord=0 (universal fallback) and the specific
87 // coord when it differs. This enables precise lookup for context-aware translation.
88 entryCoord := e.EntryCoord(branch)
89
90 jaKey := transdb.MakeKey(transdb.LangJA, 0, form)
91
92 // Collision check at coord=0.
93 if ri := db.Tree.LookupRecIdx(branch, jaKey); ri != lattice.NullRec {
94 existing := transdb.FormFromInline(db.Tree.GetRecord(ri), db.StringPool)
95 if existing != form {
96 fmt.Fprintf(os.Stderr, "collision: key=%016x form=%q existing=%q\n", jaKey, form, existing)
97 }
98 // Either duplicate or collision - skip insert to avoid overwriting.
99 }
100
101 jaRI := db.Tree.InsertRec(branch, jaKey, jaRec)
102
103 // Insert alias records for all readings that differ from the primary form.
104 // Entries with a kanji primary form (e.g. 珈琲) have katakana readings
105 // (e.g. コーヒー) that must be directly lookupable for tokenization and
106 // JA→EN translation. Each alias points to the same EN translations.
107 var readingRIs []uint32
108 for _, reading := range e.Readings {
109 if reading == "" || reading == form {
110 continue
111 }
112 rKey := transdb.MakeKey(transdb.LangJA, 0, reading)
113 if db.Tree.LookupRecIdx(branch, rKey) != lattice.NullRec {
114 if ri := db.Tree.LookupRecIdx(branch, rKey); ri != lattice.NullRec {
115 readingRIs = append(readingRIs, ri)
116 }
117 continue
118 }
119 var rRec lattice.Record
120 transdb.SetFormOnRecord(&rRec, reading, &db.StringPool)
121 rRec.Branch = uint8(branch)
122 ri := db.Tree.InsertRec(branch, rKey, rRec)
123 readingRIs = append(readingRIs, ri)
124 }
125
126 // Insert English glosses. Collect all EN record indices.
127 // Do NOT cache record pointers across InsertRec calls - the Records slice
128 // may be reallocated, invalidating pointers from earlier calls.
129 firstEN := lattice.NullRec
130 var enRIs []uint32
131 for _, sense := range e.Senses {
132 for _, gloss := range sense.Glosses {
133 if gloss == "" {
134 continue
135 }
136 var enRec lattice.Record
137 transdb.SetFormOnRecord(&enRec, gloss, &db.StringPool)
138 enRec.Branch = transdb.PackBranch(uint8(branch), reg, dom, spec)
139 // DataOff/DataLen are used by SetFormOnRecord for overflow pool
140 // references; do not overwrite for provenance. Use RecMeta instead.
141
142 enKey := transdb.MakeKey(transdb.LangEN, 0, gloss)
143
144 if ri := db.Tree.LookupRecIdx(branch, enKey); ri != lattice.NullRec {
145 if firstEN == lattice.NullRec {
146 firstEN = ri
147 }
148 enRIs = append(enRIs, ri)
149 continue
150 }
151
152 enRI := db.Tree.InsertRec(branch, enKey, enRec)
153 if firstEN == lattice.NullRec {
154 firstEN = enRI
155 }
156 enRIs = append(enRIs, enRI)
157 }
158 }
159
160 // Wire all links AFTER all inserts to use stable indices (not stale pointers).
161 // Re-fetch record pointers fresh here; no more appends after this point.
162 for _, enRI := range enRIs {
163 if enR := db.Tree.GetRecord(enRI); enR != nil && enR.Link[0] == lattice.NullRec {
164 enR.Link[0] = jaRI // EN → JA
165 }
166 }
167
168 // Wire reading aliases to the primary EN translation.
169 for _, rRI := range readingRIs {
170 if rRec := db.Tree.GetRecord(rRI); rRec != nil && rRec.Link[0] == lattice.NullRec {
171 rRec.Link[0] = firstEN
172 }
173 }
174
175 if jRec := db.Tree.GetRecord(jaRI); jRec != nil {
176 if firstEN != lattice.NullRec {
177 jRec.Link[0] = firstEN // JA → EN (primary)
178 }
179 // Link[1] points to the first reading alias for JA→alt traversal.
180 if len(readingRIs) > 0 && jRec.Link[1] == lattice.NullRec {
181 jRec.Link[1] = readingRIs[0]
182 }
183 }
184
185 // Insert specific-coord alias when entryCoord carries non-default axis values.
186 // This enables context-aware lookup (vt/vi disambiguation, register selection).
187 // The coord=0 record above is the universal fallback; this record is the precise hit.
188 if entryCoord != 0 && firstEN != lattice.NullRec {
189 jaKeyC := transdb.MakeKey(transdb.LangJA, entryCoord, form)
190 if db.Tree.LookupRecIdx(branch, jaKeyC) == lattice.NullRec {
191 var cRec lattice.Record
192 transdb.SetFormOnRecord(&cRec, form, &db.StringPool)
193 cRec.Branch = jaRec.Branch
194 newRI := db.Tree.InsertRec(branch, jaKeyC, cRec)
195 if r := db.Tree.GetRecord(newRI); r != nil {
196 r.Link[0] = firstEN
197 }
198 }
199 // Specific-coord EN alias → links to JA specific-coord record.
200 for _, gloss := range e.Senses[0].Glosses {
201 if gloss == "" {
202 continue
203 }
204 enKeyC := transdb.MakeKey(transdb.LangEN, entryCoord, gloss)
205 if db.Tree.LookupRecIdx(branch, enKeyC) == lattice.NullRec {
206 var eRec lattice.Record
207 transdb.SetFormOnRecord(&eRec, gloss, &db.StringPool)
208 eRec.Branch = jaRec.Branch
209 newRI := db.Tree.InsertRec(branch, enKeyC, eRec)
210 // Wire: EN coord-specific → JA coord-specific
211 jaCoordRI := db.Tree.LookupRecIdx(branch, jaKeyC)
212 if r := db.Tree.GetRecord(newRI); r != nil && jaCoordRI != lattice.NullRec {
213 r.Link[0] = jaCoordRI
214 }
215 }
216 }
217 }
218
219 // Generate JA conjugated forms (食べた, 食べている, etc.) for verb entries.
220 // Also register verb class in Bcooccur so InflectJAFromTree can compute
221 // any form at runtime without requiring all forms to be pre-stored.
222 if firstEN != lattice.NullRec {
223 verbClass := e.VerbClass()
224 classCode := transdb.VerbClassCode(verbClass)
225 GenerateConjugations(db, form, verbClass, branch, jaRec.Branch, firstEN)
226 transdb.RegisterVerbClass(db.Tree, &db.StringPool, transdb.LangJA, form, classCode)
227 for _, reading := range e.Readings {
228 if reading != form {
229 GenerateConjugations(db, reading, verbClass, branch, jaRec.Branch, firstEN)
230 transdb.RegisterVerbClass(db.Tree, &db.StringPool, transdb.LangJA, reading, classCode)
231 }
232 }
233 }
234
235 // Generate EN surface forms (sang, singing, sings) for verb entries.
236 // Each EN form links to the JA record so EN→JA can navigate the cluster.
237 if jaRI != lattice.NullRec && branch == lattice.Bverb {
238 enBranchByte := transdb.PackBranch(uint8(lattice.Bverb), reg, dom, spec)
239 for _, sense := range e.Senses {
240 for _, gloss := range sense.Glosses {
241 GenerateENForms(db, gloss, enBranchByte, jaRI)
242 }
243 }
244 }
245
246 return true
247 }
248
249 func insertKanjiEntry(db *DB, e KanjiEntry) bool {
250 if e.Literal == "" {
251 return false
252 }
253
254 var rec lattice.Record
255 transdb.SetFormOnRecord(&rec, e.Literal, &db.StringPool)
256 rec.Branch = uint8(lattice.Bnoun)
257
258 key := transdb.MakeKey(transdb.LangJA, 0, e.Literal)
259 if db.Tree.LookupRecIdx(lattice.Bnoun, key) != lattice.NullRec {
260 return false // already in tree from JMdict
261 }
262
263 kaRI := db.Tree.InsertRec(lattice.Bnoun, key, rec)
264
265 // Link ON reading as Link[1].
266 if len(e.OnYomi) > 0 {
267 var onRec lattice.Record
268 transdb.SetFormOnRecord(&onRec, e.OnYomi[0], &db.StringPool)
269 onRec.Branch = uint8(lattice.Bnoun)
270 onKey := transdb.MakeKey(transdb.LangJA, 0, e.OnYomi[0])
271 if db.Tree.LookupRecIdx(lattice.Bnoun, onKey) == lattice.NullRec {
272 onRI := db.Tree.InsertRec(lattice.Bnoun, onKey, onRec)
273 kaRec := db.Tree.GetRecord(kaRI)
274 if kaRec != nil {
275 kaRec.Link[1] = onRI
276 }
277 }
278 }
279
280 // Insert first English meaning, wire link.
281 if len(e.Meanings) > 0 {
282 gloss := e.Meanings[0]
283 var enRec lattice.Record
284 transdb.SetFormOnRecord(&enRec, gloss, &db.StringPool)
285 enRec.Branch = uint8(lattice.Bnoun)
286 enKey := transdb.MakeKey(transdb.LangEN, 0, gloss)
287 if db.Tree.LookupRecIdx(lattice.Bnoun, enKey) == lattice.NullRec {
288 enRI := db.Tree.InsertRec(lattice.Bnoun, enKey, enRec)
289 kaRec := db.Tree.GetRecord(kaRI)
290 enRec2 := db.Tree.GetRecord(enRI)
291 if kaRec != nil && enRI != lattice.NullRec {
292 kaRec.Link[0] = enRI
293 }
294 if enRec2 != nil {
295 enRec2.Link[0] = kaRI
296 }
297 }
298 }
299
300 return true
301 }
302
303 // openInput returns a reader for path.
304 // Use "-" for stdin (e.g. after: gunzip -c file.gz | transdb load -jmdict -).
305 // .gz files: Moxie cannot fork/exec a subprocess (runtime limitation), so
306 // pass stdin or a pre-decompressed path.
307 func openInput(path string) (io.Reader, func(), error) {
308 if path == "-" {
309 return os.Stdin, func() {}, nil
310 }
311 if len(path) > 3 && path[len(path)-3:] == ".gz" {
312 return nil, func() {}, fmt.Errorf(
313 "%q is a .gz file; Moxie cannot spawn gunzip internally.\n"+
314 "Decompress first: gunzip -c %s | transdb load -jmdict -", path, path)
315 }
316 f, err := os.Open(path)
317 if err != nil {
318 return nil, func() {}, err
319 }
320 return f, func() { f.Close() }, nil
321 }
322