load.mx raw

   1  package ingest
   2  
   3  import (
   4  	"fmt"
   5  	"io"
   6  	"os"
   7  
   8  	"git.smesh.lol/iskradb/lattice"
   9  	"git.smesh.lol/transdb"
  10  )
  11  
  12  // DB is the in-memory translation lattice with its string pool.
  13  type DB struct {
  14  	Tree       *lattice.Tree
  15  	StringPool []byte
  16  }
  17  
  18  // NewDB creates an empty translation database.
  19  func NewDB(cap int) *DB {
  20  	return &DB{
  21  		Tree:       lattice.NewTree(cap),
  22  		StringPool: []byte{:0:65536},
  23  	}
  24  }
  25  
  26  // LoadJMdict parses a JMdict XML file (or "-" for stdin) and inserts all
  27  // entries into the database. If path ends in ".gz", gunzip is used to decompress.
  28  func LoadJMdict(db *DB, path string) (int, int, error) {
  29  	r, cleanup, err := openInput(path)
  30  	if err != nil {
  31  		return 0, 0, err
  32  	}
  33  	defer cleanup()
  34  
  35  	inserted := 0
  36  	skipped := 0
  37  
  38  	ParseJMdict(r, func(e JMEntry) {
  39  		if insertJMEntry(db, e) {
  40  			inserted++
  41  		} else {
  42  			skipped++
  43  		}
  44  	})
  45  
  46  	return inserted, skipped, nil
  47  }
  48  
  49  // LoadKanjidic parses a KANJIDIC2 XML file and inserts all kanji.
  50  func LoadKanjidic(db *DB, path string) (int, error) {
  51  	r, cleanup, err := openInput(path)
  52  	if err != nil {
  53  		return 0, err
  54  	}
  55  	defer cleanup()
  56  
  57  	inserted := 0
  58  	ParseKanjidic(r, func(e KanjiEntry) {
  59  		if insertKanjiEntry(db, e) {
  60  			inserted++
  61  		}
  62  	})
  63  	return inserted, nil
  64  }
  65  
  66  func insertJMEntry(db *DB, e JMEntry) bool {
  67  	// Particles, copulae, and auxiliaries are morphological fork labels —
  68  	// they belong in the tree structure, not as content records.
  69  	if e.IsFunction() {
  70  		return false
  71  	}
  72  	form := e.PrimaryForm()
  73  	if form == "" {
  74  		return false
  75  	}
  76  	branch := e.Branch()
  77  
  78  	// Build JA record.
  79  	var jaRec lattice.Record
  80  	transdb.SetFormOnRecord(&jaRec, form, &db.StringPool)
  81  	reg, dom, spec := e.RegisterBits()
  82  	jaRec.Branch = transdb.PackBranch(uint8(branch), reg, dom, spec)
  83  	// DataOff/DataLen set by SetFormOnRecord; do not overwrite.
  84  
  85  	// Compute entry coord from axes 1-4 (grammatical, pragmatic, valency, register).
  86  	// Base forms are inserted at BOTH coord=0 (universal fallback) and the specific
  87  	// coord when it differs. This enables precise lookup for context-aware translation.
  88  	entryCoord := e.EntryCoord(branch)
  89  
  90  	jaKey := transdb.MakeKey(transdb.LangJA, 0, form)
  91  
  92  	// Collision check at coord=0.
  93  	if ri := db.Tree.LookupRecIdx(branch, jaKey); ri != lattice.NullRec {
  94  		existing := transdb.FormFromInline(db.Tree.GetRecord(ri), db.StringPool)
  95  		if existing != form {
  96  			fmt.Fprintf(os.Stderr, "collision: key=%016x form=%q existing=%q\n", jaKey, form, existing)
  97  		}
  98  		// Either duplicate or collision - skip insert to avoid overwriting.
  99  	}
 100  
 101  	jaRI := db.Tree.InsertRec(branch, jaKey, jaRec)
 102  
 103  	// Insert alias records for all readings that differ from the primary form.
 104  	// Entries with a kanji primary form (e.g. 珈琲) have katakana readings
 105  	// (e.g. コーヒー) that must be directly lookupable for tokenization and
 106  	// JA→EN translation. Each alias points to the same EN translations.
 107  	var readingRIs []uint32
 108  	for _, reading := range e.Readings {
 109  		if reading == "" || reading == form {
 110  			continue
 111  		}
 112  		rKey := transdb.MakeKey(transdb.LangJA, 0, reading)
 113  		if db.Tree.LookupRecIdx(branch, rKey) != lattice.NullRec {
 114  			if ri := db.Tree.LookupRecIdx(branch, rKey); ri != lattice.NullRec {
 115  				readingRIs = append(readingRIs, ri)
 116  			}
 117  			continue
 118  		}
 119  		var rRec lattice.Record
 120  		transdb.SetFormOnRecord(&rRec, reading, &db.StringPool)
 121  		rRec.Branch = uint8(branch)
 122  		ri := db.Tree.InsertRec(branch, rKey, rRec)
 123  		readingRIs = append(readingRIs, ri)
 124  	}
 125  
 126  	// Insert English glosses. Collect all EN record indices.
 127  	// Do NOT cache record pointers across InsertRec calls - the Records slice
 128  	// may be reallocated, invalidating pointers from earlier calls.
 129  	firstEN := lattice.NullRec
 130  	var enRIs []uint32
 131  	for _, sense := range e.Senses {
 132  		for _, gloss := range sense.Glosses {
 133  			if gloss == "" {
 134  				continue
 135  			}
 136  			var enRec lattice.Record
 137  			transdb.SetFormOnRecord(&enRec, gloss, &db.StringPool)
 138  			enRec.Branch = transdb.PackBranch(uint8(branch), reg, dom, spec)
 139  			// DataOff/DataLen are used by SetFormOnRecord for overflow pool
 140  			// references; do not overwrite for provenance. Use RecMeta instead.
 141  
 142  			enKey := transdb.MakeKey(transdb.LangEN, 0, gloss)
 143  
 144  			if ri := db.Tree.LookupRecIdx(branch, enKey); ri != lattice.NullRec {
 145  				if firstEN == lattice.NullRec {
 146  					firstEN = ri
 147  				}
 148  				enRIs = append(enRIs, ri)
 149  				continue
 150  			}
 151  
 152  			enRI := db.Tree.InsertRec(branch, enKey, enRec)
 153  			if firstEN == lattice.NullRec {
 154  				firstEN = enRI
 155  			}
 156  			enRIs = append(enRIs, enRI)
 157  		}
 158  	}
 159  
 160  	// Wire all links AFTER all inserts to use stable indices (not stale pointers).
 161  	// Re-fetch record pointers fresh here; no more appends after this point.
 162  	for _, enRI := range enRIs {
 163  		if enR := db.Tree.GetRecord(enRI); enR != nil && enR.Link[0] == lattice.NullRec {
 164  			enR.Link[0] = jaRI // EN → JA
 165  		}
 166  	}
 167  
 168  	// Wire reading aliases to the primary EN translation.
 169  	for _, rRI := range readingRIs {
 170  		if rRec := db.Tree.GetRecord(rRI); rRec != nil && rRec.Link[0] == lattice.NullRec {
 171  			rRec.Link[0] = firstEN
 172  		}
 173  	}
 174  
 175  	if jRec := db.Tree.GetRecord(jaRI); jRec != nil {
 176  		if firstEN != lattice.NullRec {
 177  			jRec.Link[0] = firstEN // JA → EN (primary)
 178  		}
 179  		// Link[1] points to the first reading alias for JA→alt traversal.
 180  		if len(readingRIs) > 0 && jRec.Link[1] == lattice.NullRec {
 181  			jRec.Link[1] = readingRIs[0]
 182  		}
 183  	}
 184  
 185  	// Insert specific-coord alias when entryCoord carries non-default axis values.
 186  	// This enables context-aware lookup (vt/vi disambiguation, register selection).
 187  	// The coord=0 record above is the universal fallback; this record is the precise hit.
 188  	if entryCoord != 0 && firstEN != lattice.NullRec {
 189  		jaKeyC := transdb.MakeKey(transdb.LangJA, entryCoord, form)
 190  		if db.Tree.LookupRecIdx(branch, jaKeyC) == lattice.NullRec {
 191  			var cRec lattice.Record
 192  			transdb.SetFormOnRecord(&cRec, form, &db.StringPool)
 193  			cRec.Branch = jaRec.Branch
 194  			newRI := db.Tree.InsertRec(branch, jaKeyC, cRec)
 195  			if r := db.Tree.GetRecord(newRI); r != nil {
 196  				r.Link[0] = firstEN
 197  			}
 198  		}
 199  		// Specific-coord EN alias → links to JA specific-coord record.
 200  		for _, gloss := range e.Senses[0].Glosses {
 201  			if gloss == "" {
 202  				continue
 203  			}
 204  			enKeyC := transdb.MakeKey(transdb.LangEN, entryCoord, gloss)
 205  			if db.Tree.LookupRecIdx(branch, enKeyC) == lattice.NullRec {
 206  				var eRec lattice.Record
 207  				transdb.SetFormOnRecord(&eRec, gloss, &db.StringPool)
 208  				eRec.Branch = jaRec.Branch
 209  				newRI := db.Tree.InsertRec(branch, enKeyC, eRec)
 210  				// Wire: EN coord-specific → JA coord-specific
 211  				jaCoordRI := db.Tree.LookupRecIdx(branch, jaKeyC)
 212  				if r := db.Tree.GetRecord(newRI); r != nil && jaCoordRI != lattice.NullRec {
 213  					r.Link[0] = jaCoordRI
 214  				}
 215  			}
 216  		}
 217  	}
 218  
 219  	// Generate JA conjugated forms (食べた, 食べている, etc.) for verb entries.
 220  	// Also register verb class in Bcooccur so InflectJAFromTree can compute
 221  	// any form at runtime without requiring all forms to be pre-stored.
 222  	if firstEN != lattice.NullRec {
 223  		verbClass := e.VerbClass()
 224  		classCode := transdb.VerbClassCode(verbClass)
 225  		GenerateConjugations(db, form, verbClass, branch, jaRec.Branch, firstEN)
 226  		transdb.RegisterVerbClass(db.Tree, &db.StringPool, transdb.LangJA, form, classCode)
 227  		for _, reading := range e.Readings {
 228  			if reading != form {
 229  				GenerateConjugations(db, reading, verbClass, branch, jaRec.Branch, firstEN)
 230  				transdb.RegisterVerbClass(db.Tree, &db.StringPool, transdb.LangJA, reading, classCode)
 231  			}
 232  		}
 233  	}
 234  
 235  	// Generate EN surface forms (sang, singing, sings) for verb entries.
 236  	// Each EN form links to the JA record so EN→JA can navigate the cluster.
 237  	if jaRI != lattice.NullRec && branch == lattice.Bverb {
 238  		enBranchByte := transdb.PackBranch(uint8(lattice.Bverb), reg, dom, spec)
 239  		for _, sense := range e.Senses {
 240  			for _, gloss := range sense.Glosses {
 241  				GenerateENForms(db, gloss, enBranchByte, jaRI)
 242  			}
 243  		}
 244  	}
 245  
 246  	return true
 247  }
 248  
 249  func insertKanjiEntry(db *DB, e KanjiEntry) bool {
 250  	if e.Literal == "" {
 251  		return false
 252  	}
 253  
 254  	var rec lattice.Record
 255  	transdb.SetFormOnRecord(&rec, e.Literal, &db.StringPool)
 256  	rec.Branch = uint8(lattice.Bnoun)
 257  
 258  	key := transdb.MakeKey(transdb.LangJA, 0, e.Literal)
 259  	if db.Tree.LookupRecIdx(lattice.Bnoun, key) != lattice.NullRec {
 260  		return false // already in tree from JMdict
 261  	}
 262  
 263  	kaRI := db.Tree.InsertRec(lattice.Bnoun, key, rec)
 264  
 265  	// Link ON reading as Link[1].
 266  	if len(e.OnYomi) > 0 {
 267  		var onRec lattice.Record
 268  		transdb.SetFormOnRecord(&onRec, e.OnYomi[0], &db.StringPool)
 269  		onRec.Branch = uint8(lattice.Bnoun)
 270  		onKey := transdb.MakeKey(transdb.LangJA, 0, e.OnYomi[0])
 271  		if db.Tree.LookupRecIdx(lattice.Bnoun, onKey) == lattice.NullRec {
 272  			onRI := db.Tree.InsertRec(lattice.Bnoun, onKey, onRec)
 273  			kaRec := db.Tree.GetRecord(kaRI)
 274  			if kaRec != nil {
 275  				kaRec.Link[1] = onRI
 276  			}
 277  		}
 278  	}
 279  
 280  	// Insert first English meaning, wire link.
 281  	if len(e.Meanings) > 0 {
 282  		gloss := e.Meanings[0]
 283  		var enRec lattice.Record
 284  		transdb.SetFormOnRecord(&enRec, gloss, &db.StringPool)
 285  		enRec.Branch = uint8(lattice.Bnoun)
 286  		enKey := transdb.MakeKey(transdb.LangEN, 0, gloss)
 287  		if db.Tree.LookupRecIdx(lattice.Bnoun, enKey) == lattice.NullRec {
 288  			enRI := db.Tree.InsertRec(lattice.Bnoun, enKey, enRec)
 289  			kaRec := db.Tree.GetRecord(kaRI)
 290  			enRec2 := db.Tree.GetRecord(enRI)
 291  			if kaRec != nil && enRI != lattice.NullRec {
 292  				kaRec.Link[0] = enRI
 293  			}
 294  			if enRec2 != nil {
 295  				enRec2.Link[0] = kaRI
 296  			}
 297  		}
 298  	}
 299  
 300  	return true
 301  }
 302  
 303  // openInput returns a reader for path.
 304  // Use "-" for stdin (e.g. after: gunzip -c file.gz | transdb load -jmdict -).
 305  // .gz files: Moxie cannot fork/exec a subprocess (runtime limitation), so
 306  // pass stdin or a pre-decompressed path.
 307  func openInput(path string) (io.Reader, func(), error) {
 308  	if path == "-" {
 309  		return os.Stdin, func() {}, nil
 310  	}
 311  	if len(path) > 3 && path[len(path)-3:] == ".gz" {
 312  		return nil, func() {}, fmt.Errorf(
 313  			"%q is a .gz file; Moxie cannot spawn gunzip internally.\n"+
 314  				"Decompress first: gunzip -c %s | transdb load -jmdict -", path, path)
 315  	}
 316  	f, err := os.Open(path)
 317  	if err != nil {
 318  		return nil, func() {}, err
 319  	}
 320  	return f, func() { f.Close() }, nil
 321  }
 322