jmdict.mx raw

   1  package ingest
   2  
   3  import (
   4  	"io"
   5  
   6  	"git.smesh.lol/iskradb/lattice"
   7  	"git.smesh.lol/transdb"
   8  )
   9  
  10  // JMEntry is one parsed JMdict entry.
  11  type JMEntry struct {
  12  	Seq      uint32
  13  	Kanji    []string // <k_ele><keb> values
  14  	Readings []string // <r_ele><reb> values
  15  	Senses   []JMSense
  16  }
  17  
  18  // JMSense is one sense block within a JMdict entry.
  19  type JMSense struct {
  20  	POS     []string // bare POS entity names: "n", "v1", "adj-i", etc.
  21  	Misc    []string // register/usage markers: "col", "arch", "hon", "vulg", etc.
  22  	Field   []string // domain markers: "med", "law", "comp", etc.
  23  	Glosses []string // English gloss strings
  24  }
  25  
  26  // PrimaryForm returns the preferred surface form.
  27  func (e *JMEntry) PrimaryForm() string {
  28  	if len(e.Kanji) > 0 {
  29  		return e.Kanji[0]
  30  	}
  31  	if len(e.Readings) > 0 {
  32  		return e.Readings[0]
  33  	}
  34  	return ""
  35  }
  36  
  37  // RegisterBits returns the packed register/domain/special bits from the
  38  // first sense's Misc and Field lists, for packing into Record.Branch.
  39  func (e *JMEntry) RegisterBits() (reg, dom, spec uint8) {
  40  	for _, sense := range e.Senses {
  41  		for _, m := range sense.Misc {
  42  			r, s := transdb.MiscToRegSpec(m)
  43  			if r > reg {
  44  				reg = r
  45  			}
  46  			if s > spec {
  47  				spec = s
  48  			}
  49  		}
  50  		for _, f := range sense.Field {
  51  			if d := transdb.FieldToDomain(f); d > dom {
  52  				dom = d
  53  			}
  54  		}
  55  		break // use first sense only for entry-level annotation
  56  	}
  57  	return
  58  }
  59  
  60  // Valency returns the argument count for verb entries.
  61  // 0 = unspecified, 1 = intransitive, 2 = transitive, 3 = ditransitive.
  62  func (e *JMEntry) Valency() uint64 {
  63  	hasVT := false
  64  	hasVI := false
  65  	for _, sense := range e.Senses {
  66  		for _, pos := range sense.POS {
  67  			switch pos {
  68  			case "vt":
  69  				hasVT = true
  70  			case "vi":
  71  				hasVI = true
  72  			}
  73  		}
  74  	}
  75  	if hasVT && !hasVI {
  76  		return 2
  77  	}
  78  	if hasVI && !hasVT {
  79  		return 1
  80  	}
  81  	return 0
  82  }
  83  
  84  // EntryCoord builds the 22-bit coordinate for axes 1-4 from JMdict metadata.
  85  // Grammatical, pragmatic, valency, and register are packed. Morphological (0),
  86  // semantic (0), cooccurrence (0), and phonological (0) are left for other phases.
  87  func (e *JMEntry) EntryCoord(branch lattice.Branch) uint64 {
  88  	reg, dom, _ := e.RegisterBits()
  89  
  90  	// Grammatical axis is already encoded in the lattice branch — omit from coord
  91  	// to avoid doubling every neutral entry. Future intra-branch subdivision
  92  	// (common vs proper noun, action vs state verb) goes here when needed.
  93  	return transdb.PackCoord(
  94  		0,            // semantic: TBD
  95  		0,            // grammatical: encoded in branch, not coord key
  96  		0,            // cooccurrence: set at corpus extend time
  97  		0,            // morphological: 0 for base form
  98  		uint64(dom),  // pragmatic: domain from JMdict <field>
  99  		e.Valency(),  // valency: from vt/vi POS tags
 100  		uint64(reg),  // register: from JMdict <misc>
 101  	)
 102  }
 103  
 104  // VerbClass returns the JMdict verb class string (e.g. "v1", "v5k", "vk")
 105  // for the first verb POS found, or "" if the entry is not a verb.
 106  func (e *JMEntry) VerbClass() string {
 107  	for _, sense := range e.Senses {
 108  		for _, pos := range sense.POS {
 109  			switch pos {
 110  			case "v1", "v1-s", "v5k", "v5k-s", "v5g", "v5s", "v5m", "v5n",
 111  				"v5b", "v5r", "v5r-i", "v5t", "v5u", "v5u-s", "v5aru",
 112  				"vk", "vs", "vs-i", "vs-s", "vs-c":
 113  				return pos
 114  			}
 115  		}
 116  	}
 117  	return ""
 118  }
 119  
 120  // IsFunction returns true if every POS tag in every sense is a structural
 121  // function word (particle, copula, auxiliary). These are fork labels in the
 122  // morphological tree, not content entries — they should not be lattice records.
 123  func (e *JMEntry) IsFunction() bool {
 124  	hasPOS := false
 125  	for _, sense := range e.Senses {
 126  		for _, pos := range sense.POS {
 127  			hasPOS = true
 128  			if !isFunctionPOS(pos) {
 129  				return false
 130  			}
 131  		}
 132  	}
 133  	return hasPOS
 134  }
 135  
 136  func isFunctionPOS(pos string) bool {
 137  	switch pos {
 138  	case "prt",           // particles: は, が, を, に, で, と, も, や, か, etc.
 139  		"cop",            // copulae: だ, です
 140  		"aux", "aux-v",  // auxiliary verbs: ます, た, て forms
 141  		"aux-adj",        // auxiliary adjectives: ない as aux
 142  		"suf",            // suffixes
 143  		"pref":           // prefixes (debatable, but structural)
 144  		return true
 145  	}
 146  	return false
 147  }
 148  
 149  // Branch returns the iskradb branch for this entry's POS.
 150  func (e *JMEntry) Branch() lattice.Branch {
 151  	for _, sense := range e.Senses {
 152  		for _, pos := range sense.POS {
 153  			return posToBranch(pos)
 154  		}
 155  	}
 156  	return lattice.Bnoun
 157  }
 158  
 159  // posToBranch maps JMdict POS entity names to iskradb branches.
 160  // Entity names are the bare strings between & and ; (e.g. "n", "v1").
 161  func posToBranch(pos string) lattice.Branch {
 162  	switch pos {
 163  	case "v1", "v1-s", "v2a-s", "v4h", "v4r", "v5aru", "v5b", "v5g", "v5k",
 164  		"v5k-s", "v5m", "v5n", "v5r", "v5r-i", "v5s", "v5t", "v5u", "v5u-s",
 165  		"v5uru", "vi", "vk", "vn", "vr", "vs", "vs-c", "vs-i", "vs-s", "vt", "vz":
 166  		return lattice.Bverb
 167  	case "adj-f", "adj-i", "adj-ix", "adj-kari", "adj-ku", "adj-na", "adj-nari",
 168  		"adj-no", "adj-pn", "adj-shiku", "adj-t", "adv", "adv-to", "conj",
 169  		"int", "prt":
 170  		return lattice.Bmodifier
 171  	default:
 172  		return lattice.Bnoun
 173  	}
 174  }
 175  
 176  // ParseJMdict reads JMdict XML from r and calls emit for each complete entry.
 177  func ParseJMdict(r io.Reader, emit func(JMEntry)) {
 178  	sc := NewXMLScanner(r)
 179  	var ev XMLEvent
 180  
 181  	var (
 182  		entry                   JMEntry
 183  		curSense                JMSense
 184  		inEntry                 bool
 185  		inKEle, inREle, inSense bool
 186  		inKeb, inReb            bool
 187  		inEntSeq, inPos         bool
 188  		inMisc, inField         bool
 189  		inGloss                 bool
 190  		glossIsEng              bool
 191  	)
 192  
 193  	finishSense := func() {
 194  		if len(curSense.Glosses) > 0 || len(curSense.POS) > 0 {
 195  			entry.Senses = append(entry.Senses, curSense)
 196  		}
 197  		curSense = JMSense{}
 198  	}
 199  
 200  	attrLang := func(attrs string) string {
 201  		const needle = "xml:lang="
 202  		for i := 0; i+len(needle) < len(attrs); i++ {
 203  			if attrs[i:i+len(needle)] == needle {
 204  				i += len(needle)
 205  				if i < len(attrs) {
 206  					q := attrs[i]
 207  					if q == '"' || q == '\'' {
 208  						i++
 209  						start := i
 210  						for i < len(attrs) && byte(attrs[i]) != q {
 211  							i++
 212  						}
 213  						return attrs[start:i]
 214  					}
 215  				}
 216  			}
 217  		}
 218  		return "eng" // default to English when no lang attr
 219  	}
 220  
 221  	for sc.Next(&ev) {
 222  		switch ev.Kind {
 223  		case XMLStart:
 224  			switch ev.Name {
 225  			case "entry":
 226  				entry = JMEntry{}
 227  				curSense = JMSense{}
 228  				inEntry = true
 229  			case "ent_seq":
 230  				inEntSeq = true
 231  			case "k_ele":
 232  				inKEle = true
 233  			case "keb":
 234  				inKeb = true
 235  			case "r_ele":
 236  				inREle = true
 237  			case "reb":
 238  				inReb = true
 239  			case "sense":
 240  				if inEntry {
 241  					inSense = true
 242  				}
 243  			case "pos":
 244  				if inSense {
 245  					inPos = true
 246  				}
 247  			case "misc":
 248  				if inSense {
 249  					inMisc = true
 250  				}
 251  			case "field":
 252  				if inSense {
 253  					inField = true
 254  				}
 255  			case "gloss":
 256  				if inSense {
 257  					lang := attrLang(ev.Attrs)
 258  					glossIsEng = lang == "eng"
 259  					inGloss = true
 260  				}
 261  			}
 262  
 263  		case XMLEnd:
 264  			switch ev.Name {
 265  			case "entry":
 266  				if inSense {
 267  					finishSense()
 268  					inSense = false
 269  				}
 270  				if inEntry && entry.PrimaryForm() != "" {
 271  					emit(entry)
 272  				}
 273  				entry = JMEntry{}
 274  				inEntry = false
 275  				inKEle = false
 276  				inREle = false
 277  			case "ent_seq":
 278  				inEntSeq = false
 279  			case "k_ele":
 280  				inKEle = false
 281  			case "keb":
 282  				inKeb = false
 283  			case "r_ele":
 284  				inREle = false
 285  			case "reb":
 286  				inReb = false
 287  			case "sense":
 288  				if inSense {
 289  					finishSense()
 290  					inSense = false
 291  				}
 292  			case "pos":
 293  				inPos = false
 294  			case "misc":
 295  				inMisc = false
 296  			case "field":
 297  				inField = false
 298  			case "gloss":
 299  				inGloss = false
 300  				glossIsEng = false
 301  			}
 302  
 303  		case XMLText:
 304  			switch {
 305  			case inEntSeq:
 306  				entry.Seq = parseUint32(ev.Text)
 307  			case inKeb && inKEle:
 308  				entry.Kanji = append(entry.Kanji, ev.Text)
 309  			case inReb && inREle:
 310  				entry.Readings = append(entry.Readings, ev.Text)
 311  			case inPos && inSense:
 312  				curSense.POS = append(curSense.POS, ev.Text)
 313  			case inMisc && inSense:
 314  				curSense.Misc = append(curSense.Misc, ev.Text)
 315  			case inField && inSense:
 316  				curSense.Field = append(curSense.Field, ev.Text)
 317  			case inGloss && inSense && glossIsEng:
 318  				curSense.Glosses = append(curSense.Glosses, ev.Text)
 319  			}
 320  		}
 321  	}
 322  }
 323  
 324  func parseUint32(s string) uint32 {
 325  	var n uint32
 326  	for i := 0; i < len(s); i++ {
 327  		c := s[i]
 328  		if c >= '0' && c <= '9' {
 329  			n = n*10 + uint32(c-'0')
 330  		}
 331  	}
 332  	return n
 333  }
 334