package ingest import ( "io" "git.smesh.lol/iskradb/lattice" "git.smesh.lol/transdb" ) // JMEntry is one parsed JMdict entry. type JMEntry struct { Seq uint32 Kanji []string // values Readings []string // values Senses []JMSense } // JMSense is one sense block within a JMdict entry. type JMSense struct { POS []string // bare POS entity names: "n", "v1", "adj-i", etc. Misc []string // register/usage markers: "col", "arch", "hon", "vulg", etc. Field []string // domain markers: "med", "law", "comp", etc. Glosses []string // English gloss strings } // PrimaryForm returns the preferred surface form. func (e *JMEntry) PrimaryForm() string { if len(e.Kanji) > 0 { return e.Kanji[0] } if len(e.Readings) > 0 { return e.Readings[0] } return "" } // RegisterBits returns the packed register/domain/special bits from the // first sense's Misc and Field lists, for packing into Record.Branch. func (e *JMEntry) RegisterBits() (reg, dom, spec uint8) { for _, sense := range e.Senses { for _, m := range sense.Misc { r, s := transdb.MiscToRegSpec(m) if r > reg { reg = r } if s > spec { spec = s } } for _, f := range sense.Field { if d := transdb.FieldToDomain(f); d > dom { dom = d } } break // use first sense only for entry-level annotation } return } // Valency returns the argument count for verb entries. // 0 = unspecified, 1 = intransitive, 2 = transitive, 3 = ditransitive. func (e *JMEntry) Valency() uint64 { hasVT := false hasVI := false for _, sense := range e.Senses { for _, pos := range sense.POS { switch pos { case "vt": hasVT = true case "vi": hasVI = true } } } if hasVT && !hasVI { return 2 } if hasVI && !hasVT { return 1 } return 0 } // EntryCoord builds the 22-bit coordinate for axes 1-4 from JMdict metadata. // Grammatical, pragmatic, valency, and register are packed. Morphological (0), // semantic (0), cooccurrence (0), and phonological (0) are left for other phases. func (e *JMEntry) EntryCoord(branch lattice.Branch) uint64 { reg, dom, _ := e.RegisterBits() // Grammatical axis is already encoded in the lattice branch — omit from coord // to avoid doubling every neutral entry. Future intra-branch subdivision // (common vs proper noun, action vs state verb) goes here when needed. return transdb.PackCoord( 0, // semantic: TBD 0, // grammatical: encoded in branch, not coord key 0, // cooccurrence: set at corpus extend time 0, // morphological: 0 for base form uint64(dom), // pragmatic: domain from JMdict e.Valency(), // valency: from vt/vi POS tags uint64(reg), // register: from JMdict ) } // VerbClass returns the JMdict verb class string (e.g. "v1", "v5k", "vk") // for the first verb POS found, or "" if the entry is not a verb. func (e *JMEntry) VerbClass() string { for _, sense := range e.Senses { for _, pos := range sense.POS { switch pos { case "v1", "v1-s", "v5k", "v5k-s", "v5g", "v5s", "v5m", "v5n", "v5b", "v5r", "v5r-i", "v5t", "v5u", "v5u-s", "v5aru", "vk", "vs", "vs-i", "vs-s", "vs-c": return pos } } } return "" } // IsFunction returns true if every POS tag in every sense is a structural // function word (particle, copula, auxiliary). These are fork labels in the // morphological tree, not content entries — they should not be lattice records. func (e *JMEntry) IsFunction() bool { hasPOS := false for _, sense := range e.Senses { for _, pos := range sense.POS { hasPOS = true if !isFunctionPOS(pos) { return false } } } return hasPOS } func isFunctionPOS(pos string) bool { switch pos { case "prt", // particles: は, が, を, に, で, と, も, や, か, etc. "cop", // copulae: だ, です "aux", "aux-v", // auxiliary verbs: ます, た, て forms "aux-adj", // auxiliary adjectives: ない as aux "suf", // suffixes "pref": // prefixes (debatable, but structural) return true } return false } // Branch returns the iskradb branch for this entry's POS. func (e *JMEntry) Branch() lattice.Branch { for _, sense := range e.Senses { for _, pos := range sense.POS { return posToBranch(pos) } } return lattice.Bnoun } // posToBranch maps JMdict POS entity names to iskradb branches. // Entity names are the bare strings between & and ; (e.g. "n", "v1"). func posToBranch(pos string) lattice.Branch { switch pos { case "v1", "v1-s", "v2a-s", "v4h", "v4r", "v5aru", "v5b", "v5g", "v5k", "v5k-s", "v5m", "v5n", "v5r", "v5r-i", "v5s", "v5t", "v5u", "v5u-s", "v5uru", "vi", "vk", "vn", "vr", "vs", "vs-c", "vs-i", "vs-s", "vt", "vz": return lattice.Bverb case "adj-f", "adj-i", "adj-ix", "adj-kari", "adj-ku", "adj-na", "adj-nari", "adj-no", "adj-pn", "adj-shiku", "adj-t", "adv", "adv-to", "conj", "int", "prt": return lattice.Bmodifier default: return lattice.Bnoun } } // ParseJMdict reads JMdict XML from r and calls emit for each complete entry. func ParseJMdict(r io.Reader, emit func(JMEntry)) { sc := NewXMLScanner(r) var ev XMLEvent var ( entry JMEntry curSense JMSense inEntry bool inKEle, inREle, inSense bool inKeb, inReb bool inEntSeq, inPos bool inMisc, inField bool inGloss bool glossIsEng bool ) finishSense := func() { if len(curSense.Glosses) > 0 || len(curSense.POS) > 0 { entry.Senses = append(entry.Senses, curSense) } curSense = JMSense{} } attrLang := func(attrs string) string { const needle = "xml:lang=" for i := 0; i+len(needle) < len(attrs); i++ { if attrs[i:i+len(needle)] == needle { i += len(needle) if i < len(attrs) { q := attrs[i] if q == '"' || q == '\'' { i++ start := i for i < len(attrs) && byte(attrs[i]) != q { i++ } return attrs[start:i] } } } } return "eng" // default to English when no lang attr } for sc.Next(&ev) { switch ev.Kind { case XMLStart: switch ev.Name { case "entry": entry = JMEntry{} curSense = JMSense{} inEntry = true case "ent_seq": inEntSeq = true case "k_ele": inKEle = true case "keb": inKeb = true case "r_ele": inREle = true case "reb": inReb = true case "sense": if inEntry { inSense = true } case "pos": if inSense { inPos = true } case "misc": if inSense { inMisc = true } case "field": if inSense { inField = true } case "gloss": if inSense { lang := attrLang(ev.Attrs) glossIsEng = lang == "eng" inGloss = true } } case XMLEnd: switch ev.Name { case "entry": if inSense { finishSense() inSense = false } if inEntry && entry.PrimaryForm() != "" { emit(entry) } entry = JMEntry{} inEntry = false inKEle = false inREle = false case "ent_seq": inEntSeq = false case "k_ele": inKEle = false case "keb": inKeb = false case "r_ele": inREle = false case "reb": inReb = false case "sense": if inSense { finishSense() inSense = false } case "pos": inPos = false case "misc": inMisc = false case "field": inField = false case "gloss": inGloss = false glossIsEng = false } case XMLText: switch { case inEntSeq: entry.Seq = parseUint32(ev.Text) case inKeb && inKEle: entry.Kanji = append(entry.Kanji, ev.Text) case inReb && inREle: entry.Readings = append(entry.Readings, ev.Text) case inPos && inSense: curSense.POS = append(curSense.POS, ev.Text) case inMisc && inSense: curSense.Misc = append(curSense.Misc, ev.Text) case inField && inSense: curSense.Field = append(curSense.Field, ev.Text) case inGloss && inSense && glossIsEng: curSense.Glosses = append(curSense.Glosses, ev.Text) } } } } func parseUint32(s string) uint32 { var n uint32 for i := 0; i < len(s); i++ { c := s[i] if c >= '0' && c <= '9' { n = n*10 + uint32(c-'0') } } return n }