jmdict.mx raw
1 package ingest
2
3 import (
4 "io"
5
6 "git.smesh.lol/iskradb/lattice"
7 "git.smesh.lol/transdb"
8 )
9
10 // JMEntry is one parsed JMdict entry.
11 type JMEntry struct {
12 Seq uint32
13 Kanji []string // <k_ele><keb> values
14 Readings []string // <r_ele><reb> values
15 Senses []JMSense
16 }
17
18 // JMSense is one sense block within a JMdict entry.
19 type JMSense struct {
20 POS []string // bare POS entity names: "n", "v1", "adj-i", etc.
21 Misc []string // register/usage markers: "col", "arch", "hon", "vulg", etc.
22 Field []string // domain markers: "med", "law", "comp", etc.
23 Glosses []string // English gloss strings
24 }
25
26 // PrimaryForm returns the preferred surface form.
27 func (e *JMEntry) PrimaryForm() string {
28 if len(e.Kanji) > 0 {
29 return e.Kanji[0]
30 }
31 if len(e.Readings) > 0 {
32 return e.Readings[0]
33 }
34 return ""
35 }
36
37 // RegisterBits returns the packed register/domain/special bits from the
38 // first sense's Misc and Field lists, for packing into Record.Branch.
39 func (e *JMEntry) RegisterBits() (reg, dom, spec uint8) {
40 for _, sense := range e.Senses {
41 for _, m := range sense.Misc {
42 r, s := transdb.MiscToRegSpec(m)
43 if r > reg {
44 reg = r
45 }
46 if s > spec {
47 spec = s
48 }
49 }
50 for _, f := range sense.Field {
51 if d := transdb.FieldToDomain(f); d > dom {
52 dom = d
53 }
54 }
55 break // use first sense only for entry-level annotation
56 }
57 return
58 }
59
60 // Valency returns the argument count for verb entries.
61 // 0 = unspecified, 1 = intransitive, 2 = transitive, 3 = ditransitive.
62 func (e *JMEntry) Valency() uint64 {
63 hasVT := false
64 hasVI := false
65 for _, sense := range e.Senses {
66 for _, pos := range sense.POS {
67 switch pos {
68 case "vt":
69 hasVT = true
70 case "vi":
71 hasVI = true
72 }
73 }
74 }
75 if hasVT && !hasVI {
76 return 2
77 }
78 if hasVI && !hasVT {
79 return 1
80 }
81 return 0
82 }
83
84 // EntryCoord builds the 22-bit coordinate for axes 1-4 from JMdict metadata.
85 // Grammatical, pragmatic, valency, and register are packed. Morphological (0),
86 // semantic (0), cooccurrence (0), and phonological (0) are left for other phases.
87 func (e *JMEntry) EntryCoord(branch lattice.Branch) uint64 {
88 reg, dom, _ := e.RegisterBits()
89
90 // Grammatical axis is already encoded in the lattice branch — omit from coord
91 // to avoid doubling every neutral entry. Future intra-branch subdivision
92 // (common vs proper noun, action vs state verb) goes here when needed.
93 return transdb.PackCoord(
94 0, // semantic: TBD
95 0, // grammatical: encoded in branch, not coord key
96 0, // cooccurrence: set at corpus extend time
97 0, // morphological: 0 for base form
98 uint64(dom), // pragmatic: domain from JMdict <field>
99 e.Valency(), // valency: from vt/vi POS tags
100 uint64(reg), // register: from JMdict <misc>
101 )
102 }
103
104 // VerbClass returns the JMdict verb class string (e.g. "v1", "v5k", "vk")
105 // for the first verb POS found, or "" if the entry is not a verb.
106 func (e *JMEntry) VerbClass() string {
107 for _, sense := range e.Senses {
108 for _, pos := range sense.POS {
109 switch pos {
110 case "v1", "v1-s", "v5k", "v5k-s", "v5g", "v5s", "v5m", "v5n",
111 "v5b", "v5r", "v5r-i", "v5t", "v5u", "v5u-s", "v5aru",
112 "vk", "vs", "vs-i", "vs-s", "vs-c":
113 return pos
114 }
115 }
116 }
117 return ""
118 }
119
120 // IsFunction returns true if every POS tag in every sense is a structural
121 // function word (particle, copula, auxiliary). These are fork labels in the
122 // morphological tree, not content entries — they should not be lattice records.
123 func (e *JMEntry) IsFunction() bool {
124 hasPOS := false
125 for _, sense := range e.Senses {
126 for _, pos := range sense.POS {
127 hasPOS = true
128 if !isFunctionPOS(pos) {
129 return false
130 }
131 }
132 }
133 return hasPOS
134 }
135
136 func isFunctionPOS(pos string) bool {
137 switch pos {
138 case "prt", // particles: は, が, を, に, で, と, も, や, か, etc.
139 "cop", // copulae: だ, です
140 "aux", "aux-v", // auxiliary verbs: ます, た, て forms
141 "aux-adj", // auxiliary adjectives: ない as aux
142 "suf", // suffixes
143 "pref": // prefixes (debatable, but structural)
144 return true
145 }
146 return false
147 }
148
149 // Branch returns the iskradb branch for this entry's POS.
150 func (e *JMEntry) Branch() lattice.Branch {
151 for _, sense := range e.Senses {
152 for _, pos := range sense.POS {
153 return posToBranch(pos)
154 }
155 }
156 return lattice.Bnoun
157 }
158
159 // posToBranch maps JMdict POS entity names to iskradb branches.
160 // Entity names are the bare strings between & and ; (e.g. "n", "v1").
161 func posToBranch(pos string) lattice.Branch {
162 switch pos {
163 case "v1", "v1-s", "v2a-s", "v4h", "v4r", "v5aru", "v5b", "v5g", "v5k",
164 "v5k-s", "v5m", "v5n", "v5r", "v5r-i", "v5s", "v5t", "v5u", "v5u-s",
165 "v5uru", "vi", "vk", "vn", "vr", "vs", "vs-c", "vs-i", "vs-s", "vt", "vz":
166 return lattice.Bverb
167 case "adj-f", "adj-i", "adj-ix", "adj-kari", "adj-ku", "adj-na", "adj-nari",
168 "adj-no", "adj-pn", "adj-shiku", "adj-t", "adv", "adv-to", "conj",
169 "int", "prt":
170 return lattice.Bmodifier
171 default:
172 return lattice.Bnoun
173 }
174 }
175
176 // ParseJMdict reads JMdict XML from r and calls emit for each complete entry.
177 func ParseJMdict(r io.Reader, emit func(JMEntry)) {
178 sc := NewXMLScanner(r)
179 var ev XMLEvent
180
181 var (
182 entry JMEntry
183 curSense JMSense
184 inEntry bool
185 inKEle, inREle, inSense bool
186 inKeb, inReb bool
187 inEntSeq, inPos bool
188 inMisc, inField bool
189 inGloss bool
190 glossIsEng bool
191 )
192
193 finishSense := func() {
194 if len(curSense.Glosses) > 0 || len(curSense.POS) > 0 {
195 entry.Senses = append(entry.Senses, curSense)
196 }
197 curSense = JMSense{}
198 }
199
200 attrLang := func(attrs string) string {
201 const needle = "xml:lang="
202 for i := 0; i+len(needle) < len(attrs); i++ {
203 if attrs[i:i+len(needle)] == needle {
204 i += len(needle)
205 if i < len(attrs) {
206 q := attrs[i]
207 if q == '"' || q == '\'' {
208 i++
209 start := i
210 for i < len(attrs) && byte(attrs[i]) != q {
211 i++
212 }
213 return attrs[start:i]
214 }
215 }
216 }
217 }
218 return "eng" // default to English when no lang attr
219 }
220
221 for sc.Next(&ev) {
222 switch ev.Kind {
223 case XMLStart:
224 switch ev.Name {
225 case "entry":
226 entry = JMEntry{}
227 curSense = JMSense{}
228 inEntry = true
229 case "ent_seq":
230 inEntSeq = true
231 case "k_ele":
232 inKEle = true
233 case "keb":
234 inKeb = true
235 case "r_ele":
236 inREle = true
237 case "reb":
238 inReb = true
239 case "sense":
240 if inEntry {
241 inSense = true
242 }
243 case "pos":
244 if inSense {
245 inPos = true
246 }
247 case "misc":
248 if inSense {
249 inMisc = true
250 }
251 case "field":
252 if inSense {
253 inField = true
254 }
255 case "gloss":
256 if inSense {
257 lang := attrLang(ev.Attrs)
258 glossIsEng = lang == "eng"
259 inGloss = true
260 }
261 }
262
263 case XMLEnd:
264 switch ev.Name {
265 case "entry":
266 if inSense {
267 finishSense()
268 inSense = false
269 }
270 if inEntry && entry.PrimaryForm() != "" {
271 emit(entry)
272 }
273 entry = JMEntry{}
274 inEntry = false
275 inKEle = false
276 inREle = false
277 case "ent_seq":
278 inEntSeq = false
279 case "k_ele":
280 inKEle = false
281 case "keb":
282 inKeb = false
283 case "r_ele":
284 inREle = false
285 case "reb":
286 inReb = false
287 case "sense":
288 if inSense {
289 finishSense()
290 inSense = false
291 }
292 case "pos":
293 inPos = false
294 case "misc":
295 inMisc = false
296 case "field":
297 inField = false
298 case "gloss":
299 inGloss = false
300 glossIsEng = false
301 }
302
303 case XMLText:
304 switch {
305 case inEntSeq:
306 entry.Seq = parseUint32(ev.Text)
307 case inKeb && inKEle:
308 entry.Kanji = append(entry.Kanji, ev.Text)
309 case inReb && inREle:
310 entry.Readings = append(entry.Readings, ev.Text)
311 case inPos && inSense:
312 curSense.POS = append(curSense.POS, ev.Text)
313 case inMisc && inSense:
314 curSense.Misc = append(curSense.Misc, ev.Text)
315 case inField && inSense:
316 curSense.Field = append(curSense.Field, ev.Text)
317 case inGloss && inSense && glossIsEng:
318 curSense.Glosses = append(curSense.Glosses, ev.Text)
319 }
320 }
321 }
322 }
323
324 func parseUint32(s string) uint32 {
325 var n uint32
326 for i := 0; i < len(s); i++ {
327 c := s[i]
328 if c >= '0' && c <= '9' {
329 n = n*10 + uint32(c-'0')
330 }
331 }
332 return n
333 }
334