lemma_ja.mx raw
1 package iskra
2
3 // volClassFromSuffix returns the godan class implied by a volitional compound
4 // ending. Returns VClassIchidan for よう (and for unknown patterns).
5 func volClassFromSuffix(suffix []byte) uint8 {
6 switch string(suffix) {
7 case "\xe3\x81\x93\xe3\x81\x86":
8 return VClassGodanKu
9 case "\xe3\x81\x94\xe3\x81\x86":
10 return VClassGodanGu
11 case "\xe3\x81\x9d\xe3\x81\x86":
12 return VClassGodanSu
13 case "\xe3\x81\xa8\xe3\x81\x86":
14 return VClassGodanTsu
15 case "\xe3\x81\xae\xe3\x81\x86":
16 return VClassGodanNu
17 case "\xe3\x81\xbc\xe3\x81\x86":
18 return VClassGodanBu
19 case "\xe3\x82\x82\xe3\x81\x86":
20 return VClassGodanMu
21 case "\xe3\x82\x8d\xe3\x81\x86":
22 return VClassGodanRu
23 case "\xe3\x81\x8a\xe3\x81\x86":
24 return VClassGodanU
25 }
26 return VClassIchidan
27 }
28
29 // LemmatizeJA reduces a JA verb surface form to its bare stem + morph + class.
30 // The stem is consistent across all inflected forms of a verb (食べる/食べた/食べている → 食).
31 // The class identifies the conjugation pattern needed to reconstruct forms.
32 //
33 // Verb class detection:
34 // ichidan: stem ends in え/い-row hiragana (食べ, 起き). Dict-form: stem + る.
35 // godan-X: stem ends in kanji or contracted/i-stem kana. Class indicates u-row ending.
36 //
37 // Non-verbs (isVerb=false) pass through unchanged.
38 func LemmatizeJA(word string, isVerb bool) LemmaResult {
39 if word == UntranslatedMarker {
40 return LemmaResult{Lemma: word, Morph: 0, Class: 0}
41 }
42 if !isVerb {
43 if stem, morph, ok := stripJAIAdj([]byte(word)); ok {
44 return LemmaResult{Lemma: string(stem), Morph: morph, Class: VClassIAdj}
45 }
46 return LemmaResult{Lemma: word, Morph: 0, Class: 0}
47 }
48 b := []byte(word)
49 if len(b) < 3 {
50 return LemmaResult{Lemma: word, Morph: 0, Class: 0}
51 }
52
53 stem, morph, stripped := stripJAConjugation(b)
54 class := detectJAClass(b, stem, stripped, morph)
55 // Volitional: class can be inferred from the specific suffix matched
56 // (こう/もう/etc. for godan, よう for ichidan).
57 if morph&MetaMoodVol != 0 && stripped {
58 suffixLen := len(b) - len(stem)
59 if suffixLen >= 6 {
60 class = volClassFromSuffix(b[len(b)-suffixLen:])
61 }
62 }
63 // Single-kana stem after stripping (e.g. bare っ from った/って) is a
64 // tokenizer artifact, not a standalone verb. Pass through as-is.
65 // Multi-kana stems (6+ bytes, 2+ characters) are legitimate ichidan
66 // verb stems (くれ, つけ, かけ, etc.) and must not be rejected.
67 if stripped && len(stem) <= 3 && isBareSuffix(stem) {
68 return LemmaResult{Lemma: word, Morph: 0, Class: 0}
69 }
70 stem = canonicalizeJAStem(stem, stripped, class)
71
72 if len(stem) == 0 {
73 return LemmaResult{Lemma: word, Morph: morph, Class: class}
74 }
75 return LemmaResult{Lemma: string(stem), Morph: morph, Class: class}
76 }
77
78 // detectJAClass attempts to determine the verb class from the original surface
79 // form and what suffix was stripped.
80 func detectJAClass(orig, stem []byte, stripped bool, morph uint16) uint8 {
81 if !stripped {
82 // Dict form: look at last kana. 食べる → ichidan if pre-る is え/い-row.
83 return classFromDictForm(orig)
84 }
85 // Suffix was stripped. Use residual to infer class.
86 return classFromInflectedStem(stem, morph)
87 }
88
89 // classFromDictForm reads a dict-form verb (assumes no suffix stripping done).
90 // Returns the class based on the last 1-2 kana.
91 func classFromDictForm(b []byte) uint8 {
92 if len(b) < 3 {
93 return VClassNone
94 }
95 last := b[len(b)-3:]
96 ls := string(last)
97 switch ls {
98 case "\xe3\x82\x8b": // る
99 // Could be ichidan or godan-ru. Check preceding kana.
100 if len(b) >= 6 {
101 prev := string(b[len(b)-6 : len(b)-3])
102 if isERowOrIRowKana(prev) {
103 return VClassIchidan
104 }
105 }
106 return VClassGodanRu
107 case "\xe3\x81\x8f": // く
108 return VClassGodanKu
109 case "\xe3\x81\x90": // ぐ
110 return VClassGodanGu
111 case "\xe3\x81\x99": // す
112 return VClassGodanSu
113 case "\xe3\x81\xa4": // つ
114 return VClassGodanTsu
115 case "\xe3\x81\xac": // ぬ
116 return VClassGodanNu
117 case "\xe3\x81\xb6": // ぶ
118 return VClassGodanBu
119 case "\xe3\x82\x80": // む
120 return VClassGodanMu
121 case "\xe3\x81\x86": // う
122 return VClassGodanU
123 }
124 return VClassNone
125 }
126
127 // classFromInflectedStem guesses class from the stem after suffix removal.
128 func classFromInflectedStem(stem []byte, morph uint16) uint8 {
129 if len(stem) < 3 {
130 return VClassNone
131 }
132 last := stem[len(stem)-3:]
133 ls := string(last)
134
135 // Negative or passive/causative: stem ends in あ-row kana for godan, or
136 // the stem is a bare ichidan stem (え/い-row).
137 // Negative: 走らない (godan-ru: 走ら). Passive: 噛まれた (godan-mu: 噛ま).
138 // Causative: 待たせた (godan-tsu: 待た).
139 if morph&(MetaPolarNeg|MetaPassive|MetaCausative) != 0 {
140 if c := classFromARowKana(ls); c != VClassNone {
141 return c
142 }
143 if isERowKana(ls) || isIRowKana(ls) {
144 return VClassIchidan
145 }
146 return VClassNone
147 }
148
149 // Polite (ます-family stripped) leaves i-row kana for godan or bare stem for ichidan.
150 if morph&MetaFormalityPol != 0 {
151 if c := classFromIStem(ls); c != VClassIchidan {
152 return c
153 }
154 if isERowKana(ls) || isIRowKana(ls) {
155 return VClassIchidan
156 }
157 return VClassNone
158 }
159
160 // Te/ta/de/da stripped: contracted forms for godan, bare stem for ichidan.
161 // っ contraction is ambiguous: godan-u (買う→買っ), godan-tsu (持つ→持っ),
162 // godan-ru (知る→知っ). Disambiguate by looking up the kanji root in a
163 // small table; default to godan-ru (most common っ-contracting class).
164 if ls == "\xe3\x81\xa3" { // っ
165 return classTsuContract(stem)
166 }
167 if ls == "\xe3\x82\x93" { // ん - godan nu/bu/mu voiced contraction
168 return VClassGodanMu
169 }
170 if ls == "\xe3\x81\x84" { // い - godan ku/gu past
171 return VClassGodanKu
172 }
173 if ls == "\xe3\x81\x97" { // し - godan su past
174 return VClassGodanSu
175 }
176 // Bare stem ending in え/い-row → ichidan
177 if isERowKana(ls) || isIRowKana(ls) {
178 return VClassIchidan
179 }
180 return VClassNone
181 }
182
183 // classFromARowKana maps the あ-row kana found before ない to the godan class.
184 func classFromARowKana(ls string) uint8 {
185 switch ls {
186 case "\xe3\x81\x8b": // か
187 return VClassGodanKu
188 case "\xe3\x81\x8c": // が
189 return VClassGodanGu
190 case "\xe3\x81\x95": // さ
191 return VClassGodanSu
192 case "\xe3\x81\x9f": // た
193 return VClassGodanTsu
194 case "\xe3\x81\xaa": // な
195 return VClassGodanNu
196 case "\xe3\x81\xb0": // ば
197 return VClassGodanBu
198 case "\xe3\x81\xbe": // ま
199 return VClassGodanMu
200 case "\xe3\x82\x89": // ら
201 return VClassGodanRu
202 case "\xe3\x82\x8f": // わ
203 return VClassGodanU
204 }
205 return VClassNone
206 }
207
208 // classFromIStem maps an i-row kana to the corresponding godan class.
209 func classFromIStem(ls string) uint8 {
210 switch ls {
211 case "\xe3\x81\x8d": // き
212 return VClassGodanKu
213 case "\xe3\x81\x8e": // ぎ
214 return VClassGodanGu
215 case "\xe3\x81\x97": // し
216 return VClassGodanSu
217 case "\xe3\x81\xa1": // ち
218 return VClassGodanTsu
219 case "\xe3\x81\xab": // に
220 return VClassGodanNu
221 case "\xe3\x81\xb3": // び
222 return VClassGodanBu
223 case "\xe3\x81\xbf": // み
224 return VClassGodanMu
225 case "\xe3\x82\x8a": // り
226 return VClassGodanRu
227 case "\xe3\x81\x84": // い (from godan-う i-stem like 買い)
228 return VClassGodanU
229 }
230 return VClassIchidan
231 }
232
233 // classTsuContract disambiguates っ te-form contraction among godan-u, godan-tsu,
234 // and godan-ru by kanji root lookup. Default: godan-ru (most frequent).
235 func classTsuContract(stem []byte) uint8 {
236 // Strip trailing っ (3 bytes: E3 81 A3) to get kanji root.
237 root := stem
238 if len(root) >= 3 {
239 tail := root[len(root)-3:]
240 if tail[0] == 0xe3 && tail[1] == 0x81 && tail[2] == 0xa3 {
241 root = root[:len(root)-3]
242 }
243 }
244 if len(root) == 0 {
245 return VClassGodanRu
246 }
247 // Extract last rune (the kanji) for table lookup.
248 // CJK kanji are 3-byte UTF-8 sequences (E0-EF range).
249 var kanji string
250 if len(root) >= 3 && root[len(root)-3] >= 0xe0 {
251 kanji = string(root[len(root)-3:])
252 } else if len(root) >= 2 && root[len(root)-2] >= 0xc0 {
253 kanji = string(root[len(root)-2:])
254 } else {
255 kanji = string(root[len(root)-1:])
256 }
257 switch kanji {
258 // Godan-tsu (dict form ends in つ)
259 case "\xe6\x8c\x81": // 持 (motsu)
260 return VClassGodanTsu
261 case "\xe7\xab\x8b": // 立 (tatsu)
262 return VClassGodanTsu
263 case "\xe5\xbe\x85": // 待 (matsu)
264 return VClassGodanTsu
265 case "\xe6\x89\x93": // 打 (utsu)
266 return VClassGodanTsu
267 case "\xe5\x8b\x9d": // 勝 (katsu)
268 return VClassGodanTsu
269 case "\xe8\x82\xb2": // 育 (sodatsu)
270 return VClassGodanTsu
271 case "\xe7\xb5\x8c": // 経 (tatsu - time passes)
272 return VClassGodanTsu
273 case "\xe4\xbf\x9d": // 保 (motsu - to last)
274 return VClassGodanTsu
275 case "\xe7\x99\xba": // 発 (tatsu - depart)
276 return VClassGodanTsu
277 case "\xe5\xbb\xba": // 建 (tatsu - build)
278 return VClassGodanTsu
279 // Godan-u (dict form ends in う)
280 case "\xe8\xb2\xb7": // 買 (kau)
281 return VClassGodanU
282 case "\xe4\xbc\x9a": // 会 (au)
283 return VClassGodanU
284 case "\xe6\xad\x8c": // 歌 (utau)
285 return VClassGodanU
286 case "\xe8\xa8\x80": // 言 (iu)
287 return VClassGodanU
288 case "\xe4\xbd\xbf": // 使 (tsukau)
289 return VClassGodanU
290 case "\xe6\x89\x95": // 払 (harau)
291 return VClassGodanU
292 case "\xe6\xb4\x97": // 洗 (arau)
293 return VClassGodanU
294 case "\xe7\xac\x91": // 笑 (warau)
295 return VClassGodanU
296 case "\xe9\x81\x95": // 違 (chigau)
297 return VClassGodanU
298 case "\xe6\x80\x9d": // 思 (omou)
299 return VClassGodanU
300 case "\xe8\xbf\xbd": // 追 (ou)
301 return VClassGodanU
302 case "\xe8\xaa\x98": // 誘 (sasou)
303 return VClassGodanU
304 case "\xe6\x8b\xbe": // 拾 (hirou)
305 return VClassGodanU
306 case "\xe9\xa3\xbc": // 飼 (kau - keep/raise)
307 return VClassGodanU
308 case "\xe5\x90\xb8": // 吸 (suu)
309 return VClassGodanU
310 case "\xe5\x90\x88": // 合 (au - match)
311 return VClassGodanU
312 case "\xe6\x95\x91": // 救 (sukuu)
313 return VClassGodanU
314 case "\xe6\x8b\x89": // 拉 (hirau - not standard, but appears)
315 return VClassGodanU
316 }
317 // Default: godan-ru (知る, 取る, 送る, 作る, 乗る, 売る, etc.)
318 return VClassGodanRu
319 }
320
321 // canonicalizeJAStem normalizes the stem to its canonical form per class.
322 // For ichidan: keep the え/い-row kana (食べ stays 食べ).
323 // For godan: canonical form is the dict form (kanji root + u-row ending).
324 // Dict form 行く: keep as-is.
325 // Inflected 行った: strip to kanji root, re-append く.
326 // This preserves lexical identity: 行く (iku, go) vs 行う (okonau, do)
327 // produce distinct lemmas instead of both collapsing to 行.
328 func canonicalizeJAStem(stem []byte, stripped bool, class uint8) []byte {
329 if class == VClassIchidan {
330 if !stripped {
331 return stripTrailingHiragana(stem)
332 }
333 return stem // 食べ is already canonical for ichidan
334 }
335 // Godan: strip to kanji root, then append the dict-form ending.
336 root := stem
337 if !stripped {
338 root = stripTrailingHiragana(stem)
339 } else {
340 root = stripAllTrailingHiragana(stem)
341 }
342 suffix := godanDictSuffix(class)
343 if suffix == "" {
344 return root
345 }
346 out := []byte{:0:len(root) + 3}
347 out = append(out, root...)
348 out = append(out, []byte(suffix)...)
349 return out
350 }
351
352 func godanDictSuffix(class uint8) string {
353 switch class {
354 case VClassGodanKu:
355 return "\xe3\x81\x8f" // く
356 case VClassGodanGu:
357 return "\xe3\x81\x90" // ぐ
358 case VClassGodanSu:
359 return "\xe3\x81\x99" // す
360 case VClassGodanTsu:
361 return "\xe3\x81\xa4" // つ
362 case VClassGodanNu:
363 return "\xe3\x81\xac" // ぬ
364 case VClassGodanBu:
365 return "\xe3\x81\xb6" // ぶ
366 case VClassGodanMu:
367 return "\xe3\x82\x80" // む
368 case VClassGodanRu:
369 return "\xe3\x82\x8b" // る
370 case VClassGodanU:
371 return "\xe3\x81\x86" // う
372 }
373 return ""
374 }
375
376 // stripTrailingHiragana removes exactly one trailing hiragana char.
377 func stripTrailingHiragana(b []byte) []byte {
378 if len(b) < 3 {
379 return b
380 }
381 tail := b[len(b)-3:]
382 if tail[0] == 0xe3 && (tail[1] == 0x81 || tail[1] == 0x82) {
383 return b[:len(b)-3]
384 }
385 return b
386 }
387
388 // stripAllTrailingHiragana removes all trailing hiragana, leaving at least 3 bytes.
389 func stripAllTrailingHiragana(b []byte) []byte {
390 for len(b) > 3 {
391 tail := b[len(b)-3:]
392 if tail[0] == 0xe3 && (tail[1] == 0x81 || tail[1] == 0x82) {
393 b = b[:len(b)-3]
394 } else {
395 break
396 }
397 }
398 return b
399 }
400
401 // isBareSuffix returns true when the stem after conjugation stripping is all
402 // hiragana/katakana with no kanji root. Such stems come from tokenizer splits
403 // (思った -> 思 + った) and shouldn't be canonicalized as standalone verbs.
404 func isBareSuffix(stem []byte) bool {
405 for i := 0; i < len(stem); {
406 if i+3 <= len(stem) && stem[i] == 0xe3 {
407 b1 := stem[i+1]
408 if b1 >= 0x81 && b1 <= 0x83 { // hiragana U+3040-U+309F, katakana U+30A0-U+30FF
409 i += 3
410 continue
411 }
412 }
413 return false // non-kana byte found = has kanji root
414 }
415 return len(stem) > 0
416 }
417
418 func isERowKana(s string) bool {
419 // え-row hiragana: え (0x81 0x88), け (0x81 0x91), せ (0x81 0x9b), て (0x81 0xa6),
420 // ね (0x81 0xad), へ (0x81 0xb8), め (0x82 0x81), れ (0x82 0x8c), ゑ (0x82 0x91)
421 // plus voiced: げ (0x81 0x92), ぜ (0x81 0x9c), で (0x81 0xa7), べ (0x81 0xb9), ぺ (0x81 0xba)
422 if len(s) != 3 || s[0] != 0xe3 {
423 return false
424 }
425 switch s {
426 case "\xe3\x81\x88", "\xe3\x81\x91", "\xe3\x81\x9b", "\xe3\x81\xa6",
427 "\xe3\x81\xad", "\xe3\x81\xb8", "\xe3\x82\x81", "\xe3\x82\x8c",
428 "\xe3\x81\x92", "\xe3\x81\x9c", "\xe3\x81\xa7", "\xe3\x81\xb9", "\xe3\x81\xba":
429 return true
430 }
431 return false
432 }
433
434 func isIRowKana(s string) bool {
435 if len(s) != 3 || s[0] != 0xe3 {
436 return false
437 }
438 switch s {
439 case "\xe3\x81\x84", "\xe3\x81\x8d", "\xe3\x81\x97", "\xe3\x81\xa1",
440 "\xe3\x81\xab", "\xe3\x81\xb2", "\xe3\x81\xbf", "\xe3\x82\x8a",
441 "\xe3\x81\x8e", "\xe3\x81\x98", "\xe3\x81\xa2", "\xe3\x81\xb3", "\xe3\x81\xb4":
442 return true
443 }
444 return false
445 }
446
447 func isERowOrIRowKana(s string) bool {
448 return isERowKana(s) || isIRowKana(s)
449 }
450
451 // stripJACopula identifies sentence-final copula forms on a noun-final token
452 // (彼は学生だ, 学生でした). Returns (atom-with-copula-stripped, true, morph-bits)
453 // or (input, false, 0) if no copula form found.
454 //
455 // Copula forms (longest first to avoid premature short matches):
456 // でした → past + polite
457 // だった → past + plain
458 // です → non-past + polite
459 // だ → non-past + plain
460 //
461 // Caller must verify this is NOT a verb context (verb past-tense た/だ is a
462 // separate suffix on conjugated verbs; this only fires on noun-final tokens).
463 func stripJACopula(word string) (string, bool, uint16) {
464 b := []byte(word)
465 // でした = E3 81 A7 E3 81 97 E3 81 9F (9 bytes)
466 deshita := []byte("\xe3\x81\xa7\xe3\x81\x97\xe3\x81\x9f")
467 if len(b) > len(deshita) && string(b[len(b)-len(deshita):]) == string(deshita) {
468 return string(b[:len(b)-len(deshita)]), true, MetaTensePast | MetaFormalityPol
469 }
470 // だった = E3 81 A0 E3 81 A3 E3 81 9F (9 bytes)
471 datta := []byte("\xe3\x81\xa0\xe3\x81\xa3\xe3\x81\x9f")
472 if len(b) > len(datta) && string(b[len(b)-len(datta):]) == string(datta) {
473 return string(b[:len(b)-len(datta)]), true, MetaTensePast
474 }
475 // です = E3 81 A7 E3 81 99 (6 bytes)
476 desu := []byte("\xe3\x81\xa7\xe3\x81\x99")
477 if len(b) > len(desu) && string(b[len(b)-len(desu):]) == string(desu) {
478 return string(b[:len(b)-len(desu)]), true, MetaFormalityPol
479 }
480 // だ = E3 81 A0 (3 bytes). Bare だ is ambiguous with verb past contractions
481 // (噛んだ, 飛んだ, 死んだ for godan-mu/bu/nu). Only treat as copula if the
482 // preceding character is not a verb-contraction signal kana (ん or っ).
483 da := []byte("\xe3\x81\xa0")
484 if len(b) > len(da) && string(b[len(b)-len(da):]) == string(da) {
485 stem := b[:len(b)-len(da)]
486 if len(stem) >= 3 {
487 tail := stem[len(stem)-3:]
488 // ん is E3 82 93, っ is E3 81 A3 - verb past contractions.
489 if string(tail) == "\xe3\x82\x93" || string(tail) == "\xe3\x81\xa3" {
490 return word, false, 0
491 }
492 }
493 return string(stem), true, 0
494 }
495 return word, false, 0
496 }
497
498 // stripJAIAdj strips i-adjective inflection suffixes and returns the kanji
499 // stem. Returns (stem, morph, true) on success. Requires the stem to contain
500 // at least one kanji character to avoid false positives on pure-kana words.
501 //
502 // Forms (longest first):
503 // くなかった -> stem, neg+past (赤くなかった -> 赤)
504 // かった -> stem, past (赤かった -> 赤)
505 // くない -> stem, neg (赤くない -> 赤)
506 // ければ -> stem, conditional (赤ければ -> 赤)
507 // くて -> stem, te-form (赤くて -> 赤)
508 // く -> stem, adverbial (赤く -> 赤)
509 // さ -> stem, nominal (赤さ -> 赤)
510 // い -> stem, dict (赤い -> 赤)
511 func stripJAIAdj(b []byte) ([]byte, uint16, bool) {
512 var stem []byte
513 var morph uint16
514
515 // Only strip compound suffixes that are unambiguously i-adjective.
516 // Bare い/く/さ create false positives on godan-ku verbs (行く,書く)
517 // and other non-adjective words.
518
519 // くなかった = く(E3 81 8F) + な(E3 81 AA) + か(E3 81 8B) + っ(E3 81 A3) + た(E3 81 9F) = 15 bytes
520 if len(b) > 15 && hasSuffix(b, "\xe3\x81\x8f\xe3\x81\xaa\xe3\x81\x8b\xe3\x81\xa3\xe3\x81\x9f") {
521 stem = b[:len(b)-15]
522 morph = MetaPolarNeg | MetaTensePast
523 } else if len(b) > 12 && hasSuffix(b, "\xe3\x81\x8b\xe3\x81\xa3\xe3\x81\x9f") {
524 // かった = か(E3 81 8B) + っ(E3 81 A3) + た(E3 81 9F) = 9 bytes
525 stem = b[:len(b)-9]
526 morph = MetaTensePast
527 } else if len(b) > 9 && hasSuffix(b, "\xe3\x81\x8f\xe3\x81\xaa\xe3\x81\x84") {
528 // くない = く(E3 81 8F) + な(E3 81 AA) + い(E3 81 84) = 9 bytes
529 stem = b[:len(b)-9]
530 morph = MetaPolarNeg
531 } else if len(b) > 9 && hasSuffix(b, "\xe3\x81\x91\xe3\x82\x8c\xe3\x81\xb0") {
532 // ければ = け(E3 81 91) + れ(E3 82 8C) + ば(E3 81 B0) = 9 bytes
533 stem = b[:len(b)-9]
534 morph = 0
535 } else if len(b) > 6 && hasSuffix(b, "\xe3\x81\x8f\xe3\x81\xa6") {
536 // くて = く(E3 81 8F) + て(E3 81 A6) = 6 bytes
537 stem = b[:len(b)-6]
538 morph = MetaAspectProg
539 } else {
540 return nil, 0, false
541 }
542
543 if len(stem) < 3 {
544 return nil, 0, false
545 }
546 if !hasKanji(stem) {
547 return nil, 0, false
548 }
549 return stem, morph, true
550 }
551
552 func hasKanji(b []byte) bool {
553 for i := 0; i+2 < len(b); {
554 if b[i] >= 0xe4 && b[i] <= 0xe9 {
555 return true
556 }
557 if b[i] < 0x80 {
558 i++
559 } else if b[i] < 0xe0 {
560 i += 2
561 } else if b[i] < 0xf0 {
562 i += 3
563 } else {
564 i += 4
565 }
566 }
567 return false
568 }
569
570 // endsInIKana returns true if the last UTF-8 character of s is the hiragana い.
571 // Used as a heuristic for i-adjective detection (赤い, 楽しい).
572 // Known false positives: な-adjectives ending in い (きれい), nouns ending in
573 // い (兄=ani; 桜井=name). The positional constraint (modifier immediately
574 // before a noun, no intervening particle) at the call site filters most.
575 func endsInIKana(s string) bool {
576 b := []byte(s)
577 if len(b) < 3 {
578 return false
579 }
580 // い is UTF-8 E3 81 84
581 tail := b[len(b)-3:]
582 return tail[0] == 0xe3 && tail[1] == 0x81 && tail[2] == 0x84
583 }
584
585 // isJADitransitive returns true if the lemma is a JA ditransitive verb
586 // taking a に-marked recipient and a を-marked patient: 彼に本をあげる
587 // (give him a book). When the clause's verb matches, に-marked slots flip
588 // from ORGoal/ORLoc default to ORRecip. Closed set of JA verb lemmas
589 // (post-lemmatization, so just the stem).
590 func isJADitransitive(lemma string) bool {
591 switch lemma {
592 // あげ (give), やる→や, くれる→くれ, 渡 (hand), 送 (send), 教え (teach/tell),
593 // 見せ (show), 売 (sell), 買 (buy), 貸 (lend), 払 (pay), 与え (give/grant).
594 case "\xe3\x81\x82\xe3\x81\x92",
595 "\xe3\x82\x84",
596 "\xe3\x81\x8f\xe3\x82\x8c",
597 "\xe6\xb8\xa1",
598 "\xe9\x80\x81",
599 "\xe6\x95\x99\xe3\x81\x88",
600 "\xe8\xa6\x8b\xe3\x81\x9b",
601 "\xe5\xa3\xb2",
602 "\xe8\xb2\xb7",
603 "\xe8\xb2\xb8",
604 "\xe6\x89\x95",
605 "\xe4\xb8\x8e\xe3\x81\x88":
606 return true
607 }
608 return false
609 }
610
611 // isJATemporalNoun returns true if s is a temporal noun that functions as
612 // a sentence-initial adverbial (today, yesterday, tomorrow, etc.). These
613 // surface as bare nouns without a particle but semantically modify the
614 // clause's verb. Closed set; treat as MKAdv adverbial during extraction.
615 func isJATemporalNoun(s string) bool {
616 switch s {
617 // 昨日 (yesterday), 今日 (today), 明日 (tomorrow), 今 (now),
618 // 昨夜 (last night), 今朝 (this morning), 今晩 (tonight), 今夜 (tonight),
619 // 来週 (next week), 先週 (last week), 来月 (next month), 先月 (last month),
620 // 来年 (next year), 去年 (last year), 毎日 (every day), 毎週 (every week),
621 // 毎月 (every month), 毎年 (every year), 朝 (morning), 昼 (noon),
622 // 夜 (night), 夕方 (evening), いつ (when/always).
623 case "\xe6\x98\xa8\xe6\x97\xa5",
624 "\xe4\xbb\x8a\xe6\x97\xa5",
625 "\xe6\x98\x8e\xe6\x97\xa5",
626 "\xe4\xbb\x8a",
627 "\xe6\x98\xa8\xe5\xa4\x9c",
628 "\xe4\xbb\x8a\xe6\x9c\x9d",
629 "\xe4\xbb\x8a\xe6\x99\xa9",
630 "\xe4\xbb\x8a\xe5\xa4\x9c",
631 "\xe6\x9d\xa5\xe9\x80\xb1",
632 "\xe5\x85\x88\xe9\x80\xb1",
633 "\xe6\x9d\xa5\xe6\x9c\x88",
634 "\xe5\x85\x88\xe6\x9c\x88",
635 "\xe6\x9d\xa5\xe5\xb9\xb4",
636 "\xe5\x8e\xbb\xe5\xb9\xb4",
637 "\xe6\xaf\x8e\xe6\x97\xa5",
638 "\xe6\xaf\x8e\xe9\x80\xb1",
639 "\xe6\xaf\x8e\xe6\x9c\x88",
640 "\xe6\xaf\x8e\xe5\xb9\xb4",
641 "\xe6\x9c\x9d",
642 "\xe6\x98\xbc",
643 "\xe5\xa4\x9c",
644 "\xe5\xa4\x95\xe6\x96\xb9",
645 "\xe3\x81\x84\xe3\x81\xa4":
646 return true
647 }
648 return false
649 }
650
651 // isJARelationalNoun returns true if s is a relational noun used in the
652 // の-relational-noun-に locative compound pattern (箱の中に = "in the box").
653 // Closed set; expansions belong here.
654 func isJARelationalNoun(s string) bool {
655 switch s {
656 // 中 (inside), 内 (within), 外 (outside), 上 (on/above), 下 (under),
657 // 前 (in front), 後 (behind), 横 (beside), 隣 (next to), 間 (between).
658 case "\xe4\xb8\xad",
659 "\xe5\x86\x85",
660 "\xe5\xa4\x96",
661 "\xe4\xb8\x8a",
662 "\xe4\xb8\x8b",
663 "\xe5\x89\x8d",
664 "\xe5\xbe\x8c",
665 "\xe6\xa8\xaa",
666 "\xe9\x9a\xa3",
667 "\xe9\x96\x93":
668 return true
669 }
670 return false
671 }
672
673 // jaRelationalNounToOblRole maps a relational noun to its semantic role.
674 // Most map to ORLoc with implicit "inside" semantics for the EN renderer;
675 // position-specific (on, under, in-front) would need a finer-grained
676 // representation. For now all locative-compounds collapse to ORLoc.
677 func jaRelationalNounToOblRole(s string) uint8 {
678 if isJARelationalNoun(s) {
679 return ORLoc
680 }
681 return ORNone
682 }
683
684 // endsInNaiSuffix returns true if s ends in ない (negative verb suffix).
685 // Used to exclude negative-verb forms from predicative-i-adjective detection,
686 // since ない ends in い (false-matching endsInIKana). な = E3 81 AA, い = E3 81 84.
687 func endsInNaiSuffix(s string) bool {
688 b := []byte(s)
689 if len(b) < 6 {
690 return false
691 }
692 tail := b[len(b)-6:]
693 return tail[0] == 0xe3 && tail[1] == 0x81 && tail[2] == 0xaa &&
694 tail[3] == 0xe3 && tail[4] == 0x81 && tail[5] == 0x84
695 }
696
697 // endsInTaiSuffix returns true if s ends in たい (desiderative verb suffix).
698 // Used to exclude desiderative verb forms (食べたい / want to eat) from
699 // predicative-i-adjective detection. た = E3 81 9F, い = E3 81 84.
700 func endsInTaiSuffix(s string) bool {
701 b := []byte(s)
702 if len(b) < 6 {
703 return false
704 }
705 tail := b[len(b)-6:]
706 return tail[0] == 0xe3 && tail[1] == 0x81 && tail[2] == 0x9f &&
707 tail[3] == 0xe3 && tail[4] == 0x81 && tail[5] == 0x84
708 }
709
710 // isJABareKanjiAdj returns true if s is a single-kanji color/size adjective
711 // (赤, 白, 黒, 大, 小, 新, 古, 長, 高, 低). These appear as bare-kanji ATTR
712 // modifiers in rendered JA when the い-suffix was not included.
713 func isJABareKanjiAdj(s string) bool {
714 if len(s) != 3 {
715 return false
716 }
717 b := []byte(s)
718 c1, c2, c3 := b[0], b[1], b[2]
719 switch {
720 case c1 == 0xe8 && c2 == 0xb5 && c3 == 0xa4: // 赤
721 case c1 == 0xe7 && c2 == 0x99 && c3 == 0xbd: // 白
722 case c1 == 0xe9 && c2 == 0xbb && c3 == 0x92: // 黒
723 case c1 == 0xe5 && c2 == 0xa4 && c3 == 0xa7: // 大
724 case c1 == 0xe5 && c2 == 0xb0 && c3 == 0x8f: // 小
725 case c1 == 0xe6 && c2 == 0x96 && c3 == 0xb0: // 新
726 case c1 == 0xe5 && c2 == 0x8f && c3 == 0xa4: // 古
727 case c1 == 0xe9 && c2 == 0x95 && c3 == 0xb7: // 長
728 case c1 == 0xe9 && c2 == 0xab && c3 == 0x98: // 高
729 case c1 == 0xe4 && c2 == 0xbd && c3 == 0x8e: // 低
730 default:
731 return false
732 }
733 return true
734 }
735
736 // endsInTaKana returns true if the last UTF-8 character of s is the hiragana た.
737 // Used to detect past-tense verb forms preceding a head noun (REL clause):
738 // 食べた猫 (the cat that ate), 見た本 (the book I saw). The morphological
739 // splitter exposes the boundary; this checks that the prior slot is a
740 // past-tense verb. False positives: nouns ending in た are uncommon.
741 func endsInTaKana(s string) bool {
742 b := []byte(s)
743 if len(b) < 3 {
744 return false
745 }
746 // た is UTF-8 E3 81 9F
747 tail := b[len(b)-3:]
748 return tail[0] == 0xe3 && tail[1] == 0x81 && tail[2] == 0x9f
749 }
750
751 // endsInKuKana returns true if the last UTF-8 character of s is the hiragana く.
752 // Used as a heuristic for く-form adverbial detection: 速く (hayaku=quickly),
753 // 高く (takaku=highly), 楽しく (tanoshiku=enjoyably). The く-form is the
754 // adverbial inflection of an i-adjective and binds to the following verb.
755 func endsInKuKana(s string) bool {
756 b := []byte(s)
757 if len(b) < 3 {
758 return false
759 }
760 // く is UTF-8 E3 81 8F
761 tail := b[len(b)-3:]
762 return tail[0] == 0xe3 && tail[1] == 0x81 && tail[2] == 0x8f
763 }
764
765 // stripJAConjugation matches the longest known conjugation suffix.
766 // Returns (remaining bytes, morph bits, stripped flag).
767 //
768 // Order matters: longest forms first. Passive (れる/られる) and causative
769 // (せる/させる) must be checked before simpler suffix endings to avoid
770 // premature shorter matches.
771 func stripJAConjugation(b []byte) ([]byte, uint16, bool) {
772 // Passive/causative past combinations.
773 // Guard: if the stem after stripping is bare kana (isBareSuffix), the
774 // match is likely an ichidan verb whose stem ends in れ/せ colliding
775 // with the passive/causative suffix (くれた = くれ+た, not く+れた).
776 // Reject and fall through to simpler suffixes.
777 if rest, ok := stripJASuffix(b, jaSeraretaPast); ok && !isBareSuffix(rest) {
778 return rest, MetaCausative | MetaPassive | MetaTensePast, true
779 }
780 if rest, ok := stripJASuffix(b, jaRaretaPast); ok && !isBareSuffix(rest) {
781 return rest, MetaPassive | MetaTensePast, true
782 }
783 if rest, ok := stripJASuffix(b, jaSasetaPast); ok && !isBareSuffix(rest) {
784 return rest, MetaCausative | MetaTensePast, true
785 }
786 // Passive/causative non-past
787 if rest, ok := stripJASuffix(b, jaSerareru); ok && !isBareSuffix(rest) {
788 return rest, MetaCausative | MetaPassive, true
789 }
790 if rest, ok := stripJASuffix(b, jaRareru); ok && !isBareSuffix(rest) {
791 return rest, MetaPassive, true
792 }
793 if rest, ok := stripJASuffix(b, jaSaseru); ok && !isBareSuffix(rest) {
794 return rest, MetaCausative, true
795 }
796 if rest, ok := stripJASuffix(b, jaReru); ok && !isBareSuffix(rest) {
797 return rest, MetaPassive, true
798 }
799 if rest, ok := stripJASuffix(b, jaSeru); ok && !isBareSuffix(rest) {
800 return rest, MetaCausative, true
801 }
802 if rest, ok := stripJASuffix(b, jaRetaPast); ok && !isBareSuffix(rest) {
803 return rest, MetaPassive | MetaTensePast, true
804 }
805 if rest, ok := stripJASuffix(b, jaSetaPast); ok && !isBareSuffix(rest) {
806 return rest, MetaCausative | MetaTensePast, true
807 }
808 if rest, ok := stripJASuffix(b, jaNakatta); ok {
809 return rest, MetaPolarNeg | MetaTensePast, true
810 }
811 // Desiderative (たい family) - before ない/た to avoid partial match.
812 if rest, ok := stripJASuffix(b, jaTakunai); ok {
813 return rest, MetaMoodVol | MetaPolarNeg, true
814 }
815 if rest, ok := stripJASuffix(b, jaTakatta); ok {
816 return rest, MetaMoodVol | MetaTensePast, true
817 }
818 if rest, ok := stripJASuffix(b, jaTai); ok {
819 return rest, MetaMoodVol, true
820 }
821 if rest, ok := stripJASuffix(b, jaMashita); ok {
822 return rest, MetaTensePast | MetaFormalityPol, true
823 }
824 if rest, ok := stripJASuffix(b, jaMasen); ok {
825 return rest, MetaPolarNeg | MetaFormalityPol, true
826 }
827 if rest, ok := stripJASuffix(b, jaTeiru); ok {
828 return rest, MetaAspectProg, true
829 }
830 if rest, ok := stripJASuffix(b, jaTeita); ok {
831 return rest, MetaAspectProg | MetaTensePast, true
832 }
833 if rest, ok := stripJASuffix(b, jaDeiru); ok {
834 return rest, MetaAspectProg, true
835 }
836 if rest, ok := stripJASuffix(b, jaDeita); ok {
837 return rest, MetaAspectProg | MetaTensePast, true
838 }
839 if rest, ok := stripJASuffix(b, jaMasu); ok {
840 return rest, MetaFormalityPol, true
841 }
842 if rest, ok := stripJASuffix(b, jaNai); ok {
843 return rest, MetaPolarNeg, true
844 }
845 // Godan volitional compounds (longest first; each is お-row kana + う).
846 // These must match before bare う suffixes or particles.
847 if rest, ok := stripJASuffix(b, jaKoU); ok {
848 return rest, MetaMoodVol, true
849 }
850 if rest, ok := stripJASuffix(b, jaGoU); ok {
851 return rest, MetaMoodVol, true
852 }
853 if rest, ok := stripJASuffix(b, jaSoU); ok {
854 return rest, MetaMoodVol, true
855 }
856 if rest, ok := stripJASuffix(b, jaToU); ok {
857 return rest, MetaMoodVol, true
858 }
859 if rest, ok := stripJASuffix(b, jaNoU); ok {
860 return rest, MetaMoodVol, true
861 }
862 if rest, ok := stripJASuffix(b, jaBoU); ok {
863 return rest, MetaMoodVol, true
864 }
865 if rest, ok := stripJASuffix(b, jaMoU); ok {
866 return rest, MetaMoodVol, true
867 }
868 if rest, ok := stripJASuffix(b, jaRoU); ok {
869 return rest, MetaMoodVol, true
870 }
871 if rest, ok := stripJASuffix(b, jaOU); ok {
872 return rest, MetaMoodVol, true
873 }
874 if rest, ok := stripJASuffix(b, jaYou); ok {
875 return rest, MetaMoodVol, true
876 }
877 if rest, ok := stripJASuffix(b, jaTe); ok {
878 return rest, MetaAspectProg, true
879 }
880 if rest, ok := stripJASuffix(b, jaDe); ok {
881 return rest, MetaAspectProg, true
882 }
883 if rest, ok := stripJASuffix(b, jaTa); ok {
884 return rest, MetaTensePast, true
885 }
886 if rest, ok := stripJASuffix(b, jaDa); ok {
887 return rest, MetaTensePast, true
888 }
889 if rest, ok := stripJASuffix(b, jaBa); ok {
890 // ば: provisional conditional. Strip from lemma; the ClauseIf
891 // relation lives at the discourse layer, not on the verb morph.
892 return rest, 0, true
893 }
894 return b, 0, false
895 }
896
897 const (
898 jaTeiru = "\xe3\x81\xa6\xe3\x81\x84\xe3\x82\x8b" // ている
899 jaTeita = "\xe3\x81\xa6\xe3\x81\x84\xe3\x81\x9f" // ていた
900 jaDeiru = "\xe3\x81\xa7\xe3\x81\x84\xe3\x82\x8b" // でいる
901 jaDeita = "\xe3\x81\xa7\xe3\x81\x84\xe3\x81\x9f" // でいた
902 jaTai = "\xe3\x81\x9f\xe3\x81\x84" // たい
903 jaTakatta = "\xe3\x81\x9f\xe3\x81\x8b\xe3\x81\xa3\xe3\x81\x9f" // たかった
904 jaTakunai = "\xe3\x81\x9f\xe3\x81\x8f\xe3\x81\xaa\xe3\x81\x84" // たくない
905 jaNai = "\xe3\x81\xaa\xe3\x81\x84" // ない
906 jaNakatta = "\xe3\x81\xaa\xe3\x81\x8b\xe3\x81\xa3\xe3\x81\x9f" // なかった
907 jaMasu = "\xe3\x81\xbe\xe3\x81\x99" // ます
908 jaMashita = "\xe3\x81\xbe\xe3\x81\x97\xe3\x81\x9f" // ました
909 jaMasen = "\xe3\x81\xbe\xe3\x81\x9b\xe3\x82\x93" // ません
910 jaYou = "\xe3\x82\x88\xe3\x81\x86" // よう
911 jaTe = "\xe3\x81\xa6" // て
912 jaDe = "\xe3\x81\xa7" // で
913 jaTa = "\xe3\x81\x9f" // た
914 jaDa = "\xe3\x81\xa0" // だ
915 jaBa = "\xe3\x81\xb0" // ば (conditional)
916
917 // Passive: れる/られる
918 jaRareru = "\xe3\x82\x89\xe3\x82\x8c\xe3\x82\x8b" // られる
919 jaReru = "\xe3\x82\x8c\xe3\x82\x8b" // れる
920 jaRaretaPast = "\xe3\x82\x89\xe3\x82\x8c\xe3\x81\x9f" // られた
921 jaRetaPast = "\xe3\x82\x8c\xe3\x81\x9f" // れた
922
923 // Causative: せる/させる
924 jaSaseru = "\xe3\x81\x95\xe3\x81\x9b\xe3\x82\x8b" // させる
925 jaSeru = "\xe3\x81\x9b\xe3\x82\x8b" // せる
926 jaSasetaPast = "\xe3\x81\x95\xe3\x81\x9b\xe3\x81\x9f" // させた
927 jaSetaPast = "\xe3\x81\x9b\xe3\x81\x9f" // せた
928
929 // Causative-passive: させられる
930 jaSerareru = "\xe3\x81\x95\xe3\x81\x9b\xe3\x82\x89\xe3\x82\x8c\xe3\x82\x8b" // させられる
931 jaSeraretaPast = "\xe3\x81\x95\xe3\x81\x9b\xe3\x82\x89\xe3\x82\x8c\xe3\x81\x9f" // させられた
932
933 // Volitional compounds
934 jaKoU = "\xe3\x81\x93\xe3\x81\x86" // こう (godan-ku)
935 jaGoU = "\xe3\x81\x94\xe3\x81\x86" // ごう (godan-gu)
936 jaSoU = "\xe3\x81\x9d\xe3\x81\x86" // そう (godan-su)
937 jaToU = "\xe3\x81\xa8\xe3\x81\x86" // とう (godan-tsu)
938 jaNoU = "\xe3\x81\xae\xe3\x81\x86" // のう (godan-nu)
939 jaBoU = "\xe3\x81\xbc\xe3\x81\x86" // ぼう (godan-bu)
940 jaMoU = "\xe3\x82\x82\xe3\x81\x86" // もう (godan-mu)
941 jaRoU = "\xe3\x82\x8d\xe3\x81\x86" // ろう (godan-ru)
942 jaOU = "\xe3\x81\x8a\xe3\x81\x86" // おう (godan-u)
943 )
944
945 func stripJASuffix(word, suffix []byte) ([]byte, bool) {
946 if len(word) <= len(suffix) {
947 return nil, false
948 }
949 tail := word[len(word)-len(suffix):]
950 if string(tail) == string(suffix) {
951 return word[:len(word)-len(suffix)], true
952 }
953 return nil, false
954 }
955
956
957