render_ja.mx raw
1 package iskra
2
3 // RenderJA converts a Set (list of SetEntry) back to JA text.
4 // The renderer is deterministic: same Set always produces the same JA output.
5 //
6 // Strategy:
7 // - For non-verb slots: emit atom + particle (from Mark if preserved, else
8 // from Role default).
9 // - For verb slots: emit atom + conjugation suffix derived from Class + Morph.
10 // - Iteration order matches the Set order (preserves slot positions).
11 // RenderJADiscourse renders a multi-clause Discourse to JA text.
12 // Single-clause case renders identically to RenderJA(d[0].Set).
13 //
14 // Subordinating prefixes (もし for ClauseIf, なぜなら for ClauseBecause)
15 // attach to the subordinate clause itself. Peer connectives (、 for AND/OR/
16 // BUT) join adjacent clauses.
17 func RenderJADiscourse(d []Clause) string {
18 if len(d) == 0 {
19 return ""
20 }
21 out := ""
22 for i, c := range d {
23 if i > 0 {
24 switch c.Relation {
25 case ClauseAnd:
26 out = out | "、"
27 case ClauseOr:
28 out = out | "、または"
29 case ClauseBut:
30 out = out | "、しかし"
31 case ClauseIf, ClauseBecause:
32 out = out | "、"
33 default:
34 out = out | "、"
35 }
36 }
37 // Subordinating prefix on this clause.
38 switch c.Relation {
39 case ClauseIf:
40 out = out | "もし"
41 case ClauseBecause:
42 out = out | "なぜなら"
43 }
44 out = out | RenderJA(c.Set)
45 }
46 return out
47 }
48
49 // RenderJA emits the Set as JA text in SOV order with modifier-aware traversal.
50 //
51 // Three groups, emitted in order:
52 // 1. Top-level non-verb arguments (with their POSS/ATTR modifiers)
53 // 2. Top-level verbs (with their modifiers)
54 // 3. Copular predicates (MKCop entries) - emitted last as sentence-final
55 // predicate with the copula form (だ/だった/です/でした) per Morph.
56 //
57 // POSS/ATTR modifiers attach to non-verb/verb heads (Head points at head).
58 // MKCop entries have Head pointing at the subject but render at end of clause.
59 func RenderJA(set []SetEntry) string {
60 var topNonVerbs, topVerbs, copulas, adjs []int32
61 mods := map[int32][]int32{} // non-COP/ADJ modifiers: headIdx -> [modIdx, ...]
62 for i, e := range set {
63 if e.ModKind == MKCop {
64 copulas = append(copulas, i)
65 continue
66 }
67 if e.ModKind == MKAdj {
68 adjs = append(adjs, i)
69 continue
70 }
71 if e.Head >= 0 && int32(e.Head) < len(set) {
72 mods[int32(e.Head)] = append(mods[int32(e.Head)], i)
73 continue
74 }
75 if e.Role == HistVerb {
76 topVerbs = append(topVerbs, i)
77 } else {
78 topNonVerbs = append(topNonVerbs, i)
79 }
80 }
81
82 // Pair each copula/adj predicate with its nearest preceding subject.
83 // Head indices are unreliable in flattened multi-clause sets, so we
84 // assign subjects to predicates by proximity: each predicate claims
85 // the closest unclaimed subject that precedes it in set order.
86 predSubj := map[int32]int32{} // predicate idx -> subject idx
87 usedSubj := map[int32]bool{}
88 allPreds := []int32{:0:len(copulas)+len(adjs)}
89 allPreds = append(allPreds, copulas...)
90 allPreds = append(allPreds, adjs...)
91 for _, pidx := range allPreds {
92 best := -1
93 for _, sidx := range topNonVerbs {
94 if sidx >= pidx {
95 break
96 }
97 r := set[sidx].Role
98 if (r == HistSubject || r == HistTopic) && !usedSubj[sidx] {
99 best = sidx
100 }
101 }
102 if best >= 0 {
103 predSubj[pidx] = best
104 usedSubj[best] = true
105 }
106 }
107
108 var out []byte
109 sepIfNeeded := func() {
110 if len(out) > 0 && !endsWithJAParticle(out) && out[len(out)-1] != ' ' {
111 out = append(out, ' ')
112 }
113 }
114 clauseSep := func() {
115 if len(out) > 0 {
116 out = append(out, 0xe3, 0x80, 0x81) // 、
117 }
118 }
119 for _, idx := range topNonVerbs {
120 if usedSubj[idx] {
121 continue
122 }
123 sepIfNeeded()
124 emitJANonVerbWithMods(&out, set, idx, mods)
125 }
126 for _, idx := range topVerbs {
127 sepIfNeeded()
128 emitJAVerbWithMods(&out, set, idx, mods)
129 }
130 for i, idx := range copulas {
131 if i > 0 {
132 clauseSep()
133 } else {
134 sepIfNeeded()
135 }
136 if sidx, ok := predSubj[idx]; ok {
137 emitJANonVerbWithMods(&out, set, sidx, mods)
138 }
139 emitJACopula(&out, set, idx, mods)
140 }
141 for i, idx := range adjs {
142 if i > 0 || len(copulas) > 0 {
143 clauseSep()
144 } else {
145 sepIfNeeded()
146 }
147 if sidx, ok := predSubj[idx]; ok {
148 emitJANonVerbWithMods(&out, set, sidx, mods)
149 }
150 emitJAPredAdj(&out, set, idx, mods)
151 }
152 return string(out)
153 }
154
155 // emitJAPredAdj emits a predicative i-adjective as the sentence-final
156 // predicate. The atom is normally the full i-adj form (面白い); when an
157 // EN→JA translation returns a stem without the い suffix (e.g. red→赤),
158 // append い to restore the predicative-adjective surface. Past/negative
159 // forms would replace い with かった/くない but only non-past is handled.
160 // Synthetic 3sg marker (◯) appended after the adjective.
161 func emitJAPredAdj(out *[]byte, set []SetEntry, idx int32, mods map[int32][]int32) {
162 emitJAModifiers(out, set, idx, mods)
163 e := set[idx]
164 *out = append(*out, []byte(e.Atom)...)
165 if !endsInIKana(e.Atom) {
166 *out = append(*out, 0xe3, 0x81, 0x84) // い
167 }
168 if e.Morph&Meta3Sg != 0 {
169 *out = append(*out, []byte(markerToJA()[Mk3Sg])...)
170 }
171 }
172
173 // emitJACopula emits a copular predicate: complement_atom + copula form +
174 // trailing morph markers. The morph markers MUST come AFTER the copula form,
175 // not between atom and copula, to avoid splitting the copula compound on
176 // re-extraction (学生◯だ would tokenize as [学生, ◯, だ] with だ as a
177 // separate slot; 学生だ◯ tokenizes as [学生だ, ◯] which strips cleanly).
178 //
179 // Locative variant: when the MKCop entry carries OblRole=ORLoc, emit as
180 // 〜の中にいる (existence-locative) instead of 〜だ (nominal copula).
181 func emitJACopula(out *[]byte, set []SetEntry, idx int32, mods map[int32][]int32) {
182 emitJAModifiers(out, set, idx, mods)
183 e := set[idx]
184 // Definiteness and plural markers belong with the noun (before copula).
185 *out = append(*out, []byte(e.Atom)...)
186 if e.Morph&MetaDefDef != 0 {
187 *out = append(*out, []byte(markerToJA()[MkDef])...)
188 }
189 if e.Morph&MetaNumPlural != 0 {
190 *out = append(*out, []byte(markerToJA()[MkPlural])...)
191 }
192 if e.OblRole == ORLoc {
193 // Locative-existence: 〜の中に + いる. 中 = E4 B8 AD, に = E3 81 AB,
194 // いる = E3 81 84 E3 82 8B.
195 *out = append(*out, 0xe3, 0x81, 0xae) // の
196 *out = append(*out, 0xe4, 0xb8, 0xad) // 中
197 *out = append(*out, 0xe3, 0x81, 0xab) // に
198 *out = append(*out, 0xe3, 0x81, 0x84, 0xe3, 0x82, 0x8b) // いる
199 } else {
200 *out = append(*out, []byte(jaCopulaForm(e.Morph))...)
201 }
202 // 3sg agreement marker AFTER the copula form so it doesn't split it.
203 if e.Morph&Meta3Sg != 0 {
204 *out = append(*out, []byte(markerToJA()[Mk3Sg])...)
205 }
206 }
207
208 func jaCopulaForm(morph uint16) string {
209 past := morph&MetaTensePast != 0
210 pol := morph&MetaFormalityPol != 0
211 switch {
212 case past && pol:
213 return "\xe3\x81\xa7\xe3\x81\x97\xe3\x81\x9f" // でした
214 case past:
215 return "\xe3\x81\xa0\xe3\x81\xa3\xe3\x81\x9f" // だった
216 case pol:
217 return "\xe3\x81\xa7\xe3\x81\x99" // です
218 default:
219 return "\xe3\x81\xa0" // だ
220 }
221 }
222
223 // emitJAModifiers emits pre-head modifiers (POSS, ATTR, ADV) of the entry at
224 // idx with the appropriate relation marker. MKCoord peers are emitted by the
225 // caller AFTER the head, so they're skipped here.
226 func emitJAModifiers(out *[]byte, set []SetEntry, idx int32, mods map[int32][]int32) {
227 for _, mIdx := range mods[idx] {
228 m := set[mIdx]
229 if m.ModKind == MKCoord {
230 continue
231 }
232 // MKRel: modifier is a verb forming a relative clause. JA renders the
233 // conjugated verb BEFORE the head noun with no particle between. The
234 // verb's morph drives the surface form; no synthetic morph markers
235 // (★/☆/◯) because those would split the conjugation boundary.
236 if m.ModKind == MKRel {
237 form := renderJAVerb(m.Atom, m.Class, m.Morph)
238 *out = append(*out, []byte(form)...)
239 continue
240 }
241 *out = append(*out, []byte(m.Atom)...)
242 emitMorphMarkersJA(out, m.Morph)
243 switch m.ModKind {
244 case MKPoss:
245 *out = append(*out, []byte(markerToJA()[MkNo])...) // の
246 case MKAttr:
247 // い-adjective sits directly before the head, no particle.
248 case MKAdv:
249 // Adverbial: space delimiter so tokenizer separates adv from verb.
250 *out = append(*out, ' ')
251 }
252 }
253 }
254
255 func emitJANonVerbWithMods(out *[]byte, set []SetEntry, idx int32, mods map[int32][]int32) {
256 emitJAModifiers(out, set, idx, mods)
257 e := set[idx]
258 *out = append(*out, []byte(e.Atom)...)
259 // Skip morph markers + particles when the atom is untranslated or
260 // foreign-language leak; those would be meaningless decorations on a
261 // placeholder. Particle is still emitted to preserve sentence shape.
262 if e.Atom == UntranslatedMarker || containsASCIIByte(e.Atom) {
263 // Still emit the particle so the sentence parses on round-trip.
264 particle := ""
265 if e.Mark != 0 && e.Mark <= 16 && e.Mark != MkTo {
266 particle = markerToJA()[e.Mark]
267 }
268 if particle == "" && e.OblRole != ORNone {
269 particle = oblRoleToJaParticle(e.OblRole)
270 }
271 if particle == "" {
272 particle = renderJAParticle(e.Role, e.Mark)
273 }
274 if particle != "" {
275 *out = append(*out, []byte(particle)...)
276 }
277 return
278 }
279 emitMorphMarkersJA(out, e.Morph)
280 // Coordination peers: emit と + peer-atom for each MKCoord modifier of
281 // this head. The peer's own morph markers stick to its atom; the head's
282 // particle is emitted AFTER all coord members so it scopes the whole group.
283 for _, mIdx := range mods[idx] {
284 if set[mIdx].ModKind != MKCoord {
285 continue
286 }
287 peer := set[mIdx]
288 *out = append(*out, []byte(markerToJA()[MkTo])...) // と
289 *out = append(*out, []byte(peer.Atom)...)
290 emitMorphMarkersJA(out, peer.Morph)
291 }
292 // Priority for particle selection: Mark first, then OblRole, then Role default.
293 // Mark==MkTo on the head is the coordination signal (consumed above), so
294 // fall through to OblRole or Role for the actual scope-particle.
295 var particle string
296 if e.Mark != 0 && e.Mark <= 16 && e.Mark != MkTo {
297 particle = markerToJA()[e.Mark]
298 }
299 if particle == "" && e.OblRole != ORNone {
300 particle = oblRoleToJaParticle(e.OblRole)
301 }
302 if particle == "" {
303 particle = renderJAParticle(e.Role, e.Mark)
304 }
305 if particle != "" {
306 *out = append(*out, []byte(particle)...)
307 }
308 }
309
310 func emitJAVerbWithMods(out *[]byte, set []SetEntry, idx int32, mods map[int32][]int32) {
311 emitJAModifiers(out, set, idx, mods)
312 e := set[idx]
313 form := renderJAVerb(e.Atom, e.Class, e.Morph)
314 *out = append(*out, []byte(form)...)
315 // 3sg marker emitted via emitMorphMarkersJA semantics, but verb-side
316 // places it AFTER the conjugated form rather than between atom and particle.
317 if e.Morph&Meta3Sg != 0 {
318 *out = append(*out, []byte(markerToJA()[Mk3Sg])...)
319 }
320 if isSentenceFinalParticle(e.Mark) {
321 *out = append(*out, []byte(markerToJAString(e.Mark))...)
322 }
323 }
324
325 // emitMorphMarkersJA appends synthetic morph markers (★/☆/◯) to the output
326 // for morph bits that have no native JA surface form. EN-only features like
327 // definiteness, plural, and 3sg agreement need explicit markers in JA to
328 // survive the round-trip.
329 func emitMorphMarkersJA(out *[]byte, morph uint16) {
330 if morph&MetaDefDef != 0 {
331 *out = append(*out, []byte(markerToJA()[MkDef])...)
332 }
333 if morph&MetaNumPlural != 0 {
334 *out = append(*out, []byte(markerToJA()[MkPlural])...)
335 }
336 if morph&Meta3Sg != 0 {
337 *out = append(*out, []byte(markerToJA()[Mk3Sg])...)
338 }
339 }
340
341 func isSentenceFinalParticle(mk uint8) bool {
342 switch mk {
343 case MkYo, MkNe, MkKa, MkYori, MkKedo, MkMo:
344 return true
345 }
346 return false
347 }
348
349 // renderJAParticle picks the JA particle for a slot.
350 // Priority: OblRole (semantic) > Mark (within-language preserved) > Role default.
351 func renderJAParticle(role int32, mark uint8) string {
352 if mark != 0 && mark <= 16 {
353 s := markerToJA()[mark]
354 if s != "" {
355 return s
356 }
357 }
358 switch role {
359 case HistTopic:
360 return "\xe3\x81\xaf" // は
361 case HistSubject:
362 return "\xe3\x81\x8c" // が
363 case HistObject:
364 return "\xe3\x82\x92" // を
365 case HistScope:
366 return "\xe3\x81\xab" // に
367 case HistModifier:
368 return "\xe3\x81\xa7" // で
369 case HistOperator:
370 return "\xe3\x81\xae" // の
371 case HistComplement:
372 return ""
373 }
374 return ""
375 }
376
377 // oblRoleToJaParticle maps an oblique semantic role to the canonical JA particle.
378 func oblRoleToJaParticle(or uint8) string {
379 switch or {
380 case ORGoal:
381 return "\xe3\x81\xab" // に (motion goal)
382 case ORLoc:
383 return "\xe3\x81\xa7" // で (location of action)
384 case ORSource:
385 return "\xe3\x81\x8b\xe3\x82\x89" // から
386 case ORLimit:
387 return "\xe3\x81\xbe\xe3\x81\xa7" // まで
388 case ORInstr:
389 return "\xe3\x81\xa7" // で (instrumental)
390 case ORComit:
391 return "\xe3\x81\xa8" // と
392 case ORBenef:
393 return "\xe3\x81\xab" // に (default; could be のために)
394 case ORAgent:
395 return "\xe3\x81\xab" // に (passive agent)
396 case ORRecip:
397 return "\xe3\x81\xab" // に (dative)
398 case ORPart:
399 return "\xe3\x81\xae" // の
400 case ORCompare:
401 return "\xe3\x82\x88\xe3\x82\x8a" // より
402 }
403 return ""
404 }
405
406 func endsWithJAParticle(b []byte) bool {
407 n := len(b)
408 if n < 3 {
409 return false
410 }
411 for i := 1; i < len(markerToJA()); i++ {
412 p := markerToJA()[i]
413 if p == "" {
414 continue
415 }
416 pb := []byte(p)
417 if n >= len(pb) && string(b[n-len(pb):]) == p {
418 return true
419 }
420 }
421 return false
422 }
423
424 func markerToJAString(mk uint8) string {
425 if mk >= 1 && mk <= 16 {
426 return markerToJA()[mk]
427 }
428 return ""
429 }
430
431 // renderJAVerb reconstructs the verb surface form from stem, class, and morph.
432 // Class=0 means "no class detected" - this happens routinely for atoms
433 // returned by cross-language translation (the lattice atom-link table
434 // carries the lemma but not its class). Recover by inferring class from
435 // the lemma's final character + a small lookup table of common verbs.
436 // Falling back to godan-ru as default for unknown kanji stems produces
437 // a parseable surface form that round-trips, even when the specific
438 // class is wrong.
439 func renderJAVerb(stem string, class uint8, morph uint16) string {
440 if stem == UntranslatedMarker {
441 return stem
442 }
443 if containsASCIIByte(stem) {
444 return stem
445 }
446 if class == 0 {
447 class = inferJAClass(stem)
448 }
449 if class == VClassBare {
450 b := []byte(stem)
451 if endsInIKana(stem) && len(b) > 3 && hasKanji(b) {
452 return renderJAIAdjVerbSlot(stem, morph)
453 }
454 return stem
455 }
456 if class == VClassSuru {
457 return conjugateSuru(stem, morph)
458 }
459 if class == VClassKuru {
460 return conjugateKuru(stem, morph)
461 }
462 // Godan dict-form lemmas (行く, 読む, etc.) include the u-row ending.
463 // Strip it before conjugating to avoid double-suffix (行く+く = 行くく).
464 stem = stripGodanDictEnding(stem, class)
465 connector, suffix := jaConjugate(class, morph)
466 return stem | connector | suffix
467 }
468
469 func renderJAIAdjVerbSlot(stem string, morph uint16) string {
470 b := []byte(stem)
471 root := string(b[:len(b)-3])
472 neg := morph&MetaPolarNeg != 0
473 past := morph&MetaTensePast != 0
474 if neg && past {
475 return root | "\xe3\x81\x8f\xe3\x81\xaa\xe3\x81\x8b\xe3\x81\xa3\xe3\x81\x9f" // くなかった
476 }
477 if neg {
478 return root | "\xe3\x81\x8f\xe3\x81\xaa\xe3\x81\x84" // くない
479 }
480 if past {
481 return root | "\xe3\x81\x8b\xe3\x81\xa3\xe3\x81\x9f" // かった
482 }
483 return stem
484 }
485
486 func conjugateSuru(stem string, morph uint16) string {
487 // stem is "する" for standalone, or "勉強する" for compounds.
488 // Strip する suffix to get the noun-stem (empty for standalone).
489 base := ""
490 sb := []byte(stem)
491 suruBytes := []byte("\xe3\x81\x99\xe3\x82\x8b") // する
492 if len(sb) > 6 && string(sb[len(sb)-6:]) == string(suruBytes) {
493 base = string(sb[:len(sb)-6])
494 }
495 past := morph&MetaTensePast != 0
496 prog := morph&MetaAspectProg != 0
497 neg := morph&MetaPolarNeg != 0
498 pol := morph&MetaFormalityPol != 0
499 pass := morph&MetaPassive != 0
500 caus := morph&MetaCausative != 0
501 if caus && pass {
502 if past {
503 return base | "\xe3\x81\x95\xe3\x81\x9b\xe3\x82\x89\xe3\x82\x8c\xe3\x81\x9f" // させられた
504 }
505 return base | "\xe3\x81\x95\xe3\x81\x9b\xe3\x82\x89\xe3\x82\x8c\xe3\x82\x8b" // させられる
506 }
507 if pass {
508 if past {
509 return base | "\xe3\x81\x95\xe3\x82\x8c\xe3\x81\x9f" // された
510 }
511 return base | "\xe3\x81\x95\xe3\x82\x8c\xe3\x82\x8b" // される
512 }
513 if caus {
514 if past {
515 return base | "\xe3\x81\x95\xe3\x81\x9b\xe3\x81\x9f" // させた
516 }
517 return base | "\xe3\x81\x95\xe3\x81\x9b\xe3\x82\x8b" // させる
518 }
519 if pol {
520 if past && neg {
521 return base | "\xe3\x81\x97\xe3\x81\xbe\xe3\x81\x9b\xe3\x82\x93\xe3\x81\xa7\xe3\x81\x97\xe3\x81\x9f" // しませんでした
522 }
523 if neg {
524 return base | "\xe3\x81\x97\xe3\x81\xbe\xe3\x81\x9b\xe3\x82\x93" // しません
525 }
526 if past {
527 return base | "\xe3\x81\x97\xe3\x81\xbe\xe3\x81\x97\xe3\x81\x9f" // しました
528 }
529 return base | "\xe3\x81\x97\xe3\x81\xbe\xe3\x81\x99" // します
530 }
531 if prog {
532 if past {
533 return base | "\xe3\x81\x97\xe3\x81\xa6\xe3\x81\x84\xe3\x81\x9f" // していた
534 }
535 return base | "\xe3\x81\x97\xe3\x81\xa6\xe3\x81\x84\xe3\x82\x8b" // している
536 }
537 if neg {
538 if past {
539 return base | "\xe3\x81\x97\xe3\x81\xaa\xe3\x81\x8b\xe3\x81\xa3\xe3\x81\x9f" // しなかった
540 }
541 return base | "\xe3\x81\x97\xe3\x81\xaa\xe3\x81\x84" // しない
542 }
543 if past {
544 return base | "\xe3\x81\x97\xe3\x81\x9f" // した
545 }
546 return base | "\xe3\x81\x99\xe3\x82\x8b" // する
547 }
548
549 func conjugateKuru(stem string, morph uint16) string {
550 past := morph&MetaTensePast != 0
551 prog := morph&MetaAspectProg != 0
552 neg := morph&MetaPolarNeg != 0
553 pol := morph&MetaFormalityPol != 0
554 if pol {
555 if past {
556 return "\xe6\x9d\xa5\xe3\x81\xbe\xe3\x81\x97\xe3\x81\x9f" // 来ました
557 }
558 if neg {
559 return "\xe6\x9d\xa5\xe3\x81\xbe\xe3\x81\x9b\xe3\x82\x93" // 来ません
560 }
561 return "\xe6\x9d\xa5\xe3\x81\xbe\xe3\x81\x99" // 来ます
562 }
563 if prog {
564 if past {
565 return "\xe6\x9d\xa5\xe3\x81\xa6\xe3\x81\x84\xe3\x81\x9f" // 来ていた
566 }
567 return "\xe6\x9d\xa5\xe3\x81\xa6\xe3\x81\x84\xe3\x82\x8b" // 来ている
568 }
569 if neg {
570 if past {
571 return "\xe6\x9d\xa5\xe3\x81\xaa\xe3\x81\x8b\xe3\x81\xa3\xe3\x81\x9f" // 来なかった
572 }
573 return "\xe6\x9d\xa5\xe3\x81\xaa\xe3\x81\x84" // 来ない
574 }
575 if past {
576 return "\xe6\x9d\xa5\xe3\x81\x9f" // 来た
577 }
578 return "\xe6\x9d\xa5\xe3\x82\x8b" // 来る
579 }
580
581 func stripGodanDictEnding(stem string, class uint8) string {
582 s := godanDictSuffix(class)
583 if s == "" {
584 return stem
585 }
586 b := []byte(stem)
587 sb := []byte(s)
588 if len(b) > len(sb) && string(b[len(b)-len(sb):]) == s {
589 return string(b[:len(b)-len(sb)])
590 }
591 return stem
592 }
593
594 // containsASCIIByte returns true when at least one byte of s is in the
595 // ASCII range. Used by the JA renderer to refuse inflection on atoms that
596 // appear to be foreign (cross-language leakage). Mirrors the EN renderer's
597 // isASCIIOnly guard but from the JA side.
598 func containsASCIIByte(s string) bool {
599 for i := 0; i < len(s); i++ {
600 if s[i] < 0x80 {
601 return true
602 }
603 }
604 return false
605 }
606
607 // inferJAClass guesses verb class from the lemma. Lemma is now in dict form
608 // for godan verbs (行く, 読む, etc.), so the trailing kana directly identifies
609 // the class. Ichidan lemmas keep the え/い-row stem (食べ, 見, 起き).
610 //
611 // Rules:
612 // 1. Ends in u-row kana (く,ぐ,す,つ,ぬ,ぶ,む,う) → godan of that class
613 // 2. Ends in る: preceding kana is え/い-row → ichidan, else → godan-ru
614 // 3. Ends in え/い-row (no る) → ichidan
615 // 4. Kanji-final → legacy bare-stem, check table or default godan-ru
616 func inferJAClass(lemma string) uint8 {
617 b := []byte(lemma)
618 if len(b) >= 6 && string(b[len(b)-6:]) == "\xe3\x81\x99\xe3\x82\x8b" { // ~する
619 return VClassSuru
620 }
621 if lemma == "\xe6\x9d\xa5\xe3\x82\x8b" { // 来る
622 return VClassKuru
623 }
624 if len(b) < 3 {
625 return VClassBare
626 }
627 last := string(b[len(b)-3:])
628 switch last {
629 case "\xe3\x81\x8f": // く
630 return VClassGodanKu
631 case "\xe3\x81\x90": // ぐ
632 return VClassGodanGu
633 case "\xe3\x81\x99": // す
634 return VClassGodanSu
635 case "\xe3\x81\xa4": // つ
636 return VClassGodanTsu
637 case "\xe3\x81\xac": // ぬ
638 return VClassGodanNu
639 case "\xe3\x81\xb6": // ぶ
640 return VClassGodanBu
641 case "\xe3\x82\x80": // む
642 return VClassGodanMu
643 case "\xe3\x81\x86": // う
644 return VClassGodanU
645 case "\xe3\x82\x8b": // る - ambiguous: ichidan or godan-ru
646 if len(b) >= 6 {
647 prev := string(b[len(b)-6 : len(b)-3])
648 if isERowOrIRowKana(prev) {
649 return VClassIchidan
650 }
651 }
652 return VClassGodanRu
653 }
654 if isERowOrIRowKana(last) {
655 return VClassIchidan
656 }
657 if c, ok := jaVerbClassTable()[lemma]; ok {
658 return c
659 }
660 return VClassBare
661 }
662
663 // jaVerbClassTable maps bare-kanji stems to verb class. Legacy fallback
664 // for atoms that reach inferJAClass without a dict-form ending (e.g.
665 // pre-existing data or non-verb tokens misclassified as verbs).
666 // The primary path now infers class from the lemma's trailing kana.
667 func jaVerbClassTable() map[string]uint8 {
668 return map[string]uint8{
669 "\xe8\xa1\x8c": VClassGodanKu, // 行
670 "\xe6\x9b\xb8": VClassGodanKu, // 書
671 "\xe8\x81\x9e": VClassGodanKu, // 聞
672 "\xe9\x96\x8b": VClassGodanKu, // 開
673 "\xe5\x83\x8d": VClassGodanKu, // 働
674 "\xe7\xb6\x9a": VClassGodanKu, // 続
675 "\xe6\xad\xa9": VClassGodanKu, // 歩
676 "\xe7\x9d\x80": VClassGodanKu, // 着
677 "\xe6\x8f\x8f": VClassGodanKu, // 描
678 "\xe6\xb3\xb3": VClassGodanGu, // 泳
679 "\xe6\x80\xa5": VClassGodanGu, // 急
680 "\xe8\xa9\xb1": VClassGodanSu, // 話
681 "\xe6\x8a\xbc": VClassGodanSu, // 押
682 "\xe6\x8e\xa2": VClassGodanSu, // 探
683 "\xe7\x9b\xb4": VClassGodanSu, // 直
684 "\xe6\xb8\xa1": VClassGodanSu, // 渡
685 "\xe6\xb6\x88": VClassGodanSu, // 消
686 "\xe7\xab\x8b": VClassGodanTsu, // 立
687 "\xe5\xbe\x85": VClassGodanTsu, // 待
688 "\xe6\x8c\x81": VClassGodanTsu, // 持
689 "\xe6\x89\x93": VClassGodanTsu, // 打
690 "\xe5\x8b\x9d": VClassGodanTsu, // 勝
691 "\xe8\x82\xb2": VClassGodanTsu, // 育
692 "\xe6\xad\xbb": VClassGodanNu, // 死
693 "\xe9\xa3\x9b": VClassGodanBu, // 飛
694 "\xe5\x91\xbc": VClassGodanBu, // 呼
695 "\xe9\x81\x8a": VClassGodanBu, // 遊
696 "\xe7\xb5\x90": VClassGodanBu, // 結
697 "\xe5\xad\xa6": VClassGodanBu, // 学
698 "\xe5\x96\x9c": VClassGodanBu, // 喜
699 "\xe8\xaa\xad": VClassGodanMu, // 読
700 "\xe9\xa3\xb2": VClassGodanMu, // 飲
701 "\xe4\xbd\x8f": VClassGodanMu, // 住
702 "\xe9\x80\xb2": VClassGodanMu, // 進
703 "\xe5\x8c\x85": VClassGodanMu, // 包
704 "\xe4\xbc\x91": VClassGodanMu, // 休
705 "\xe6\x9c\x9b": VClassGodanMu, // 望
706 "\xe9\xa0\xbc": VClassGodanMu, // 頼
707 "\xe6\x88\xbb": VClassGodanRu, // 戻
708 "\xe7\x9f\xa5": VClassGodanRu, // 知
709 "\xe8\xb5\xb0": VClassGodanRu, // 走
710 "\xe5\x8f\x96": VClassGodanRu, // 取
711 "\xe5\x88\x87": VClassGodanRu, // 切
712 "\xe5\xa3\xb2": VClassGodanRu, // 売
713 "\xe5\x85\xa5": VClassGodanRu, // 入
714 "\xe5\xb8\xb0": VClassGodanRu, // 帰
715 "\xe7\xb5\x82": VClassGodanRu, // 終
716 "\xe5\xa7\x8b": VClassGodanRu, // 始
717 "\xe6\xae\x8b": VClassGodanRu, // 残
718 "\xe4\xb9\x97": VClassGodanRu, // 乗
719 "\xe6\x8c\xaf": VClassGodanRu, // 振
720 "\xe9\x80\x81": VClassGodanRu, // 送
721 "\xe4\xbd\x9c": VClassGodanRu, // 作
722 "\xe6\x80\x9d": VClassGodanU, // 思
723 "\xe8\xa8\x80": VClassGodanU, // 言
724 "\xe8\xb2\xb7": VClassGodanU, // 買
725 "\xe4\xbd\xbf": VClassGodanU, // 使
726 "\xe7\xac\x91": VClassGodanU, // 笑
727 "\xe6\x89\x95": VClassGodanU, // 払
728 "\xe9\xa1\x98": VClassGodanU, // 願
729 "\xe6\xad\x8c": VClassGodanU, // 歌
730 "\xe4\xbc\x9a": VClassGodanU, // 会
731 "\xe8\xbf\xbd": VClassGodanU, // 追
732 "\xe9\x80\x9a": VClassGodanU, // 通
733 "\xe6\x95\x91": VClassGodanU, // 救
734 "\xe9\x81\x95": VClassGodanU, // 違
735 "\xe6\x88\xa6": VClassGodanU, // 戦
736 }
737 }
738
739 // rawMorphSuffix emits a class-independent surface for morph bits.
740 // Used when class detection failed (class=0) - includes the case of a
741 // cross-language-translated atom that lost its JA class.
742 func rawMorphSuffix(morph uint16) string {
743 if morph&MetaMoodVol != 0 {
744 // Default to ichidan よう - the only class-neutral volitional form
745 // available without verb-class knowledge. Re-extraction recognizes
746 // よう as ichidan, so this round-trips even if not original-faithful.
747 return "\xe3\x82\x88\xe3\x81\x86" // よう
748 }
749 if morph&MetaPassive != 0 {
750 if morph&MetaTensePast != 0 {
751 return "\xe3\x82\x89\xe3\x82\x8c\xe3\x81\x9f" // られた
752 }
753 return "\xe3\x82\x89\xe3\x82\x8c\xe3\x82\x8b" // られる
754 }
755 if morph&MetaCausative != 0 {
756 if morph&MetaTensePast != 0 {
757 return "\xe3\x81\x95\xe3\x81\x9b\xe3\x81\x9f" // させた
758 }
759 return "\xe3\x81\x95\xe3\x81\x9b\xe3\x82\x8b" // させる
760 }
761 out := ""
762 if morph&MetaAspectProg != 0 {
763 if morph&MetaTensePast != 0 {
764 out = out | "\xe3\x81\xa6\xe3\x81\x84\xe3\x81\x9f" // ていた
765 } else {
766 out = out | "\xe3\x81\xa6\xe3\x81\x84\xe3\x82\x8b" // ている
767 }
768 return out
769 }
770 if morph&MetaPolarNeg != 0 {
771 if morph&MetaTensePast != 0 {
772 out = out | "\xe3\x81\xaa\xe3\x81\x8b\xe3\x81\xa3\xe3\x81\x9f" // なかった
773 } else {
774 out = out | "\xe3\x81\xaa\xe3\x81\x84" // ない
775 }
776 return out
777 }
778 if morph&MetaFormalityPol != 0 {
779 if morph&MetaTensePast != 0 {
780 return "\xe3\x81\xbe\xe3\x81\x97\xe3\x81\x9f" // ました
781 }
782 return "\xe3\x81\xbe\xe3\x81\x99" // ます
783 }
784 if morph&MetaTensePast != 0 {
785 return "\xe3\x81\x9f" // た
786 }
787 return ""
788 }
789
790 // jaConjugate returns (connector, suffix) for a verb of given class and morph.
791 // connector is the kana between stem and suffix (e.g. for ichidan: empty for past,
792 // for godan: i-row for polite, etc).
793 func jaConjugate(class uint8, morph uint16) (string, string) {
794 past := morph&MetaTensePast != 0
795 prog := morph&MetaAspectProg != 0
796 neg := morph&MetaPolarNeg != 0
797 pol := morph&MetaFormalityPol != 0
798 vol := morph&MetaMoodVol != 0
799 pass := morph&MetaPassive != 0
800 caus := morph&MetaCausative != 0
801
802 // Causative-passive (made to V): させられる
803 if caus && pass {
804 if past {
805 return jaCausPassPast(class)
806 }
807 return jaCausPass(class)
808 }
809 // Passive alone
810 if pass {
811 if past {
812 return jaPassivePast(class)
813 }
814 return jaPassive(class)
815 }
816 // Causative alone
817 if caus {
818 if past {
819 return jaCausativePast(class)
820 }
821 return jaCausative(class)
822 }
823
824 if vol {
825 return jaVolitional(class)
826 }
827
828 // Polite forms: ます, ました, ません
829 if pol {
830 istem := jaIStem(class)
831 if past && neg {
832 return istem, "\xe3\x81\xbe\xe3\x81\x9b\xe3\x82\x93\xe3\x81\xa7\xe3\x81\x97\xe3\x81\x9f" // ませんでした (rare; default to ません)
833 }
834 if neg {
835 return istem, "\xe3\x81\xbe\xe3\x81\x9b\xe3\x82\x93" // ません
836 }
837 if past {
838 return istem, "\xe3\x81\xbe\xe3\x81\x97\xe3\x81\x9f" // ました
839 }
840 return istem, "\xe3\x81\xbe\xe3\x81\x99" // ます
841 }
842
843 // Progressive: ている, ていた
844 if prog {
845 teconn, te := jaTeForm(class)
846 if past {
847 return teconn, te | "\xe3\x81\x84\xe3\x81\x9f" // ていた / でいた
848 }
849 return teconn, te | "\xe3\x81\x84\xe3\x82\x8b" // ている / でいる
850 }
851
852 // Negative
853 if neg {
854 nstem := jaNegStem(class)
855 if past {
856 return nstem, "\xe3\x81\xaa\xe3\x81\x8b\xe3\x81\xa3\xe3\x81\x9f" // なかった
857 }
858 return nstem, "\xe3\x81\xaa\xe3\x81\x84" // ない
859 }
860
861 // Past (plain)
862 if past {
863 return jaPastForm(class)
864 }
865
866 // Dict form (non-past affirmative plain)
867 return jaDictForm(class)
868 }
869
870 // jaDictForm returns the dict-form ending for each class.
871 // For ichidan, the connector (え/い-row) stays in the stem, suffix is る.
872 // For godan, the kanji-stem gets the u-row ending appended directly.
873 func jaDictForm(class uint8) (string, string) {
874 switch class {
875 case VClassIchidan:
876 return "", "\xe3\x82\x8b" // る
877 case VClassGodanKu:
878 return "", "\xe3\x81\x8f" // く
879 case VClassGodanGu:
880 return "", "\xe3\x81\x90" // ぐ
881 case VClassGodanSu:
882 return "", "\xe3\x81\x99" // す
883 case VClassGodanTsu:
884 return "", "\xe3\x81\xa4" // つ
885 case VClassGodanNu:
886 return "", "\xe3\x81\xac" // ぬ
887 case VClassGodanBu:
888 return "", "\xe3\x81\xb6" // ぶ
889 case VClassGodanMu:
890 return "", "\xe3\x82\x80" // む
891 case VClassGodanRu:
892 return "", "\xe3\x82\x8b" // る
893 case VClassGodanU:
894 return "", "\xe3\x81\x86" // う
895 }
896 return "", "\xe3\x82\x8b" // default る
897 }
898
899 // jaPastForm returns (connector, suffix) for past-tense plain form.
900 // Ichidan: stem + た. Godan: contracted forms.
901 func jaPastForm(class uint8) (string, string) {
902 switch class {
903 case VClassIchidan:
904 return "", "\xe3\x81\x9f" // た
905 case VClassGodanKu:
906 return "\xe3\x81\x84", "\xe3\x81\x9f" // いた
907 case VClassGodanGu:
908 return "\xe3\x81\x84", "\xe3\x81\xa0" // いだ
909 case VClassGodanSu:
910 return "\xe3\x81\x97", "\xe3\x81\x9f" // した
911 case VClassGodanTsu, VClassGodanRu, VClassGodanU:
912 return "\xe3\x81\xa3", "\xe3\x81\x9f" // った
913 case VClassGodanNu, VClassGodanBu, VClassGodanMu:
914 return "\xe3\x82\x93", "\xe3\x81\xa0" // んだ
915 }
916 return "", "\xe3\x81\x9f"
917 }
918
919 // jaTeForm returns (connector, te-form-suffix) - basically same as past but te/で.
920 func jaTeForm(class uint8) (string, string) {
921 switch class {
922 case VClassIchidan:
923 return "", "\xe3\x81\xa6" // て
924 case VClassGodanKu:
925 return "\xe3\x81\x84", "\xe3\x81\xa6" // いて
926 case VClassGodanGu:
927 return "\xe3\x81\x84", "\xe3\x81\xa7" // いで
928 case VClassGodanSu:
929 return "\xe3\x81\x97", "\xe3\x81\xa6" // して
930 case VClassGodanTsu, VClassGodanRu, VClassGodanU:
931 return "\xe3\x81\xa3", "\xe3\x81\xa6" // って
932 case VClassGodanNu, VClassGodanBu, VClassGodanMu:
933 return "\xe3\x82\x93", "\xe3\x81\xa7" // んで
934 }
935 return "", "\xe3\x81\xa6"
936 }
937
938 // jaIStem returns the i-row stem connector for polite forms.
939 func jaIStem(class uint8) string {
940 switch class {
941 case VClassIchidan:
942 return "" // ichidan stem connects directly
943 case VClassGodanKu:
944 return "\xe3\x81\x8d" // き
945 case VClassGodanGu:
946 return "\xe3\x81\x8e" // ぎ
947 case VClassGodanSu:
948 return "\xe3\x81\x97" // し
949 case VClassGodanTsu:
950 return "\xe3\x81\xa1" // ち
951 case VClassGodanNu:
952 return "\xe3\x81\xab" // に
953 case VClassGodanBu:
954 return "\xe3\x81\xb3" // び
955 case VClassGodanMu:
956 return "\xe3\x81\xbf" // み
957 case VClassGodanRu:
958 return "\xe3\x82\x8a" // り
959 case VClassGodanU:
960 return "\xe3\x81\x84" // い
961 }
962 return ""
963 }
964
965 // jaNegStem returns the a-row stem connector for negative forms.
966 func jaNegStem(class uint8) string {
967 switch class {
968 case VClassIchidan:
969 return ""
970 case VClassGodanKu:
971 return "\xe3\x81\x8b" // か
972 case VClassGodanGu:
973 return "\xe3\x81\x8c" // が
974 case VClassGodanSu:
975 return "\xe3\x81\x95" // さ
976 case VClassGodanTsu:
977 return "\xe3\x81\x9f" // た - wait, this collides with た suffix.
978 // Actually godan-tsu negative is た+ない = たない. But that conflicts with past suffix た.
979 // Use 立つ → 立たない (the あ-row of つ is た). Need to be careful.
980 case VClassGodanNu:
981 return "\xe3\x81\xaa" // な
982 case VClassGodanBu:
983 return "\xe3\x81\xb0" // ば
984 case VClassGodanMu:
985 return "\xe3\x81\xbe" // ま
986 case VClassGodanRu:
987 return "\xe3\x82\x89" // ら
988 case VClassGodanU:
989 return "\xe3\x82\x8f" // わ
990 }
991 return ""
992 }
993
994 // jaVolitional returns (connector, suffix) for volitional form (let's...).
995 func jaVolitional(class uint8) (string, string) {
996 switch class {
997 case VClassIchidan:
998 return "", "\xe3\x82\x88\xe3\x81\x86" // よう
999 case VClassGodanKu:
1000 return "\xe3\x81\x93", "\xe3\x81\x86" // こう
1001 case VClassGodanGu:
1002 return "\xe3\x81\x94", "\xe3\x81\x86" // ごう
1003 case VClassGodanSu:
1004 return "\xe3\x81\x9d", "\xe3\x81\x86" // そう
1005 case VClassGodanTsu:
1006 return "\xe3\x81\xa8", "\xe3\x81\x86" // とう
1007 case VClassGodanNu:
1008 return "\xe3\x81\xae", "\xe3\x81\x86" // のう
1009 case VClassGodanBu:
1010 return "\xe3\x81\xbc", "\xe3\x81\x86" // ぼう
1011 case VClassGodanMu:
1012 return "\xe3\x82\x82", "\xe3\x81\x86" // もう
1013 case VClassGodanRu:
1014 return "\xe3\x82\x8d", "\xe3\x81\x86" // ろう
1015 case VClassGodanU:
1016 return "\xe3\x81\x8a", "\xe3\x81\x86" // おう
1017 }
1018 return "", "\xe3\x82\x88\xe3\x81\x86"
1019 }
1020
1021 // jaPassive: ichidan stem + られる, godan a-row stem + れる.
1022 func jaPassive(class uint8) (string, string) {
1023 switch class {
1024 case VClassIchidan:
1025 return "", "\xe3\x82\x89\xe3\x82\x8c\xe3\x82\x8b" // られる
1026 }
1027 // Godan: a-row connector + れる
1028 return jaNegStem(class), "\xe3\x82\x8c\xe3\x82\x8b" // a-row + れる
1029 }
1030
1031 func jaPassivePast(class uint8) (string, string) {
1032 switch class {
1033 case VClassIchidan:
1034 return "", "\xe3\x82\x89\xe3\x82\x8c\xe3\x81\x9f" // られた
1035 }
1036 return jaNegStem(class), "\xe3\x82\x8c\xe3\x81\x9f" // a-row + れた
1037 }
1038
1039 // jaCausative: ichidan stem + させる, godan a-row stem + せる.
1040 func jaCausative(class uint8) (string, string) {
1041 switch class {
1042 case VClassIchidan:
1043 return "", "\xe3\x81\x95\xe3\x81\x9b\xe3\x82\x8b" // させる
1044 }
1045 return jaNegStem(class), "\xe3\x81\x9b\xe3\x82\x8b" // a-row + せる
1046 }
1047
1048 func jaCausativePast(class uint8) (string, string) {
1049 switch class {
1050 case VClassIchidan:
1051 return "", "\xe3\x81\x95\xe3\x81\x9b\xe3\x81\x9f" // させた
1052 }
1053 return jaNegStem(class), "\xe3\x81\x9b\xe3\x81\x9f" // a-row + せた
1054 }
1055
1056 // jaCausPass: ichidan stem + させられる, godan a-row + せられる.
1057 func jaCausPass(class uint8) (string, string) {
1058 switch class {
1059 case VClassIchidan:
1060 return "", "\xe3\x81\x95\xe3\x81\x9b\xe3\x82\x89\xe3\x82\x8c\xe3\x82\x8b" // させられる
1061 }
1062 return jaNegStem(class), "\xe3\x81\x9b\xe3\x82\x89\xe3\x82\x8c\xe3\x82\x8b" // a-row + せられる
1063 }
1064
1065 func jaCausPassPast(class uint8) (string, string) {
1066 switch class {
1067 case VClassIchidan:
1068 return "", "\xe3\x81\x95\xe3\x81\x9b\xe3\x82\x89\xe3\x82\x8c\xe3\x81\x9f" // させられた
1069 }
1070 return jaNegStem(class), "\xe3\x81\x9b\xe3\x82\x89\xe3\x82\x8c\xe3\x81\x9f" // a-row + せられた
1071 }
1072