translate.mx raw
1 package transdb
2
3 import (
4 "git.smesh.lol/iskradb/lattice"
5 "git.smesh.lol/transdb/fuzzy"
6 )
7
8 // FormFromInline extracts the surface form stored in Record.Inline.
9 // Byte 23 holds the inline length (0 = overflow, data in pool).
10 func FormFromInline(rec *lattice.Record, pool []byte) string {
11 n := int(rec.Inline[23])
12 if n > 0 && n <= 23 {
13 return string(rec.Inline[:n])
14 }
15 if rec.DataFile == 1 && rec.DataLen > 0 {
16 end := rec.DataOff + rec.DataLen
17 if int(end) <= len(pool) {
18 return string(pool[rec.DataOff:end])
19 }
20 }
21 return ""
22 }
23
24 // SetFormOnRecord stores the surface form in Record.Inline (up to 23 bytes)
25 // or overflows into pool when longer.
26 func SetFormOnRecord(rec *lattice.Record, form string, pool *[]byte) {
27 b := []byte(form)
28 if len(b) <= 23 {
29 copy(rec.Inline[:], b)
30 rec.Inline[23] = byte(len(b))
31 rec.DataFile = 0
32 } else {
33 copy(rec.Inline[:23], b[:23])
34 rec.Inline[23] = 0
35 rec.DataFile = 1
36 rec.DataOff = uint32(len(*pool))
37 rec.DataLen = uint32(len(b))
38 *pool = append(*pool, b...)
39 }
40 }
41
42 // defaultBranchOrder uses actual branch indices (Bnoun=1, Bverb=3, Bmodifier=4).
43 var defaultBranchOrder = [3]uint8{uint8(lattice.Bnoun), uint8(lattice.Bverb), uint8(lattice.Bmodifier)}
44
45 // lookupByKey finds all translation candidates for a pre-computed key,
46 // searching branches in the given order.
47 func lookupByKey(tree *lattice.Tree, pool []byte, key lattice.Key, order [3]uint8) []string {
48 var results []string
49 for _, b := range order {
50 ri := tree.LookupRecIdx(lattice.Branch(b), key)
51 if ri == lattice.NullRec {
52 continue
53 }
54 rec := tree.GetRecord(ri)
55 if rec == nil {
56 continue
57 }
58 if rec.Link[0] != lattice.NullRec {
59 if dst := tree.GetRecord(rec.Link[0]); dst != nil {
60 if form := FormFromInline(dst, pool); form != "" {
61 results = appendUniq(results, form)
62 }
63 }
64 }
65 if rec.Link[1] != lattice.NullRec {
66 if dst := tree.GetRecord(rec.Link[1]); dst != nil {
67 if form := FormFromInline(dst, pool); form != "" {
68 results = appendUniq(results, form)
69 }
70 }
71 }
72 break
73 }
74 return results
75 }
76
77 // jaRecordBranch returns the branch of the coord=0 JA record for tok, or 255 if not found.
78 func jaRecordBranch(tree *lattice.Tree, tok string) uint8 {
79 key := MakeKey(LangJA, 0, tok)
80 for _, b := range ActiveBranches {
81 if tree.LookupRecIdx(b, key) != lattice.NullRec {
82 return uint8(b)
83 }
84 }
85 return 255
86 }
87
88 // LookupWord finds all translation candidates for a single word token (coord=0).
89 func LookupWord(tree *lattice.Tree, pool []byte, word string, srcLang uint8) []string {
90 return lookupByKey(tree, pool, MakeKey(srcLang, 0, word), defaultBranchOrder)
91 }
92
93 // LookupWordCtx finds translations using the 22-bit coordinate.
94 // Tries each coordinate in the relaxation sequence (most specific → least specific).
95 // For JA source, branch order is derived from the cooccurrence axis.
96 func LookupWordCtx(tree *lattice.Tree, pool []byte, word string, srcLang uint8, coord uint64) []string {
97 order := defaultBranchOrder
98 if srcLang == LangJA {
99 order = branchOrderJA(coord)
100 }
101 for _, c := range RelaxCoord(coord) {
102 if results := lookupByKey(tree, pool, MakeKey(srcLang, c, word), order); len(results) > 0 {
103 return results
104 }
105 }
106 return nil
107 }
108
109 // jaRole constants for syntactic role assignment.
110 const (
111 jaRoleNone = uint8(0)
112 jaRoleSubj = uint8(1) // は が
113 jaRoleObj = uint8(2) // を
114 jaRoleVerb = uint8(3)
115 jaRoleMisc = uint8(4) // everything else
116 )
117
118 // jaRoleParticle maps particle strings to syntactic roles.
119 // Only subject (は/が) and object (を) get specific roles;
120 // other particles collapse to misc.
121 var jaRoleParticle = map[string]uint8{
122 "は": jaRoleSubj, "が": jaRoleSubj,
123 "を": jaRoleObj,
124 }
125
126 // Translate tokenizes text in srcLang and translates each token to dstLang.
127 // For JA→EN, applies particle-based role assignment and SOV→SVO reordering.
128 // Tokens with no translation are passed through unchanged.
129 func Translate(tree *lattice.Tree, pool []byte, idx *fuzzy.DualIndex,
130 text string, srcLang, dstLang uint8, verbose bool) string {
131
132 var tokens []string
133 switch srcLang {
134 case LangEN:
135 tokens = TokenizeEN(text)
136 case LangJA:
137 tokens = TokenizeJA(text, tree, verbose)
138 default:
139 tokens = TokenizeEN(text)
140 }
141
142 if srcLang == LangJA && dstLang == LangEN {
143 return translateJAToEN(tree, pool, idx, tokens, verbose)
144 }
145 return translateTokens(tree, pool, idx, tokens, srcLang, dstLang, verbose)
146 }
147
148 // translateJAToEN handles JA→EN with two-zone SOV→SVO reordering.
149 //
150 // Zone split: は/が divides the sentence into subject zone and predicate zone.
151 // Within the predicate zone, verb tokens are pulled to the front:
152 // SUBJ_ZONE + VERB(s) + REST_OF_PRED_ZONE
153 //
154 // This preserves modifier attachment (天皇の歴史的責任感 stays together as the
155 // subject) while achieving SVO word order for the core clause.
156 func translateJAToEN(tree *lattice.Tree, pool []byte, idx *fuzzy.DualIndex,
157 tokens []string, verbose bool) string {
158
159 n := len(tokens)
160
161 // isSkipToken: pure-hiragana particles and copulae get no EN output.
162 isSkip := func(tok string) bool {
163 if !isPureHiragana(tok) {
164 return false
165 }
166 jaKey := MakeKey(LangJA, 0, tok)
167 return tree.LookupRecIdx(lattice.Bmodifier, jaKey) != lattice.NullRec || jaFunctionWord[tok]
168 }
169
170 // lookupMorph returns the translation and MorphState for a JA token.
171 // Uses RelaxCoord: tries most-specific coord first, falls back toward coord=0.
172 lookupMorph := func(tok string, coord uint64) (string, uint8) {
173 order := branchOrderJA(coord)
174 for _, c := range RelaxCoord(coord) {
175 key := MakeKey(LangJA, c, tok)
176 for _, b := range order {
177 ri := tree.LookupRecIdx(lattice.Branch(b), key)
178 if ri == lattice.NullRec {
179 continue
180 }
181 rec := tree.GetRecord(ri)
182 if rec == nil {
183 continue
184 }
185 state := GetMorphState(rec)
186 if rec.Link[0] != lattice.NullRec {
187 if dst := tree.GetRecord(rec.Link[0]); dst != nil {
188 if form := FormFromInline(dst, pool); form != "" {
189 return form, state
190 }
191 }
192 }
193 break
194 }
195 }
196 return "", 0
197 }
198
199 // translateTok: translate a single JA token using the 22-bit coordinate.
200 // The coord encodes both cooccurrence context (prev/next word types) and
201 // the morphological state inferred from the token's surface form.
202 translateTok := func(i int, tok string) string {
203 var prevType, nextType uint8
204 if i > 0 {
205 prevType = POSTypeFor(POSForWord(tree, LangJA, tokens[i-1]))
206 }
207 if i+1 < n {
208 nextType = POSTypeFor(POSForWord(tree, LangJA, tokens[i+1]))
209 }
210 morphState := uint64(inferMorphState(tok))
211 coord := PackCoord(0, 0, CoordCooccur(prevType, nextType), morphState, 0, 0, 0)
212
213 if en, state := lookupMorph(tok, coord); en != "" {
214 return applyMorphEN(en, state)
215 }
216
217 // Fuzzy fallback.
218 if idx != nil {
219 var corrected string
220 var wasCorrected bool
221 var candidates []string
222 candidates, corrected, wasCorrected = FuzzyLookupWord(tree, pool, idx, tok, LangJA, 2)
223 if verbose && wasCorrected {
224 println("fuzzy:", tok, "→", corrected)
225 }
226 for _, c := range candidates {
227 return applyMorphEN(c, 0)
228 }
229 _ = corrected
230 }
231
232 // verbStems fallback for forms not in lattice.
233 if stems := verbStems(tok); len(stems) > 0 {
234 for _, stem := range stems {
235 stemCoord := PackCoord(0, 0, CoordCooccur(prevType, nextType), morphState, 0, 0, 0)
236 if en, _ := lookupMorph(stem, stemCoord); en != "" {
237 return applyMorphEN(en, uint8(morphState))
238 }
239 }
240 }
241 return tok
242 }
243
244 // Find the first は/が boundary to split subject zone from predicate zone.
245 // subjEnd is the index of the は/が particle itself.
246 subjEnd := -1
247 for i, tok := range tokens {
248 if tok == "は" || tok == "が" {
249 if isPureHiragana(tok) {
250 subjEnd = i
251 break
252 }
253 }
254 }
255
256 // Translate all tokens in JA order, tagging each as subj/verb/pred.
257 type word struct {
258 en string
259 isV bool
260 }
261 var subjWords, predVerbs, predRest []word
262
263 for i, tok := range tokens {
264 if isSkip(tok) {
265 continue
266 }
267 en := translateTok(i, tok)
268 if en == "" {
269 continue
270 }
271 w := word{en, isJAVerb(tree, tok)}
272 if subjEnd >= 0 && i < subjEnd {
273 subjWords = append(subjWords, w)
274 } else if w.isV {
275 predVerbs = append(predVerbs, w)
276 } else {
277 predRest = append(predRest, w)
278 }
279 }
280
281 // Emit: SUBJ + VERB + REST_OF_PRED (preserves modifier order within each zone).
282 var out []byte
283 first := true
284 emit := func(en string) {
285 if !first {
286 out = append(out, ' ')
287 }
288 out = append(out, []byte(en)...)
289 first = false
290 }
291 for _, w := range subjWords {
292 emit(w.en)
293 }
294 for _, w := range predVerbs {
295 emit(w.en)
296 }
297 for _, w := range predRest {
298 emit(w.en)
299 }
300 return string(out)
301 }
302
303 // translateTokens handles EN→JA and same-language translation (no reordering).
304 // For EN→JA: operator tokens ("did", "not", "apparently" etc.) accumulate
305 // morphstate bits and are consumed without output; the next verb is looked up
306 // at the resulting morphstate in the JA cluster.
307 func translateTokens(tree *lattice.Tree, pool []byte, idx *fuzzy.DualIndex,
308 tokens []string, srcLang, dstLang uint8, verbose bool) string {
309
310 var out []byte
311 pendingMorph := uint8(0) // accumulated operator bits waiting for a verb
312 progressiveAux := uint8(0xFF) // 0xFF = none; otherwise tense bits from is/was/were
313 subjectSemFlags := uint64(0) // semantic flags from subject nouns seen so far
314
315 for i, tok := range tokens {
316 // EN→JA: detect operator tokens (morphstate walk instructions).
317 if srcLang == LangEN && dstLang == LangJA {
318 if bits, ok := enOperators[tok]; ok {
319 pendingMorph |= bits
320 continue // operator consumed, no output
321 }
322 // Progressive auxiliary: "is/am/are/was/were" before a verb+ing.
323 if tenseBits, ok := enProgressiveAux[tok]; ok {
324 progressiveAux = tenseBits
325 continue
326 }
327 // Detect "-ing" suffix on a verb when progressive aux is pending.
328 if progressiveAux != 0xFF && len(tok) > 3 && tok[len(tok)-3:] == "ing" {
329 pendingMorph |= (1 << 3) | progressiveAux // aspect + tense
330 progressiveAux = 0xFF
331 // Strip "ing" to get base verb for lookup.
332 tok = tok[:len(tok)-3]
333 }
334 }
335
336 var candidates []string
337 corrected := tok
338
339 var prevType, nextType uint8
340 if i > 0 {
341 prevType = POSTypeFor(POSForWord(tree, srcLang, tokens[i-1]))
342 }
343 if i+1 < len(tokens) {
344 nextType = POSTypeFor(POSForWord(tree, srcLang, tokens[i+1]))
345 }
346
347 // Accumulate semantic flags from subject nouns for verb disambiguation.
348 // Read flags from the noun's base record DataFile (O(1), no coord scan).
349 if srcLang == LangEN && dstLang == LangJA {
350 curType := POSTypeFor(POSForWord(tree, srcLang, tok))
351 if curType == CooccurNominal { // it's a noun in the EN lattice
352 key := MakeKey(LangEN, 0, tok)
353 for _, b := range ActiveBranches {
354 if ri := tree.LookupRecIdx(b, key); ri != lattice.NullRec {
355 if rec := tree.GetRecord(ri); rec != nil {
356 subjectSemFlags |= GetSemanticFromDataFile(rec)
357 }
358 break
359 }
360 }
361 }
362 }
363
364 coord := PackCoord(subjectSemFlags, 0, CoordCooccur(prevType, nextType), 0, 0, 0, 0)
365
366 if idx != nil {
367 var wasCorrected bool
368 candidates, corrected, wasCorrected = FuzzyLookupWord(tree, pool, idx, tok, srcLang, 2)
369 if verbose && wasCorrected {
370 println("fuzzy: corrected", tok, "→", corrected)
371 }
372 if len(candidates) > 0 && coord != 0 {
373 if ctxCands := LookupWordCtx(tree, pool, corrected, srcLang, coord); len(ctxCands) > 0 {
374 candidates = ctxCands
375 }
376 }
377 } else {
378 candidates = LookupWordCtx(tree, pool, tok, srcLang, coord)
379 }
380
381 var translated string
382
383 // EN→JA: use lookupENToJA to get JA base + EN record's own MorphState.
384 // Combine with pendingMorph (accumulated operator bits) for the target state.
385 // Handles both synthetic ("sang" has MorphState=16) and analytical ("did"+"sing").
386 if srcLang == LangEN && dstLang == LangJA {
387 jaBase, enMorphState := lookupENToJA(tree, pool, corrected, coord)
388 targetState := pendingMorph | enMorphState
389 if jaBase != "" && targetState != 0 {
390 if targetForm := lookupJAAtMorphState(tree, pool, jaBase, targetState); targetForm != "" {
391 translated = targetForm
392 } else {
393 translated = jaBase
394 }
395 pendingMorph = 0
396 } else if jaBase != "" {
397 translated = jaBase
398 pendingMorph = 0
399 }
400 }
401
402 if translated == "" {
403 for _, c := range candidates {
404 translated = c
405 break
406 }
407 }
408 if translated == "" {
409 translated = tok
410 }
411 if len(out) > 0 && dstLang == LangEN {
412 out = append(out, ' ')
413 }
414 out = append(out, []byte(translated)...)
415 }
416 return string(out)
417 }
418
419 // lookupENToJA finds the JA base form and the EN record's MorphState for a
420 // given EN token. Tries the word as-is, then with "to " prefix (JMdict gloss
421 // format). The MorphState on the EN record drives JA cluster navigation:
422 // "sang" has MorphState=16 pointing to 歌う, so we navigate to 歌った.
423 func lookupENToJA(tree *lattice.Tree, pool []byte, word string, coord uint64) (jaBase string, morphState uint8) {
424 order := defaultBranchOrder
425 for _, tryWord := range []string{word, "to " | word} {
426 for _, c := range RelaxCoord(coord) {
427 key := MakeKey(LangEN, c, tryWord)
428 for _, b := range order {
429 ri := tree.LookupRecIdx(lattice.Branch(b), key)
430 if ri == lattice.NullRec {
431 continue
432 }
433 rec := tree.GetRecord(ri)
434 if rec == nil {
435 continue
436 }
437 state := GetMorphState(rec)
438 if rec.Link[0] == lattice.NullRec {
439 break
440 }
441 dst := tree.GetRecord(rec.Link[0])
442 if dst == nil {
443 break
444 }
445 if form := FormFromInline(dst, pool); form != "" {
446 return form, state
447 }
448 break
449 }
450 }
451 }
452 return "", 0
453 }
454
455 // enToJABase is the legacy wrapper used by the operator path.
456 func enToJABase(tree *lattice.Tree, pool []byte, enWord string) string {
457 base, _ := lookupENToJA(tree, pool, enWord, 0)
458 return base
459 }
460
461 // lookupJAAtMorphState finds the surface form of jaBase at the given morphstate.
462 // Uses stored verb class from Bcooccur (O(1)) when available; falls back to
463 // trying each conjugation class in priority order (O(classes)).
464 func lookupJAAtMorphState(tree *lattice.Tree, pool []byte, jaBase string, targetState uint8) string {
465 tryForm := func(targetForm string) bool {
466 if targetForm == "" {
467 return false
468 }
469 key := MakeKey(LangJA, 0, targetForm)
470 for _, b := range ActiveBranches {
471 if tree.LookupRecIdx(lattice.Branch(b), key) != lattice.NullRec {
472 return true
473 }
474 }
475 return false
476 }
477
478 // Fast path: stored verb class from inflect.mx registration.
479 // When the class is known, the computed form is authoritative — return it
480 // even if not pre-stored in the lattice.
481 if class, ok := GetVerbClass(tree, LangJA, jaBase); ok {
482 if f := InflectJA(jaBase, class, targetState); f != "" {
483 return f
484 }
485 }
486
487 // Fallback: try each class in priority order (pre-inflect data or unknown class)
488 classOrder := []string{
489 "v1", "v5k", "v5s", "v5m", "v5b", "v5r", "v5t", "v5u", "v5g", "v5n", "vs", "vk",
490 }
491 for _, class := range classOrder {
492 forms := BuildVerbForms(jaBase, class)
493 if len(forms) == 0 {
494 continue
495 }
496 targetForm, ok := forms[targetState]
497 if !ok || targetForm == "" {
498 continue
499 }
500 if tryForm(targetForm) {
501 return targetForm
502 }
503 }
504 return ""
505 }
506
507 // TranslateWithClusters uses the five-stage cluster pipeline instead of
508 // token-by-token translation. Falls back to Translate if lang descriptors
509 // are not registered (lang-init not yet run).
510 func TranslateWithClusters(tree *lattice.Tree, pool []byte, text string, srcLang, dstLang uint8, verbose bool) string {
511 srcDesc, hasSrc := GetLangDesc(tree, srcLang)
512 dstDesc, hasDst := GetLangDesc(tree, dstLang)
513 if !hasSrc || !hasDst {
514 if verbose {
515 println("cluster: lang descriptors not registered, using token-by-token")
516 }
517 return Translate(tree, pool, nil, text, srcLang, dstLang, verbose)
518 }
519
520 var tokens []string
521 switch srcLang {
522 case LangEN:
523 tokens = TokenizeEN(text)
524 case LangJA:
525 tokens = TokenizeJA(text, tree, verbose)
526 default:
527 tokens = TokenizeEN(text)
528 }
529
530 clusters := ParseClusters(tokens, tree, srcLang)
531 for _, c := range clusters {
532 TranslateCluster(c, tree, pool, srcLang, dstLang)
533 }
534 reordered := ReorderClusters(clusters, srcDesc.Order, dstDesc.Order)
535 return InsertMarkers(reordered, dstDesc, dstLang)
536 }
537
538 // BuildWordIndex extracts all words from the lattice and builds BK-trees
539 // for fuzzy matching. Call once after loading the DB.
540 // Returns a *fuzzy.DualIndex with EN words in A and JA words in B.
541 func BuildWordIndex(tree *lattice.Tree, pool []byte) *fuzzy.DualIndex {
542 var enWords, jaWords []string
543 for recIdx := range tree.RecKey {
544 rec := tree.GetRecord(recIdx)
545 if rec == nil {
546 continue
547 }
548 form := FormFromInline(rec, pool)
549 if form == "" {
550 continue
551 }
552 switch Detect(form) {
553 case LangEN:
554 enWords = append(enWords, form)
555 case LangJA:
556 jaWords = append(jaWords, form)
557 }
558 }
559 return fuzzy.NewDualIndex(fuzzy.Build(enWords), fuzzy.Build(jaWords))
560 }
561
562 // FuzzyLookupWord attempts a translation with fuzzy fallback on exact miss.
563 // Returns (translations, correctedForm, wasCorrected).
564 func FuzzyLookupWord(tree *lattice.Tree, pool []byte, idx *fuzzy.DualIndex,
565 word string, srcLang uint8, maxDist int) ([]string, string, bool) {
566
567 results := LookupWord(tree, pool, word, srcLang)
568 if len(results) > 0 {
569 return results, word, false
570 }
571 if idx == nil {
572 return nil, word, false
573 }
574
575 var matches []fuzzy.Match
576 switch srcLang {
577 case LangEN:
578 matches = idx.SuggestA(word, maxDist, 3)
579 case LangJA:
580 matches = idx.SuggestB(word, maxDist, 3)
581 }
582 if len(matches) == 0 {
583 return nil, word, false
584 }
585
586 best := matches[0].Word
587 results = LookupWord(tree, pool, best, srcLang)
588 if len(results) > 0 {
589 return results, best, true
590 }
591 return nil, word, false
592 }
593
594 // stripTo removes a leading "to " from a JMdict verb gloss ("to eat" → "eat").
595 func stripTo(s string) string {
596 if len(s) > 3 && s[:3] == "to " {
597 return s[3:]
598 }
599 return s
600 }
601
602 // applyMorphEN maps a 5-bit MorphState onto EN tense/aspect/polarity markers.
603 // Formality (bit1) has no EN grammatical effect. Evidentiality (bit0) → "apparently".
604 // Strips JMdict "to " prefix before applying operators.
605 func applyMorphEN(base string, state uint8) string {
606 v := stripTo(base) // "to eat" → "eat"
607 if state == 0 {
608 return v
609 }
610 past := (state>>4)&1 == 1 // bit 4
611 prog := (state>>3)&1 == 1 // bit 3
612 neg := (state>>2)&1 == 1 // bit 2
613 evid := state&1 == 1 // bit 0
614
615 prefix := ""
616 if evid {
617 prefix = "apparently "
618 }
619 switch {
620 case past && prog && neg:
621 return prefix | "wasn't " | v | "ing"
622 case past && prog:
623 return prefix | "was " | v | "ing"
624 case past && neg:
625 return prefix | "didn't " | v
626 case past:
627 return prefix | "did " | v
628 case prog && neg:
629 return prefix | "isn't " | v | "ing"
630 case prog:
631 return prefix | "is " | v | "ing"
632 case neg:
633 return prefix | "don't " | v
634 default:
635 return prefix | v // polite present, no EN marker
636 }
637 }
638
639 // enOperators maps EN words to the morphstate bits they set.
640 // These are not content words — they are lattice walk operators.
641 // bit 4 = tense(past), bit 3 = aspect(progressive), bit 2 = polarity(negative),
642 // bit 0 = evidentiality(reported).
643 var enOperators = map[string]uint8{
644 "did": 1 << 4, // past
645 "didn't": (1 << 4) | (1 << 2), // past + negative
646 "not": 1 << 2, // negative
647 "don't": 1 << 2,
648 "doesn't": 1 << 2,
649 "wasn't": (1 << 4) | (1 << 3) | (1 << 2),
650 "weren't": (1 << 4) | (1 << 3) | (1 << 2),
651 "apparently": 1 << 0, // evidential
652 "reportedly": 1 << 0,
653 "supposedly": 1 << 0,
654 "allegedly": 1 << 0,
655 }
656
657 // enProgressiveAuxiliary maps "is/are/am/was/were" to their tense bits.
658 // Combined with an -ing verb, they set the aspect bit.
659 var enProgressiveAux = map[string]uint8{
660 "is": 0, "are": 0, "am": 0,
661 "was": 1 << 4, "were": 1 << 4,
662 }
663
664 // isJAVerb returns true if tok is a verb in the lattice, either as a dictionary
665 // IsJAVerb exports isJAVerb for use by the propagation command.
666 func IsJAVerb(tree *lattice.Tree, tok string) bool { return isJAVerb(tree, tok) }
667
668 // form (Bverb) or as a conjugated form whose stem is a Bverb record.
669 func isJAVerb(tree *lattice.Tree, tok string) bool {
670 if jaRecordBranch(tree, tok) == uint8(lattice.Bverb) {
671 return true
672 }
673 for _, stem := range verbStems(tok) {
674 if tree.LookupRecIdx(lattice.Bverb, MakeKey(LangJA, 0, stem)) != lattice.NullRec {
675 return true
676 }
677 }
678 return false
679 }
680
681 // inferMorphState estimates the MorphState from a conjugated JA token's suffix.
682 // Used as fallback when the form isn't in the lattice (verbStems path).
683 func inferMorphState(tok string) uint8 {
684 hs := func(suf string) bool {
685 return len(tok) >= len(suf) && tok[len(tok)-len(suf):] == suf
686 }
687 // Progressive past
688 if hs("ていなかった") || hs("でいなかった") { return MorphPastProgNeg }
689 if hs("ていました") || hs("でいました") { return MorphPastProgPolite }
690 if hs("ていた") || hs("でいた") { return MorphPastProgPlain }
691 // Progressive present
692 if hs("ていない") || hs("でいない") { return MorphPresProgNeg }
693 if hs("ています") || hs("でいます") { return MorphPresProgPolite }
694 if hs("ている") || hs("でいる") { return MorphPresProgPlain }
695 // Past
696 if hs("ませんでした") { return MorphPastNegPolite }
697 if hs("なかった") { return MorphPastNegPlain }
698 if hs("ました") { return MorphPastAffPolite }
699 if hs("そうだ") {
700 // reported: check if stem is past
701 inner := tok[:len(tok)-len("そうだ")]
702 if len(inner) > 0 {
703 last := inner[len(inner)-3:]
704 if last == "た" || last == "だ" { return MorphPastReported }
705 }
706 return MorphPresReported
707 }
708 if hs("た") || hs("だ") { return MorphPastAffPlain }
709 // Present negative
710 if hs("ません") { return MorphPresNegPolite }
711 if hs("ない") { return MorphPresNegPlain }
712 // Present polite
713 if hs("ます") { return MorphPresAffPolite }
714 return MorphPresAffPlain
715 }
716
717 // verbStems strips common Japanese conjugation suffixes and returns dictionary-
718 // form candidates to try against the lattice. Longer suffixes checked first.
719 // Returns nil if no suffix pattern recognized.
720 func verbStems(tok string) []string {
721 if len(tok) == 0 {
722 return nil
723 }
724 hs := func(suf string) bool {
725 return len(tok) > len(suf) && tok[len(tok)-len(suf):] == suf
726 }
727 st := func(suf string) string {
728 return tok[:len(tok)-len(suf)]
729 }
730 // 9-byte (3-char) patterns
731 if hs("ている") {
732 s := st("ている")
733 return []string{s | "る", s | "く"}
734 }
735 // 6-byte (2-char) patterns — godan sound changes
736 if hs("いた") { return []string{st("いた") | "く"} }
737 if hs("いだ") { return []string{st("いだ") | "ぐ"} }
738 if hs("した") { s := st("した"); return []string{s | "す", s | "する"} }
739 if hs("んだ") { s := st("んだ"); return []string{s | "む", s | "ぬ", s | "ぶ"} }
740 if hs("った") { s := st("った"); return []string{s | "つ", s | "う", s | "る"} }
741 if hs("いて") { return []string{st("いて") | "く"} }
742 if hs("いで") { return []string{st("いで") | "ぐ"} }
743 if hs("して") { s := st("して"); return []string{s | "す", s | "する"} }
744 if hs("んで") { s := st("んで"); return []string{s | "む", s | "ぬ", s | "ぶ"} }
745 if hs("って") { s := st("って"); return []string{s | "つ", s | "う", s | "る"} }
746 if hs("ない") { s := st("ない"); return []string{s | "る", s | "う"} }
747 // 3-byte (1-char) — ichidan plain past only.
748 // bare て is a connective te-form (食べて+いる), NOT a standalone verb form;
749 // including it causes the tokenizer to split 食べていた as 食べて+い+た.
750 if hs("た") { return []string{st("た") | "る"} }
751 return nil
752 }
753
754 // isPureHiragana returns true if every codepoint in s is in U+3040-U+309F (hiragana).
755 // Particles are always pure hiragana; kanji-containing words are content words.
756 func isPureHiragana(s string) bool {
757 if len(s) == 0 {
758 return false
759 }
760 for i := 0; i < len(s); {
761 if i+2 >= len(s) {
762 return false
763 }
764 // Hiragana block: U+3040-U+309F = E3 81 80 – E3 82 9F
765 if s[i] != 0xE3 {
766 return false
767 }
768 b1 := s[i+1]
769 b2 := s[i+2]
770 if b1 == 0x81 && b2 >= 0x80 {
771 // U+3040-U+307F ✓
772 } else if b1 == 0x82 && b2 <= 0x9F {
773 // U+3080-U+309F ✓
774 } else {
775 return false
776 }
777 i += 3
778 }
779 return true
780 }
781
782 // jaFunctionWord: particles, copulae, and auxiliaries that are structural
783 // fork labels, not content. Includes entries removed from the lattice by the
784 // IsFunction() filter at ingest (prt/cop/aux POS codes).
785 var jaFunctionWord = map[string]bool{
786 // copulae and auxiliaries
787 "だ": true, "です": true, "でした": true,
788 "ない": true, "ぬ": true, "ん": true,
789 "ます": true, "ません": true, "ました": true,
790 // particles (no longer in lattice — removed by IsFunction filter)
791 "は": true, "が": true, "を": true,
792 "に": true, "で": true, "と": true,
793 "も": true, "や": true, "か": true,
794 "の": true, "から": true, "まで": true,
795 "より": true, "など": true, "ね": true,
796 "よ": true, "さ": true, "な": true,
797 "わ": true, "ぞ": true, "ぜ": true,
798 "て": true, "た": true,
799 }
800
801 // VerbLemma returns a single-step approximation of the dictionary form for
802 // a JA verb surface form via verbStems. Used for morph-stats grouping.
803 func VerbLemma(form string) string {
804 stems := verbStems(form)
805 if len(stems) > 0 {
806 return stems[0]
807 }
808 return form
809 }
810
811 func appendUniq(s []string, v string) []string {
812 for _, x := range s {
813 if x == v {
814 return s
815 }
816 }
817 return append(s, v)
818 }
819