package transdb import ( "git.smesh.lol/iskradb/lattice" "git.smesh.lol/transdb/fuzzy" ) // FormFromInline extracts the surface form stored in Record.Inline. // Byte 23 holds the inline length (0 = overflow, data in pool). func FormFromInline(rec *lattice.Record, pool []byte) string { n := int(rec.Inline[23]) if n > 0 && n <= 23 { return string(rec.Inline[:n]) } if rec.DataFile == 1 && rec.DataLen > 0 { end := rec.DataOff + rec.DataLen if int(end) <= len(pool) { return string(pool[rec.DataOff:end]) } } return "" } // SetFormOnRecord stores the surface form in Record.Inline (up to 23 bytes) // or overflows into pool when longer. func SetFormOnRecord(rec *lattice.Record, form string, pool *[]byte) { b := []byte(form) if len(b) <= 23 { copy(rec.Inline[:], b) rec.Inline[23] = byte(len(b)) rec.DataFile = 0 } else { copy(rec.Inline[:23], b[:23]) rec.Inline[23] = 0 rec.DataFile = 1 rec.DataOff = uint32(len(*pool)) rec.DataLen = uint32(len(b)) *pool = append(*pool, b...) } } // defaultBranchOrder uses actual branch indices (Bnoun=1, Bverb=3, Bmodifier=4). var defaultBranchOrder = [3]uint8{uint8(lattice.Bnoun), uint8(lattice.Bverb), uint8(lattice.Bmodifier)} // lookupByKey finds all translation candidates for a pre-computed key, // searching branches in the given order. func lookupByKey(tree *lattice.Tree, pool []byte, key lattice.Key, order [3]uint8) []string { var results []string for _, b := range order { ri := tree.LookupRecIdx(lattice.Branch(b), key) if ri == lattice.NullRec { continue } rec := tree.GetRecord(ri) if rec == nil { continue } if rec.Link[0] != lattice.NullRec { if dst := tree.GetRecord(rec.Link[0]); dst != nil { if form := FormFromInline(dst, pool); form != "" { results = appendUniq(results, form) } } } if rec.Link[1] != lattice.NullRec { if dst := tree.GetRecord(rec.Link[1]); dst != nil { if form := FormFromInline(dst, pool); form != "" { results = appendUniq(results, form) } } } break } return results } // jaRecordBranch returns the branch of the coord=0 JA record for tok, or 255 if not found. func jaRecordBranch(tree *lattice.Tree, tok string) uint8 { key := MakeKey(LangJA, 0, tok) for _, b := range ActiveBranches { if tree.LookupRecIdx(b, key) != lattice.NullRec { return uint8(b) } } return 255 } // LookupWord finds all translation candidates for a single word token (coord=0). func LookupWord(tree *lattice.Tree, pool []byte, word string, srcLang uint8) []string { return lookupByKey(tree, pool, MakeKey(srcLang, 0, word), defaultBranchOrder) } // LookupWordCtx finds translations using the 22-bit coordinate. // Tries each coordinate in the relaxation sequence (most specific → least specific). // For JA source, branch order is derived from the cooccurrence axis. func LookupWordCtx(tree *lattice.Tree, pool []byte, word string, srcLang uint8, coord uint64) []string { order := defaultBranchOrder if srcLang == LangJA { order = branchOrderJA(coord) } for _, c := range RelaxCoord(coord) { if results := lookupByKey(tree, pool, MakeKey(srcLang, c, word), order); len(results) > 0 { return results } } return nil } // jaRole constants for syntactic role assignment. const ( jaRoleNone = uint8(0) jaRoleSubj = uint8(1) // は が jaRoleObj = uint8(2) // を jaRoleVerb = uint8(3) jaRoleMisc = uint8(4) // everything else ) // jaRoleParticle maps particle strings to syntactic roles. // Only subject (は/が) and object (を) get specific roles; // other particles collapse to misc. var jaRoleParticle = map[string]uint8{ "は": jaRoleSubj, "が": jaRoleSubj, "を": jaRoleObj, } // Translate tokenizes text in srcLang and translates each token to dstLang. // For JA→EN, applies particle-based role assignment and SOV→SVO reordering. // Tokens with no translation are passed through unchanged. func Translate(tree *lattice.Tree, pool []byte, idx *fuzzy.DualIndex, text string, srcLang, dstLang uint8, verbose bool) string { var tokens []string switch srcLang { case LangEN: tokens = TokenizeEN(text) case LangJA: tokens = TokenizeJA(text, tree, verbose) default: tokens = TokenizeEN(text) } if srcLang == LangJA && dstLang == LangEN { return translateJAToEN(tree, pool, idx, tokens, verbose) } return translateTokens(tree, pool, idx, tokens, srcLang, dstLang, verbose) } // translateJAToEN handles JA→EN with two-zone SOV→SVO reordering. // // Zone split: は/が divides the sentence into subject zone and predicate zone. // Within the predicate zone, verb tokens are pulled to the front: // SUBJ_ZONE + VERB(s) + REST_OF_PRED_ZONE // // This preserves modifier attachment (天皇の歴史的責任感 stays together as the // subject) while achieving SVO word order for the core clause. func translateJAToEN(tree *lattice.Tree, pool []byte, idx *fuzzy.DualIndex, tokens []string, verbose bool) string { n := len(tokens) // isSkipToken: pure-hiragana particles and copulae get no EN output. isSkip := func(tok string) bool { if !isPureHiragana(tok) { return false } jaKey := MakeKey(LangJA, 0, tok) return tree.LookupRecIdx(lattice.Bmodifier, jaKey) != lattice.NullRec || jaFunctionWord[tok] } // lookupMorph returns the translation and MorphState for a JA token. // Uses RelaxCoord: tries most-specific coord first, falls back toward coord=0. lookupMorph := func(tok string, coord uint64) (string, uint8) { order := branchOrderJA(coord) for _, c := range RelaxCoord(coord) { key := MakeKey(LangJA, c, tok) for _, b := range order { ri := tree.LookupRecIdx(lattice.Branch(b), key) if ri == lattice.NullRec { continue } rec := tree.GetRecord(ri) if rec == nil { continue } state := GetMorphState(rec) if rec.Link[0] != lattice.NullRec { if dst := tree.GetRecord(rec.Link[0]); dst != nil { if form := FormFromInline(dst, pool); form != "" { return form, state } } } break } } return "", 0 } // translateTok: translate a single JA token using the 22-bit coordinate. // The coord encodes both cooccurrence context (prev/next word types) and // the morphological state inferred from the token's surface form. translateTok := func(i int, tok string) string { var prevType, nextType uint8 if i > 0 { prevType = POSTypeFor(POSForWord(tree, LangJA, tokens[i-1])) } if i+1 < n { nextType = POSTypeFor(POSForWord(tree, LangJA, tokens[i+1])) } morphState := uint64(inferMorphState(tok)) coord := PackCoord(0, 0, CoordCooccur(prevType, nextType), morphState, 0, 0, 0) if en, state := lookupMorph(tok, coord); en != "" { return applyMorphEN(en, state) } // Fuzzy fallback. if idx != nil { var corrected string var wasCorrected bool var candidates []string candidates, corrected, wasCorrected = FuzzyLookupWord(tree, pool, idx, tok, LangJA, 2) if verbose && wasCorrected { println("fuzzy:", tok, "→", corrected) } for _, c := range candidates { return applyMorphEN(c, 0) } _ = corrected } // verbStems fallback for forms not in lattice. if stems := verbStems(tok); len(stems) > 0 { for _, stem := range stems { stemCoord := PackCoord(0, 0, CoordCooccur(prevType, nextType), morphState, 0, 0, 0) if en, _ := lookupMorph(stem, stemCoord); en != "" { return applyMorphEN(en, uint8(morphState)) } } } return tok } // Find the first は/が boundary to split subject zone from predicate zone. // subjEnd is the index of the は/が particle itself. subjEnd := -1 for i, tok := range tokens { if tok == "は" || tok == "が" { if isPureHiragana(tok) { subjEnd = i break } } } // Translate all tokens in JA order, tagging each as subj/verb/pred. type word struct { en string isV bool } var subjWords, predVerbs, predRest []word for i, tok := range tokens { if isSkip(tok) { continue } en := translateTok(i, tok) if en == "" { continue } w := word{en, isJAVerb(tree, tok)} if subjEnd >= 0 && i < subjEnd { subjWords = append(subjWords, w) } else if w.isV { predVerbs = append(predVerbs, w) } else { predRest = append(predRest, w) } } // Emit: SUBJ + VERB + REST_OF_PRED (preserves modifier order within each zone). var out []byte first := true emit := func(en string) { if !first { out = append(out, ' ') } out = append(out, []byte(en)...) first = false } for _, w := range subjWords { emit(w.en) } for _, w := range predVerbs { emit(w.en) } for _, w := range predRest { emit(w.en) } return string(out) } // translateTokens handles EN→JA and same-language translation (no reordering). // For EN→JA: operator tokens ("did", "not", "apparently" etc.) accumulate // morphstate bits and are consumed without output; the next verb is looked up // at the resulting morphstate in the JA cluster. func translateTokens(tree *lattice.Tree, pool []byte, idx *fuzzy.DualIndex, tokens []string, srcLang, dstLang uint8, verbose bool) string { var out []byte pendingMorph := uint8(0) // accumulated operator bits waiting for a verb progressiveAux := uint8(0xFF) // 0xFF = none; otherwise tense bits from is/was/were subjectSemFlags := uint64(0) // semantic flags from subject nouns seen so far for i, tok := range tokens { // EN→JA: detect operator tokens (morphstate walk instructions). if srcLang == LangEN && dstLang == LangJA { if bits, ok := enOperators[tok]; ok { pendingMorph |= bits continue // operator consumed, no output } // Progressive auxiliary: "is/am/are/was/were" before a verb+ing. if tenseBits, ok := enProgressiveAux[tok]; ok { progressiveAux = tenseBits continue } // Detect "-ing" suffix on a verb when progressive aux is pending. if progressiveAux != 0xFF && len(tok) > 3 && tok[len(tok)-3:] == "ing" { pendingMorph |= (1 << 3) | progressiveAux // aspect + tense progressiveAux = 0xFF // Strip "ing" to get base verb for lookup. tok = tok[:len(tok)-3] } } var candidates []string corrected := tok var prevType, nextType uint8 if i > 0 { prevType = POSTypeFor(POSForWord(tree, srcLang, tokens[i-1])) } if i+1 < len(tokens) { nextType = POSTypeFor(POSForWord(tree, srcLang, tokens[i+1])) } // Accumulate semantic flags from subject nouns for verb disambiguation. // Read flags from the noun's base record DataFile (O(1), no coord scan). if srcLang == LangEN && dstLang == LangJA { curType := POSTypeFor(POSForWord(tree, srcLang, tok)) if curType == CooccurNominal { // it's a noun in the EN lattice key := MakeKey(LangEN, 0, tok) for _, b := range ActiveBranches { if ri := tree.LookupRecIdx(b, key); ri != lattice.NullRec { if rec := tree.GetRecord(ri); rec != nil { subjectSemFlags |= GetSemanticFromDataFile(rec) } break } } } } coord := PackCoord(subjectSemFlags, 0, CoordCooccur(prevType, nextType), 0, 0, 0, 0) if idx != nil { var wasCorrected bool candidates, corrected, wasCorrected = FuzzyLookupWord(tree, pool, idx, tok, srcLang, 2) if verbose && wasCorrected { println("fuzzy: corrected", tok, "→", corrected) } if len(candidates) > 0 && coord != 0 { if ctxCands := LookupWordCtx(tree, pool, corrected, srcLang, coord); len(ctxCands) > 0 { candidates = ctxCands } } } else { candidates = LookupWordCtx(tree, pool, tok, srcLang, coord) } var translated string // EN→JA: use lookupENToJA to get JA base + EN record's own MorphState. // Combine with pendingMorph (accumulated operator bits) for the target state. // Handles both synthetic ("sang" has MorphState=16) and analytical ("did"+"sing"). if srcLang == LangEN && dstLang == LangJA { jaBase, enMorphState := lookupENToJA(tree, pool, corrected, coord) targetState := pendingMorph | enMorphState if jaBase != "" && targetState != 0 { if targetForm := lookupJAAtMorphState(tree, pool, jaBase, targetState); targetForm != "" { translated = targetForm } else { translated = jaBase } pendingMorph = 0 } else if jaBase != "" { translated = jaBase pendingMorph = 0 } } if translated == "" { for _, c := range candidates { translated = c break } } if translated == "" { translated = tok } if len(out) > 0 && dstLang == LangEN { out = append(out, ' ') } out = append(out, []byte(translated)...) } return string(out) } // lookupENToJA finds the JA base form and the EN record's MorphState for a // given EN token. Tries the word as-is, then with "to " prefix (JMdict gloss // format). The MorphState on the EN record drives JA cluster navigation: // "sang" has MorphState=16 pointing to 歌う, so we navigate to 歌った. func lookupENToJA(tree *lattice.Tree, pool []byte, word string, coord uint64) (jaBase string, morphState uint8) { order := defaultBranchOrder for _, tryWord := range []string{word, "to " | word} { for _, c := range RelaxCoord(coord) { key := MakeKey(LangEN, c, tryWord) for _, b := range order { ri := tree.LookupRecIdx(lattice.Branch(b), key) if ri == lattice.NullRec { continue } rec := tree.GetRecord(ri) if rec == nil { continue } state := GetMorphState(rec) if rec.Link[0] == lattice.NullRec { break } dst := tree.GetRecord(rec.Link[0]) if dst == nil { break } if form := FormFromInline(dst, pool); form != "" { return form, state } break } } } return "", 0 } // enToJABase is the legacy wrapper used by the operator path. func enToJABase(tree *lattice.Tree, pool []byte, enWord string) string { base, _ := lookupENToJA(tree, pool, enWord, 0) return base } // lookupJAAtMorphState finds the surface form of jaBase at the given morphstate. // Uses stored verb class from Bcooccur (O(1)) when available; falls back to // trying each conjugation class in priority order (O(classes)). func lookupJAAtMorphState(tree *lattice.Tree, pool []byte, jaBase string, targetState uint8) string { tryForm := func(targetForm string) bool { if targetForm == "" { return false } key := MakeKey(LangJA, 0, targetForm) for _, b := range ActiveBranches { if tree.LookupRecIdx(lattice.Branch(b), key) != lattice.NullRec { return true } } return false } // Fast path: stored verb class from inflect.mx registration. // When the class is known, the computed form is authoritative — return it // even if not pre-stored in the lattice. if class, ok := GetVerbClass(tree, LangJA, jaBase); ok { if f := InflectJA(jaBase, class, targetState); f != "" { return f } } // Fallback: try each class in priority order (pre-inflect data or unknown class) classOrder := []string{ "v1", "v5k", "v5s", "v5m", "v5b", "v5r", "v5t", "v5u", "v5g", "v5n", "vs", "vk", } for _, class := range classOrder { forms := BuildVerbForms(jaBase, class) if len(forms) == 0 { continue } targetForm, ok := forms[targetState] if !ok || targetForm == "" { continue } if tryForm(targetForm) { return targetForm } } return "" } // TranslateWithClusters uses the five-stage cluster pipeline instead of // token-by-token translation. Falls back to Translate if lang descriptors // are not registered (lang-init not yet run). func TranslateWithClusters(tree *lattice.Tree, pool []byte, text string, srcLang, dstLang uint8, verbose bool) string { srcDesc, hasSrc := GetLangDesc(tree, srcLang) dstDesc, hasDst := GetLangDesc(tree, dstLang) if !hasSrc || !hasDst { if verbose { println("cluster: lang descriptors not registered, using token-by-token") } return Translate(tree, pool, nil, text, srcLang, dstLang, verbose) } var tokens []string switch srcLang { case LangEN: tokens = TokenizeEN(text) case LangJA: tokens = TokenizeJA(text, tree, verbose) default: tokens = TokenizeEN(text) } clusters := ParseClusters(tokens, tree, srcLang) for _, c := range clusters { TranslateCluster(c, tree, pool, srcLang, dstLang) } reordered := ReorderClusters(clusters, srcDesc.Order, dstDesc.Order) return InsertMarkers(reordered, dstDesc, dstLang) } // BuildWordIndex extracts all words from the lattice and builds BK-trees // for fuzzy matching. Call once after loading the DB. // Returns a *fuzzy.DualIndex with EN words in A and JA words in B. func BuildWordIndex(tree *lattice.Tree, pool []byte) *fuzzy.DualIndex { var enWords, jaWords []string for recIdx := range tree.RecKey { rec := tree.GetRecord(recIdx) if rec == nil { continue } form := FormFromInline(rec, pool) if form == "" { continue } switch Detect(form) { case LangEN: enWords = append(enWords, form) case LangJA: jaWords = append(jaWords, form) } } return fuzzy.NewDualIndex(fuzzy.Build(enWords), fuzzy.Build(jaWords)) } // FuzzyLookupWord attempts a translation with fuzzy fallback on exact miss. // Returns (translations, correctedForm, wasCorrected). func FuzzyLookupWord(tree *lattice.Tree, pool []byte, idx *fuzzy.DualIndex, word string, srcLang uint8, maxDist int) ([]string, string, bool) { results := LookupWord(tree, pool, word, srcLang) if len(results) > 0 { return results, word, false } if idx == nil { return nil, word, false } var matches []fuzzy.Match switch srcLang { case LangEN: matches = idx.SuggestA(word, maxDist, 3) case LangJA: matches = idx.SuggestB(word, maxDist, 3) } if len(matches) == 0 { return nil, word, false } best := matches[0].Word results = LookupWord(tree, pool, best, srcLang) if len(results) > 0 { return results, best, true } return nil, word, false } // stripTo removes a leading "to " from a JMdict verb gloss ("to eat" → "eat"). func stripTo(s string) string { if len(s) > 3 && s[:3] == "to " { return s[3:] } return s } // applyMorphEN maps a 5-bit MorphState onto EN tense/aspect/polarity markers. // Formality (bit1) has no EN grammatical effect. Evidentiality (bit0) → "apparently". // Strips JMdict "to " prefix before applying operators. func applyMorphEN(base string, state uint8) string { v := stripTo(base) // "to eat" → "eat" if state == 0 { return v } past := (state>>4)&1 == 1 // bit 4 prog := (state>>3)&1 == 1 // bit 3 neg := (state>>2)&1 == 1 // bit 2 evid := state&1 == 1 // bit 0 prefix := "" if evid { prefix = "apparently " } switch { case past && prog && neg: return prefix | "wasn't " | v | "ing" case past && prog: return prefix | "was " | v | "ing" case past && neg: return prefix | "didn't " | v case past: return prefix | "did " | v case prog && neg: return prefix | "isn't " | v | "ing" case prog: return prefix | "is " | v | "ing" case neg: return prefix | "don't " | v default: return prefix | v // polite present, no EN marker } } // enOperators maps EN words to the morphstate bits they set. // These are not content words — they are lattice walk operators. // bit 4 = tense(past), bit 3 = aspect(progressive), bit 2 = polarity(negative), // bit 0 = evidentiality(reported). var enOperators = map[string]uint8{ "did": 1 << 4, // past "didn't": (1 << 4) | (1 << 2), // past + negative "not": 1 << 2, // negative "don't": 1 << 2, "doesn't": 1 << 2, "wasn't": (1 << 4) | (1 << 3) | (1 << 2), "weren't": (1 << 4) | (1 << 3) | (1 << 2), "apparently": 1 << 0, // evidential "reportedly": 1 << 0, "supposedly": 1 << 0, "allegedly": 1 << 0, } // enProgressiveAuxiliary maps "is/are/am/was/were" to their tense bits. // Combined with an -ing verb, they set the aspect bit. var enProgressiveAux = map[string]uint8{ "is": 0, "are": 0, "am": 0, "was": 1 << 4, "were": 1 << 4, } // isJAVerb returns true if tok is a verb in the lattice, either as a dictionary // IsJAVerb exports isJAVerb for use by the propagation command. func IsJAVerb(tree *lattice.Tree, tok string) bool { return isJAVerb(tree, tok) } // form (Bverb) or as a conjugated form whose stem is a Bverb record. func isJAVerb(tree *lattice.Tree, tok string) bool { if jaRecordBranch(tree, tok) == uint8(lattice.Bverb) { return true } for _, stem := range verbStems(tok) { if tree.LookupRecIdx(lattice.Bverb, MakeKey(LangJA, 0, stem)) != lattice.NullRec { return true } } return false } // inferMorphState estimates the MorphState from a conjugated JA token's suffix. // Used as fallback when the form isn't in the lattice (verbStems path). func inferMorphState(tok string) uint8 { hs := func(suf string) bool { return len(tok) >= len(suf) && tok[len(tok)-len(suf):] == suf } // Progressive past if hs("ていなかった") || hs("でいなかった") { return MorphPastProgNeg } if hs("ていました") || hs("でいました") { return MorphPastProgPolite } if hs("ていた") || hs("でいた") { return MorphPastProgPlain } // Progressive present if hs("ていない") || hs("でいない") { return MorphPresProgNeg } if hs("ています") || hs("でいます") { return MorphPresProgPolite } if hs("ている") || hs("でいる") { return MorphPresProgPlain } // Past if hs("ませんでした") { return MorphPastNegPolite } if hs("なかった") { return MorphPastNegPlain } if hs("ました") { return MorphPastAffPolite } if hs("そうだ") { // reported: check if stem is past inner := tok[:len(tok)-len("そうだ")] if len(inner) > 0 { last := inner[len(inner)-3:] if last == "た" || last == "だ" { return MorphPastReported } } return MorphPresReported } if hs("た") || hs("だ") { return MorphPastAffPlain } // Present negative if hs("ません") { return MorphPresNegPolite } if hs("ない") { return MorphPresNegPlain } // Present polite if hs("ます") { return MorphPresAffPolite } return MorphPresAffPlain } // verbStems strips common Japanese conjugation suffixes and returns dictionary- // form candidates to try against the lattice. Longer suffixes checked first. // Returns nil if no suffix pattern recognized. func verbStems(tok string) []string { if len(tok) == 0 { return nil } hs := func(suf string) bool { return len(tok) > len(suf) && tok[len(tok)-len(suf):] == suf } st := func(suf string) string { return tok[:len(tok)-len(suf)] } // 9-byte (3-char) patterns if hs("ている") { s := st("ている") return []string{s | "る", s | "く"} } // 6-byte (2-char) patterns — godan sound changes if hs("いた") { return []string{st("いた") | "く"} } if hs("いだ") { return []string{st("いだ") | "ぐ"} } if hs("した") { s := st("した"); return []string{s | "す", s | "する"} } if hs("んだ") { s := st("んだ"); return []string{s | "む", s | "ぬ", s | "ぶ"} } if hs("った") { s := st("った"); return []string{s | "つ", s | "う", s | "る"} } if hs("いて") { return []string{st("いて") | "く"} } if hs("いで") { return []string{st("いで") | "ぐ"} } if hs("して") { s := st("して"); return []string{s | "す", s | "する"} } if hs("んで") { s := st("んで"); return []string{s | "む", s | "ぬ", s | "ぶ"} } if hs("って") { s := st("って"); return []string{s | "つ", s | "う", s | "る"} } if hs("ない") { s := st("ない"); return []string{s | "る", s | "う"} } // 3-byte (1-char) — ichidan plain past only. // bare て is a connective te-form (食べて+いる), NOT a standalone verb form; // including it causes the tokenizer to split 食べていた as 食べて+い+た. if hs("た") { return []string{st("た") | "る"} } return nil } // isPureHiragana returns true if every codepoint in s is in U+3040-U+309F (hiragana). // Particles are always pure hiragana; kanji-containing words are content words. func isPureHiragana(s string) bool { if len(s) == 0 { return false } for i := 0; i < len(s); { if i+2 >= len(s) { return false } // Hiragana block: U+3040-U+309F = E3 81 80 – E3 82 9F if s[i] != 0xE3 { return false } b1 := s[i+1] b2 := s[i+2] if b1 == 0x81 && b2 >= 0x80 { // U+3040-U+307F ✓ } else if b1 == 0x82 && b2 <= 0x9F { // U+3080-U+309F ✓ } else { return false } i += 3 } return true } // jaFunctionWord: particles, copulae, and auxiliaries that are structural // fork labels, not content. Includes entries removed from the lattice by the // IsFunction() filter at ingest (prt/cop/aux POS codes). var jaFunctionWord = map[string]bool{ // copulae and auxiliaries "だ": true, "です": true, "でした": true, "ない": true, "ぬ": true, "ん": true, "ます": true, "ません": true, "ました": true, // particles (no longer in lattice — removed by IsFunction filter) "は": true, "が": true, "を": true, "に": true, "で": true, "と": true, "も": true, "や": true, "か": true, "の": true, "から": true, "まで": true, "より": true, "など": true, "ね": true, "よ": true, "さ": true, "な": true, "わ": true, "ぞ": true, "ぜ": true, "て": true, "た": true, } // VerbLemma returns a single-step approximation of the dictionary form for // a JA verb surface form via verbStems. Used for morph-stats grouping. func VerbLemma(form string) string { stems := verbStems(form) if len(stems) > 0 { return stems[0] } return form } func appendUniq(s []string, v string) []string { for _, x := range s { if x == v { return s } } return append(s, v) }