package iskra // volClassFromSuffix returns the godan class implied by a volitional compound // ending. Returns VClassIchidan for よう (and for unknown patterns). func volClassFromSuffix(suffix []byte) uint8 { switch string(suffix) { case "\xe3\x81\x93\xe3\x81\x86": return VClassGodanKu case "\xe3\x81\x94\xe3\x81\x86": return VClassGodanGu case "\xe3\x81\x9d\xe3\x81\x86": return VClassGodanSu case "\xe3\x81\xa8\xe3\x81\x86": return VClassGodanTsu case "\xe3\x81\xae\xe3\x81\x86": return VClassGodanNu case "\xe3\x81\xbc\xe3\x81\x86": return VClassGodanBu case "\xe3\x82\x82\xe3\x81\x86": return VClassGodanMu case "\xe3\x82\x8d\xe3\x81\x86": return VClassGodanRu case "\xe3\x81\x8a\xe3\x81\x86": return VClassGodanU } return VClassIchidan } // LemmatizeJA reduces a JA verb surface form to its bare stem + morph + class. // The stem is consistent across all inflected forms of a verb (食べる/食べた/食べている → 食). // The class identifies the conjugation pattern needed to reconstruct forms. // // Verb class detection: // ichidan: stem ends in え/い-row hiragana (食べ, 起き). Dict-form: stem + る. // godan-X: stem ends in kanji or contracted/i-stem kana. Class indicates u-row ending. // // Non-verbs (isVerb=false) pass through unchanged. func LemmatizeJA(word string, isVerb bool) LemmaResult { if word == UntranslatedMarker { return LemmaResult{Lemma: word, Morph: 0, Class: 0} } if !isVerb { if stem, morph, ok := stripJAIAdj([]byte(word)); ok { return LemmaResult{Lemma: string(stem), Morph: morph, Class: VClassIAdj} } return LemmaResult{Lemma: word, Morph: 0, Class: 0} } b := []byte(word) if len(b) < 3 { return LemmaResult{Lemma: word, Morph: 0, Class: 0} } stem, morph, stripped := stripJAConjugation(b) class := detectJAClass(b, stem, stripped, morph) // Volitional: class can be inferred from the specific suffix matched // (こう/もう/etc. for godan, よう for ichidan). if morph&MetaMoodVol != 0 && stripped { suffixLen := len(b) - len(stem) if suffixLen >= 6 { class = volClassFromSuffix(b[len(b)-suffixLen:]) } } // Single-kana stem after stripping (e.g. bare っ from った/って) is a // tokenizer artifact, not a standalone verb. Pass through as-is. // Multi-kana stems (6+ bytes, 2+ characters) are legitimate ichidan // verb stems (くれ, つけ, かけ, etc.) and must not be rejected. if stripped && len(stem) <= 3 && isBareSuffix(stem) { return LemmaResult{Lemma: word, Morph: 0, Class: 0} } stem = canonicalizeJAStem(stem, stripped, class) if len(stem) == 0 { return LemmaResult{Lemma: word, Morph: morph, Class: class} } return LemmaResult{Lemma: string(stem), Morph: morph, Class: class} } // detectJAClass attempts to determine the verb class from the original surface // form and what suffix was stripped. func detectJAClass(orig, stem []byte, stripped bool, morph uint16) uint8 { if !stripped { // Dict form: look at last kana. 食べる → ichidan if pre-る is え/い-row. return classFromDictForm(orig) } // Suffix was stripped. Use residual to infer class. return classFromInflectedStem(stem, morph) } // classFromDictForm reads a dict-form verb (assumes no suffix stripping done). // Returns the class based on the last 1-2 kana. func classFromDictForm(b []byte) uint8 { if len(b) < 3 { return VClassNone } last := b[len(b)-3:] ls := string(last) switch ls { case "\xe3\x82\x8b": // る // Could be ichidan or godan-ru. Check preceding kana. if len(b) >= 6 { prev := string(b[len(b)-6 : len(b)-3]) if isERowOrIRowKana(prev) { return VClassIchidan } } return VClassGodanRu case "\xe3\x81\x8f": // く return VClassGodanKu case "\xe3\x81\x90": // ぐ return VClassGodanGu case "\xe3\x81\x99": // す return VClassGodanSu case "\xe3\x81\xa4": // つ return VClassGodanTsu case "\xe3\x81\xac": // ぬ return VClassGodanNu case "\xe3\x81\xb6": // ぶ return VClassGodanBu case "\xe3\x82\x80": // む return VClassGodanMu case "\xe3\x81\x86": // う return VClassGodanU } return VClassNone } // classFromInflectedStem guesses class from the stem after suffix removal. func classFromInflectedStem(stem []byte, morph uint16) uint8 { if len(stem) < 3 { return VClassNone } last := stem[len(stem)-3:] ls := string(last) // Negative or passive/causative: stem ends in あ-row kana for godan, or // the stem is a bare ichidan stem (え/い-row). // Negative: 走らない (godan-ru: 走ら). Passive: 噛まれた (godan-mu: 噛ま). // Causative: 待たせた (godan-tsu: 待た). if morph&(MetaPolarNeg|MetaPassive|MetaCausative) != 0 { if c := classFromARowKana(ls); c != VClassNone { return c } if isERowKana(ls) || isIRowKana(ls) { return VClassIchidan } return VClassNone } // Polite (ます-family stripped) leaves i-row kana for godan or bare stem for ichidan. if morph&MetaFormalityPol != 0 { if c := classFromIStem(ls); c != VClassIchidan { return c } if isERowKana(ls) || isIRowKana(ls) { return VClassIchidan } return VClassNone } // Te/ta/de/da stripped: contracted forms for godan, bare stem for ichidan. // っ contraction is ambiguous: godan-u (買う→買っ), godan-tsu (持つ→持っ), // godan-ru (知る→知っ). Disambiguate by looking up the kanji root in a // small table; default to godan-ru (most common っ-contracting class). if ls == "\xe3\x81\xa3" { // っ return classTsuContract(stem) } if ls == "\xe3\x82\x93" { // ん - godan nu/bu/mu voiced contraction return VClassGodanMu } if ls == "\xe3\x81\x84" { // い - godan ku/gu past return VClassGodanKu } if ls == "\xe3\x81\x97" { // し - godan su past return VClassGodanSu } // Bare stem ending in え/い-row → ichidan if isERowKana(ls) || isIRowKana(ls) { return VClassIchidan } return VClassNone } // classFromARowKana maps the あ-row kana found before ない to the godan class. func classFromARowKana(ls string) uint8 { switch ls { case "\xe3\x81\x8b": // か return VClassGodanKu case "\xe3\x81\x8c": // が return VClassGodanGu case "\xe3\x81\x95": // さ return VClassGodanSu case "\xe3\x81\x9f": // た return VClassGodanTsu case "\xe3\x81\xaa": // な return VClassGodanNu case "\xe3\x81\xb0": // ば return VClassGodanBu case "\xe3\x81\xbe": // ま return VClassGodanMu case "\xe3\x82\x89": // ら return VClassGodanRu case "\xe3\x82\x8f": // わ return VClassGodanU } return VClassNone } // classFromIStem maps an i-row kana to the corresponding godan class. func classFromIStem(ls string) uint8 { switch ls { case "\xe3\x81\x8d": // き return VClassGodanKu case "\xe3\x81\x8e": // ぎ return VClassGodanGu case "\xe3\x81\x97": // し return VClassGodanSu case "\xe3\x81\xa1": // ち return VClassGodanTsu case "\xe3\x81\xab": // に return VClassGodanNu case "\xe3\x81\xb3": // び return VClassGodanBu case "\xe3\x81\xbf": // み return VClassGodanMu case "\xe3\x82\x8a": // り return VClassGodanRu case "\xe3\x81\x84": // い (from godan-う i-stem like 買い) return VClassGodanU } return VClassIchidan } // classTsuContract disambiguates っ te-form contraction among godan-u, godan-tsu, // and godan-ru by kanji root lookup. Default: godan-ru (most frequent). func classTsuContract(stem []byte) uint8 { // Strip trailing っ (3 bytes: E3 81 A3) to get kanji root. root := stem if len(root) >= 3 { tail := root[len(root)-3:] if tail[0] == 0xe3 && tail[1] == 0x81 && tail[2] == 0xa3 { root = root[:len(root)-3] } } if len(root) == 0 { return VClassGodanRu } // Extract last rune (the kanji) for table lookup. // CJK kanji are 3-byte UTF-8 sequences (E0-EF range). var kanji string if len(root) >= 3 && root[len(root)-3] >= 0xe0 { kanji = string(root[len(root)-3:]) } else if len(root) >= 2 && root[len(root)-2] >= 0xc0 { kanji = string(root[len(root)-2:]) } else { kanji = string(root[len(root)-1:]) } switch kanji { // Godan-tsu (dict form ends in つ) case "\xe6\x8c\x81": // 持 (motsu) return VClassGodanTsu case "\xe7\xab\x8b": // 立 (tatsu) return VClassGodanTsu case "\xe5\xbe\x85": // 待 (matsu) return VClassGodanTsu case "\xe6\x89\x93": // 打 (utsu) return VClassGodanTsu case "\xe5\x8b\x9d": // 勝 (katsu) return VClassGodanTsu case "\xe8\x82\xb2": // 育 (sodatsu) return VClassGodanTsu case "\xe7\xb5\x8c": // 経 (tatsu - time passes) return VClassGodanTsu case "\xe4\xbf\x9d": // 保 (motsu - to last) return VClassGodanTsu case "\xe7\x99\xba": // 発 (tatsu - depart) return VClassGodanTsu case "\xe5\xbb\xba": // 建 (tatsu - build) return VClassGodanTsu // Godan-u (dict form ends in う) case "\xe8\xb2\xb7": // 買 (kau) return VClassGodanU case "\xe4\xbc\x9a": // 会 (au) return VClassGodanU case "\xe6\xad\x8c": // 歌 (utau) return VClassGodanU case "\xe8\xa8\x80": // 言 (iu) return VClassGodanU case "\xe4\xbd\xbf": // 使 (tsukau) return VClassGodanU case "\xe6\x89\x95": // 払 (harau) return VClassGodanU case "\xe6\xb4\x97": // 洗 (arau) return VClassGodanU case "\xe7\xac\x91": // 笑 (warau) return VClassGodanU case "\xe9\x81\x95": // 違 (chigau) return VClassGodanU case "\xe6\x80\x9d": // 思 (omou) return VClassGodanU case "\xe8\xbf\xbd": // 追 (ou) return VClassGodanU case "\xe8\xaa\x98": // 誘 (sasou) return VClassGodanU case "\xe6\x8b\xbe": // 拾 (hirou) return VClassGodanU case "\xe9\xa3\xbc": // 飼 (kau - keep/raise) return VClassGodanU case "\xe5\x90\xb8": // 吸 (suu) return VClassGodanU case "\xe5\x90\x88": // 合 (au - match) return VClassGodanU case "\xe6\x95\x91": // 救 (sukuu) return VClassGodanU case "\xe6\x8b\x89": // 拉 (hirau - not standard, but appears) return VClassGodanU } // Default: godan-ru (知る, 取る, 送る, 作る, 乗る, 売る, etc.) return VClassGodanRu } // canonicalizeJAStem normalizes the stem to its canonical form per class. // For ichidan: keep the え/い-row kana (食べ stays 食べ). // For godan: canonical form is the dict form (kanji root + u-row ending). // Dict form 行く: keep as-is. // Inflected 行った: strip to kanji root, re-append く. // This preserves lexical identity: 行く (iku, go) vs 行う (okonau, do) // produce distinct lemmas instead of both collapsing to 行. func canonicalizeJAStem(stem []byte, stripped bool, class uint8) []byte { if class == VClassIchidan { if !stripped { return stripTrailingHiragana(stem) } return stem // 食べ is already canonical for ichidan } // Godan: strip to kanji root, then append the dict-form ending. root := stem if !stripped { root = stripTrailingHiragana(stem) } else { root = stripAllTrailingHiragana(stem) } suffix := godanDictSuffix(class) if suffix == "" { return root } out := []byte{:0:len(root) + 3} out = append(out, root...) out = append(out, []byte(suffix)...) return out } func godanDictSuffix(class uint8) string { switch class { case VClassGodanKu: return "\xe3\x81\x8f" // く case VClassGodanGu: return "\xe3\x81\x90" // ぐ case VClassGodanSu: return "\xe3\x81\x99" // す case VClassGodanTsu: return "\xe3\x81\xa4" // つ case VClassGodanNu: return "\xe3\x81\xac" // ぬ case VClassGodanBu: return "\xe3\x81\xb6" // ぶ case VClassGodanMu: return "\xe3\x82\x80" // む case VClassGodanRu: return "\xe3\x82\x8b" // る case VClassGodanU: return "\xe3\x81\x86" // う } return "" } // stripTrailingHiragana removes exactly one trailing hiragana char. func stripTrailingHiragana(b []byte) []byte { if len(b) < 3 { return b } tail := b[len(b)-3:] if tail[0] == 0xe3 && (tail[1] == 0x81 || tail[1] == 0x82) { return b[:len(b)-3] } return b } // stripAllTrailingHiragana removes all trailing hiragana, leaving at least 3 bytes. func stripAllTrailingHiragana(b []byte) []byte { for len(b) > 3 { tail := b[len(b)-3:] if tail[0] == 0xe3 && (tail[1] == 0x81 || tail[1] == 0x82) { b = b[:len(b)-3] } else { break } } return b } // isBareSuffix returns true when the stem after conjugation stripping is all // hiragana/katakana with no kanji root. Such stems come from tokenizer splits // (思った -> 思 + った) and shouldn't be canonicalized as standalone verbs. func isBareSuffix(stem []byte) bool { for i := 0; i < len(stem); { if i+3 <= len(stem) && stem[i] == 0xe3 { b1 := stem[i+1] if b1 >= 0x81 && b1 <= 0x83 { // hiragana U+3040-U+309F, katakana U+30A0-U+30FF i += 3 continue } } return false // non-kana byte found = has kanji root } return len(stem) > 0 } func isERowKana(s string) bool { // え-row hiragana: え (0x81 0x88), け (0x81 0x91), せ (0x81 0x9b), て (0x81 0xa6), // ね (0x81 0xad), へ (0x81 0xb8), め (0x82 0x81), れ (0x82 0x8c), ゑ (0x82 0x91) // plus voiced: げ (0x81 0x92), ぜ (0x81 0x9c), で (0x81 0xa7), べ (0x81 0xb9), ぺ (0x81 0xba) if len(s) != 3 || s[0] != 0xe3 { return false } switch s { case "\xe3\x81\x88", "\xe3\x81\x91", "\xe3\x81\x9b", "\xe3\x81\xa6", "\xe3\x81\xad", "\xe3\x81\xb8", "\xe3\x82\x81", "\xe3\x82\x8c", "\xe3\x81\x92", "\xe3\x81\x9c", "\xe3\x81\xa7", "\xe3\x81\xb9", "\xe3\x81\xba": return true } return false } func isIRowKana(s string) bool { if len(s) != 3 || s[0] != 0xe3 { return false } switch s { case "\xe3\x81\x84", "\xe3\x81\x8d", "\xe3\x81\x97", "\xe3\x81\xa1", "\xe3\x81\xab", "\xe3\x81\xb2", "\xe3\x81\xbf", "\xe3\x82\x8a", "\xe3\x81\x8e", "\xe3\x81\x98", "\xe3\x81\xa2", "\xe3\x81\xb3", "\xe3\x81\xb4": return true } return false } func isERowOrIRowKana(s string) bool { return isERowKana(s) || isIRowKana(s) } // stripJACopula identifies sentence-final copula forms on a noun-final token // (彼は学生だ, 学生でした). Returns (atom-with-copula-stripped, true, morph-bits) // or (input, false, 0) if no copula form found. // // Copula forms (longest first to avoid premature short matches): // でした → past + polite // だった → past + plain // です → non-past + polite // だ → non-past + plain // // Caller must verify this is NOT a verb context (verb past-tense た/だ is a // separate suffix on conjugated verbs; this only fires on noun-final tokens). func stripJACopula(word string) (string, bool, uint16) { b := []byte(word) // でした = E3 81 A7 E3 81 97 E3 81 9F (9 bytes) deshita := []byte("\xe3\x81\xa7\xe3\x81\x97\xe3\x81\x9f") if len(b) > len(deshita) && string(b[len(b)-len(deshita):]) == string(deshita) { return string(b[:len(b)-len(deshita)]), true, MetaTensePast | MetaFormalityPol } // だった = E3 81 A0 E3 81 A3 E3 81 9F (9 bytes) datta := []byte("\xe3\x81\xa0\xe3\x81\xa3\xe3\x81\x9f") if len(b) > len(datta) && string(b[len(b)-len(datta):]) == string(datta) { return string(b[:len(b)-len(datta)]), true, MetaTensePast } // です = E3 81 A7 E3 81 99 (6 bytes) desu := []byte("\xe3\x81\xa7\xe3\x81\x99") if len(b) > len(desu) && string(b[len(b)-len(desu):]) == string(desu) { return string(b[:len(b)-len(desu)]), true, MetaFormalityPol } // だ = E3 81 A0 (3 bytes). Bare だ is ambiguous with verb past contractions // (噛んだ, 飛んだ, 死んだ for godan-mu/bu/nu). Only treat as copula if the // preceding character is not a verb-contraction signal kana (ん or っ). da := []byte("\xe3\x81\xa0") if len(b) > len(da) && string(b[len(b)-len(da):]) == string(da) { stem := b[:len(b)-len(da)] if len(stem) >= 3 { tail := stem[len(stem)-3:] // ん is E3 82 93, っ is E3 81 A3 - verb past contractions. if string(tail) == "\xe3\x82\x93" || string(tail) == "\xe3\x81\xa3" { return word, false, 0 } } return string(stem), true, 0 } return word, false, 0 } // stripJAIAdj strips i-adjective inflection suffixes and returns the kanji // stem. Returns (stem, morph, true) on success. Requires the stem to contain // at least one kanji character to avoid false positives on pure-kana words. // // Forms (longest first): // くなかった -> stem, neg+past (赤くなかった -> 赤) // かった -> stem, past (赤かった -> 赤) // くない -> stem, neg (赤くない -> 赤) // ければ -> stem, conditional (赤ければ -> 赤) // くて -> stem, te-form (赤くて -> 赤) // く -> stem, adverbial (赤く -> 赤) // さ -> stem, nominal (赤さ -> 赤) // い -> stem, dict (赤い -> 赤) func stripJAIAdj(b []byte) ([]byte, uint16, bool) { var stem []byte var morph uint16 // Only strip compound suffixes that are unambiguously i-adjective. // Bare い/く/さ create false positives on godan-ku verbs (行く,書く) // and other non-adjective words. // くなかった = く(E3 81 8F) + な(E3 81 AA) + か(E3 81 8B) + っ(E3 81 A3) + た(E3 81 9F) = 15 bytes if len(b) > 15 && hasSuffix(b, "\xe3\x81\x8f\xe3\x81\xaa\xe3\x81\x8b\xe3\x81\xa3\xe3\x81\x9f") { stem = b[:len(b)-15] morph = MetaPolarNeg | MetaTensePast } else if len(b) > 12 && hasSuffix(b, "\xe3\x81\x8b\xe3\x81\xa3\xe3\x81\x9f") { // かった = か(E3 81 8B) + っ(E3 81 A3) + た(E3 81 9F) = 9 bytes stem = b[:len(b)-9] morph = MetaTensePast } else if len(b) > 9 && hasSuffix(b, "\xe3\x81\x8f\xe3\x81\xaa\xe3\x81\x84") { // くない = く(E3 81 8F) + な(E3 81 AA) + い(E3 81 84) = 9 bytes stem = b[:len(b)-9] morph = MetaPolarNeg } else if len(b) > 9 && hasSuffix(b, "\xe3\x81\x91\xe3\x82\x8c\xe3\x81\xb0") { // ければ = け(E3 81 91) + れ(E3 82 8C) + ば(E3 81 B0) = 9 bytes stem = b[:len(b)-9] morph = 0 } else if len(b) > 6 && hasSuffix(b, "\xe3\x81\x8f\xe3\x81\xa6") { // くて = く(E3 81 8F) + て(E3 81 A6) = 6 bytes stem = b[:len(b)-6] morph = MetaAspectProg } else { return nil, 0, false } if len(stem) < 3 { return nil, 0, false } if !hasKanji(stem) { return nil, 0, false } return stem, morph, true } func hasKanji(b []byte) bool { for i := 0; i+2 < len(b); { if b[i] >= 0xe4 && b[i] <= 0xe9 { return true } if b[i] < 0x80 { i++ } else if b[i] < 0xe0 { i += 2 } else if b[i] < 0xf0 { i += 3 } else { i += 4 } } return false } // endsInIKana returns true if the last UTF-8 character of s is the hiragana い. // Used as a heuristic for i-adjective detection (赤い, 楽しい). // Known false positives: な-adjectives ending in い (きれい), nouns ending in // い (兄=ani; 桜井=name). The positional constraint (modifier immediately // before a noun, no intervening particle) at the call site filters most. func endsInIKana(s string) bool { b := []byte(s) if len(b) < 3 { return false } // い is UTF-8 E3 81 84 tail := b[len(b)-3:] return tail[0] == 0xe3 && tail[1] == 0x81 && tail[2] == 0x84 } // isJADitransitive returns true if the lemma is a JA ditransitive verb // taking a に-marked recipient and a を-marked patient: 彼に本をあげる // (give him a book). When the clause's verb matches, に-marked slots flip // from ORGoal/ORLoc default to ORRecip. Closed set of JA verb lemmas // (post-lemmatization, so just the stem). func isJADitransitive(lemma string) bool { switch lemma { // あげ (give), やる→や, くれる→くれ, 渡 (hand), 送 (send), 教え (teach/tell), // 見せ (show), 売 (sell), 買 (buy), 貸 (lend), 払 (pay), 与え (give/grant). case "\xe3\x81\x82\xe3\x81\x92", "\xe3\x82\x84", "\xe3\x81\x8f\xe3\x82\x8c", "\xe6\xb8\xa1", "\xe9\x80\x81", "\xe6\x95\x99\xe3\x81\x88", "\xe8\xa6\x8b\xe3\x81\x9b", "\xe5\xa3\xb2", "\xe8\xb2\xb7", "\xe8\xb2\xb8", "\xe6\x89\x95", "\xe4\xb8\x8e\xe3\x81\x88": return true } return false } // isJATemporalNoun returns true if s is a temporal noun that functions as // a sentence-initial adverbial (today, yesterday, tomorrow, etc.). These // surface as bare nouns without a particle but semantically modify the // clause's verb. Closed set; treat as MKAdv adverbial during extraction. func isJATemporalNoun(s string) bool { switch s { // 昨日 (yesterday), 今日 (today), 明日 (tomorrow), 今 (now), // 昨夜 (last night), 今朝 (this morning), 今晩 (tonight), 今夜 (tonight), // 来週 (next week), 先週 (last week), 来月 (next month), 先月 (last month), // 来年 (next year), 去年 (last year), 毎日 (every day), 毎週 (every week), // 毎月 (every month), 毎年 (every year), 朝 (morning), 昼 (noon), // 夜 (night), 夕方 (evening), いつ (when/always). case "\xe6\x98\xa8\xe6\x97\xa5", "\xe4\xbb\x8a\xe6\x97\xa5", "\xe6\x98\x8e\xe6\x97\xa5", "\xe4\xbb\x8a", "\xe6\x98\xa8\xe5\xa4\x9c", "\xe4\xbb\x8a\xe6\x9c\x9d", "\xe4\xbb\x8a\xe6\x99\xa9", "\xe4\xbb\x8a\xe5\xa4\x9c", "\xe6\x9d\xa5\xe9\x80\xb1", "\xe5\x85\x88\xe9\x80\xb1", "\xe6\x9d\xa5\xe6\x9c\x88", "\xe5\x85\x88\xe6\x9c\x88", "\xe6\x9d\xa5\xe5\xb9\xb4", "\xe5\x8e\xbb\xe5\xb9\xb4", "\xe6\xaf\x8e\xe6\x97\xa5", "\xe6\xaf\x8e\xe9\x80\xb1", "\xe6\xaf\x8e\xe6\x9c\x88", "\xe6\xaf\x8e\xe5\xb9\xb4", "\xe6\x9c\x9d", "\xe6\x98\xbc", "\xe5\xa4\x9c", "\xe5\xa4\x95\xe6\x96\xb9", "\xe3\x81\x84\xe3\x81\xa4": return true } return false } // isJARelationalNoun returns true if s is a relational noun used in the // の-relational-noun-に locative compound pattern (箱の中に = "in the box"). // Closed set; expansions belong here. func isJARelationalNoun(s string) bool { switch s { // 中 (inside), 内 (within), 外 (outside), 上 (on/above), 下 (under), // 前 (in front), 後 (behind), 横 (beside), 隣 (next to), 間 (between). case "\xe4\xb8\xad", "\xe5\x86\x85", "\xe5\xa4\x96", "\xe4\xb8\x8a", "\xe4\xb8\x8b", "\xe5\x89\x8d", "\xe5\xbe\x8c", "\xe6\xa8\xaa", "\xe9\x9a\xa3", "\xe9\x96\x93": return true } return false } // jaRelationalNounToOblRole maps a relational noun to its semantic role. // Most map to ORLoc with implicit "inside" semantics for the EN renderer; // position-specific (on, under, in-front) would need a finer-grained // representation. For now all locative-compounds collapse to ORLoc. func jaRelationalNounToOblRole(s string) uint8 { if isJARelationalNoun(s) { return ORLoc } return ORNone } // endsInNaiSuffix returns true if s ends in ない (negative verb suffix). // Used to exclude negative-verb forms from predicative-i-adjective detection, // since ない ends in い (false-matching endsInIKana). な = E3 81 AA, い = E3 81 84. func endsInNaiSuffix(s string) bool { b := []byte(s) if len(b) < 6 { return false } tail := b[len(b)-6:] return tail[0] == 0xe3 && tail[1] == 0x81 && tail[2] == 0xaa && tail[3] == 0xe3 && tail[4] == 0x81 && tail[5] == 0x84 } // endsInTaiSuffix returns true if s ends in たい (desiderative verb suffix). // Used to exclude desiderative verb forms (食べたい / want to eat) from // predicative-i-adjective detection. た = E3 81 9F, い = E3 81 84. func endsInTaiSuffix(s string) bool { b := []byte(s) if len(b) < 6 { return false } tail := b[len(b)-6:] return tail[0] == 0xe3 && tail[1] == 0x81 && tail[2] == 0x9f && tail[3] == 0xe3 && tail[4] == 0x81 && tail[5] == 0x84 } // isJABareKanjiAdj returns true if s is a single-kanji color/size adjective // (赤, 白, 黒, 大, 小, 新, 古, 長, 高, 低). These appear as bare-kanji ATTR // modifiers in rendered JA when the い-suffix was not included. func isJABareKanjiAdj(s string) bool { if len(s) != 3 { return false } b := []byte(s) c1, c2, c3 := b[0], b[1], b[2] switch { case c1 == 0xe8 && c2 == 0xb5 && c3 == 0xa4: // 赤 case c1 == 0xe7 && c2 == 0x99 && c3 == 0xbd: // 白 case c1 == 0xe9 && c2 == 0xbb && c3 == 0x92: // 黒 case c1 == 0xe5 && c2 == 0xa4 && c3 == 0xa7: // 大 case c1 == 0xe5 && c2 == 0xb0 && c3 == 0x8f: // 小 case c1 == 0xe6 && c2 == 0x96 && c3 == 0xb0: // 新 case c1 == 0xe5 && c2 == 0x8f && c3 == 0xa4: // 古 case c1 == 0xe9 && c2 == 0x95 && c3 == 0xb7: // 長 case c1 == 0xe9 && c2 == 0xab && c3 == 0x98: // 高 case c1 == 0xe4 && c2 == 0xbd && c3 == 0x8e: // 低 default: return false } return true } // endsInTaKana returns true if the last UTF-8 character of s is the hiragana た. // Used to detect past-tense verb forms preceding a head noun (REL clause): // 食べた猫 (the cat that ate), 見た本 (the book I saw). The morphological // splitter exposes the boundary; this checks that the prior slot is a // past-tense verb. False positives: nouns ending in た are uncommon. func endsInTaKana(s string) bool { b := []byte(s) if len(b) < 3 { return false } // た is UTF-8 E3 81 9F tail := b[len(b)-3:] return tail[0] == 0xe3 && tail[1] == 0x81 && tail[2] == 0x9f } // endsInKuKana returns true if the last UTF-8 character of s is the hiragana く. // Used as a heuristic for く-form adverbial detection: 速く (hayaku=quickly), // 高く (takaku=highly), 楽しく (tanoshiku=enjoyably). The く-form is the // adverbial inflection of an i-adjective and binds to the following verb. func endsInKuKana(s string) bool { b := []byte(s) if len(b) < 3 { return false } // く is UTF-8 E3 81 8F tail := b[len(b)-3:] return tail[0] == 0xe3 && tail[1] == 0x81 && tail[2] == 0x8f } // stripJAConjugation matches the longest known conjugation suffix. // Returns (remaining bytes, morph bits, stripped flag). // // Order matters: longest forms first. Passive (れる/られる) and causative // (せる/させる) must be checked before simpler suffix endings to avoid // premature shorter matches. func stripJAConjugation(b []byte) ([]byte, uint16, bool) { // Passive/causative past combinations. // Guard: if the stem after stripping is bare kana (isBareSuffix), the // match is likely an ichidan verb whose stem ends in れ/せ colliding // with the passive/causative suffix (くれた = くれ+た, not く+れた). // Reject and fall through to simpler suffixes. if rest, ok := stripJASuffix(b, jaSeraretaPast); ok && !isBareSuffix(rest) { return rest, MetaCausative | MetaPassive | MetaTensePast, true } if rest, ok := stripJASuffix(b, jaRaretaPast); ok && !isBareSuffix(rest) { return rest, MetaPassive | MetaTensePast, true } if rest, ok := stripJASuffix(b, jaSasetaPast); ok && !isBareSuffix(rest) { return rest, MetaCausative | MetaTensePast, true } // Passive/causative non-past if rest, ok := stripJASuffix(b, jaSerareru); ok && !isBareSuffix(rest) { return rest, MetaCausative | MetaPassive, true } if rest, ok := stripJASuffix(b, jaRareru); ok && !isBareSuffix(rest) { return rest, MetaPassive, true } if rest, ok := stripJASuffix(b, jaSaseru); ok && !isBareSuffix(rest) { return rest, MetaCausative, true } if rest, ok := stripJASuffix(b, jaReru); ok && !isBareSuffix(rest) { return rest, MetaPassive, true } if rest, ok := stripJASuffix(b, jaSeru); ok && !isBareSuffix(rest) { return rest, MetaCausative, true } if rest, ok := stripJASuffix(b, jaRetaPast); ok && !isBareSuffix(rest) { return rest, MetaPassive | MetaTensePast, true } if rest, ok := stripJASuffix(b, jaSetaPast); ok && !isBareSuffix(rest) { return rest, MetaCausative | MetaTensePast, true } if rest, ok := stripJASuffix(b, jaNakatta); ok { return rest, MetaPolarNeg | MetaTensePast, true } // Desiderative (たい family) - before ない/た to avoid partial match. if rest, ok := stripJASuffix(b, jaTakunai); ok { return rest, MetaMoodVol | MetaPolarNeg, true } if rest, ok := stripJASuffix(b, jaTakatta); ok { return rest, MetaMoodVol | MetaTensePast, true } if rest, ok := stripJASuffix(b, jaTai); ok { return rest, MetaMoodVol, true } if rest, ok := stripJASuffix(b, jaMashita); ok { return rest, MetaTensePast | MetaFormalityPol, true } if rest, ok := stripJASuffix(b, jaMasen); ok { return rest, MetaPolarNeg | MetaFormalityPol, true } if rest, ok := stripJASuffix(b, jaTeiru); ok { return rest, MetaAspectProg, true } if rest, ok := stripJASuffix(b, jaTeita); ok { return rest, MetaAspectProg | MetaTensePast, true } if rest, ok := stripJASuffix(b, jaDeiru); ok { return rest, MetaAspectProg, true } if rest, ok := stripJASuffix(b, jaDeita); ok { return rest, MetaAspectProg | MetaTensePast, true } if rest, ok := stripJASuffix(b, jaMasu); ok { return rest, MetaFormalityPol, true } if rest, ok := stripJASuffix(b, jaNai); ok { return rest, MetaPolarNeg, true } // Godan volitional compounds (longest first; each is お-row kana + う). // These must match before bare う suffixes or particles. if rest, ok := stripJASuffix(b, jaKoU); ok { return rest, MetaMoodVol, true } if rest, ok := stripJASuffix(b, jaGoU); ok { return rest, MetaMoodVol, true } if rest, ok := stripJASuffix(b, jaSoU); ok { return rest, MetaMoodVol, true } if rest, ok := stripJASuffix(b, jaToU); ok { return rest, MetaMoodVol, true } if rest, ok := stripJASuffix(b, jaNoU); ok { return rest, MetaMoodVol, true } if rest, ok := stripJASuffix(b, jaBoU); ok { return rest, MetaMoodVol, true } if rest, ok := stripJASuffix(b, jaMoU); ok { return rest, MetaMoodVol, true } if rest, ok := stripJASuffix(b, jaRoU); ok { return rest, MetaMoodVol, true } if rest, ok := stripJASuffix(b, jaOU); ok { return rest, MetaMoodVol, true } if rest, ok := stripJASuffix(b, jaYou); ok { return rest, MetaMoodVol, true } if rest, ok := stripJASuffix(b, jaTe); ok { return rest, MetaAspectProg, true } if rest, ok := stripJASuffix(b, jaDe); ok { return rest, MetaAspectProg, true } if rest, ok := stripJASuffix(b, jaTa); ok { return rest, MetaTensePast, true } if rest, ok := stripJASuffix(b, jaDa); ok { return rest, MetaTensePast, true } if rest, ok := stripJASuffix(b, jaBa); ok { // ば: provisional conditional. Strip from lemma; the ClauseIf // relation lives at the discourse layer, not on the verb morph. return rest, 0, true } return b, 0, false } const ( jaTeiru = "\xe3\x81\xa6\xe3\x81\x84\xe3\x82\x8b" // ている jaTeita = "\xe3\x81\xa6\xe3\x81\x84\xe3\x81\x9f" // ていた jaDeiru = "\xe3\x81\xa7\xe3\x81\x84\xe3\x82\x8b" // でいる jaDeita = "\xe3\x81\xa7\xe3\x81\x84\xe3\x81\x9f" // でいた jaTai = "\xe3\x81\x9f\xe3\x81\x84" // たい jaTakatta = "\xe3\x81\x9f\xe3\x81\x8b\xe3\x81\xa3\xe3\x81\x9f" // たかった jaTakunai = "\xe3\x81\x9f\xe3\x81\x8f\xe3\x81\xaa\xe3\x81\x84" // たくない jaNai = "\xe3\x81\xaa\xe3\x81\x84" // ない jaNakatta = "\xe3\x81\xaa\xe3\x81\x8b\xe3\x81\xa3\xe3\x81\x9f" // なかった jaMasu = "\xe3\x81\xbe\xe3\x81\x99" // ます jaMashita = "\xe3\x81\xbe\xe3\x81\x97\xe3\x81\x9f" // ました jaMasen = "\xe3\x81\xbe\xe3\x81\x9b\xe3\x82\x93" // ません jaYou = "\xe3\x82\x88\xe3\x81\x86" // よう jaTe = "\xe3\x81\xa6" // て jaDe = "\xe3\x81\xa7" // で jaTa = "\xe3\x81\x9f" // た jaDa = "\xe3\x81\xa0" // だ jaBa = "\xe3\x81\xb0" // ば (conditional) // Passive: れる/られる jaRareru = "\xe3\x82\x89\xe3\x82\x8c\xe3\x82\x8b" // られる jaReru = "\xe3\x82\x8c\xe3\x82\x8b" // れる jaRaretaPast = "\xe3\x82\x89\xe3\x82\x8c\xe3\x81\x9f" // られた jaRetaPast = "\xe3\x82\x8c\xe3\x81\x9f" // れた // Causative: せる/させる jaSaseru = "\xe3\x81\x95\xe3\x81\x9b\xe3\x82\x8b" // させる jaSeru = "\xe3\x81\x9b\xe3\x82\x8b" // せる jaSasetaPast = "\xe3\x81\x95\xe3\x81\x9b\xe3\x81\x9f" // させた jaSetaPast = "\xe3\x81\x9b\xe3\x81\x9f" // せた // Causative-passive: させられる jaSerareru = "\xe3\x81\x95\xe3\x81\x9b\xe3\x82\x89\xe3\x82\x8c\xe3\x82\x8b" // させられる jaSeraretaPast = "\xe3\x81\x95\xe3\x81\x9b\xe3\x82\x89\xe3\x82\x8c\xe3\x81\x9f" // させられた // Volitional compounds jaKoU = "\xe3\x81\x93\xe3\x81\x86" // こう (godan-ku) jaGoU = "\xe3\x81\x94\xe3\x81\x86" // ごう (godan-gu) jaSoU = "\xe3\x81\x9d\xe3\x81\x86" // そう (godan-su) jaToU = "\xe3\x81\xa8\xe3\x81\x86" // とう (godan-tsu) jaNoU = "\xe3\x81\xae\xe3\x81\x86" // のう (godan-nu) jaBoU = "\xe3\x81\xbc\xe3\x81\x86" // ぼう (godan-bu) jaMoU = "\xe3\x82\x82\xe3\x81\x86" // もう (godan-mu) jaRoU = "\xe3\x82\x8d\xe3\x81\x86" // ろう (godan-ru) jaOU = "\xe3\x81\x8a\xe3\x81\x86" // おう (godan-u) ) func stripJASuffix(word, suffix []byte) ([]byte, bool) { if len(word) <= len(suffix) { return nil, false } tail := word[len(word)-len(suffix):] if string(tail) == string(suffix) { return word[:len(word)-len(suffix)], true } return nil, false }