lemma_ja.mx raw

   1  package iskra
   2  
   3  // volClassFromSuffix returns the godan class implied by a volitional compound
   4  // ending. Returns VClassIchidan for よう (and for unknown patterns).
   5  func volClassFromSuffix(suffix []byte) uint8 {
   6  	switch string(suffix) {
   7  	case "\xe3\x81\x93\xe3\x81\x86":
   8  		return VClassGodanKu
   9  	case "\xe3\x81\x94\xe3\x81\x86":
  10  		return VClassGodanGu
  11  	case "\xe3\x81\x9d\xe3\x81\x86":
  12  		return VClassGodanSu
  13  	case "\xe3\x81\xa8\xe3\x81\x86":
  14  		return VClassGodanTsu
  15  	case "\xe3\x81\xae\xe3\x81\x86":
  16  		return VClassGodanNu
  17  	case "\xe3\x81\xbc\xe3\x81\x86":
  18  		return VClassGodanBu
  19  	case "\xe3\x82\x82\xe3\x81\x86":
  20  		return VClassGodanMu
  21  	case "\xe3\x82\x8d\xe3\x81\x86":
  22  		return VClassGodanRu
  23  	case "\xe3\x81\x8a\xe3\x81\x86":
  24  		return VClassGodanU
  25  	}
  26  	return VClassIchidan
  27  }
  28  
  29  // LemmatizeJA reduces a JA verb surface form to its bare stem + morph + class.
  30  // The stem is consistent across all inflected forms of a verb (食べる/食べた/食べている → 食).
  31  // The class identifies the conjugation pattern needed to reconstruct forms.
  32  //
  33  // Verb class detection:
  34  //   ichidan: stem ends in え/い-row hiragana (食べ, 起き). Dict-form: stem + る.
  35  //   godan-X: stem ends in kanji or contracted/i-stem kana. Class indicates u-row ending.
  36  //
  37  // Non-verbs (isVerb=false) pass through unchanged.
  38  func LemmatizeJA(word string, isVerb bool) LemmaResult {
  39  	if word == UntranslatedMarker {
  40  		return LemmaResult{Lemma: word, Morph: 0, Class: 0}
  41  	}
  42  	if !isVerb {
  43  		if stem, morph, ok := stripJAIAdj([]byte(word)); ok {
  44  			return LemmaResult{Lemma: string(stem), Morph: morph, Class: VClassIAdj}
  45  		}
  46  		return LemmaResult{Lemma: word, Morph: 0, Class: 0}
  47  	}
  48  	b := []byte(word)
  49  	if len(b) < 3 {
  50  		return LemmaResult{Lemma: word, Morph: 0, Class: 0}
  51  	}
  52  
  53  	stem, morph, stripped := stripJAConjugation(b)
  54  	class := detectJAClass(b, stem, stripped, morph)
  55  	// Volitional: class can be inferred from the specific suffix matched
  56  	// (こう/もう/etc. for godan, よう for ichidan).
  57  	if morph&MetaMoodVol != 0 && stripped {
  58  		suffixLen := len(b) - len(stem)
  59  		if suffixLen >= 6 {
  60  			class = volClassFromSuffix(b[len(b)-suffixLen:])
  61  		}
  62  	}
  63  	// Single-kana stem after stripping (e.g. bare っ from った/って) is a
  64  	// tokenizer artifact, not a standalone verb. Pass through as-is.
  65  	// Multi-kana stems (6+ bytes, 2+ characters) are legitimate ichidan
  66  	// verb stems (くれ, つけ, かけ, etc.) and must not be rejected.
  67  	if stripped && len(stem) <= 3 && isBareSuffix(stem) {
  68  		return LemmaResult{Lemma: word, Morph: 0, Class: 0}
  69  	}
  70  	stem = canonicalizeJAStem(stem, stripped, class)
  71  
  72  	if len(stem) == 0 {
  73  		return LemmaResult{Lemma: word, Morph: morph, Class: class}
  74  	}
  75  	return LemmaResult{Lemma: string(stem), Morph: morph, Class: class}
  76  }
  77  
  78  // detectJAClass attempts to determine the verb class from the original surface
  79  // form and what suffix was stripped.
  80  func detectJAClass(orig, stem []byte, stripped bool, morph uint16) uint8 {
  81  	if !stripped {
  82  		// Dict form: look at last kana. 食べる → ichidan if pre-る is え/い-row.
  83  		return classFromDictForm(orig)
  84  	}
  85  	// Suffix was stripped. Use residual to infer class.
  86  	return classFromInflectedStem(stem, morph)
  87  }
  88  
  89  // classFromDictForm reads a dict-form verb (assumes no suffix stripping done).
  90  // Returns the class based on the last 1-2 kana.
  91  func classFromDictForm(b []byte) uint8 {
  92  	if len(b) < 3 {
  93  		return VClassNone
  94  	}
  95  	last := b[len(b)-3:]
  96  	ls := string(last)
  97  	switch ls {
  98  	case "\xe3\x82\x8b": // る
  99  		// Could be ichidan or godan-ru. Check preceding kana.
 100  		if len(b) >= 6 {
 101  			prev := string(b[len(b)-6 : len(b)-3])
 102  			if isERowOrIRowKana(prev) {
 103  				return VClassIchidan
 104  			}
 105  		}
 106  		return VClassGodanRu
 107  	case "\xe3\x81\x8f": // く
 108  		return VClassGodanKu
 109  	case "\xe3\x81\x90": // ぐ
 110  		return VClassGodanGu
 111  	case "\xe3\x81\x99": // す
 112  		return VClassGodanSu
 113  	case "\xe3\x81\xa4": // つ
 114  		return VClassGodanTsu
 115  	case "\xe3\x81\xac": // ぬ
 116  		return VClassGodanNu
 117  	case "\xe3\x81\xb6": // ぶ
 118  		return VClassGodanBu
 119  	case "\xe3\x82\x80": // む
 120  		return VClassGodanMu
 121  	case "\xe3\x81\x86": // う
 122  		return VClassGodanU
 123  	}
 124  	return VClassNone
 125  }
 126  
 127  // classFromInflectedStem guesses class from the stem after suffix removal.
 128  func classFromInflectedStem(stem []byte, morph uint16) uint8 {
 129  	if len(stem) < 3 {
 130  		return VClassNone
 131  	}
 132  	last := stem[len(stem)-3:]
 133  	ls := string(last)
 134  
 135  	// Negative or passive/causative: stem ends in あ-row kana for godan, or
 136  	// the stem is a bare ichidan stem (え/い-row).
 137  	// Negative: 走らない (godan-ru: 走ら). Passive: 噛まれた (godan-mu: 噛ま).
 138  	// Causative: 待たせた (godan-tsu: 待た).
 139  	if morph&(MetaPolarNeg|MetaPassive|MetaCausative) != 0 {
 140  		if c := classFromARowKana(ls); c != VClassNone {
 141  			return c
 142  		}
 143  		if isERowKana(ls) || isIRowKana(ls) {
 144  			return VClassIchidan
 145  		}
 146  		return VClassNone
 147  	}
 148  
 149  	// Polite (ます-family stripped) leaves i-row kana for godan or bare stem for ichidan.
 150  	if morph&MetaFormalityPol != 0 {
 151  		if c := classFromIStem(ls); c != VClassIchidan {
 152  			return c
 153  		}
 154  		if isERowKana(ls) || isIRowKana(ls) {
 155  			return VClassIchidan
 156  		}
 157  		return VClassNone
 158  	}
 159  
 160  	// Te/ta/de/da stripped: contracted forms for godan, bare stem for ichidan.
 161  	// っ contraction is ambiguous: godan-u (買う→買っ), godan-tsu (持つ→持っ),
 162  	// godan-ru (知る→知っ). Disambiguate by looking up the kanji root in a
 163  	// small table; default to godan-ru (most common っ-contracting class).
 164  	if ls == "\xe3\x81\xa3" { // っ
 165  		return classTsuContract(stem)
 166  	}
 167  	if ls == "\xe3\x82\x93" { // ん - godan nu/bu/mu voiced contraction
 168  		return VClassGodanMu
 169  	}
 170  	if ls == "\xe3\x81\x84" { // い - godan ku/gu past
 171  		return VClassGodanKu
 172  	}
 173  	if ls == "\xe3\x81\x97" { // し - godan su past
 174  		return VClassGodanSu
 175  	}
 176  	// Bare stem ending in え/い-row → ichidan
 177  	if isERowKana(ls) || isIRowKana(ls) {
 178  		return VClassIchidan
 179  	}
 180  	return VClassNone
 181  }
 182  
 183  // classFromARowKana maps the あ-row kana found before ない to the godan class.
 184  func classFromARowKana(ls string) uint8 {
 185  	switch ls {
 186  	case "\xe3\x81\x8b": // か
 187  		return VClassGodanKu
 188  	case "\xe3\x81\x8c": // が
 189  		return VClassGodanGu
 190  	case "\xe3\x81\x95": // さ
 191  		return VClassGodanSu
 192  	case "\xe3\x81\x9f": // た
 193  		return VClassGodanTsu
 194  	case "\xe3\x81\xaa": // な
 195  		return VClassGodanNu
 196  	case "\xe3\x81\xb0": // ば
 197  		return VClassGodanBu
 198  	case "\xe3\x81\xbe": // ま
 199  		return VClassGodanMu
 200  	case "\xe3\x82\x89": // ら
 201  		return VClassGodanRu
 202  	case "\xe3\x82\x8f": // わ
 203  		return VClassGodanU
 204  	}
 205  	return VClassNone
 206  }
 207  
 208  // classFromIStem maps an i-row kana to the corresponding godan class.
 209  func classFromIStem(ls string) uint8 {
 210  	switch ls {
 211  	case "\xe3\x81\x8d": // き
 212  		return VClassGodanKu
 213  	case "\xe3\x81\x8e": // ぎ
 214  		return VClassGodanGu
 215  	case "\xe3\x81\x97": // し
 216  		return VClassGodanSu
 217  	case "\xe3\x81\xa1": // ち
 218  		return VClassGodanTsu
 219  	case "\xe3\x81\xab": // に
 220  		return VClassGodanNu
 221  	case "\xe3\x81\xb3": // び
 222  		return VClassGodanBu
 223  	case "\xe3\x81\xbf": // み
 224  		return VClassGodanMu
 225  	case "\xe3\x82\x8a": // り
 226  		return VClassGodanRu
 227  	case "\xe3\x81\x84": // い (from godan-う i-stem like 買い)
 228  		return VClassGodanU
 229  	}
 230  	return VClassIchidan
 231  }
 232  
 233  // classTsuContract disambiguates っ te-form contraction among godan-u, godan-tsu,
 234  // and godan-ru by kanji root lookup. Default: godan-ru (most frequent).
 235  func classTsuContract(stem []byte) uint8 {
 236  	// Strip trailing っ (3 bytes: E3 81 A3) to get kanji root.
 237  	root := stem
 238  	if len(root) >= 3 {
 239  		tail := root[len(root)-3:]
 240  		if tail[0] == 0xe3 && tail[1] == 0x81 && tail[2] == 0xa3 {
 241  			root = root[:len(root)-3]
 242  		}
 243  	}
 244  	if len(root) == 0 {
 245  		return VClassGodanRu
 246  	}
 247  	// Extract last rune (the kanji) for table lookup.
 248  	// CJK kanji are 3-byte UTF-8 sequences (E0-EF range).
 249  	var kanji string
 250  	if len(root) >= 3 && root[len(root)-3] >= 0xe0 {
 251  		kanji = string(root[len(root)-3:])
 252  	} else if len(root) >= 2 && root[len(root)-2] >= 0xc0 {
 253  		kanji = string(root[len(root)-2:])
 254  	} else {
 255  		kanji = string(root[len(root)-1:])
 256  	}
 257  	switch kanji {
 258  	// Godan-tsu (dict form ends in つ)
 259  	case "\xe6\x8c\x81": // 持 (motsu)
 260  		return VClassGodanTsu
 261  	case "\xe7\xab\x8b": // 立 (tatsu)
 262  		return VClassGodanTsu
 263  	case "\xe5\xbe\x85": // 待 (matsu)
 264  		return VClassGodanTsu
 265  	case "\xe6\x89\x93": // 打 (utsu)
 266  		return VClassGodanTsu
 267  	case "\xe5\x8b\x9d": // 勝 (katsu)
 268  		return VClassGodanTsu
 269  	case "\xe8\x82\xb2": // 育 (sodatsu)
 270  		return VClassGodanTsu
 271  	case "\xe7\xb5\x8c": // 経 (tatsu - time passes)
 272  		return VClassGodanTsu
 273  	case "\xe4\xbf\x9d": // 保 (motsu - to last)
 274  		return VClassGodanTsu
 275  	case "\xe7\x99\xba": // 発 (tatsu - depart)
 276  		return VClassGodanTsu
 277  	case "\xe5\xbb\xba": // 建 (tatsu - build)
 278  		return VClassGodanTsu
 279  	// Godan-u (dict form ends in う)
 280  	case "\xe8\xb2\xb7": // 買 (kau)
 281  		return VClassGodanU
 282  	case "\xe4\xbc\x9a": // 会 (au)
 283  		return VClassGodanU
 284  	case "\xe6\xad\x8c": // 歌 (utau)
 285  		return VClassGodanU
 286  	case "\xe8\xa8\x80": // 言 (iu)
 287  		return VClassGodanU
 288  	case "\xe4\xbd\xbf": // 使 (tsukau)
 289  		return VClassGodanU
 290  	case "\xe6\x89\x95": // 払 (harau)
 291  		return VClassGodanU
 292  	case "\xe6\xb4\x97": // 洗 (arau)
 293  		return VClassGodanU
 294  	case "\xe7\xac\x91": // 笑 (warau)
 295  		return VClassGodanU
 296  	case "\xe9\x81\x95": // 違 (chigau)
 297  		return VClassGodanU
 298  	case "\xe6\x80\x9d": // 思 (omou)
 299  		return VClassGodanU
 300  	case "\xe8\xbf\xbd": // 追 (ou)
 301  		return VClassGodanU
 302  	case "\xe8\xaa\x98": // 誘 (sasou)
 303  		return VClassGodanU
 304  	case "\xe6\x8b\xbe": // 拾 (hirou)
 305  		return VClassGodanU
 306  	case "\xe9\xa3\xbc": // 飼 (kau - keep/raise)
 307  		return VClassGodanU
 308  	case "\xe5\x90\xb8": // 吸 (suu)
 309  		return VClassGodanU
 310  	case "\xe5\x90\x88": // 合 (au - match)
 311  		return VClassGodanU
 312  	case "\xe6\x95\x91": // 救 (sukuu)
 313  		return VClassGodanU
 314  	case "\xe6\x8b\x89": // 拉 (hirau - not standard, but appears)
 315  		return VClassGodanU
 316  	}
 317  	// Default: godan-ru (知る, 取る, 送る, 作る, 乗る, 売る, etc.)
 318  	return VClassGodanRu
 319  }
 320  
 321  // canonicalizeJAStem normalizes the stem to its canonical form per class.
 322  // For ichidan: keep the え/い-row kana (食べ stays 食べ).
 323  // For godan: canonical form is the dict form (kanji root + u-row ending).
 324  //   Dict form 行く: keep as-is.
 325  //   Inflected 行った: strip to kanji root, re-append く.
 326  // This preserves lexical identity: 行く (iku, go) vs 行う (okonau, do)
 327  // produce distinct lemmas instead of both collapsing to 行.
 328  func canonicalizeJAStem(stem []byte, stripped bool, class uint8) []byte {
 329  	if class == VClassIchidan {
 330  		if !stripped {
 331  			return stripTrailingHiragana(stem)
 332  		}
 333  		return stem // 食べ is already canonical for ichidan
 334  	}
 335  	// Godan: strip to kanji root, then append the dict-form ending.
 336  	root := stem
 337  	if !stripped {
 338  		root = stripTrailingHiragana(stem)
 339  	} else {
 340  		root = stripAllTrailingHiragana(stem)
 341  	}
 342  	suffix := godanDictSuffix(class)
 343  	if suffix == "" {
 344  		return root
 345  	}
 346  	out := []byte{:0:len(root) + 3}
 347  	out = append(out, root...)
 348  	out = append(out, []byte(suffix)...)
 349  	return out
 350  }
 351  
 352  func godanDictSuffix(class uint8) string {
 353  	switch class {
 354  	case VClassGodanKu:
 355  		return "\xe3\x81\x8f" // く
 356  	case VClassGodanGu:
 357  		return "\xe3\x81\x90" // ぐ
 358  	case VClassGodanSu:
 359  		return "\xe3\x81\x99" // す
 360  	case VClassGodanTsu:
 361  		return "\xe3\x81\xa4" // つ
 362  	case VClassGodanNu:
 363  		return "\xe3\x81\xac" // ぬ
 364  	case VClassGodanBu:
 365  		return "\xe3\x81\xb6" // ぶ
 366  	case VClassGodanMu:
 367  		return "\xe3\x82\x80" // む
 368  	case VClassGodanRu:
 369  		return "\xe3\x82\x8b" // る
 370  	case VClassGodanU:
 371  		return "\xe3\x81\x86" // う
 372  	}
 373  	return ""
 374  }
 375  
 376  // stripTrailingHiragana removes exactly one trailing hiragana char.
 377  func stripTrailingHiragana(b []byte) []byte {
 378  	if len(b) < 3 {
 379  		return b
 380  	}
 381  	tail := b[len(b)-3:]
 382  	if tail[0] == 0xe3 && (tail[1] == 0x81 || tail[1] == 0x82) {
 383  		return b[:len(b)-3]
 384  	}
 385  	return b
 386  }
 387  
 388  // stripAllTrailingHiragana removes all trailing hiragana, leaving at least 3 bytes.
 389  func stripAllTrailingHiragana(b []byte) []byte {
 390  	for len(b) > 3 {
 391  		tail := b[len(b)-3:]
 392  		if tail[0] == 0xe3 && (tail[1] == 0x81 || tail[1] == 0x82) {
 393  			b = b[:len(b)-3]
 394  		} else {
 395  			break
 396  		}
 397  	}
 398  	return b
 399  }
 400  
 401  // isBareSuffix returns true when the stem after conjugation stripping is all
 402  // hiragana/katakana with no kanji root. Such stems come from tokenizer splits
 403  // (思った -> 思 + った) and shouldn't be canonicalized as standalone verbs.
 404  func isBareSuffix(stem []byte) bool {
 405  	for i := 0; i < len(stem); {
 406  		if i+3 <= len(stem) && stem[i] == 0xe3 {
 407  			b1 := stem[i+1]
 408  			if b1 >= 0x81 && b1 <= 0x83 { // hiragana U+3040-U+309F, katakana U+30A0-U+30FF
 409  				i += 3
 410  				continue
 411  			}
 412  		}
 413  		return false // non-kana byte found = has kanji root
 414  	}
 415  	return len(stem) > 0
 416  }
 417  
 418  func isERowKana(s string) bool {
 419  	// え-row hiragana: え (0x81 0x88), け (0x81 0x91), せ (0x81 0x9b), て (0x81 0xa6),
 420  	// ね (0x81 0xad), へ (0x81 0xb8), め (0x82 0x81), れ (0x82 0x8c), ゑ (0x82 0x91)
 421  	// plus voiced: げ (0x81 0x92), ぜ (0x81 0x9c), で (0x81 0xa7), べ (0x81 0xb9), ぺ (0x81 0xba)
 422  	if len(s) != 3 || s[0] != 0xe3 {
 423  		return false
 424  	}
 425  	switch s {
 426  	case "\xe3\x81\x88", "\xe3\x81\x91", "\xe3\x81\x9b", "\xe3\x81\xa6",
 427  		"\xe3\x81\xad", "\xe3\x81\xb8", "\xe3\x82\x81", "\xe3\x82\x8c",
 428  		"\xe3\x81\x92", "\xe3\x81\x9c", "\xe3\x81\xa7", "\xe3\x81\xb9", "\xe3\x81\xba":
 429  		return true
 430  	}
 431  	return false
 432  }
 433  
 434  func isIRowKana(s string) bool {
 435  	if len(s) != 3 || s[0] != 0xe3 {
 436  		return false
 437  	}
 438  	switch s {
 439  	case "\xe3\x81\x84", "\xe3\x81\x8d", "\xe3\x81\x97", "\xe3\x81\xa1",
 440  		"\xe3\x81\xab", "\xe3\x81\xb2", "\xe3\x81\xbf", "\xe3\x82\x8a",
 441  		"\xe3\x81\x8e", "\xe3\x81\x98", "\xe3\x81\xa2", "\xe3\x81\xb3", "\xe3\x81\xb4":
 442  		return true
 443  	}
 444  	return false
 445  }
 446  
 447  func isERowOrIRowKana(s string) bool {
 448  	return isERowKana(s) || isIRowKana(s)
 449  }
 450  
 451  // stripJACopula identifies sentence-final copula forms on a noun-final token
 452  // (彼は学生だ, 学生でした). Returns (atom-with-copula-stripped, true, morph-bits)
 453  // or (input, false, 0) if no copula form found.
 454  //
 455  // Copula forms (longest first to avoid premature short matches):
 456  //   でした → past + polite
 457  //   だった → past + plain
 458  //   です   → non-past + polite
 459  //   だ     → non-past + plain
 460  //
 461  // Caller must verify this is NOT a verb context (verb past-tense た/だ is a
 462  // separate suffix on conjugated verbs; this only fires on noun-final tokens).
 463  func stripJACopula(word string) (string, bool, uint16) {
 464  	b := []byte(word)
 465  	// でした = E3 81 A7 E3 81 97 E3 81 9F (9 bytes)
 466  	deshita := []byte("\xe3\x81\xa7\xe3\x81\x97\xe3\x81\x9f")
 467  	if len(b) > len(deshita) && string(b[len(b)-len(deshita):]) == string(deshita) {
 468  		return string(b[:len(b)-len(deshita)]), true, MetaTensePast | MetaFormalityPol
 469  	}
 470  	// だった = E3 81 A0 E3 81 A3 E3 81 9F (9 bytes)
 471  	datta := []byte("\xe3\x81\xa0\xe3\x81\xa3\xe3\x81\x9f")
 472  	if len(b) > len(datta) && string(b[len(b)-len(datta):]) == string(datta) {
 473  		return string(b[:len(b)-len(datta)]), true, MetaTensePast
 474  	}
 475  	// です = E3 81 A7 E3 81 99 (6 bytes)
 476  	desu := []byte("\xe3\x81\xa7\xe3\x81\x99")
 477  	if len(b) > len(desu) && string(b[len(b)-len(desu):]) == string(desu) {
 478  		return string(b[:len(b)-len(desu)]), true, MetaFormalityPol
 479  	}
 480  	// だ = E3 81 A0 (3 bytes). Bare だ is ambiguous with verb past contractions
 481  	// (噛んだ, 飛んだ, 死んだ for godan-mu/bu/nu). Only treat as copula if the
 482  	// preceding character is not a verb-contraction signal kana (ん or っ).
 483  	da := []byte("\xe3\x81\xa0")
 484  	if len(b) > len(da) && string(b[len(b)-len(da):]) == string(da) {
 485  		stem := b[:len(b)-len(da)]
 486  		if len(stem) >= 3 {
 487  			tail := stem[len(stem)-3:]
 488  			// ん is E3 82 93, っ is E3 81 A3 - verb past contractions.
 489  			if string(tail) == "\xe3\x82\x93" || string(tail) == "\xe3\x81\xa3" {
 490  				return word, false, 0
 491  			}
 492  		}
 493  		return string(stem), true, 0
 494  	}
 495  	return word, false, 0
 496  }
 497  
 498  // stripJAIAdj strips i-adjective inflection suffixes and returns the kanji
 499  // stem. Returns (stem, morph, true) on success. Requires the stem to contain
 500  // at least one kanji character to avoid false positives on pure-kana words.
 501  //
 502  // Forms (longest first):
 503  //   くなかった -> stem, neg+past    (赤くなかった -> 赤)
 504  //   かった    -> stem, past         (赤かった -> 赤)
 505  //   くない    -> stem, neg          (赤くない -> 赤)
 506  //   ければ    -> stem, conditional  (赤ければ -> 赤)
 507  //   くて      -> stem, te-form      (赤くて -> 赤)
 508  //   く        -> stem, adverbial    (赤く -> 赤)
 509  //   さ        -> stem, nominal      (赤さ -> 赤)
 510  //   い        -> stem, dict         (赤い -> 赤)
 511  func stripJAIAdj(b []byte) ([]byte, uint16, bool) {
 512  	var stem []byte
 513  	var morph uint16
 514  
 515  	// Only strip compound suffixes that are unambiguously i-adjective.
 516  	// Bare い/く/さ create false positives on godan-ku verbs (行く,書く)
 517  	// and other non-adjective words.
 518  
 519  	// くなかった = く(E3 81 8F) + な(E3 81 AA) + か(E3 81 8B) + っ(E3 81 A3) + た(E3 81 9F) = 15 bytes
 520  	if len(b) > 15 && hasSuffix(b, "\xe3\x81\x8f\xe3\x81\xaa\xe3\x81\x8b\xe3\x81\xa3\xe3\x81\x9f") {
 521  		stem = b[:len(b)-15]
 522  		morph = MetaPolarNeg | MetaTensePast
 523  	} else if len(b) > 12 && hasSuffix(b, "\xe3\x81\x8b\xe3\x81\xa3\xe3\x81\x9f") {
 524  		// かった = か(E3 81 8B) + っ(E3 81 A3) + た(E3 81 9F) = 9 bytes
 525  		stem = b[:len(b)-9]
 526  		morph = MetaTensePast
 527  	} else if len(b) > 9 && hasSuffix(b, "\xe3\x81\x8f\xe3\x81\xaa\xe3\x81\x84") {
 528  		// くない = く(E3 81 8F) + な(E3 81 AA) + い(E3 81 84) = 9 bytes
 529  		stem = b[:len(b)-9]
 530  		morph = MetaPolarNeg
 531  	} else if len(b) > 9 && hasSuffix(b, "\xe3\x81\x91\xe3\x82\x8c\xe3\x81\xb0") {
 532  		// ければ = け(E3 81 91) + れ(E3 82 8C) + ば(E3 81 B0) = 9 bytes
 533  		stem = b[:len(b)-9]
 534  		morph = 0
 535  	} else if len(b) > 6 && hasSuffix(b, "\xe3\x81\x8f\xe3\x81\xa6") {
 536  		// くて = く(E3 81 8F) + て(E3 81 A6) = 6 bytes
 537  		stem = b[:len(b)-6]
 538  		morph = MetaAspectProg
 539  	} else {
 540  		return nil, 0, false
 541  	}
 542  
 543  	if len(stem) < 3 {
 544  		return nil, 0, false
 545  	}
 546  	if !hasKanji(stem) {
 547  		return nil, 0, false
 548  	}
 549  	return stem, morph, true
 550  }
 551  
 552  func hasKanji(b []byte) bool {
 553  	for i := 0; i+2 < len(b); {
 554  		if b[i] >= 0xe4 && b[i] <= 0xe9 {
 555  			return true
 556  		}
 557  		if b[i] < 0x80 {
 558  			i++
 559  		} else if b[i] < 0xe0 {
 560  			i += 2
 561  		} else if b[i] < 0xf0 {
 562  			i += 3
 563  		} else {
 564  			i += 4
 565  		}
 566  	}
 567  	return false
 568  }
 569  
 570  // endsInIKana returns true if the last UTF-8 character of s is the hiragana い.
 571  // Used as a heuristic for i-adjective detection (赤い, 楽しい).
 572  // Known false positives: な-adjectives ending in い (きれい), nouns ending in
 573  // い (兄=ani; 桜井=name). The positional constraint (modifier immediately
 574  // before a noun, no intervening particle) at the call site filters most.
 575  func endsInIKana(s string) bool {
 576  	b := []byte(s)
 577  	if len(b) < 3 {
 578  		return false
 579  	}
 580  	// い is UTF-8 E3 81 84
 581  	tail := b[len(b)-3:]
 582  	return tail[0] == 0xe3 && tail[1] == 0x81 && tail[2] == 0x84
 583  }
 584  
 585  // isJADitransitive returns true if the lemma is a JA ditransitive verb
 586  // taking a に-marked recipient and a を-marked patient: 彼に本をあげる
 587  // (give him a book). When the clause's verb matches, に-marked slots flip
 588  // from ORGoal/ORLoc default to ORRecip. Closed set of JA verb lemmas
 589  // (post-lemmatization, so just the stem).
 590  func isJADitransitive(lemma string) bool {
 591  	switch lemma {
 592  	// あげ (give), やる→や, くれる→くれ, 渡 (hand), 送 (send), 教え (teach/tell),
 593  	// 見せ (show), 売 (sell), 買 (buy), 貸 (lend), 払 (pay), 与え (give/grant).
 594  	case "\xe3\x81\x82\xe3\x81\x92",
 595  		"\xe3\x82\x84",
 596  		"\xe3\x81\x8f\xe3\x82\x8c",
 597  		"\xe6\xb8\xa1",
 598  		"\xe9\x80\x81",
 599  		"\xe6\x95\x99\xe3\x81\x88",
 600  		"\xe8\xa6\x8b\xe3\x81\x9b",
 601  		"\xe5\xa3\xb2",
 602  		"\xe8\xb2\xb7",
 603  		"\xe8\xb2\xb8",
 604  		"\xe6\x89\x95",
 605  		"\xe4\xb8\x8e\xe3\x81\x88":
 606  		return true
 607  	}
 608  	return false
 609  }
 610  
 611  // isJATemporalNoun returns true if s is a temporal noun that functions as
 612  // a sentence-initial adverbial (today, yesterday, tomorrow, etc.). These
 613  // surface as bare nouns without a particle but semantically modify the
 614  // clause's verb. Closed set; treat as MKAdv adverbial during extraction.
 615  func isJATemporalNoun(s string) bool {
 616  	switch s {
 617  	// 昨日 (yesterday), 今日 (today), 明日 (tomorrow), 今 (now),
 618  	// 昨夜 (last night), 今朝 (this morning), 今晩 (tonight), 今夜 (tonight),
 619  	// 来週 (next week), 先週 (last week), 来月 (next month), 先月 (last month),
 620  	// 来年 (next year), 去年 (last year), 毎日 (every day), 毎週 (every week),
 621  	// 毎月 (every month), 毎年 (every year), 朝 (morning), 昼 (noon),
 622  	// 夜 (night), 夕方 (evening), いつ (when/always).
 623  	case "\xe6\x98\xa8\xe6\x97\xa5",
 624  		"\xe4\xbb\x8a\xe6\x97\xa5",
 625  		"\xe6\x98\x8e\xe6\x97\xa5",
 626  		"\xe4\xbb\x8a",
 627  		"\xe6\x98\xa8\xe5\xa4\x9c",
 628  		"\xe4\xbb\x8a\xe6\x9c\x9d",
 629  		"\xe4\xbb\x8a\xe6\x99\xa9",
 630  		"\xe4\xbb\x8a\xe5\xa4\x9c",
 631  		"\xe6\x9d\xa5\xe9\x80\xb1",
 632  		"\xe5\x85\x88\xe9\x80\xb1",
 633  		"\xe6\x9d\xa5\xe6\x9c\x88",
 634  		"\xe5\x85\x88\xe6\x9c\x88",
 635  		"\xe6\x9d\xa5\xe5\xb9\xb4",
 636  		"\xe5\x8e\xbb\xe5\xb9\xb4",
 637  		"\xe6\xaf\x8e\xe6\x97\xa5",
 638  		"\xe6\xaf\x8e\xe9\x80\xb1",
 639  		"\xe6\xaf\x8e\xe6\x9c\x88",
 640  		"\xe6\xaf\x8e\xe5\xb9\xb4",
 641  		"\xe6\x9c\x9d",
 642  		"\xe6\x98\xbc",
 643  		"\xe5\xa4\x9c",
 644  		"\xe5\xa4\x95\xe6\x96\xb9",
 645  		"\xe3\x81\x84\xe3\x81\xa4":
 646  		return true
 647  	}
 648  	return false
 649  }
 650  
 651  // isJARelationalNoun returns true if s is a relational noun used in the
 652  // の-relational-noun-に locative compound pattern (箱の中に = "in the box").
 653  // Closed set; expansions belong here.
 654  func isJARelationalNoun(s string) bool {
 655  	switch s {
 656  	// 中 (inside), 内 (within), 外 (outside), 上 (on/above), 下 (under),
 657  	// 前 (in front), 後 (behind), 横 (beside), 隣 (next to), 間 (between).
 658  	case "\xe4\xb8\xad",
 659  		"\xe5\x86\x85",
 660  		"\xe5\xa4\x96",
 661  		"\xe4\xb8\x8a",
 662  		"\xe4\xb8\x8b",
 663  		"\xe5\x89\x8d",
 664  		"\xe5\xbe\x8c",
 665  		"\xe6\xa8\xaa",
 666  		"\xe9\x9a\xa3",
 667  		"\xe9\x96\x93":
 668  		return true
 669  	}
 670  	return false
 671  }
 672  
 673  // jaRelationalNounToOblRole maps a relational noun to its semantic role.
 674  // Most map to ORLoc with implicit "inside" semantics for the EN renderer;
 675  // position-specific (on, under, in-front) would need a finer-grained
 676  // representation. For now all locative-compounds collapse to ORLoc.
 677  func jaRelationalNounToOblRole(s string) uint8 {
 678  	if isJARelationalNoun(s) {
 679  		return ORLoc
 680  	}
 681  	return ORNone
 682  }
 683  
 684  // endsInNaiSuffix returns true if s ends in ない (negative verb suffix).
 685  // Used to exclude negative-verb forms from predicative-i-adjective detection,
 686  // since ない ends in い (false-matching endsInIKana). な = E3 81 AA, い = E3 81 84.
 687  func endsInNaiSuffix(s string) bool {
 688  	b := []byte(s)
 689  	if len(b) < 6 {
 690  		return false
 691  	}
 692  	tail := b[len(b)-6:]
 693  	return tail[0] == 0xe3 && tail[1] == 0x81 && tail[2] == 0xaa &&
 694  		tail[3] == 0xe3 && tail[4] == 0x81 && tail[5] == 0x84
 695  }
 696  
 697  // endsInTaiSuffix returns true if s ends in たい (desiderative verb suffix).
 698  // Used to exclude desiderative verb forms (食べたい / want to eat) from
 699  // predicative-i-adjective detection. た = E3 81 9F, い = E3 81 84.
 700  func endsInTaiSuffix(s string) bool {
 701  	b := []byte(s)
 702  	if len(b) < 6 {
 703  		return false
 704  	}
 705  	tail := b[len(b)-6:]
 706  	return tail[0] == 0xe3 && tail[1] == 0x81 && tail[2] == 0x9f &&
 707  		tail[3] == 0xe3 && tail[4] == 0x81 && tail[5] == 0x84
 708  }
 709  
 710  // isJABareKanjiAdj returns true if s is a single-kanji color/size adjective
 711  // (赤, 白, 黒, 大, 小, 新, 古, 長, 高, 低). These appear as bare-kanji ATTR
 712  // modifiers in rendered JA when the い-suffix was not included.
 713  func isJABareKanjiAdj(s string) bool {
 714  	if len(s) != 3 {
 715  		return false
 716  	}
 717  	b := []byte(s)
 718  	c1, c2, c3 := b[0], b[1], b[2]
 719  	switch {
 720  	case c1 == 0xe8 && c2 == 0xb5 && c3 == 0xa4: // 赤
 721  	case c1 == 0xe7 && c2 == 0x99 && c3 == 0xbd: // 白
 722  	case c1 == 0xe9 && c2 == 0xbb && c3 == 0x92: // 黒
 723  	case c1 == 0xe5 && c2 == 0xa4 && c3 == 0xa7: // 大
 724  	case c1 == 0xe5 && c2 == 0xb0 && c3 == 0x8f: // 小
 725  	case c1 == 0xe6 && c2 == 0x96 && c3 == 0xb0: // 新
 726  	case c1 == 0xe5 && c2 == 0x8f && c3 == 0xa4: // 古
 727  	case c1 == 0xe9 && c2 == 0x95 && c3 == 0xb7: // 長
 728  	case c1 == 0xe9 && c2 == 0xab && c3 == 0x98: // 高
 729  	case c1 == 0xe4 && c2 == 0xbd && c3 == 0x8e: // 低
 730  	default:
 731  		return false
 732  	}
 733  	return true
 734  }
 735  
 736  // endsInTaKana returns true if the last UTF-8 character of s is the hiragana た.
 737  // Used to detect past-tense verb forms preceding a head noun (REL clause):
 738  // 食べた猫 (the cat that ate), 見た本 (the book I saw). The morphological
 739  // splitter exposes the boundary; this checks that the prior slot is a
 740  // past-tense verb. False positives: nouns ending in た are uncommon.
 741  func endsInTaKana(s string) bool {
 742  	b := []byte(s)
 743  	if len(b) < 3 {
 744  		return false
 745  	}
 746  	// た is UTF-8 E3 81 9F
 747  	tail := b[len(b)-3:]
 748  	return tail[0] == 0xe3 && tail[1] == 0x81 && tail[2] == 0x9f
 749  }
 750  
 751  // endsInKuKana returns true if the last UTF-8 character of s is the hiragana く.
 752  // Used as a heuristic for く-form adverbial detection: 速く (hayaku=quickly),
 753  // 高く (takaku=highly), 楽しく (tanoshiku=enjoyably). The く-form is the
 754  // adverbial inflection of an i-adjective and binds to the following verb.
 755  func endsInKuKana(s string) bool {
 756  	b := []byte(s)
 757  	if len(b) < 3 {
 758  		return false
 759  	}
 760  	// く is UTF-8 E3 81 8F
 761  	tail := b[len(b)-3:]
 762  	return tail[0] == 0xe3 && tail[1] == 0x81 && tail[2] == 0x8f
 763  }
 764  
 765  // stripJAConjugation matches the longest known conjugation suffix.
 766  // Returns (remaining bytes, morph bits, stripped flag).
 767  //
 768  // Order matters: longest forms first. Passive (れる/られる) and causative
 769  // (せる/させる) must be checked before simpler suffix endings to avoid
 770  // premature shorter matches.
 771  func stripJAConjugation(b []byte) ([]byte, uint16, bool) {
 772  	// Passive/causative past combinations.
 773  	// Guard: if the stem after stripping is bare kana (isBareSuffix), the
 774  	// match is likely an ichidan verb whose stem ends in れ/せ colliding
 775  	// with the passive/causative suffix (くれた = くれ+た, not く+れた).
 776  	// Reject and fall through to simpler suffixes.
 777  	if rest, ok := stripJASuffix(b, jaSeraretaPast); ok && !isBareSuffix(rest) {
 778  		return rest, MetaCausative | MetaPassive | MetaTensePast, true
 779  	}
 780  	if rest, ok := stripJASuffix(b, jaRaretaPast); ok && !isBareSuffix(rest) {
 781  		return rest, MetaPassive | MetaTensePast, true
 782  	}
 783  	if rest, ok := stripJASuffix(b, jaSasetaPast); ok && !isBareSuffix(rest) {
 784  		return rest, MetaCausative | MetaTensePast, true
 785  	}
 786  	// Passive/causative non-past
 787  	if rest, ok := stripJASuffix(b, jaSerareru); ok && !isBareSuffix(rest) {
 788  		return rest, MetaCausative | MetaPassive, true
 789  	}
 790  	if rest, ok := stripJASuffix(b, jaRareru); ok && !isBareSuffix(rest) {
 791  		return rest, MetaPassive, true
 792  	}
 793  	if rest, ok := stripJASuffix(b, jaSaseru); ok && !isBareSuffix(rest) {
 794  		return rest, MetaCausative, true
 795  	}
 796  	if rest, ok := stripJASuffix(b, jaReru); ok && !isBareSuffix(rest) {
 797  		return rest, MetaPassive, true
 798  	}
 799  	if rest, ok := stripJASuffix(b, jaSeru); ok && !isBareSuffix(rest) {
 800  		return rest, MetaCausative, true
 801  	}
 802  	if rest, ok := stripJASuffix(b, jaRetaPast); ok && !isBareSuffix(rest) {
 803  		return rest, MetaPassive | MetaTensePast, true
 804  	}
 805  	if rest, ok := stripJASuffix(b, jaSetaPast); ok && !isBareSuffix(rest) {
 806  		return rest, MetaCausative | MetaTensePast, true
 807  	}
 808  	if rest, ok := stripJASuffix(b, jaNakatta); ok {
 809  		return rest, MetaPolarNeg | MetaTensePast, true
 810  	}
 811  	// Desiderative (たい family) - before ない/た to avoid partial match.
 812  	if rest, ok := stripJASuffix(b, jaTakunai); ok {
 813  		return rest, MetaMoodVol | MetaPolarNeg, true
 814  	}
 815  	if rest, ok := stripJASuffix(b, jaTakatta); ok {
 816  		return rest, MetaMoodVol | MetaTensePast, true
 817  	}
 818  	if rest, ok := stripJASuffix(b, jaTai); ok {
 819  		return rest, MetaMoodVol, true
 820  	}
 821  	if rest, ok := stripJASuffix(b, jaMashita); ok {
 822  		return rest, MetaTensePast | MetaFormalityPol, true
 823  	}
 824  	if rest, ok := stripJASuffix(b, jaMasen); ok {
 825  		return rest, MetaPolarNeg | MetaFormalityPol, true
 826  	}
 827  	if rest, ok := stripJASuffix(b, jaTeiru); ok {
 828  		return rest, MetaAspectProg, true
 829  	}
 830  	if rest, ok := stripJASuffix(b, jaTeita); ok {
 831  		return rest, MetaAspectProg | MetaTensePast, true
 832  	}
 833  	if rest, ok := stripJASuffix(b, jaDeiru); ok {
 834  		return rest, MetaAspectProg, true
 835  	}
 836  	if rest, ok := stripJASuffix(b, jaDeita); ok {
 837  		return rest, MetaAspectProg | MetaTensePast, true
 838  	}
 839  	if rest, ok := stripJASuffix(b, jaMasu); ok {
 840  		return rest, MetaFormalityPol, true
 841  	}
 842  	if rest, ok := stripJASuffix(b, jaNai); ok {
 843  		return rest, MetaPolarNeg, true
 844  	}
 845  	// Godan volitional compounds (longest first; each is お-row kana + う).
 846  	// These must match before bare う suffixes or particles.
 847  	if rest, ok := stripJASuffix(b, jaKoU); ok {
 848  		return rest, MetaMoodVol, true
 849  	}
 850  	if rest, ok := stripJASuffix(b, jaGoU); ok {
 851  		return rest, MetaMoodVol, true
 852  	}
 853  	if rest, ok := stripJASuffix(b, jaSoU); ok {
 854  		return rest, MetaMoodVol, true
 855  	}
 856  	if rest, ok := stripJASuffix(b, jaToU); ok {
 857  		return rest, MetaMoodVol, true
 858  	}
 859  	if rest, ok := stripJASuffix(b, jaNoU); ok {
 860  		return rest, MetaMoodVol, true
 861  	}
 862  	if rest, ok := stripJASuffix(b, jaBoU); ok {
 863  		return rest, MetaMoodVol, true
 864  	}
 865  	if rest, ok := stripJASuffix(b, jaMoU); ok {
 866  		return rest, MetaMoodVol, true
 867  	}
 868  	if rest, ok := stripJASuffix(b, jaRoU); ok {
 869  		return rest, MetaMoodVol, true
 870  	}
 871  	if rest, ok := stripJASuffix(b, jaOU); ok {
 872  		return rest, MetaMoodVol, true
 873  	}
 874  	if rest, ok := stripJASuffix(b, jaYou); ok {
 875  		return rest, MetaMoodVol, true
 876  	}
 877  	if rest, ok := stripJASuffix(b, jaTe); ok {
 878  		return rest, MetaAspectProg, true
 879  	}
 880  	if rest, ok := stripJASuffix(b, jaDe); ok {
 881  		return rest, MetaAspectProg, true
 882  	}
 883  	if rest, ok := stripJASuffix(b, jaTa); ok {
 884  		return rest, MetaTensePast, true
 885  	}
 886  	if rest, ok := stripJASuffix(b, jaDa); ok {
 887  		return rest, MetaTensePast, true
 888  	}
 889  	if rest, ok := stripJASuffix(b, jaBa); ok {
 890  		// ば: provisional conditional. Strip from lemma; the ClauseIf
 891  		// relation lives at the discourse layer, not on the verb morph.
 892  		return rest, 0, true
 893  	}
 894  	return b, 0, false
 895  }
 896  
 897  const (
 898  	jaTeiru   = "\xe3\x81\xa6\xe3\x81\x84\xe3\x82\x8b"             // ている
 899  	jaTeita   = "\xe3\x81\xa6\xe3\x81\x84\xe3\x81\x9f"             // ていた
 900  	jaDeiru   = "\xe3\x81\xa7\xe3\x81\x84\xe3\x82\x8b"             // でいる
 901  	jaDeita   = "\xe3\x81\xa7\xe3\x81\x84\xe3\x81\x9f"             // でいた
 902  	jaTai     = "\xe3\x81\x9f\xe3\x81\x84"                         // たい
 903  	jaTakatta = "\xe3\x81\x9f\xe3\x81\x8b\xe3\x81\xa3\xe3\x81\x9f" // たかった
 904  	jaTakunai = "\xe3\x81\x9f\xe3\x81\x8f\xe3\x81\xaa\xe3\x81\x84" // たくない
 905  	jaNai     = "\xe3\x81\xaa\xe3\x81\x84"                         // ない
 906  	jaNakatta = "\xe3\x81\xaa\xe3\x81\x8b\xe3\x81\xa3\xe3\x81\x9f" // なかった
 907  	jaMasu    = "\xe3\x81\xbe\xe3\x81\x99"                         // ます
 908  	jaMashita = "\xe3\x81\xbe\xe3\x81\x97\xe3\x81\x9f"             // ました
 909  	jaMasen   = "\xe3\x81\xbe\xe3\x81\x9b\xe3\x82\x93"             // ません
 910  	jaYou     = "\xe3\x82\x88\xe3\x81\x86"                         // よう
 911  	jaTe      = "\xe3\x81\xa6"                                     // て
 912  	jaDe      = "\xe3\x81\xa7"                                     // で
 913  	jaTa      = "\xe3\x81\x9f"                                     // た
 914  	jaDa      = "\xe3\x81\xa0"                                     // だ
 915  	jaBa      = "\xe3\x81\xb0"                                     // ば (conditional)
 916  
 917  	// Passive: れる/られる
 918  	jaRareru     = "\xe3\x82\x89\xe3\x82\x8c\xe3\x82\x8b"                         // られる
 919  	jaReru       = "\xe3\x82\x8c\xe3\x82\x8b"                                     // れる
 920  	jaRaretaPast = "\xe3\x82\x89\xe3\x82\x8c\xe3\x81\x9f"                         // られた
 921  	jaRetaPast   = "\xe3\x82\x8c\xe3\x81\x9f"                                     // れた
 922  
 923  	// Causative: せる/させる
 924  	jaSaseru      = "\xe3\x81\x95\xe3\x81\x9b\xe3\x82\x8b"                         // させる
 925  	jaSeru        = "\xe3\x81\x9b\xe3\x82\x8b"                                     // せる
 926  	jaSasetaPast  = "\xe3\x81\x95\xe3\x81\x9b\xe3\x81\x9f"                        // させた
 927  	jaSetaPast    = "\xe3\x81\x9b\xe3\x81\x9f"                                     // せた
 928  
 929  	// Causative-passive: させられる
 930  	jaSerareru     = "\xe3\x81\x95\xe3\x81\x9b\xe3\x82\x89\xe3\x82\x8c\xe3\x82\x8b" // させられる
 931  	jaSeraretaPast = "\xe3\x81\x95\xe3\x81\x9b\xe3\x82\x89\xe3\x82\x8c\xe3\x81\x9f" // させられた
 932  
 933  	// Volitional compounds
 934  	jaKoU = "\xe3\x81\x93\xe3\x81\x86" // こう (godan-ku)
 935  	jaGoU = "\xe3\x81\x94\xe3\x81\x86" // ごう (godan-gu)
 936  	jaSoU = "\xe3\x81\x9d\xe3\x81\x86" // そう (godan-su)
 937  	jaToU = "\xe3\x81\xa8\xe3\x81\x86" // とう (godan-tsu)
 938  	jaNoU = "\xe3\x81\xae\xe3\x81\x86" // のう (godan-nu)
 939  	jaBoU = "\xe3\x81\xbc\xe3\x81\x86" // ぼう (godan-bu)
 940  	jaMoU = "\xe3\x82\x82\xe3\x81\x86" // もう (godan-mu)
 941  	jaRoU = "\xe3\x82\x8d\xe3\x81\x86" // ろう (godan-ru)
 942  	jaOU  = "\xe3\x81\x8a\xe3\x81\x86" // おう (godan-u)
 943  )
 944  
 945  func stripJASuffix(word, suffix []byte) ([]byte, bool) {
 946  	if len(word) <= len(suffix) {
 947  		return nil, false
 948  	}
 949  	tail := word[len(word)-len(suffix):]
 950  	if string(tail) == string(suffix) {
 951  		return word[:len(word)-len(suffix)], true
 952  	}
 953  	return nil, false
 954  }
 955  
 956  
 957