translate.mx raw

   1  package transdb
   2  
   3  import (
   4  	"git.smesh.lol/iskradb/lattice"
   5  	"git.smesh.lol/transdb/fuzzy"
   6  )
   7  
   8  // FormFromInline extracts the surface form stored in Record.Inline.
   9  // Byte 23 holds the inline length (0 = overflow, data in pool).
  10  func FormFromInline(rec *lattice.Record, pool []byte) string {
  11  	n := int(rec.Inline[23])
  12  	if n > 0 && n <= 23 {
  13  		return string(rec.Inline[:n])
  14  	}
  15  	if rec.DataFile == 1 && rec.DataLen > 0 {
  16  		end := rec.DataOff + rec.DataLen
  17  		if int(end) <= len(pool) {
  18  			return string(pool[rec.DataOff:end])
  19  		}
  20  	}
  21  	return ""
  22  }
  23  
  24  // SetFormOnRecord stores the surface form in Record.Inline (up to 23 bytes)
  25  // or overflows into pool when longer.
  26  func SetFormOnRecord(rec *lattice.Record, form string, pool *[]byte) {
  27  	b := []byte(form)
  28  	if len(b) <= 23 {
  29  		copy(rec.Inline[:], b)
  30  		rec.Inline[23] = byte(len(b))
  31  		rec.DataFile = 0
  32  	} else {
  33  		copy(rec.Inline[:23], b[:23])
  34  		rec.Inline[23] = 0
  35  		rec.DataFile = 1
  36  		rec.DataOff = uint32(len(*pool))
  37  		rec.DataLen = uint32(len(b))
  38  		*pool = append(*pool, b...)
  39  	}
  40  }
  41  
  42  // defaultBranchOrder uses actual branch indices (Bnoun=1, Bverb=3, Bmodifier=4).
  43  var defaultBranchOrder = [3]uint8{uint8(lattice.Bnoun), uint8(lattice.Bverb), uint8(lattice.Bmodifier)}
  44  
  45  // lookupByKey finds all translation candidates for a pre-computed key,
  46  // searching branches in the given order.
  47  func lookupByKey(tree *lattice.Tree, pool []byte, key lattice.Key, order [3]uint8) []string {
  48  	var results []string
  49  	for _, b := range order {
  50  		ri := tree.LookupRecIdx(lattice.Branch(b), key)
  51  		if ri == lattice.NullRec {
  52  			continue
  53  		}
  54  		rec := tree.GetRecord(ri)
  55  		if rec == nil {
  56  			continue
  57  		}
  58  		if rec.Link[0] != lattice.NullRec {
  59  			if dst := tree.GetRecord(rec.Link[0]); dst != nil {
  60  				if form := FormFromInline(dst, pool); form != "" {
  61  					results = appendUniq(results, form)
  62  				}
  63  			}
  64  		}
  65  		if rec.Link[1] != lattice.NullRec {
  66  			if dst := tree.GetRecord(rec.Link[1]); dst != nil {
  67  				if form := FormFromInline(dst, pool); form != "" {
  68  					results = appendUniq(results, form)
  69  				}
  70  			}
  71  		}
  72  		break
  73  	}
  74  	return results
  75  }
  76  
  77  // jaRecordBranch returns the branch of the coord=0 JA record for tok, or 255 if not found.
  78  func jaRecordBranch(tree *lattice.Tree, tok string) uint8 {
  79  	key := MakeKey(LangJA, 0, tok)
  80  	for _, b := range ActiveBranches {
  81  		if tree.LookupRecIdx(b, key) != lattice.NullRec {
  82  			return uint8(b)
  83  		}
  84  	}
  85  	return 255
  86  }
  87  
  88  // LookupWord finds all translation candidates for a single word token (coord=0).
  89  func LookupWord(tree *lattice.Tree, pool []byte, word string, srcLang uint8) []string {
  90  	return lookupByKey(tree, pool, MakeKey(srcLang, 0, word), defaultBranchOrder)
  91  }
  92  
  93  // LookupWordCtx finds translations using the 22-bit coordinate.
  94  // Tries each coordinate in the relaxation sequence (most specific → least specific).
  95  // For JA source, branch order is derived from the cooccurrence axis.
  96  func LookupWordCtx(tree *lattice.Tree, pool []byte, word string, srcLang uint8, coord uint64) []string {
  97  	order := defaultBranchOrder
  98  	if srcLang == LangJA {
  99  		order = branchOrderJA(coord)
 100  	}
 101  	for _, c := range RelaxCoord(coord) {
 102  		if results := lookupByKey(tree, pool, MakeKey(srcLang, c, word), order); len(results) > 0 {
 103  			return results
 104  		}
 105  	}
 106  	return nil
 107  }
 108  
 109  // jaRole constants for syntactic role assignment.
 110  const (
 111  	jaRoleNone = uint8(0)
 112  	jaRoleSubj = uint8(1) // は が
 113  	jaRoleObj  = uint8(2) // を
 114  	jaRoleVerb = uint8(3)
 115  	jaRoleMisc = uint8(4) // everything else
 116  )
 117  
 118  // jaRoleParticle maps particle strings to syntactic roles.
 119  // Only subject (は/が) and object (を) get specific roles;
 120  // other particles collapse to misc.
 121  var jaRoleParticle = map[string]uint8{
 122  	"は": jaRoleSubj, "が": jaRoleSubj,
 123  	"を": jaRoleObj,
 124  }
 125  
 126  // Translate tokenizes text in srcLang and translates each token to dstLang.
 127  // For JA→EN, applies particle-based role assignment and SOV→SVO reordering.
 128  // Tokens with no translation are passed through unchanged.
 129  func Translate(tree *lattice.Tree, pool []byte, idx *fuzzy.DualIndex,
 130  	text string, srcLang, dstLang uint8, verbose bool) string {
 131  
 132  	var tokens []string
 133  	switch srcLang {
 134  	case LangEN:
 135  		tokens = TokenizeEN(text)
 136  	case LangJA:
 137  		tokens = TokenizeJA(text, tree, verbose)
 138  	default:
 139  		tokens = TokenizeEN(text)
 140  	}
 141  
 142  	if srcLang == LangJA && dstLang == LangEN {
 143  		return translateJAToEN(tree, pool, idx, tokens, verbose)
 144  	}
 145  	return translateTokens(tree, pool, idx, tokens, srcLang, dstLang, verbose)
 146  }
 147  
 148  // translateJAToEN handles JA→EN with two-zone SOV→SVO reordering.
 149  //
 150  // Zone split: は/が divides the sentence into subject zone and predicate zone.
 151  // Within the predicate zone, verb tokens are pulled to the front:
 152  //   SUBJ_ZONE + VERB(s) + REST_OF_PRED_ZONE
 153  //
 154  // This preserves modifier attachment (天皇の歴史的責任感 stays together as the
 155  // subject) while achieving SVO word order for the core clause.
 156  func translateJAToEN(tree *lattice.Tree, pool []byte, idx *fuzzy.DualIndex,
 157  	tokens []string, verbose bool) string {
 158  
 159  	n := len(tokens)
 160  
 161  	// isSkipToken: pure-hiragana particles and copulae get no EN output.
 162  	isSkip := func(tok string) bool {
 163  		if !isPureHiragana(tok) {
 164  			return false
 165  		}
 166  		jaKey := MakeKey(LangJA, 0, tok)
 167  		return tree.LookupRecIdx(lattice.Bmodifier, jaKey) != lattice.NullRec || jaFunctionWord[tok]
 168  	}
 169  
 170  	// lookupMorph returns the translation and MorphState for a JA token.
 171  	// Uses RelaxCoord: tries most-specific coord first, falls back toward coord=0.
 172  	lookupMorph := func(tok string, coord uint64) (string, uint8) {
 173  		order := branchOrderJA(coord)
 174  		for _, c := range RelaxCoord(coord) {
 175  			key := MakeKey(LangJA, c, tok)
 176  			for _, b := range order {
 177  				ri := tree.LookupRecIdx(lattice.Branch(b), key)
 178  				if ri == lattice.NullRec {
 179  					continue
 180  				}
 181  				rec := tree.GetRecord(ri)
 182  				if rec == nil {
 183  					continue
 184  				}
 185  				state := GetMorphState(rec)
 186  				if rec.Link[0] != lattice.NullRec {
 187  					if dst := tree.GetRecord(rec.Link[0]); dst != nil {
 188  						if form := FormFromInline(dst, pool); form != "" {
 189  							return form, state
 190  						}
 191  					}
 192  				}
 193  				break
 194  			}
 195  		}
 196  		return "", 0
 197  	}
 198  
 199  	// translateTok: translate a single JA token using the 22-bit coordinate.
 200  	// The coord encodes both cooccurrence context (prev/next word types) and
 201  	// the morphological state inferred from the token's surface form.
 202  	translateTok := func(i int, tok string) string {
 203  		var prevType, nextType uint8
 204  		if i > 0 {
 205  			prevType = POSTypeFor(POSForWord(tree, LangJA, tokens[i-1]))
 206  		}
 207  		if i+1 < n {
 208  			nextType = POSTypeFor(POSForWord(tree, LangJA, tokens[i+1]))
 209  		}
 210  		morphState := uint64(inferMorphState(tok))
 211  		coord := PackCoord(0, 0, CoordCooccur(prevType, nextType), morphState, 0, 0, 0)
 212  
 213  		if en, state := lookupMorph(tok, coord); en != "" {
 214  			return applyMorphEN(en, state)
 215  		}
 216  
 217  		// Fuzzy fallback.
 218  		if idx != nil {
 219  			var corrected string
 220  			var wasCorrected bool
 221  			var candidates []string
 222  			candidates, corrected, wasCorrected = FuzzyLookupWord(tree, pool, idx, tok, LangJA, 2)
 223  			if verbose && wasCorrected {
 224  				println("fuzzy:", tok, "→", corrected)
 225  			}
 226  			for _, c := range candidates {
 227  				return applyMorphEN(c, 0)
 228  			}
 229  			_ = corrected
 230  		}
 231  
 232  		// verbStems fallback for forms not in lattice.
 233  		if stems := verbStems(tok); len(stems) > 0 {
 234  			for _, stem := range stems {
 235  				stemCoord := PackCoord(0, 0, CoordCooccur(prevType, nextType), morphState, 0, 0, 0)
 236  				if en, _ := lookupMorph(stem, stemCoord); en != "" {
 237  					return applyMorphEN(en, uint8(morphState))
 238  				}
 239  			}
 240  		}
 241  		return tok
 242  	}
 243  
 244  	// Find the first は/が boundary to split subject zone from predicate zone.
 245  	// subjEnd is the index of the は/が particle itself.
 246  	subjEnd := -1
 247  	for i, tok := range tokens {
 248  		if tok == "は" || tok == "が" {
 249  			if isPureHiragana(tok) {
 250  				subjEnd = i
 251  				break
 252  			}
 253  		}
 254  	}
 255  
 256  	// Translate all tokens in JA order, tagging each as subj/verb/pred.
 257  	type word struct {
 258  		en   string
 259  		isV  bool
 260  	}
 261  	var subjWords, predVerbs, predRest []word
 262  
 263  	for i, tok := range tokens {
 264  		if isSkip(tok) {
 265  			continue
 266  		}
 267  		en := translateTok(i, tok)
 268  		if en == "" {
 269  			continue
 270  		}
 271  		w := word{en, isJAVerb(tree, tok)}
 272  		if subjEnd >= 0 && i < subjEnd {
 273  			subjWords = append(subjWords, w)
 274  		} else if w.isV {
 275  			predVerbs = append(predVerbs, w)
 276  		} else {
 277  			predRest = append(predRest, w)
 278  		}
 279  	}
 280  
 281  	// Emit: SUBJ + VERB + REST_OF_PRED (preserves modifier order within each zone).
 282  	var out []byte
 283  	first := true
 284  	emit := func(en string) {
 285  		if !first {
 286  			out = append(out, ' ')
 287  		}
 288  		out = append(out, []byte(en)...)
 289  		first = false
 290  	}
 291  	for _, w := range subjWords {
 292  		emit(w.en)
 293  	}
 294  	for _, w := range predVerbs {
 295  		emit(w.en)
 296  	}
 297  	for _, w := range predRest {
 298  		emit(w.en)
 299  	}
 300  	return string(out)
 301  }
 302  
 303  // translateTokens handles EN→JA and same-language translation (no reordering).
 304  // For EN→JA: operator tokens ("did", "not", "apparently" etc.) accumulate
 305  // morphstate bits and are consumed without output; the next verb is looked up
 306  // at the resulting morphstate in the JA cluster.
 307  func translateTokens(tree *lattice.Tree, pool []byte, idx *fuzzy.DualIndex,
 308  	tokens []string, srcLang, dstLang uint8, verbose bool) string {
 309  
 310  	var out []byte
 311  	pendingMorph := uint8(0)       // accumulated operator bits waiting for a verb
 312  	progressiveAux := uint8(0xFF)  // 0xFF = none; otherwise tense bits from is/was/were
 313  	subjectSemFlags := uint64(0)   // semantic flags from subject nouns seen so far
 314  
 315  	for i, tok := range tokens {
 316  		// EN→JA: detect operator tokens (morphstate walk instructions).
 317  		if srcLang == LangEN && dstLang == LangJA {
 318  			if bits, ok := enOperators[tok]; ok {
 319  				pendingMorph |= bits
 320  				continue // operator consumed, no output
 321  			}
 322  			// Progressive auxiliary: "is/am/are/was/were" before a verb+ing.
 323  			if tenseBits, ok := enProgressiveAux[tok]; ok {
 324  				progressiveAux = tenseBits
 325  				continue
 326  			}
 327  			// Detect "-ing" suffix on a verb when progressive aux is pending.
 328  			if progressiveAux != 0xFF && len(tok) > 3 && tok[len(tok)-3:] == "ing" {
 329  				pendingMorph |= (1 << 3) | progressiveAux // aspect + tense
 330  				progressiveAux = 0xFF
 331  				// Strip "ing" to get base verb for lookup.
 332  				tok = tok[:len(tok)-3]
 333  			}
 334  		}
 335  
 336  		var candidates []string
 337  		corrected := tok
 338  
 339  		var prevType, nextType uint8
 340  		if i > 0 {
 341  			prevType = POSTypeFor(POSForWord(tree, srcLang, tokens[i-1]))
 342  		}
 343  		if i+1 < len(tokens) {
 344  			nextType = POSTypeFor(POSForWord(tree, srcLang, tokens[i+1]))
 345  		}
 346  
 347  		// Accumulate semantic flags from subject nouns for verb disambiguation.
 348  		// Read flags from the noun's base record DataFile (O(1), no coord scan).
 349  		if srcLang == LangEN && dstLang == LangJA {
 350  			curType := POSTypeFor(POSForWord(tree, srcLang, tok))
 351  			if curType == CooccurNominal { // it's a noun in the EN lattice
 352  				key := MakeKey(LangEN, 0, tok)
 353  				for _, b := range ActiveBranches {
 354  					if ri := tree.LookupRecIdx(b, key); ri != lattice.NullRec {
 355  						if rec := tree.GetRecord(ri); rec != nil {
 356  							subjectSemFlags |= GetSemanticFromDataFile(rec)
 357  						}
 358  						break
 359  					}
 360  				}
 361  			}
 362  		}
 363  
 364  		coord := PackCoord(subjectSemFlags, 0, CoordCooccur(prevType, nextType), 0, 0, 0, 0)
 365  
 366  		if idx != nil {
 367  			var wasCorrected bool
 368  			candidates, corrected, wasCorrected = FuzzyLookupWord(tree, pool, idx, tok, srcLang, 2)
 369  			if verbose && wasCorrected {
 370  				println("fuzzy: corrected", tok, "→", corrected)
 371  			}
 372  			if len(candidates) > 0 && coord != 0 {
 373  				if ctxCands := LookupWordCtx(tree, pool, corrected, srcLang, coord); len(ctxCands) > 0 {
 374  					candidates = ctxCands
 375  				}
 376  			}
 377  		} else {
 378  			candidates = LookupWordCtx(tree, pool, tok, srcLang, coord)
 379  		}
 380  
 381  		var translated string
 382  
 383  		// EN→JA: use lookupENToJA to get JA base + EN record's own MorphState.
 384  		// Combine with pendingMorph (accumulated operator bits) for the target state.
 385  		// Handles both synthetic ("sang" has MorphState=16) and analytical ("did"+"sing").
 386  		if srcLang == LangEN && dstLang == LangJA {
 387  			jaBase, enMorphState := lookupENToJA(tree, pool, corrected, coord)
 388  			targetState := pendingMorph | enMorphState
 389  			if jaBase != "" && targetState != 0 {
 390  				if targetForm := lookupJAAtMorphState(tree, pool, jaBase, targetState); targetForm != "" {
 391  					translated = targetForm
 392  				} else {
 393  					translated = jaBase
 394  				}
 395  				pendingMorph = 0
 396  			} else if jaBase != "" {
 397  				translated = jaBase
 398  				pendingMorph = 0
 399  			}
 400  		}
 401  
 402  		if translated == "" {
 403  			for _, c := range candidates {
 404  				translated = c
 405  				break
 406  			}
 407  		}
 408  		if translated == "" {
 409  			translated = tok
 410  		}
 411  		if len(out) > 0 && dstLang == LangEN {
 412  			out = append(out, ' ')
 413  		}
 414  		out = append(out, []byte(translated)...)
 415  	}
 416  	return string(out)
 417  }
 418  
 419  // lookupENToJA finds the JA base form and the EN record's MorphState for a
 420  // given EN token. Tries the word as-is, then with "to " prefix (JMdict gloss
 421  // format). The MorphState on the EN record drives JA cluster navigation:
 422  // "sang" has MorphState=16 pointing to 歌う, so we navigate to 歌った.
 423  func lookupENToJA(tree *lattice.Tree, pool []byte, word string, coord uint64) (jaBase string, morphState uint8) {
 424  	order := defaultBranchOrder
 425  	for _, tryWord := range []string{word, "to " | word} {
 426  		for _, c := range RelaxCoord(coord) {
 427  			key := MakeKey(LangEN, c, tryWord)
 428  			for _, b := range order {
 429  				ri := tree.LookupRecIdx(lattice.Branch(b), key)
 430  				if ri == lattice.NullRec {
 431  					continue
 432  				}
 433  				rec := tree.GetRecord(ri)
 434  				if rec == nil {
 435  					continue
 436  				}
 437  				state := GetMorphState(rec)
 438  				if rec.Link[0] == lattice.NullRec {
 439  					break
 440  				}
 441  				dst := tree.GetRecord(rec.Link[0])
 442  				if dst == nil {
 443  					break
 444  				}
 445  				if form := FormFromInline(dst, pool); form != "" {
 446  					return form, state
 447  				}
 448  				break
 449  			}
 450  		}
 451  	}
 452  	return "", 0
 453  }
 454  
 455  // enToJABase is the legacy wrapper used by the operator path.
 456  func enToJABase(tree *lattice.Tree, pool []byte, enWord string) string {
 457  	base, _ := lookupENToJA(tree, pool, enWord, 0)
 458  	return base
 459  }
 460  
 461  // lookupJAAtMorphState finds the surface form of jaBase at the given morphstate.
 462  // Uses stored verb class from Bcooccur (O(1)) when available; falls back to
 463  // trying each conjugation class in priority order (O(classes)).
 464  func lookupJAAtMorphState(tree *lattice.Tree, pool []byte, jaBase string, targetState uint8) string {
 465  	tryForm := func(targetForm string) bool {
 466  		if targetForm == "" {
 467  			return false
 468  		}
 469  		key := MakeKey(LangJA, 0, targetForm)
 470  		for _, b := range ActiveBranches {
 471  			if tree.LookupRecIdx(lattice.Branch(b), key) != lattice.NullRec {
 472  				return true
 473  			}
 474  		}
 475  		return false
 476  	}
 477  
 478  	// Fast path: stored verb class from inflect.mx registration.
 479  	// When the class is known, the computed form is authoritative — return it
 480  	// even if not pre-stored in the lattice.
 481  	if class, ok := GetVerbClass(tree, LangJA, jaBase); ok {
 482  		if f := InflectJA(jaBase, class, targetState); f != "" {
 483  			return f
 484  		}
 485  	}
 486  
 487  	// Fallback: try each class in priority order (pre-inflect data or unknown class)
 488  	classOrder := []string{
 489  		"v1", "v5k", "v5s", "v5m", "v5b", "v5r", "v5t", "v5u", "v5g", "v5n", "vs", "vk",
 490  	}
 491  	for _, class := range classOrder {
 492  		forms := BuildVerbForms(jaBase, class)
 493  		if len(forms) == 0 {
 494  			continue
 495  		}
 496  		targetForm, ok := forms[targetState]
 497  		if !ok || targetForm == "" {
 498  			continue
 499  		}
 500  		if tryForm(targetForm) {
 501  			return targetForm
 502  		}
 503  	}
 504  	return ""
 505  }
 506  
 507  // TranslateWithClusters uses the five-stage cluster pipeline instead of
 508  // token-by-token translation. Falls back to Translate if lang descriptors
 509  // are not registered (lang-init not yet run).
 510  func TranslateWithClusters(tree *lattice.Tree, pool []byte, text string, srcLang, dstLang uint8, verbose bool) string {
 511  	srcDesc, hasSrc := GetLangDesc(tree, srcLang)
 512  	dstDesc, hasDst := GetLangDesc(tree, dstLang)
 513  	if !hasSrc || !hasDst {
 514  		if verbose {
 515  			println("cluster: lang descriptors not registered, using token-by-token")
 516  		}
 517  		return Translate(tree, pool, nil, text, srcLang, dstLang, verbose)
 518  	}
 519  
 520  	var tokens []string
 521  	switch srcLang {
 522  	case LangEN:
 523  		tokens = TokenizeEN(text)
 524  	case LangJA:
 525  		tokens = TokenizeJA(text, tree, verbose)
 526  	default:
 527  		tokens = TokenizeEN(text)
 528  	}
 529  
 530  	clusters := ParseClusters(tokens, tree, srcLang)
 531  	for _, c := range clusters {
 532  		TranslateCluster(c, tree, pool, srcLang, dstLang)
 533  	}
 534  	reordered := ReorderClusters(clusters, srcDesc.Order, dstDesc.Order)
 535  	return InsertMarkers(reordered, dstDesc, dstLang)
 536  }
 537  
 538  // BuildWordIndex extracts all words from the lattice and builds BK-trees
 539  // for fuzzy matching. Call once after loading the DB.
 540  // Returns a *fuzzy.DualIndex with EN words in A and JA words in B.
 541  func BuildWordIndex(tree *lattice.Tree, pool []byte) *fuzzy.DualIndex {
 542  	var enWords, jaWords []string
 543  	for recIdx := range tree.RecKey {
 544  		rec := tree.GetRecord(recIdx)
 545  		if rec == nil {
 546  			continue
 547  		}
 548  		form := FormFromInline(rec, pool)
 549  		if form == "" {
 550  			continue
 551  		}
 552  		switch Detect(form) {
 553  		case LangEN:
 554  			enWords = append(enWords, form)
 555  		case LangJA:
 556  			jaWords = append(jaWords, form)
 557  		}
 558  	}
 559  	return fuzzy.NewDualIndex(fuzzy.Build(enWords), fuzzy.Build(jaWords))
 560  }
 561  
 562  // FuzzyLookupWord attempts a translation with fuzzy fallback on exact miss.
 563  // Returns (translations, correctedForm, wasCorrected).
 564  func FuzzyLookupWord(tree *lattice.Tree, pool []byte, idx *fuzzy.DualIndex,
 565  	word string, srcLang uint8, maxDist int) ([]string, string, bool) {
 566  
 567  	results := LookupWord(tree, pool, word, srcLang)
 568  	if len(results) > 0 {
 569  		return results, word, false
 570  	}
 571  	if idx == nil {
 572  		return nil, word, false
 573  	}
 574  
 575  	var matches []fuzzy.Match
 576  	switch srcLang {
 577  	case LangEN:
 578  		matches = idx.SuggestA(word, maxDist, 3)
 579  	case LangJA:
 580  		matches = idx.SuggestB(word, maxDist, 3)
 581  	}
 582  	if len(matches) == 0 {
 583  		return nil, word, false
 584  	}
 585  
 586  	best := matches[0].Word
 587  	results = LookupWord(tree, pool, best, srcLang)
 588  	if len(results) > 0 {
 589  		return results, best, true
 590  	}
 591  	return nil, word, false
 592  }
 593  
 594  // stripTo removes a leading "to " from a JMdict verb gloss ("to eat" → "eat").
 595  func stripTo(s string) string {
 596  	if len(s) > 3 && s[:3] == "to " {
 597  		return s[3:]
 598  	}
 599  	return s
 600  }
 601  
 602  // applyMorphEN maps a 5-bit MorphState onto EN tense/aspect/polarity markers.
 603  // Formality (bit1) has no EN grammatical effect. Evidentiality (bit0) → "apparently".
 604  // Strips JMdict "to " prefix before applying operators.
 605  func applyMorphEN(base string, state uint8) string {
 606  	v := stripTo(base) // "to eat" → "eat"
 607  	if state == 0 {
 608  		return v
 609  	}
 610  	past   := (state>>4)&1 == 1 // bit 4
 611  	prog   := (state>>3)&1 == 1 // bit 3
 612  	neg    := (state>>2)&1 == 1 // bit 2
 613  	evid   := state&1 == 1      // bit 0
 614  
 615  	prefix := ""
 616  	if evid {
 617  		prefix = "apparently "
 618  	}
 619  	switch {
 620  	case past && prog && neg:
 621  		return prefix | "wasn't " | v | "ing"
 622  	case past && prog:
 623  		return prefix | "was " | v | "ing"
 624  	case past && neg:
 625  		return prefix | "didn't " | v
 626  	case past:
 627  		return prefix | "did " | v
 628  	case prog && neg:
 629  		return prefix | "isn't " | v | "ing"
 630  	case prog:
 631  		return prefix | "is " | v | "ing"
 632  	case neg:
 633  		return prefix | "don't " | v
 634  	default:
 635  		return prefix | v // polite present, no EN marker
 636  	}
 637  }
 638  
 639  // enOperators maps EN words to the morphstate bits they set.
 640  // These are not content words — they are lattice walk operators.
 641  // bit 4 = tense(past), bit 3 = aspect(progressive), bit 2 = polarity(negative),
 642  // bit 0 = evidentiality(reported).
 643  var enOperators = map[string]uint8{
 644  	"did":         1 << 4, // past
 645  	"didn't":      (1 << 4) | (1 << 2), // past + negative
 646  	"not":         1 << 2, // negative
 647  	"don't":       1 << 2,
 648  	"doesn't":     1 << 2,
 649  	"wasn't":      (1 << 4) | (1 << 3) | (1 << 2),
 650  	"weren't":     (1 << 4) | (1 << 3) | (1 << 2),
 651  	"apparently":  1 << 0, // evidential
 652  	"reportedly":  1 << 0,
 653  	"supposedly":  1 << 0,
 654  	"allegedly":   1 << 0,
 655  }
 656  
 657  // enProgressiveAuxiliary maps "is/are/am/was/were" to their tense bits.
 658  // Combined with an -ing verb, they set the aspect bit.
 659  var enProgressiveAux = map[string]uint8{
 660  	"is": 0, "are": 0, "am": 0,
 661  	"was": 1 << 4, "were": 1 << 4,
 662  }
 663  
 664  // isJAVerb returns true if tok is a verb in the lattice, either as a dictionary
 665  // IsJAVerb exports isJAVerb for use by the propagation command.
 666  func IsJAVerb(tree *lattice.Tree, tok string) bool { return isJAVerb(tree, tok) }
 667  
 668  // form (Bverb) or as a conjugated form whose stem is a Bverb record.
 669  func isJAVerb(tree *lattice.Tree, tok string) bool {
 670  	if jaRecordBranch(tree, tok) == uint8(lattice.Bverb) {
 671  		return true
 672  	}
 673  	for _, stem := range verbStems(tok) {
 674  		if tree.LookupRecIdx(lattice.Bverb, MakeKey(LangJA, 0, stem)) != lattice.NullRec {
 675  			return true
 676  		}
 677  	}
 678  	return false
 679  }
 680  
 681  // inferMorphState estimates the MorphState from a conjugated JA token's suffix.
 682  // Used as fallback when the form isn't in the lattice (verbStems path).
 683  func inferMorphState(tok string) uint8 {
 684  	hs := func(suf string) bool {
 685  		return len(tok) >= len(suf) && tok[len(tok)-len(suf):] == suf
 686  	}
 687  	// Progressive past
 688  	if hs("ていなかった") || hs("でいなかった") { return MorphPastProgNeg }
 689  	if hs("ていました") || hs("でいました") { return MorphPastProgPolite }
 690  	if hs("ていた") || hs("でいた") { return MorphPastProgPlain }
 691  	// Progressive present
 692  	if hs("ていない") || hs("でいない") { return MorphPresProgNeg }
 693  	if hs("ています") || hs("でいます") { return MorphPresProgPolite }
 694  	if hs("ている") || hs("でいる") { return MorphPresProgPlain }
 695  	// Past
 696  	if hs("ませんでした") { return MorphPastNegPolite }
 697  	if hs("なかった") { return MorphPastNegPlain }
 698  	if hs("ました") { return MorphPastAffPolite }
 699  	if hs("そうだ") {
 700  		// reported: check if stem is past
 701  		inner := tok[:len(tok)-len("そうだ")]
 702  		if len(inner) > 0 {
 703  			last := inner[len(inner)-3:]
 704  			if last == "た" || last == "だ" { return MorphPastReported }
 705  		}
 706  		return MorphPresReported
 707  	}
 708  	if hs("た") || hs("だ") { return MorphPastAffPlain }
 709  	// Present negative
 710  	if hs("ません") { return MorphPresNegPolite }
 711  	if hs("ない") { return MorphPresNegPlain }
 712  	// Present polite
 713  	if hs("ます") { return MorphPresAffPolite }
 714  	return MorphPresAffPlain
 715  }
 716  
 717  // verbStems strips common Japanese conjugation suffixes and returns dictionary-
 718  // form candidates to try against the lattice. Longer suffixes checked first.
 719  // Returns nil if no suffix pattern recognized.
 720  func verbStems(tok string) []string {
 721  	if len(tok) == 0 {
 722  		return nil
 723  	}
 724  	hs := func(suf string) bool {
 725  		return len(tok) > len(suf) && tok[len(tok)-len(suf):] == suf
 726  	}
 727  	st := func(suf string) string {
 728  		return tok[:len(tok)-len(suf)]
 729  	}
 730  	// 9-byte (3-char) patterns
 731  	if hs("ている") {
 732  		s := st("ている")
 733  		return []string{s | "る", s | "く"}
 734  	}
 735  	// 6-byte (2-char) patterns — godan sound changes
 736  	if hs("いた") { return []string{st("いた") | "く"} }
 737  	if hs("いだ") { return []string{st("いだ") | "ぐ"} }
 738  	if hs("した") { s := st("した"); return []string{s | "す", s | "する"} }
 739  	if hs("んだ") { s := st("んだ"); return []string{s | "む", s | "ぬ", s | "ぶ"} }
 740  	if hs("った") { s := st("った"); return []string{s | "つ", s | "う", s | "る"} }
 741  	if hs("いて") { return []string{st("いて") | "く"} }
 742  	if hs("いで") { return []string{st("いで") | "ぐ"} }
 743  	if hs("して") { s := st("して"); return []string{s | "す", s | "する"} }
 744  	if hs("んで") { s := st("んで"); return []string{s | "む", s | "ぬ", s | "ぶ"} }
 745  	if hs("って") { s := st("って"); return []string{s | "つ", s | "う", s | "る"} }
 746  	if hs("ない") { s := st("ない"); return []string{s | "る", s | "う"} }
 747  	// 3-byte (1-char) — ichidan plain past only.
 748  	// bare て is a connective te-form (食べて+いる), NOT a standalone verb form;
 749  	// including it causes the tokenizer to split 食べていた as 食べて+い+た.
 750  	if hs("た") { return []string{st("た") | "る"} }
 751  	return nil
 752  }
 753  
 754  // isPureHiragana returns true if every codepoint in s is in U+3040-U+309F (hiragana).
 755  // Particles are always pure hiragana; kanji-containing words are content words.
 756  func isPureHiragana(s string) bool {
 757  	if len(s) == 0 {
 758  		return false
 759  	}
 760  	for i := 0; i < len(s); {
 761  		if i+2 >= len(s) {
 762  			return false
 763  		}
 764  		// Hiragana block: U+3040-U+309F = E3 81 80 – E3 82 9F
 765  		if s[i] != 0xE3 {
 766  			return false
 767  		}
 768  		b1 := s[i+1]
 769  		b2 := s[i+2]
 770  		if b1 == 0x81 && b2 >= 0x80 {
 771  			// U+3040-U+307F ✓
 772  		} else if b1 == 0x82 && b2 <= 0x9F {
 773  			// U+3080-U+309F ✓
 774  		} else {
 775  			return false
 776  		}
 777  		i += 3
 778  	}
 779  	return true
 780  }
 781  
 782  // jaFunctionWord: particles, copulae, and auxiliaries that are structural
 783  // fork labels, not content. Includes entries removed from the lattice by the
 784  // IsFunction() filter at ingest (prt/cop/aux POS codes).
 785  var jaFunctionWord = map[string]bool{
 786  	// copulae and auxiliaries
 787  	"だ": true, "です": true, "でした": true,
 788  	"ない": true, "ぬ": true, "ん": true,
 789  	"ます": true, "ません": true, "ました": true,
 790  	// particles (no longer in lattice — removed by IsFunction filter)
 791  	"は": true, "が": true, "を": true,
 792  	"に": true, "で": true, "と": true,
 793  	"も": true, "や": true, "か": true,
 794  	"の": true, "から": true, "まで": true,
 795  	"より": true, "など": true, "ね": true,
 796  	"よ": true, "さ": true, "な": true,
 797  	"わ": true, "ぞ": true, "ぜ": true,
 798  	"て": true, "た": true,
 799  }
 800  
 801  // VerbLemma returns a single-step approximation of the dictionary form for
 802  // a JA verb surface form via verbStems. Used for morph-stats grouping.
 803  func VerbLemma(form string) string {
 804  	stems := verbStems(form)
 805  	if len(stems) > 0 {
 806  		return stems[0]
 807  	}
 808  	return form
 809  }
 810  
 811  func appendUniq(s []string, v string) []string {
 812  	for _, x := range s {
 813  		if x == v {
 814  			return s
 815  		}
 816  	}
 817  	return append(s, v)
 818  }
 819