package transdb

import (
	"git.smesh.lol/iskradb/lattice"
	"git.smesh.lol/transdb/fuzzy"
)

// FormFromInline extracts the surface form stored in Record.Inline.
// Byte 23 holds the inline length (0 = overflow, data in pool).
func FormFromInline(rec *lattice.Record, pool []byte) string {
	n := int(rec.Inline[23])
	if n > 0 && n <= 23 {
		return string(rec.Inline[:n])
	}
	if rec.DataFile == 1 && rec.DataLen > 0 {
		end := rec.DataOff + rec.DataLen
		if int(end) <= len(pool) {
			return string(pool[rec.DataOff:end])
		}
	}
	return ""
}

// SetFormOnRecord stores the surface form in Record.Inline (up to 23 bytes)
// or overflows into pool when longer.
func SetFormOnRecord(rec *lattice.Record, form string, pool *[]byte) {
	b := []byte(form)
	if len(b) <= 23 {
		copy(rec.Inline[:], b)
		rec.Inline[23] = byte(len(b))
		rec.DataFile = 0
	} else {
		copy(rec.Inline[:23], b[:23])
		rec.Inline[23] = 0
		rec.DataFile = 1
		rec.DataOff = uint32(len(*pool))
		rec.DataLen = uint32(len(b))
		*pool = append(*pool, b...)
	}
}

// defaultBranchOrder uses actual branch indices (Bnoun=1, Bverb=3, Bmodifier=4).
var defaultBranchOrder = [3]uint8{uint8(lattice.Bnoun), uint8(lattice.Bverb), uint8(lattice.Bmodifier)}

// lookupByKey finds all translation candidates for a pre-computed key,
// searching branches in the given order.
func lookupByKey(tree *lattice.Tree, pool []byte, key lattice.Key, order [3]uint8) []string {
	var results []string
	for _, b := range order {
		ri := tree.LookupRecIdx(lattice.Branch(b), key)
		if ri == lattice.NullRec {
			continue
		}
		rec := tree.GetRecord(ri)
		if rec == nil {
			continue
		}
		if rec.Link[0] != lattice.NullRec {
			if dst := tree.GetRecord(rec.Link[0]); dst != nil {
				if form := FormFromInline(dst, pool); form != "" {
					results = appendUniq(results, form)
				}
			}
		}
		if rec.Link[1] != lattice.NullRec {
			if dst := tree.GetRecord(rec.Link[1]); dst != nil {
				if form := FormFromInline(dst, pool); form != "" {
					results = appendUniq(results, form)
				}
			}
		}
		break
	}
	return results
}

// jaRecordBranch returns the branch of the coord=0 JA record for tok, or 255 if not found.
func jaRecordBranch(tree *lattice.Tree, tok string) uint8 {
	key := MakeKey(LangJA, 0, tok)
	for _, b := range ActiveBranches {
		if tree.LookupRecIdx(b, key) != lattice.NullRec {
			return uint8(b)
		}
	}
	return 255
}

// LookupWord finds all translation candidates for a single word token (coord=0).
func LookupWord(tree *lattice.Tree, pool []byte, word string, srcLang uint8) []string {
	return lookupByKey(tree, pool, MakeKey(srcLang, 0, word), defaultBranchOrder)
}

// LookupWordCtx finds translations using the 22-bit coordinate.
// Tries each coordinate in the relaxation sequence (most specific → least specific).
// For JA source, branch order is derived from the cooccurrence axis.
func LookupWordCtx(tree *lattice.Tree, pool []byte, word string, srcLang uint8, coord uint64) []string {
	order := defaultBranchOrder
	if srcLang == LangJA {
		order = branchOrderJA(coord)
	}
	for _, c := range RelaxCoord(coord) {
		if results := lookupByKey(tree, pool, MakeKey(srcLang, c, word), order); len(results) > 0 {
			return results
		}
	}
	return nil
}

// jaRole constants for syntactic role assignment.
const (
	jaRoleNone = uint8(0)
	jaRoleSubj = uint8(1) // は が
	jaRoleObj  = uint8(2) // を
	jaRoleVerb = uint8(3)
	jaRoleMisc = uint8(4) // everything else
)

// jaRoleParticle maps particle strings to syntactic roles.
// Only subject (は/が) and object (を) get specific roles;
// other particles collapse to misc.
var jaRoleParticle = map[string]uint8{
	"は": jaRoleSubj, "が": jaRoleSubj,
	"を": jaRoleObj,
}

// Translate tokenizes text in srcLang and translates each token to dstLang.
// For JA→EN, applies particle-based role assignment and SOV→SVO reordering.
// Tokens with no translation are passed through unchanged.
func Translate(tree *lattice.Tree, pool []byte, idx *fuzzy.DualIndex,
	text string, srcLang, dstLang uint8, verbose bool) string {

	var tokens []string
	switch srcLang {
	case LangEN:
		tokens = TokenizeEN(text)
	case LangJA:
		tokens = TokenizeJA(text, tree, verbose)
	default:
		tokens = TokenizeEN(text)
	}

	if srcLang == LangJA && dstLang == LangEN {
		return translateJAToEN(tree, pool, idx, tokens, verbose)
	}
	return translateTokens(tree, pool, idx, tokens, srcLang, dstLang, verbose)
}

// translateJAToEN handles JA→EN with two-zone SOV→SVO reordering.
//
// Zone split: は/が divides the sentence into subject zone and predicate zone.
// Within the predicate zone, verb tokens are pulled to the front:
//   SUBJ_ZONE + VERB(s) + REST_OF_PRED_ZONE
//
// This preserves modifier attachment (天皇の歴史的責任感 stays together as the
// subject) while achieving SVO word order for the core clause.
func translateJAToEN(tree *lattice.Tree, pool []byte, idx *fuzzy.DualIndex,
	tokens []string, verbose bool) string {

	n := len(tokens)

	// isSkipToken: pure-hiragana particles and copulae get no EN output.
	isSkip := func(tok string) bool {
		if !isPureHiragana(tok) {
			return false
		}
		jaKey := MakeKey(LangJA, 0, tok)
		return tree.LookupRecIdx(lattice.Bmodifier, jaKey) != lattice.NullRec || jaFunctionWord[tok]
	}

	// lookupMorph returns the translation and MorphState for a JA token.
	// Uses RelaxCoord: tries most-specific coord first, falls back toward coord=0.
	lookupMorph := func(tok string, coord uint64) (string, uint8) {
		order := branchOrderJA(coord)
		for _, c := range RelaxCoord(coord) {
			key := MakeKey(LangJA, c, tok)
			for _, b := range order {
				ri := tree.LookupRecIdx(lattice.Branch(b), key)
				if ri == lattice.NullRec {
					continue
				}
				rec := tree.GetRecord(ri)
				if rec == nil {
					continue
				}
				state := GetMorphState(rec)
				if rec.Link[0] != lattice.NullRec {
					if dst := tree.GetRecord(rec.Link[0]); dst != nil {
						if form := FormFromInline(dst, pool); form != "" {
							return form, state
						}
					}
				}
				break
			}
		}
		return "", 0
	}

	// translateTok: translate a single JA token using the 22-bit coordinate.
	// The coord encodes both cooccurrence context (prev/next word types) and
	// the morphological state inferred from the token's surface form.
	translateTok := func(i int, tok string) string {
		var prevType, nextType uint8
		if i > 0 {
			prevType = POSTypeFor(POSForWord(tree, LangJA, tokens[i-1]))
		}
		if i+1 < n {
			nextType = POSTypeFor(POSForWord(tree, LangJA, tokens[i+1]))
		}
		morphState := uint64(inferMorphState(tok))
		coord := PackCoord(0, 0, CoordCooccur(prevType, nextType), morphState, 0, 0, 0)

		if en, state := lookupMorph(tok, coord); en != "" {
			return applyMorphEN(en, state)
		}

		// Fuzzy fallback.
		if idx != nil {
			var corrected string
			var wasCorrected bool
			var candidates []string
			candidates, corrected, wasCorrected = FuzzyLookupWord(tree, pool, idx, tok, LangJA, 2)
			if verbose && wasCorrected {
				println("fuzzy:", tok, "→", corrected)
			}
			for _, c := range candidates {
				return applyMorphEN(c, 0)
			}
			_ = corrected
		}

		// verbStems fallback for forms not in lattice.
		if stems := verbStems(tok); len(stems) > 0 {
			for _, stem := range stems {
				stemCoord := PackCoord(0, 0, CoordCooccur(prevType, nextType), morphState, 0, 0, 0)
				if en, _ := lookupMorph(stem, stemCoord); en != "" {
					return applyMorphEN(en, uint8(morphState))
				}
			}
		}
		return tok
	}

	// Find the first は/が boundary to split subject zone from predicate zone.
	// subjEnd is the index of the は/が particle itself.
	subjEnd := -1
	for i, tok := range tokens {
		if tok == "は" || tok == "が" {
			if isPureHiragana(tok) {
				subjEnd = i
				break
			}
		}
	}

	// Translate all tokens in JA order, tagging each as subj/verb/pred.
	type word struct {
		en   string
		isV  bool
	}
	var subjWords, predVerbs, predRest []word

	for i, tok := range tokens {
		if isSkip(tok) {
			continue
		}
		en := translateTok(i, tok)
		if en == "" {
			continue
		}
		w := word{en, isJAVerb(tree, tok)}
		if subjEnd >= 0 && i < subjEnd {
			subjWords = append(subjWords, w)
		} else if w.isV {
			predVerbs = append(predVerbs, w)
		} else {
			predRest = append(predRest, w)
		}
	}

	// Emit: SUBJ + VERB + REST_OF_PRED (preserves modifier order within each zone).
	var out []byte
	first := true
	emit := func(en string) {
		if !first {
			out = append(out, ' ')
		}
		out = append(out, []byte(en)...)
		first = false
	}
	for _, w := range subjWords {
		emit(w.en)
	}
	for _, w := range predVerbs {
		emit(w.en)
	}
	for _, w := range predRest {
		emit(w.en)
	}
	return string(out)
}

// translateTokens handles EN→JA and same-language translation (no reordering).
// For EN→JA: operator tokens ("did", "not", "apparently" etc.) accumulate
// morphstate bits and are consumed without output; the next verb is looked up
// at the resulting morphstate in the JA cluster.
func translateTokens(tree *lattice.Tree, pool []byte, idx *fuzzy.DualIndex,
	tokens []string, srcLang, dstLang uint8, verbose bool) string {

	var out []byte
	pendingMorph := uint8(0)       // accumulated operator bits waiting for a verb
	progressiveAux := uint8(0xFF)  // 0xFF = none; otherwise tense bits from is/was/were
	subjectSemFlags := uint64(0)   // semantic flags from subject nouns seen so far

	for i, tok := range tokens {
		// EN→JA: detect operator tokens (morphstate walk instructions).
		if srcLang == LangEN && dstLang == LangJA {
			if bits, ok := enOperators[tok]; ok {
				pendingMorph |= bits
				continue // operator consumed, no output
			}
			// Progressive auxiliary: "is/am/are/was/were" before a verb+ing.
			if tenseBits, ok := enProgressiveAux[tok]; ok {
				progressiveAux = tenseBits
				continue
			}
			// Detect "-ing" suffix on a verb when progressive aux is pending.
			if progressiveAux != 0xFF && len(tok) > 3 && tok[len(tok)-3:] == "ing" {
				pendingMorph |= (1 << 3) | progressiveAux // aspect + tense
				progressiveAux = 0xFF
				// Strip "ing" to get base verb for lookup.
				tok = tok[:len(tok)-3]
			}
		}

		var candidates []string
		corrected := tok

		var prevType, nextType uint8
		if i > 0 {
			prevType = POSTypeFor(POSForWord(tree, srcLang, tokens[i-1]))
		}
		if i+1 < len(tokens) {
			nextType = POSTypeFor(POSForWord(tree, srcLang, tokens[i+1]))
		}

		// Accumulate semantic flags from subject nouns for verb disambiguation.
		// Read flags from the noun's base record DataFile (O(1), no coord scan).
		if srcLang == LangEN && dstLang == LangJA {
			curType := POSTypeFor(POSForWord(tree, srcLang, tok))
			if curType == CooccurNominal { // it's a noun in the EN lattice
				key := MakeKey(LangEN, 0, tok)
				for _, b := range ActiveBranches {
					if ri := tree.LookupRecIdx(b, key); ri != lattice.NullRec {
						if rec := tree.GetRecord(ri); rec != nil {
							subjectSemFlags |= GetSemanticFromDataFile(rec)
						}
						break
					}
				}
			}
		}

		coord := PackCoord(subjectSemFlags, 0, CoordCooccur(prevType, nextType), 0, 0, 0, 0)

		if idx != nil {
			var wasCorrected bool
			candidates, corrected, wasCorrected = FuzzyLookupWord(tree, pool, idx, tok, srcLang, 2)
			if verbose && wasCorrected {
				println("fuzzy: corrected", tok, "→", corrected)
			}
			if len(candidates) > 0 && coord != 0 {
				if ctxCands := LookupWordCtx(tree, pool, corrected, srcLang, coord); len(ctxCands) > 0 {
					candidates = ctxCands
				}
			}
		} else {
			candidates = LookupWordCtx(tree, pool, tok, srcLang, coord)
		}

		var translated string

		// EN→JA: use lookupENToJA to get JA base + EN record's own MorphState.
		// Combine with pendingMorph (accumulated operator bits) for the target state.
		// Handles both synthetic ("sang" has MorphState=16) and analytical ("did"+"sing").
		if srcLang == LangEN && dstLang == LangJA {
			jaBase, enMorphState := lookupENToJA(tree, pool, corrected, coord)
			targetState := pendingMorph | enMorphState
			if jaBase != "" && targetState != 0 {
				if targetForm := lookupJAAtMorphState(tree, pool, jaBase, targetState); targetForm != "" {
					translated = targetForm
				} else {
					translated = jaBase
				}
				pendingMorph = 0
			} else if jaBase != "" {
				translated = jaBase
				pendingMorph = 0
			}
		}

		if translated == "" {
			for _, c := range candidates {
				translated = c
				break
			}
		}
		if translated == "" {
			translated = tok
		}
		if len(out) > 0 && dstLang == LangEN {
			out = append(out, ' ')
		}
		out = append(out, []byte(translated)...)
	}
	return string(out)
}

// lookupENToJA finds the JA base form and the EN record's MorphState for a
// given EN token. Tries the word as-is, then with "to " prefix (JMdict gloss
// format). The MorphState on the EN record drives JA cluster navigation:
// "sang" has MorphState=16 pointing to 歌う, so we navigate to 歌った.
func lookupENToJA(tree *lattice.Tree, pool []byte, word string, coord uint64) (jaBase string, morphState uint8) {
	order := defaultBranchOrder
	for _, tryWord := range []string{word, "to " | word} {
		for _, c := range RelaxCoord(coord) {
			key := MakeKey(LangEN, c, tryWord)
			for _, b := range order {
				ri := tree.LookupRecIdx(lattice.Branch(b), key)
				if ri == lattice.NullRec {
					continue
				}
				rec := tree.GetRecord(ri)
				if rec == nil {
					continue
				}
				state := GetMorphState(rec)
				if rec.Link[0] == lattice.NullRec {
					break
				}
				dst := tree.GetRecord(rec.Link[0])
				if dst == nil {
					break
				}
				if form := FormFromInline(dst, pool); form != "" {
					return form, state
				}
				break
			}
		}
	}
	return "", 0
}

// enToJABase is the legacy wrapper used by the operator path.
func enToJABase(tree *lattice.Tree, pool []byte, enWord string) string {
	base, _ := lookupENToJA(tree, pool, enWord, 0)
	return base
}

// lookupJAAtMorphState finds the surface form of jaBase at the given morphstate.
// Uses stored verb class from Bcooccur (O(1)) when available; falls back to
// trying each conjugation class in priority order (O(classes)).
func lookupJAAtMorphState(tree *lattice.Tree, pool []byte, jaBase string, targetState uint8) string {
	tryForm := func(targetForm string) bool {
		if targetForm == "" {
			return false
		}
		key := MakeKey(LangJA, 0, targetForm)
		for _, b := range ActiveBranches {
			if tree.LookupRecIdx(lattice.Branch(b), key) != lattice.NullRec {
				return true
			}
		}
		return false
	}

	// Fast path: stored verb class from inflect.mx registration.
	// When the class is known, the computed form is authoritative — return it
	// even if not pre-stored in the lattice.
	if class, ok := GetVerbClass(tree, LangJA, jaBase); ok {
		if f := InflectJA(jaBase, class, targetState); f != "" {
			return f
		}
	}

	// Fallback: try each class in priority order (pre-inflect data or unknown class)
	classOrder := []string{
		"v1", "v5k", "v5s", "v5m", "v5b", "v5r", "v5t", "v5u", "v5g", "v5n", "vs", "vk",
	}
	for _, class := range classOrder {
		forms := BuildVerbForms(jaBase, class)
		if len(forms) == 0 {
			continue
		}
		targetForm, ok := forms[targetState]
		if !ok || targetForm == "" {
			continue
		}
		if tryForm(targetForm) {
			return targetForm
		}
	}
	return ""
}

// TranslateWithClusters uses the five-stage cluster pipeline instead of
// token-by-token translation. Falls back to Translate if lang descriptors
// are not registered (lang-init not yet run).
func TranslateWithClusters(tree *lattice.Tree, pool []byte, text string, srcLang, dstLang uint8, verbose bool) string {
	srcDesc, hasSrc := GetLangDesc(tree, srcLang)
	dstDesc, hasDst := GetLangDesc(tree, dstLang)
	if !hasSrc || !hasDst {
		if verbose {
			println("cluster: lang descriptors not registered, using token-by-token")
		}
		return Translate(tree, pool, nil, text, srcLang, dstLang, verbose)
	}

	var tokens []string
	switch srcLang {
	case LangEN:
		tokens = TokenizeEN(text)
	case LangJA:
		tokens = TokenizeJA(text, tree, verbose)
	default:
		tokens = TokenizeEN(text)
	}

	clusters := ParseClusters(tokens, tree, srcLang)
	for _, c := range clusters {
		TranslateCluster(c, tree, pool, srcLang, dstLang)
	}
	reordered := ReorderClusters(clusters, srcDesc.Order, dstDesc.Order)
	return InsertMarkers(reordered, dstDesc, dstLang)
}

// BuildWordIndex extracts all words from the lattice and builds BK-trees
// for fuzzy matching. Call once after loading the DB.
// Returns a *fuzzy.DualIndex with EN words in A and JA words in B.
func BuildWordIndex(tree *lattice.Tree, pool []byte) *fuzzy.DualIndex {
	var enWords, jaWords []string
	for recIdx := range tree.RecKey {
		rec := tree.GetRecord(recIdx)
		if rec == nil {
			continue
		}
		form := FormFromInline(rec, pool)
		if form == "" {
			continue
		}
		switch Detect(form) {
		case LangEN:
			enWords = append(enWords, form)
		case LangJA:
			jaWords = append(jaWords, form)
		}
	}
	return fuzzy.NewDualIndex(fuzzy.Build(enWords), fuzzy.Build(jaWords))
}

// FuzzyLookupWord attempts a translation with fuzzy fallback on exact miss.
// Returns (translations, correctedForm, wasCorrected).
func FuzzyLookupWord(tree *lattice.Tree, pool []byte, idx *fuzzy.DualIndex,
	word string, srcLang uint8, maxDist int) ([]string, string, bool) {

	results := LookupWord(tree, pool, word, srcLang)
	if len(results) > 0 {
		return results, word, false
	}
	if idx == nil {
		return nil, word, false
	}

	var matches []fuzzy.Match
	switch srcLang {
	case LangEN:
		matches = idx.SuggestA(word, maxDist, 3)
	case LangJA:
		matches = idx.SuggestB(word, maxDist, 3)
	}
	if len(matches) == 0 {
		return nil, word, false
	}

	best := matches[0].Word
	results = LookupWord(tree, pool, best, srcLang)
	if len(results) > 0 {
		return results, best, true
	}
	return nil, word, false
}

// stripTo removes a leading "to " from a JMdict verb gloss ("to eat" → "eat").
func stripTo(s string) string {
	if len(s) > 3 && s[:3] == "to " {
		return s[3:]
	}
	return s
}

// applyMorphEN maps a 5-bit MorphState onto EN tense/aspect/polarity markers.
// Formality (bit1) has no EN grammatical effect. Evidentiality (bit0) → "apparently".
// Strips JMdict "to " prefix before applying operators.
func applyMorphEN(base string, state uint8) string {
	v := stripTo(base) // "to eat" → "eat"
	if state == 0 {
		return v
	}
	past   := (state>>4)&1 == 1 // bit 4
	prog   := (state>>3)&1 == 1 // bit 3
	neg    := (state>>2)&1 == 1 // bit 2
	evid   := state&1 == 1      // bit 0

	prefix := ""
	if evid {
		prefix = "apparently "
	}
	switch {
	case past && prog && neg:
		return prefix | "wasn't " | v | "ing"
	case past && prog:
		return prefix | "was " | v | "ing"
	case past && neg:
		return prefix | "didn't " | v
	case past:
		return prefix | "did " | v
	case prog && neg:
		return prefix | "isn't " | v | "ing"
	case prog:
		return prefix | "is " | v | "ing"
	case neg:
		return prefix | "don't " | v
	default:
		return prefix | v // polite present, no EN marker
	}
}

// enOperators maps EN words to the morphstate bits they set.
// These are not content words — they are lattice walk operators.
// bit 4 = tense(past), bit 3 = aspect(progressive), bit 2 = polarity(negative),
// bit 0 = evidentiality(reported).
var enOperators = map[string]uint8{
	"did":         1 << 4, // past
	"didn't":      (1 << 4) | (1 << 2), // past + negative
	"not":         1 << 2, // negative
	"don't":       1 << 2,
	"doesn't":     1 << 2,
	"wasn't":      (1 << 4) | (1 << 3) | (1 << 2),
	"weren't":     (1 << 4) | (1 << 3) | (1 << 2),
	"apparently":  1 << 0, // evidential
	"reportedly":  1 << 0,
	"supposedly":  1 << 0,
	"allegedly":   1 << 0,
}

// enProgressiveAuxiliary maps "is/are/am/was/were" to their tense bits.
// Combined with an -ing verb, they set the aspect bit.
var enProgressiveAux = map[string]uint8{
	"is": 0, "are": 0, "am": 0,
	"was": 1 << 4, "were": 1 << 4,
}

// isJAVerb returns true if tok is a verb in the lattice, either as a dictionary
// IsJAVerb exports isJAVerb for use by the propagation command.
func IsJAVerb(tree *lattice.Tree, tok string) bool { return isJAVerb(tree, tok) }

// form (Bverb) or as a conjugated form whose stem is a Bverb record.
func isJAVerb(tree *lattice.Tree, tok string) bool {
	if jaRecordBranch(tree, tok) == uint8(lattice.Bverb) {
		return true
	}
	for _, stem := range verbStems(tok) {
		if tree.LookupRecIdx(lattice.Bverb, MakeKey(LangJA, 0, stem)) != lattice.NullRec {
			return true
		}
	}
	return false
}

// inferMorphState estimates the MorphState from a conjugated JA token's suffix.
// Used as fallback when the form isn't in the lattice (verbStems path).
func inferMorphState(tok string) uint8 {
	hs := func(suf string) bool {
		return len(tok) >= len(suf) && tok[len(tok)-len(suf):] == suf
	}
	// Progressive past
	if hs("ていなかった") || hs("でいなかった") { return MorphPastProgNeg }
	if hs("ていました") || hs("でいました") { return MorphPastProgPolite }
	if hs("ていた") || hs("でいた") { return MorphPastProgPlain }
	// Progressive present
	if hs("ていない") || hs("でいない") { return MorphPresProgNeg }
	if hs("ています") || hs("でいます") { return MorphPresProgPolite }
	if hs("ている") || hs("でいる") { return MorphPresProgPlain }
	// Past
	if hs("ませんでした") { return MorphPastNegPolite }
	if hs("なかった") { return MorphPastNegPlain }
	if hs("ました") { return MorphPastAffPolite }
	if hs("そうだ") {
		// reported: check if stem is past
		inner := tok[:len(tok)-len("そうだ")]
		if len(inner) > 0 {
			last := inner[len(inner)-3:]
			if last == "た" || last == "だ" { return MorphPastReported }
		}
		return MorphPresReported
	}
	if hs("た") || hs("だ") { return MorphPastAffPlain }
	// Present negative
	if hs("ません") { return MorphPresNegPolite }
	if hs("ない") { return MorphPresNegPlain }
	// Present polite
	if hs("ます") { return MorphPresAffPolite }
	return MorphPresAffPlain
}

// verbStems strips common Japanese conjugation suffixes and returns dictionary-
// form candidates to try against the lattice. Longer suffixes checked first.
// Returns nil if no suffix pattern recognized.
func verbStems(tok string) []string {
	if len(tok) == 0 {
		return nil
	}
	hs := func(suf string) bool {
		return len(tok) > len(suf) && tok[len(tok)-len(suf):] == suf
	}
	st := func(suf string) string {
		return tok[:len(tok)-len(suf)]
	}
	// 9-byte (3-char) patterns
	if hs("ている") {
		s := st("ている")
		return []string{s | "る", s | "く"}
	}
	// 6-byte (2-char) patterns — godan sound changes
	if hs("いた") { return []string{st("いた") | "く"} }
	if hs("いだ") { return []string{st("いだ") | "ぐ"} }
	if hs("した") { s := st("した"); return []string{s | "す", s | "する"} }
	if hs("んだ") { s := st("んだ"); return []string{s | "む", s | "ぬ", s | "ぶ"} }
	if hs("った") { s := st("った"); return []string{s | "つ", s | "う", s | "る"} }
	if hs("いて") { return []string{st("いて") | "く"} }
	if hs("いで") { return []string{st("いで") | "ぐ"} }
	if hs("して") { s := st("して"); return []string{s | "す", s | "する"} }
	if hs("んで") { s := st("んで"); return []string{s | "む", s | "ぬ", s | "ぶ"} }
	if hs("って") { s := st("って"); return []string{s | "つ", s | "う", s | "る"} }
	if hs("ない") { s := st("ない"); return []string{s | "る", s | "う"} }
	// 3-byte (1-char) — ichidan plain past only.
	// bare て is a connective te-form (食べて+いる), NOT a standalone verb form;
	// including it causes the tokenizer to split 食べていた as 食べて+い+た.
	if hs("た") { return []string{st("た") | "る"} }
	return nil
}

// isPureHiragana returns true if every codepoint in s is in U+3040-U+309F (hiragana).
// Particles are always pure hiragana; kanji-containing words are content words.
func isPureHiragana(s string) bool {
	if len(s) == 0 {
		return false
	}
	for i := 0; i < len(s); {
		if i+2 >= len(s) {
			return false
		}
		// Hiragana block: U+3040-U+309F = E3 81 80 – E3 82 9F
		if s[i] != 0xE3 {
			return false
		}
		b1 := s[i+1]
		b2 := s[i+2]
		if b1 == 0x81 && b2 >= 0x80 {
			// U+3040-U+307F ✓
		} else if b1 == 0x82 && b2 <= 0x9F {
			// U+3080-U+309F ✓
		} else {
			return false
		}
		i += 3
	}
	return true
}

// jaFunctionWord: particles, copulae, and auxiliaries that are structural
// fork labels, not content. Includes entries removed from the lattice by the
// IsFunction() filter at ingest (prt/cop/aux POS codes).
var jaFunctionWord = map[string]bool{
	// copulae and auxiliaries
	"だ": true, "です": true, "でした": true,
	"ない": true, "ぬ": true, "ん": true,
	"ます": true, "ません": true, "ました": true,
	// particles (no longer in lattice — removed by IsFunction filter)
	"は": true, "が": true, "を": true,
	"に": true, "で": true, "と": true,
	"も": true, "や": true, "か": true,
	"の": true, "から": true, "まで": true,
	"より": true, "など": true, "ね": true,
	"よ": true, "さ": true, "な": true,
	"わ": true, "ぞ": true, "ぜ": true,
	"て": true, "た": true,
}

// VerbLemma returns a single-step approximation of the dictionary form for
// a JA verb surface form via verbStems. Used for morph-stats grouping.
func VerbLemma(form string) string {
	stems := verbStems(form)
	if len(stems) > 0 {
		return stems[0]
	}
	return form
}

func appendUniq(s []string, v string) []string {
	for _, x := range s {
		if x == v {
			return s
		}
	}
	return append(s, v)
}