package transdb

import (
	"bytes"

	"git.smesh.lol/iskradb/lattice"
)

// TokenizeEN splits English text on whitespace and punctuation, lowercases
// each token, and strips trailing possessive 's.
func TokenizeEN(text string) []string {
	var tokens []string
	b := []byte(text)
	start := -1
	for i := 0; i <= len(b); i++ {
		var c byte
		if i < len(b) {
			c = b[i]
		}
		isAlnum := (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')
		if isAlnum {
			if start < 0 {
				start = i
			}
		} else {
			if start >= 0 {
				tok := bytes.ToLower(b[start:i])
				// Strip trailing 's or s'
				if len(tok) > 2 && tok[len(tok)-2] == '\'' && tok[len(tok)-1] == 's' {
					tok = tok[:len(tok)-2]
				}
				tokens = append(tokens, string(tok))
				start = -1
			}
		}
	}
	return tokens
}

// TokenizeJA segments Japanese text using forward maximum-match against the
// translation lattice. Branch search order adapts to syntactic context so
// particles (Bmodifier) are preferred over noun readings in particle positions.
func TokenizeJA(text string, tree *lattice.Tree, verbose bool) []string {
	var tokens []string
	var prevPOS uint8
	for len(text) > 0 {
		tok, consumed, branch := maxMatchJA(text, tree, prevPOS)
		if tok != "" {
			tokens = append(tokens, tok)
			if branch < 3 {
				prevPOS = branch + 1 // 1=noun, 2=verb, 3=modifier
			} else {
				prevPOS = 0
			}
		}
		text = text[consumed:]
		if verbose && tok != "" {
			println("segment:", tok)
		}
	}
	return tokens
}

// maxMatchJA tries to match the longest prefix of text against the JA lattice.
// prevPOS carries the POS of the preceding content token (1=noun, 2=verb,
// 3=modifier, 0=unknown) for context-guided branch ordering.
// Returns (matched_form, bytes_consumed, branch). Branch=255 means no match.
// Punctuation and CJK symbols pass through silently (empty form, 1 codepoint consumed).
func maxMatchJA(text string, tree *lattice.Tree, prevPOS uint8) (string, int, uint8) {
	_, r0size := decodeRune(text, 0)

	// Fast path: punctuation and non-word characters pass through silently.
	// CJK punctuation: U+3000-U+303F (。、・「」〜 etc.)
	// Full-width: U+FF00-U+FFEF
	// ASCII: < 0x80
	if len(text) >= 3 && text[0] == 0xE3 {
		b1 := text[1]
		if b1 == 0x80 && text[2] <= 0xBF { // U+3000-U+303F
			return "", r0size, 255
		}
		if b1 == 0x83 && text[2] == 0xBC { // ー U+30FC alone (handled by reading aliases in compounds)
			// only skip if no longer match found below
		}
	}
	if text[0] < 0x80 { // ASCII punctuation/spaces
		return "", r0size, 255
	}
	// Full-width punctuation U+FF00-U+FFEF
	if len(text) >= 3 && text[0] == 0xEF && text[1] == 0xBC {
		return "", r0size, 255
	}

	// Build codepoint boundary offsets.
	offsets := []int{:0:32}
	i := 0
	for i < len(text) {
		offsets = append(offsets, i)
		_, size := decodeRune(text, i)
		i += size
	}
	offsets = append(offsets, len(text))

	// Branch order: noun-preceding context → try Bmodifier first.
	// prevPOS from POSForWord returns 1/2/3 matching CooccurNominal/Verbal/Function
	coord := PackCoord(0, 0, CoordCooccur(prevPOS, 0), 0, 0, 0, 0)
	order := branchOrderJA(coord)

	// Try from longest to shortest prefix. At each length, check:
	// 1. coord=0 (base form, most common)
	// 2. morph coord (conjugated form — must check at the SAME length before
	//    shortening, or else "開け"(noun) beats "開けた"(past of 開ける))
	// 3. verbStem de-stem (for forms not yet in lattice)
	for end := len(offsets) - 1; end >= 2; end-- {
		prefix := text[:offsets[end]]

		// coord=0 lookup.
		key0 := MakeKey(LangJA, 0, prefix)
		for _, b := range order {
			if tree.LookupRecIdx(lattice.Branch(b), key0) != lattice.NullRec {
				return prefix, offsets[end], b
			}
		}

		// Morph coord lookup — only for ≥3 codepoints (conjugated forms).
		if end >= 3 {
			ms := inferMorphState(prefix)
			if ms != 0 {
				coordM := PackCoord(0, 0, 0, uint64(ms), 0, 0, 0)
				keyM := MakeKey(LangJA, coordM, prefix)
				for _, b := range ActiveBranches {
					if tree.LookupRecIdx(b, keyM) != lattice.NullRec {
						return prefix, offsets[end], 255
					}
				}
			}
			// verbStem de-stem for forms not yet at morph coord.
			for _, stem := range verbStems(prefix) {
				keyS := MakeKey(LangJA, 0, stem)
				for _, b := range ActiveBranches {
					if tree.LookupRecIdx(b, keyS) != lattice.NullRec {
						return prefix, offsets[end], 255
					}
				}
			}
		}
	}

	// No match - consume first codepoint as an opaque token.
	return text[:r0size], r0size, 255
}