package transdb import ( "bytes" "git.smesh.lol/iskradb/lattice" ) // TokenizeEN splits English text on whitespace and punctuation, lowercases // each token, and strips trailing possessive 's. func TokenizeEN(text string) []string { var tokens []string b := []byte(text) start := -1 for i := 0; i <= len(b); i++ { var c byte if i < len(b) { c = b[i] } isAlnum := (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') if isAlnum { if start < 0 { start = i } } else { if start >= 0 { tok := bytes.ToLower(b[start:i]) // Strip trailing 's or s' if len(tok) > 2 && tok[len(tok)-2] == '\'' && tok[len(tok)-1] == 's' { tok = tok[:len(tok)-2] } tokens = append(tokens, string(tok)) start = -1 } } } return tokens } // TokenizeJA segments Japanese text using forward maximum-match against the // translation lattice. Branch search order adapts to syntactic context so // particles (Bmodifier) are preferred over noun readings in particle positions. func TokenizeJA(text string, tree *lattice.Tree, verbose bool) []string { var tokens []string var prevPOS uint8 for len(text) > 0 { tok, consumed, branch := maxMatchJA(text, tree, prevPOS) if tok != "" { tokens = append(tokens, tok) if branch < 3 { prevPOS = branch + 1 // 1=noun, 2=verb, 3=modifier } else { prevPOS = 0 } } text = text[consumed:] if verbose && tok != "" { println("segment:", tok) } } return tokens } // maxMatchJA tries to match the longest prefix of text against the JA lattice. // prevPOS carries the POS of the preceding content token (1=noun, 2=verb, // 3=modifier, 0=unknown) for context-guided branch ordering. // Returns (matched_form, bytes_consumed, branch). Branch=255 means no match. // Punctuation and CJK symbols pass through silently (empty form, 1 codepoint consumed). func maxMatchJA(text string, tree *lattice.Tree, prevPOS uint8) (string, int, uint8) { _, r0size := decodeRune(text, 0) // Fast path: punctuation and non-word characters pass through silently. // CJK punctuation: U+3000-U+303F (。、・「」〜 etc.) // Full-width: U+FF00-U+FFEF // ASCII: < 0x80 if len(text) >= 3 && text[0] == 0xE3 { b1 := text[1] if b1 == 0x80 && text[2] <= 0xBF { // U+3000-U+303F return "", r0size, 255 } if b1 == 0x83 && text[2] == 0xBC { // ー U+30FC alone (handled by reading aliases in compounds) // only skip if no longer match found below } } if text[0] < 0x80 { // ASCII punctuation/spaces return "", r0size, 255 } // Full-width punctuation U+FF00-U+FFEF if len(text) >= 3 && text[0] == 0xEF && text[1] == 0xBC { return "", r0size, 255 } // Build codepoint boundary offsets. offsets := []int{:0:32} i := 0 for i < len(text) { offsets = append(offsets, i) _, size := decodeRune(text, i) i += size } offsets = append(offsets, len(text)) // Branch order: noun-preceding context → try Bmodifier first. // prevPOS from POSForWord returns 1/2/3 matching CooccurNominal/Verbal/Function coord := PackCoord(0, 0, CoordCooccur(prevPOS, 0), 0, 0, 0, 0) order := branchOrderJA(coord) // Try from longest to shortest prefix. At each length, check: // 1. coord=0 (base form, most common) // 2. morph coord (conjugated form — must check at the SAME length before // shortening, or else "開け"(noun) beats "開けた"(past of 開ける)) // 3. verbStem de-stem (for forms not yet in lattice) for end := len(offsets) - 1; end >= 2; end-- { prefix := text[:offsets[end]] // coord=0 lookup. key0 := MakeKey(LangJA, 0, prefix) for _, b := range order { if tree.LookupRecIdx(lattice.Branch(b), key0) != lattice.NullRec { return prefix, offsets[end], b } } // Morph coord lookup — only for ≥3 codepoints (conjugated forms). if end >= 3 { ms := inferMorphState(prefix) if ms != 0 { coordM := PackCoord(0, 0, 0, uint64(ms), 0, 0, 0) keyM := MakeKey(LangJA, coordM, prefix) for _, b := range ActiveBranches { if tree.LookupRecIdx(b, keyM) != lattice.NullRec { return prefix, offsets[end], 255 } } } // verbStem de-stem for forms not yet at morph coord. for _, stem := range verbStems(prefix) { keyS := MakeKey(LangJA, 0, stem) for _, b := range ActiveBranches { if tree.LookupRecIdx(b, keyS) != lattice.NullRec { return prefix, offsets[end], 255 } } } } } // No match - consume first codepoint as an opaque token. return text[:r0size], r0size, 255 }