tokenize.mx raw
1 package transdb
2
3 import (
4 "bytes"
5
6 "git.smesh.lol/iskradb/lattice"
7 )
8
9 // TokenizeEN splits English text on whitespace and punctuation, lowercases
10 // each token, and strips trailing possessive 's.
11 func TokenizeEN(text string) []string {
12 var tokens []string
13 b := []byte(text)
14 start := -1
15 for i := 0; i <= len(b); i++ {
16 var c byte
17 if i < len(b) {
18 c = b[i]
19 }
20 isAlnum := (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')
21 if isAlnum {
22 if start < 0 {
23 start = i
24 }
25 } else {
26 if start >= 0 {
27 tok := bytes.ToLower(b[start:i])
28 // Strip trailing 's or s'
29 if len(tok) > 2 && tok[len(tok)-2] == '\'' && tok[len(tok)-1] == 's' {
30 tok = tok[:len(tok)-2]
31 }
32 tokens = append(tokens, string(tok))
33 start = -1
34 }
35 }
36 }
37 return tokens
38 }
39
40 // TokenizeJA segments Japanese text using forward maximum-match against the
41 // translation lattice. Branch search order adapts to syntactic context so
42 // particles (Bmodifier) are preferred over noun readings in particle positions.
43 func TokenizeJA(text string, tree *lattice.Tree, verbose bool) []string {
44 var tokens []string
45 var prevPOS uint8
46 for len(text) > 0 {
47 tok, consumed, branch := maxMatchJA(text, tree, prevPOS)
48 if tok != "" {
49 tokens = append(tokens, tok)
50 if branch < 3 {
51 prevPOS = branch + 1 // 1=noun, 2=verb, 3=modifier
52 } else {
53 prevPOS = 0
54 }
55 }
56 text = text[consumed:]
57 if verbose && tok != "" {
58 println("segment:", tok)
59 }
60 }
61 return tokens
62 }
63
64 // maxMatchJA tries to match the longest prefix of text against the JA lattice.
65 // prevPOS carries the POS of the preceding content token (1=noun, 2=verb,
66 // 3=modifier, 0=unknown) for context-guided branch ordering.
67 // Returns (matched_form, bytes_consumed, branch). Branch=255 means no match.
68 // Punctuation and CJK symbols pass through silently (empty form, 1 codepoint consumed).
69 func maxMatchJA(text string, tree *lattice.Tree, prevPOS uint8) (string, int, uint8) {
70 _, r0size := decodeRune(text, 0)
71
72 // Fast path: punctuation and non-word characters pass through silently.
73 // CJK punctuation: U+3000-U+303F (。、・「」〜 etc.)
74 // Full-width: U+FF00-U+FFEF
75 // ASCII: < 0x80
76 if len(text) >= 3 && text[0] == 0xE3 {
77 b1 := text[1]
78 if b1 == 0x80 && text[2] <= 0xBF { // U+3000-U+303F
79 return "", r0size, 255
80 }
81 if b1 == 0x83 && text[2] == 0xBC { // ー U+30FC alone (handled by reading aliases in compounds)
82 // only skip if no longer match found below
83 }
84 }
85 if text[0] < 0x80 { // ASCII punctuation/spaces
86 return "", r0size, 255
87 }
88 // Full-width punctuation U+FF00-U+FFEF
89 if len(text) >= 3 && text[0] == 0xEF && text[1] == 0xBC {
90 return "", r0size, 255
91 }
92
93 // Build codepoint boundary offsets.
94 offsets := []int{:0:32}
95 i := 0
96 for i < len(text) {
97 offsets = append(offsets, i)
98 _, size := decodeRune(text, i)
99 i += size
100 }
101 offsets = append(offsets, len(text))
102
103 // Branch order: noun-preceding context → try Bmodifier first.
104 // prevPOS from POSForWord returns 1/2/3 matching CooccurNominal/Verbal/Function
105 coord := PackCoord(0, 0, CoordCooccur(prevPOS, 0), 0, 0, 0, 0)
106 order := branchOrderJA(coord)
107
108 // Try from longest to shortest prefix. At each length, check:
109 // 1. coord=0 (base form, most common)
110 // 2. morph coord (conjugated form — must check at the SAME length before
111 // shortening, or else "開け"(noun) beats "開けた"(past of 開ける))
112 // 3. verbStem de-stem (for forms not yet in lattice)
113 for end := len(offsets) - 1; end >= 2; end-- {
114 prefix := text[:offsets[end]]
115
116 // coord=0 lookup.
117 key0 := MakeKey(LangJA, 0, prefix)
118 for _, b := range order {
119 if tree.LookupRecIdx(lattice.Branch(b), key0) != lattice.NullRec {
120 return prefix, offsets[end], b
121 }
122 }
123
124 // Morph coord lookup — only for ≥3 codepoints (conjugated forms).
125 if end >= 3 {
126 ms := inferMorphState(prefix)
127 if ms != 0 {
128 coordM := PackCoord(0, 0, 0, uint64(ms), 0, 0, 0)
129 keyM := MakeKey(LangJA, coordM, prefix)
130 for _, b := range ActiveBranches {
131 if tree.LookupRecIdx(b, keyM) != lattice.NullRec {
132 return prefix, offsets[end], 255
133 }
134 }
135 }
136 // verbStem de-stem for forms not yet at morph coord.
137 for _, stem := range verbStems(prefix) {
138 keyS := MakeKey(LangJA, 0, stem)
139 for _, b := range ActiveBranches {
140 if tree.LookupRecIdx(b, keyS) != lattice.NullRec {
141 return prefix, offsets[end], 255
142 }
143 }
144 }
145 }
146 }
147
148 // No match - consume first codepoint as an opaque token.
149 return text[:r0size], r0size, 255
150 }
151