dict.mx raw
1 package iskra
2
3 const (
4 TokUnknown uint8 = 0
5 TokNoun uint8 = 1 // word not followed by ( - identifiers, keywords, types
6 TokVerb uint8 = 2 // word followed by ( - function/method calls
7 TokModifier uint8 = 3 // non-alphanumeric: operators, punctuation, whitespace, metadata
8 TokStructural uint8 = 4 // unused - kept for mesh backward compat
9 TokLiteral uint8 = 5 // numbers, string literals
10 )
11
12 type DictEntry struct {
13 Offset uint32
14 Len uint16
15 Class uint8
16 _pad uint8
17 Count uint32
18 }
19
20 type Dict struct {
21 Pool []byte
22 Entries []DictEntry
23 Index map[string]uint32
24 }
25
26 func NewDict() *Dict {
27 return &Dict{
28 Pool: []byte{:0:4096},
29 Entries: []DictEntry{:0:1024},
30 Index: map[string]uint32{},
31 }
32 }
33
34 func (d *Dict) Add(token []byte, class uint8) uint32 {
35 s := string(token)
36 if idx, ok := d.Index[s]; ok {
37 d.Entries[idx].Count++
38 return idx
39 }
40 idx := uint32(len(d.Entries))
41 off := uint32(len(d.Pool))
42 d.Pool = append(d.Pool, token...)
43 d.Entries = append(d.Entries, DictEntry{
44 Offset: off,
45 Len: uint16(len(token)),
46 Class: class,
47 Count: 1,
48 })
49 d.Index[s] = idx
50 return idx
51 }
52
53 func (d *Dict) Get(idx uint32) []byte {
54 e := &d.Entries[idx]
55 return d.Pool[e.Offset : e.Offset+uint32(e.Len)]
56 }
57
58 func (d *Dict) EntryCount() int { return len(d.Entries) }
59 func (d *Dict) PoolSize() int { return len(d.Pool) }
60
61 func (d *Dict) SortByFrequency(tokenPool []uint32) []uint32 {
62 n := len(d.Entries)
63 if n == 0 {
64 return tokenPool
65 }
66
67 order := []int32{:n}
68 for i := range order {
69 order[i] = int32(i)
70 }
71 sortByCount(order, d.Entries)
72
73 oldToNew := []uint32{:n}
74 newEntries := []DictEntry{:n}
75 for newIdx, oldIdx := range order {
76 oldToNew[oldIdx] = uint32(newIdx)
77 newEntries[newIdx] = d.Entries[oldIdx]
78 }
79 d.Entries = newEntries
80
81 newIndex := map[string]uint32{}
82 for s, oldIdx := range d.Index {
83 newIndex[s] = oldToNew[oldIdx]
84 }
85 d.Index = newIndex
86
87 for i := range tokenPool {
88 tokenPool[i] = oldToNew[tokenPool[i]]
89 }
90 return tokenPool
91 }
92
93 func sortByCount(order []int32, entries []DictEntry) {
94 if len(order) <= 1 {
95 return
96 }
97 pivot := entries[order[len(order)/2]].Count
98 i, j := 0, len(order)-1
99 for i <= j {
100 for entries[order[i]].Count > pivot {
101 i++
102 }
103 for entries[order[j]].Count < pivot {
104 j--
105 }
106 if i <= j {
107 order[i], order[j] = order[j], order[i]
108 i++
109 j--
110 }
111 }
112 if j > 0 {
113 sortByCount(order[:j+1], entries)
114 }
115 if i < len(order)-1 {
116 sortByCount(order[i:], entries)
117 }
118 }
119
120 func (d *Dict) Decode(seq []uint32) []byte {
121 n := 0
122 for _, idx := range seq {
123 n += int(d.Entries[idx].Len)
124 }
125 out := []byte{:0:n}
126 for _, idx := range seq {
127 e := &d.Entries[idx]
128 out = append(out, d.Pool[e.Offset:e.Offset+uint32(e.Len)]...)
129 }
130 return out
131 }
132
133 func Tokenize(data []byte) []Token {
134 tokens := []Token{:0:len(data) / 4}
135 i := 0
136 for i < len(data) {
137 start := i
138 ch := data[i]
139 class := TokModifier
140
141 switch {
142 case ch == '"':
143 i = scanQuotedStr(data, i)
144 class = TokLiteral
145 case ch == '`':
146 i = scanRawStr(data, i)
147 class = TokLiteral
148 case ch == '@' && i+1 < len(data) && data[i+1] == '"':
149 i = scanQuotedStr(data, i+1)
150 class = classifyByParen(data, i)
151 case ch == '@' && i+1 < len(data) && isWordByte(data[i+1]):
152 i++
153 i = scanWordTok(data, i)
154 class = classifyByParen(data, i)
155 case ch == '%' && i+1 < len(data) && (isWordByte(data[i+1]) || isDigitByte(data[i+1])):
156 i++
157 i = scanWordTok(data, i)
158 class = TokNoun
159 case ch == '!' && i+1 < len(data) && (isWordByte(data[i+1]) || isDigitByte(data[i+1])):
160 i++
161 i = scanWordTok(data, i)
162 class = TokModifier
163 case ch == '#' && i+1 < len(data) && isDigitByte(data[i+1]):
164 i++
165 for i < len(data) && isDigitByte(data[i]) {
166 i++
167 }
168 class = TokModifier
169 case ch == ';':
170 for i < len(data) && data[i] != '\n' {
171 i++
172 }
173 class = TokModifier
174 case ch == '/' && i+1 < len(data) && data[i+1] == '/':
175 for i < len(data) && data[i] != '\n' {
176 i++
177 }
178 class = TokModifier
179 case ch == '#' && i+1 < len(data) && isWordByte(data[i+1]):
180 i++
181 i = scanWordTok(data, i)
182 class = TokModifier
183 case isWordStartByte(ch):
184 i = scanWordTok(data, i)
185 class = classifyByParen(data, i)
186 case isDigitByte(ch):
187 i = scanNumberTok(data, i)
188 class = TokLiteral
189 case ch == '-' && i+1 < len(data) && isDigitByte(data[i+1]):
190 i++
191 i = scanNumberTok(data, i)
192 class = TokLiteral
193 default:
194 if ch == '\n' {
195 i++
196 } else if ch == ' ' || ch == '\t' {
197 for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
198 i++
199 }
200 } else {
201 i++
202 }
203 }
204 tokens = append(tokens, Token{
205 Text: data[start:i],
206 Class: class,
207 })
208 }
209 return tokens
210 }
211
212 func classifyByParen(data []byte, pos int) uint8 {
213 if pos < len(data) && data[pos] == '(' {
214 return TokVerb
215 }
216 return TokNoun
217 }
218
219 type Token struct {
220 Text []byte
221 Class uint8
222 }
223
224 func scanQuotedStr(data []byte, i int) int {
225 q := data[i]
226 i++
227 for i < len(data) {
228 if data[i] == '\\' && i+1 < len(data) {
229 i += 2
230 continue
231 }
232 if data[i] == q {
233 i++
234 return i
235 }
236 i++
237 }
238 return i
239 }
240
241 func scanRawStr(data []byte, i int) int {
242 i++
243 for i < len(data) {
244 if data[i] == '`' {
245 i++
246 return i
247 }
248 i++
249 }
250 return i
251 }
252
253 func scanWordTok(data []byte, i int) int {
254 for i < len(data) && isWordContByte(data[i]) {
255 i++
256 }
257 return i
258 }
259
260 func scanNumberTok(data []byte, i int) int {
261 if i+1 < len(data) && data[i] == '0' && (data[i+1] == 'x' || data[i+1] == 'X') {
262 i += 2
263 for i < len(data) && isHexByte(data[i]) {
264 i++
265 }
266 return i
267 }
268 for i < len(data) && (isDigitByte(data[i]) || data[i] == '.') {
269 i++
270 }
271 return i
272 }
273
274 func isWordStartByte(c byte) bool {
275 return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'
276 }
277
278 func isWordByte(c byte) bool {
279 return isWordStartByte(c) || isDigitByte(c)
280 }
281
282 func isWordContByte(c byte) bool {
283 return isWordByte(c) || c == '.'
284 }
285
286 func isDigitByte(c byte) bool {
287 return c >= '0' && c <= '9'
288 }
289
290 func isHexByte(c byte) bool {
291 return isDigitByte(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
292 }
293
294