package iskra const ( TokUnknown uint8 = 0 TokNoun uint8 = 1 // word not followed by ( - identifiers, keywords, types TokVerb uint8 = 2 // word followed by ( - function/method calls TokModifier uint8 = 3 // non-alphanumeric: operators, punctuation, whitespace, metadata TokStructural uint8 = 4 // unused - kept for mesh backward compat TokLiteral uint8 = 5 // numbers, string literals ) type DictEntry struct { Offset uint32 Len uint16 Class uint8 _pad uint8 Count uint32 } type Dict struct { Pool []byte Entries []DictEntry Index map[string]uint32 } func NewDict() *Dict { return &Dict{ Pool: []byte{:0:4096}, Entries: []DictEntry{:0:1024}, Index: map[string]uint32{}, } } func (d *Dict) Add(token []byte, class uint8) uint32 { s := string(token) if idx, ok := d.Index[s]; ok { d.Entries[idx].Count++ return idx } idx := uint32(len(d.Entries)) off := uint32(len(d.Pool)) d.Pool = append(d.Pool, token...) d.Entries = append(d.Entries, DictEntry{ Offset: off, Len: uint16(len(token)), Class: class, Count: 1, }) d.Index[s] = idx return idx } func (d *Dict) Get(idx uint32) []byte { e := &d.Entries[idx] return d.Pool[e.Offset : e.Offset+uint32(e.Len)] } func (d *Dict) EntryCount() int { return len(d.Entries) } func (d *Dict) PoolSize() int { return len(d.Pool) } func (d *Dict) SortByFrequency(tokenPool []uint32) []uint32 { n := len(d.Entries) if n == 0 { return tokenPool } order := []int32{:n} for i := range order { order[i] = int32(i) } sortByCount(order, d.Entries) oldToNew := []uint32{:n} newEntries := []DictEntry{:n} for newIdx, oldIdx := range order { oldToNew[oldIdx] = uint32(newIdx) newEntries[newIdx] = d.Entries[oldIdx] } d.Entries = newEntries newIndex := map[string]uint32{} for s, oldIdx := range d.Index { newIndex[s] = oldToNew[oldIdx] } d.Index = newIndex for i := range tokenPool { tokenPool[i] = oldToNew[tokenPool[i]] } return tokenPool } func sortByCount(order []int32, entries []DictEntry) { if len(order) <= 1 { return } pivot := entries[order[len(order)/2]].Count i, j := 0, len(order)-1 for i <= j { for entries[order[i]].Count > pivot { i++ } for entries[order[j]].Count < pivot { j-- } if i <= j { order[i], order[j] = order[j], order[i] i++ j-- } } if j > 0 { sortByCount(order[:j+1], entries) } if i < len(order)-1 { sortByCount(order[i:], entries) } } func (d *Dict) Decode(seq []uint32) []byte { n := 0 for _, idx := range seq { n += int(d.Entries[idx].Len) } out := []byte{:0:n} for _, idx := range seq { e := &d.Entries[idx] out = append(out, d.Pool[e.Offset:e.Offset+uint32(e.Len)]...) } return out } func Tokenize(data []byte) []Token { tokens := []Token{:0:len(data) / 4} i := 0 for i < len(data) { start := i ch := data[i] class := TokModifier switch { case ch == '"': i = scanQuotedStr(data, i) class = TokLiteral case ch == '`': i = scanRawStr(data, i) class = TokLiteral case ch == '@' && i+1 < len(data) && data[i+1] == '"': i = scanQuotedStr(data, i+1) class = classifyByParen(data, i) case ch == '@' && i+1 < len(data) && isWordByte(data[i+1]): i++ i = scanWordTok(data, i) class = classifyByParen(data, i) case ch == '%' && i+1 < len(data) && (isWordByte(data[i+1]) || isDigitByte(data[i+1])): i++ i = scanWordTok(data, i) class = TokNoun case ch == '!' && i+1 < len(data) && (isWordByte(data[i+1]) || isDigitByte(data[i+1])): i++ i = scanWordTok(data, i) class = TokModifier case ch == '#' && i+1 < len(data) && isDigitByte(data[i+1]): i++ for i < len(data) && isDigitByte(data[i]) { i++ } class = TokModifier case ch == ';': for i < len(data) && data[i] != '\n' { i++ } class = TokModifier case ch == '/' && i+1 < len(data) && data[i+1] == '/': for i < len(data) && data[i] != '\n' { i++ } class = TokModifier case ch == '#' && i+1 < len(data) && isWordByte(data[i+1]): i++ i = scanWordTok(data, i) class = TokModifier case isWordStartByte(ch): i = scanWordTok(data, i) class = classifyByParen(data, i) case isDigitByte(ch): i = scanNumberTok(data, i) class = TokLiteral case ch == '-' && i+1 < len(data) && isDigitByte(data[i+1]): i++ i = scanNumberTok(data, i) class = TokLiteral default: if ch == '\n' { i++ } else if ch == ' ' || ch == '\t' { for i < len(data) && (data[i] == ' ' || data[i] == '\t') { i++ } } else { i++ } } tokens = append(tokens, Token{ Text: data[start:i], Class: class, }) } return tokens } func classifyByParen(data []byte, pos int) uint8 { if pos < len(data) && data[pos] == '(' { return TokVerb } return TokNoun } type Token struct { Text []byte Class uint8 } func scanQuotedStr(data []byte, i int) int { q := data[i] i++ for i < len(data) { if data[i] == '\\' && i+1 < len(data) { i += 2 continue } if data[i] == q { i++ return i } i++ } return i } func scanRawStr(data []byte, i int) int { i++ for i < len(data) { if data[i] == '`' { i++ return i } i++ } return i } func scanWordTok(data []byte, i int) int { for i < len(data) && isWordContByte(data[i]) { i++ } return i } func scanNumberTok(data []byte, i int) int { if i+1 < len(data) && data[i] == '0' && (data[i+1] == 'x' || data[i+1] == 'X') { i += 2 for i < len(data) && isHexByte(data[i]) { i++ } return i } for i < len(data) && (isDigitByte(data[i]) || data[i] == '.') { i++ } return i } func isWordStartByte(c byte) bool { return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' } func isWordByte(c byte) bool { return isWordStartByte(c) || isDigitByte(c) } func isWordContByte(c byte) bool { return isWordByte(c) || c == '.' } func isDigitByte(c byte) bool { return c >= '0' && c <= '9' } func isHexByte(c byte) bool { return isDigitByte(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') }