dict.mx raw

   1  package iskra
   2  
   3  const (
   4  	TokUnknown    uint8 = 0
   5  	TokNoun       uint8 = 1 // word not followed by ( - identifiers, keywords, types
   6  	TokVerb       uint8 = 2 // word followed by ( - function/method calls
   7  	TokModifier   uint8 = 3 // non-alphanumeric: operators, punctuation, whitespace, metadata
   8  	TokStructural uint8 = 4 // unused - kept for mesh backward compat
   9  	TokLiteral    uint8 = 5 // numbers, string literals
  10  )
  11  
  12  type DictEntry struct {
  13  	Offset uint32
  14  	Len    uint16
  15  	Class  uint8
  16  	_pad   uint8
  17  	Count  uint32
  18  }
  19  
  20  type Dict struct {
  21  	Pool    []byte
  22  	Entries []DictEntry
  23  	Index   map[string]uint32
  24  }
  25  
  26  func NewDict() *Dict {
  27  	return &Dict{
  28  		Pool:    []byte{:0:4096},
  29  		Entries: []DictEntry{:0:1024},
  30  		Index:   map[string]uint32{},
  31  	}
  32  }
  33  
  34  func (d *Dict) Add(token []byte, class uint8) uint32 {
  35  	s := string(token)
  36  	if idx, ok := d.Index[s]; ok {
  37  		d.Entries[idx].Count++
  38  		return idx
  39  	}
  40  	idx := uint32(len(d.Entries))
  41  	off := uint32(len(d.Pool))
  42  	d.Pool = append(d.Pool, token...)
  43  	d.Entries = append(d.Entries, DictEntry{
  44  		Offset: off,
  45  		Len:    uint16(len(token)),
  46  		Class:  class,
  47  		Count:  1,
  48  	})
  49  	d.Index[s] = idx
  50  	return idx
  51  }
  52  
  53  func (d *Dict) Get(idx uint32) []byte {
  54  	e := &d.Entries[idx]
  55  	return d.Pool[e.Offset : e.Offset+uint32(e.Len)]
  56  }
  57  
  58  func (d *Dict) EntryCount() int { return len(d.Entries) }
  59  func (d *Dict) PoolSize() int   { return len(d.Pool) }
  60  
  61  func (d *Dict) SortByFrequency(tokenPool []uint32) []uint32 {
  62  	n := len(d.Entries)
  63  	if n == 0 {
  64  		return tokenPool
  65  	}
  66  
  67  	order := []int32{:n}
  68  	for i := range order {
  69  		order[i] = int32(i)
  70  	}
  71  	sortByCount(order, d.Entries)
  72  
  73  	oldToNew := []uint32{:n}
  74  	newEntries := []DictEntry{:n}
  75  	for newIdx, oldIdx := range order {
  76  		oldToNew[oldIdx] = uint32(newIdx)
  77  		newEntries[newIdx] = d.Entries[oldIdx]
  78  	}
  79  	d.Entries = newEntries
  80  
  81  	newIndex := map[string]uint32{}
  82  	for s, oldIdx := range d.Index {
  83  		newIndex[s] = oldToNew[oldIdx]
  84  	}
  85  	d.Index = newIndex
  86  
  87  	for i := range tokenPool {
  88  		tokenPool[i] = oldToNew[tokenPool[i]]
  89  	}
  90  	return tokenPool
  91  }
  92  
  93  func sortByCount(order []int32, entries []DictEntry) {
  94  	if len(order) <= 1 {
  95  		return
  96  	}
  97  	pivot := entries[order[len(order)/2]].Count
  98  	i, j := 0, len(order)-1
  99  	for i <= j {
 100  		for entries[order[i]].Count > pivot {
 101  			i++
 102  		}
 103  		for entries[order[j]].Count < pivot {
 104  			j--
 105  		}
 106  		if i <= j {
 107  			order[i], order[j] = order[j], order[i]
 108  			i++
 109  			j--
 110  		}
 111  	}
 112  	if j > 0 {
 113  		sortByCount(order[:j+1], entries)
 114  	}
 115  	if i < len(order)-1 {
 116  		sortByCount(order[i:], entries)
 117  	}
 118  }
 119  
 120  func (d *Dict) Decode(seq []uint32) []byte {
 121  	n := 0
 122  	for _, idx := range seq {
 123  		n += int(d.Entries[idx].Len)
 124  	}
 125  	out := []byte{:0:n}
 126  	for _, idx := range seq {
 127  		e := &d.Entries[idx]
 128  		out = append(out, d.Pool[e.Offset:e.Offset+uint32(e.Len)]...)
 129  	}
 130  	return out
 131  }
 132  
 133  func Tokenize(data []byte) []Token {
 134  	tokens := []Token{:0:len(data) / 4}
 135  	i := 0
 136  	for i < len(data) {
 137  		start := i
 138  		ch := data[i]
 139  		class := TokModifier
 140  
 141  		switch {
 142  		case ch == '"':
 143  			i = scanQuotedStr(data, i)
 144  			class = TokLiteral
 145  		case ch == '`':
 146  			i = scanRawStr(data, i)
 147  			class = TokLiteral
 148  		case ch == '@' && i+1 < len(data) && data[i+1] == '"':
 149  			i = scanQuotedStr(data, i+1)
 150  			class = classifyByParen(data, i)
 151  		case ch == '@' && i+1 < len(data) && isWordByte(data[i+1]):
 152  			i++
 153  			i = scanWordTok(data, i)
 154  			class = classifyByParen(data, i)
 155  		case ch == '%' && i+1 < len(data) && (isWordByte(data[i+1]) || isDigitByte(data[i+1])):
 156  			i++
 157  			i = scanWordTok(data, i)
 158  			class = TokNoun
 159  		case ch == '!' && i+1 < len(data) && (isWordByte(data[i+1]) || isDigitByte(data[i+1])):
 160  			i++
 161  			i = scanWordTok(data, i)
 162  			class = TokModifier
 163  		case ch == '#' && i+1 < len(data) && isDigitByte(data[i+1]):
 164  			i++
 165  			for i < len(data) && isDigitByte(data[i]) {
 166  				i++
 167  			}
 168  			class = TokModifier
 169  		case ch == ';':
 170  			for i < len(data) && data[i] != '\n' {
 171  				i++
 172  			}
 173  			class = TokModifier
 174  		case ch == '/' && i+1 < len(data) && data[i+1] == '/':
 175  			for i < len(data) && data[i] != '\n' {
 176  				i++
 177  			}
 178  			class = TokModifier
 179  		case ch == '#' && i+1 < len(data) && isWordByte(data[i+1]):
 180  			i++
 181  			i = scanWordTok(data, i)
 182  			class = TokModifier
 183  		case isWordStartByte(ch):
 184  			i = scanWordTok(data, i)
 185  			class = classifyByParen(data, i)
 186  		case isDigitByte(ch):
 187  			i = scanNumberTok(data, i)
 188  			class = TokLiteral
 189  		case ch == '-' && i+1 < len(data) && isDigitByte(data[i+1]):
 190  			i++
 191  			i = scanNumberTok(data, i)
 192  			class = TokLiteral
 193  		default:
 194  			if ch == '\n' {
 195  				i++
 196  			} else if ch == ' ' || ch == '\t' {
 197  				for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
 198  					i++
 199  				}
 200  			} else {
 201  				i++
 202  			}
 203  		}
 204  		tokens = append(tokens, Token{
 205  			Text:  data[start:i],
 206  			Class: class,
 207  		})
 208  	}
 209  	return tokens
 210  }
 211  
 212  func classifyByParen(data []byte, pos int) uint8 {
 213  	if pos < len(data) && data[pos] == '(' {
 214  		return TokVerb
 215  	}
 216  	return TokNoun
 217  }
 218  
 219  type Token struct {
 220  	Text  []byte
 221  	Class uint8
 222  }
 223  
 224  func scanQuotedStr(data []byte, i int) int {
 225  	q := data[i]
 226  	i++
 227  	for i < len(data) {
 228  		if data[i] == '\\' && i+1 < len(data) {
 229  			i += 2
 230  			continue
 231  		}
 232  		if data[i] == q {
 233  			i++
 234  			return i
 235  		}
 236  		i++
 237  	}
 238  	return i
 239  }
 240  
 241  func scanRawStr(data []byte, i int) int {
 242  	i++
 243  	for i < len(data) {
 244  		if data[i] == '`' {
 245  			i++
 246  			return i
 247  		}
 248  		i++
 249  	}
 250  	return i
 251  }
 252  
 253  func scanWordTok(data []byte, i int) int {
 254  	for i < len(data) && isWordContByte(data[i]) {
 255  		i++
 256  	}
 257  	return i
 258  }
 259  
 260  func scanNumberTok(data []byte, i int) int {
 261  	if i+1 < len(data) && data[i] == '0' && (data[i+1] == 'x' || data[i+1] == 'X') {
 262  		i += 2
 263  		for i < len(data) && isHexByte(data[i]) {
 264  			i++
 265  		}
 266  		return i
 267  	}
 268  	for i < len(data) && (isDigitByte(data[i]) || data[i] == '.') {
 269  		i++
 270  	}
 271  	return i
 272  }
 273  
 274  func isWordStartByte(c byte) bool {
 275  	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'
 276  }
 277  
 278  func isWordByte(c byte) bool {
 279  	return isWordStartByte(c) || isDigitByte(c)
 280  }
 281  
 282  func isWordContByte(c byte) bool {
 283  	return isWordByte(c) || c == '.'
 284  }
 285  
 286  func isDigitByte(c byte) bool {
 287  	return c >= '0' && c <= '9'
 288  }
 289  
 290  func isHexByte(c byte) bool {
 291  	return isDigitByte(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
 292  }
 293  
 294