package iskra

const (
	TokUnknown    uint8 = 0
	TokNoun       uint8 = 1 // word not followed by ( - identifiers, keywords, types
	TokVerb       uint8 = 2 // word followed by ( - function/method calls
	TokModifier   uint8 = 3 // non-alphanumeric: operators, punctuation, whitespace, metadata
	TokStructural uint8 = 4 // unused - kept for mesh backward compat
	TokLiteral    uint8 = 5 // numbers, string literals
)

type DictEntry struct {
	Offset uint32
	Len    uint16
	Class  uint8
	_pad   uint8
	Count  uint32
}

type Dict struct {
	Pool    []byte
	Entries []DictEntry
	Index   map[string]uint32
}

func NewDict() *Dict {
	return &Dict{
		Pool:    []byte{:0:4096},
		Entries: []DictEntry{:0:1024},
		Index:   map[string]uint32{},
	}
}

func (d *Dict) Add(token []byte, class uint8) uint32 {
	s := string(token)
	if idx, ok := d.Index[s]; ok {
		d.Entries[idx].Count++
		return idx
	}
	idx := uint32(len(d.Entries))
	off := uint32(len(d.Pool))
	d.Pool = append(d.Pool, token...)
	d.Entries = append(d.Entries, DictEntry{
		Offset: off,
		Len:    uint16(len(token)),
		Class:  class,
		Count:  1,
	})
	d.Index[s] = idx
	return idx
}

func (d *Dict) Get(idx uint32) []byte {
	e := &d.Entries[idx]
	return d.Pool[e.Offset : e.Offset+uint32(e.Len)]
}

func (d *Dict) EntryCount() int { return len(d.Entries) }
func (d *Dict) PoolSize() int   { return len(d.Pool) }

func (d *Dict) SortByFrequency(tokenPool []uint32) []uint32 {
	n := len(d.Entries)
	if n == 0 {
		return tokenPool
	}

	order := []int32{:n}
	for i := range order {
		order[i] = int32(i)
	}
	sortByCount(order, d.Entries)

	oldToNew := []uint32{:n}
	newEntries := []DictEntry{:n}
	for newIdx, oldIdx := range order {
		oldToNew[oldIdx] = uint32(newIdx)
		newEntries[newIdx] = d.Entries[oldIdx]
	}
	d.Entries = newEntries

	newIndex := map[string]uint32{}
	for s, oldIdx := range d.Index {
		newIndex[s] = oldToNew[oldIdx]
	}
	d.Index = newIndex

	for i := range tokenPool {
		tokenPool[i] = oldToNew[tokenPool[i]]
	}
	return tokenPool
}

func sortByCount(order []int32, entries []DictEntry) {
	if len(order) <= 1 {
		return
	}
	pivot := entries[order[len(order)/2]].Count
	i, j := 0, len(order)-1
	for i <= j {
		for entries[order[i]].Count > pivot {
			i++
		}
		for entries[order[j]].Count < pivot {
			j--
		}
		if i <= j {
			order[i], order[j] = order[j], order[i]
			i++
			j--
		}
	}
	if j > 0 {
		sortByCount(order[:j+1], entries)
	}
	if i < len(order)-1 {
		sortByCount(order[i:], entries)
	}
}

func (d *Dict) Decode(seq []uint32) []byte {
	n := 0
	for _, idx := range seq {
		n += int(d.Entries[idx].Len)
	}
	out := []byte{:0:n}
	for _, idx := range seq {
		e := &d.Entries[idx]
		out = append(out, d.Pool[e.Offset:e.Offset+uint32(e.Len)]...)
	}
	return out
}

func Tokenize(data []byte) []Token {
	tokens := []Token{:0:len(data) / 4}
	i := 0
	for i < len(data) {
		start := i
		ch := data[i]
		class := TokModifier

		switch {
		case ch == '"':
			i = scanQuotedStr(data, i)
			class = TokLiteral
		case ch == '`':
			i = scanRawStr(data, i)
			class = TokLiteral
		case ch == '@' && i+1 < len(data) && data[i+1] == '"':
			i = scanQuotedStr(data, i+1)
			class = classifyByParen(data, i)
		case ch == '@' && i+1 < len(data) && isWordByte(data[i+1]):
			i++
			i = scanWordTok(data, i)
			class = classifyByParen(data, i)
		case ch == '%' && i+1 < len(data) && (isWordByte(data[i+1]) || isDigitByte(data[i+1])):
			i++
			i = scanWordTok(data, i)
			class = TokNoun
		case ch == '!' && i+1 < len(data) && (isWordByte(data[i+1]) || isDigitByte(data[i+1])):
			i++
			i = scanWordTok(data, i)
			class = TokModifier
		case ch == '#' && i+1 < len(data) && isDigitByte(data[i+1]):
			i++
			for i < len(data) && isDigitByte(data[i]) {
				i++
			}
			class = TokModifier
		case ch == ';':
			for i < len(data) && data[i] != '\n' {
				i++
			}
			class = TokModifier
		case ch == '/' && i+1 < len(data) && data[i+1] == '/':
			for i < len(data) && data[i] != '\n' {
				i++
			}
			class = TokModifier
		case ch == '#' && i+1 < len(data) && isWordByte(data[i+1]):
			i++
			i = scanWordTok(data, i)
			class = TokModifier
		case isWordStartByte(ch):
			i = scanWordTok(data, i)
			class = classifyByParen(data, i)
		case isDigitByte(ch):
			i = scanNumberTok(data, i)
			class = TokLiteral
		case ch == '-' && i+1 < len(data) && isDigitByte(data[i+1]):
			i++
			i = scanNumberTok(data, i)
			class = TokLiteral
		default:
			if ch == '\n' {
				i++
			} else if ch == ' ' || ch == '\t' {
				for i < len(data) && (data[i] == ' ' || data[i] == '\t') {
					i++
				}
			} else {
				i++
			}
		}
		tokens = append(tokens, Token{
			Text:  data[start:i],
			Class: class,
		})
	}
	return tokens
}

func classifyByParen(data []byte, pos int) uint8 {
	if pos < len(data) && data[pos] == '(' {
		return TokVerb
	}
	return TokNoun
}

type Token struct {
	Text  []byte
	Class uint8
}

func scanQuotedStr(data []byte, i int) int {
	q := data[i]
	i++
	for i < len(data) {
		if data[i] == '\\' && i+1 < len(data) {
			i += 2
			continue
		}
		if data[i] == q {
			i++
			return i
		}
		i++
	}
	return i
}

func scanRawStr(data []byte, i int) int {
	i++
	for i < len(data) {
		if data[i] == '`' {
			i++
			return i
		}
		i++
	}
	return i
}

func scanWordTok(data []byte, i int) int {
	for i < len(data) && isWordContByte(data[i]) {
		i++
	}
	return i
}

func scanNumberTok(data []byte, i int) int {
	if i+1 < len(data) && data[i] == '0' && (data[i+1] == 'x' || data[i+1] == 'X') {
		i += 2
		for i < len(data) && isHexByte(data[i]) {
			i++
		}
		return i
	}
	for i < len(data) && (isDigitByte(data[i]) || data[i] == '.') {
		i++
	}
	return i
}

func isWordStartByte(c byte) bool {
	return (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_'
}

func isWordByte(c byte) bool {
	return isWordStartByte(c) || isDigitByte(c)
}

func isWordContByte(c byte) bool {
	return isWordByte(c) || c == '.'
}

func isDigitByte(c byte) bool {
	return c >= '0' && c <= '9'
}

func isHexByte(c byte) bool {
	return isDigitByte(c) || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F')
}