package iskra

import (
	"bufio"
	"bytes"
	"encoding/binary"
	"io"
	"os"
	"sort"

	"git.smesh.lol/iskradb/lattice"
)

// AtomIdxEntry is one row of the sidecar index. The lattice key it points
// at is reconstructible from these fields via AtomLinkKey, so we only
// store the components, not the hashed key itself.
//
// Sort order is: SrcLang, then SrcAtom, then ContextA, then DstLang,
// then DstAtom. This makes "all candidates for (srcLang, srcAtom)" and
// "all candidates for (srcLang, srcAtom, contextA)" both prefix scans.
type AtomIdxEntry struct {
	SrcLang    uint8
	DstLang    uint8
	RoleA      uint8
	RoleB      uint8
	Gen        uint8  // GenLegacy or GenContexted
	RArchaic   uint8  // source-corpus register coord: archaism axis
	RDiscourse uint8  // source-corpus register coord: discourse-shape axis
	Weight     uint32 // observation count
	SrcAtom    string
	ContextA   string
	DstAtom    string
	ContextB   string
}

// AtomIdx is the in-memory sorted index loaded from the sidecar file.
// Built by BuildAtomIdx from a Tree's Bpragmatic records; serialized to
// disk for reuse without rescanning the lattice.
type AtomIdx struct {
	Entries []AtomIdxEntry
}

// BuildAtomIdx scans every Bpragmatic record in the lattice, decodes
// each as an AtomIdxEntry (recognizing both GenLegacy '=' form and
// GenContexted '|' form), and returns a sorted index. The index is
// disposable: regenerate from the primary store at any time.
func BuildAtomIdx(t *Tree) *AtomIdx {
	idx := &AtomIdx{}
	for ri := range t.RecMeta {
		rec := t.DB().GetRecord(uint32(ri))
		if rec == nil || rec.Branch != uint8(lattice.Bpragmatic) {
			continue
		}
		m := t.MetaAt(uint32(ri))
		if m == nil {
			continue
		}
		form := FormFromRecord(rec, t.Pool())
		w := m.Count
		if w == 0 {
			continue
		}
		// StageTag low nibble = srcLang; high nibble = generation.
		srcLang := m.StageTag & 0x0f
		gen := (m.StageTag >> 4) & 0x0f

		entry := AtomIdxEntry{
			SrcLang:    srcLang,
			Gen:        gen,
			Weight:     w,
			RArchaic:   m.Extra[2],
			RDiscourse: m.Extra[3],
		}
		if gen == GenContexted {
			// Form: "atomA|contextA|atomB|contextB"
			parts := splitPipe4(form)
			if parts == nil {
				continue
			}
			entry.SrcAtom = parts[0]
			entry.ContextA = parts[1]
			entry.DstAtom = parts[2]
			entry.ContextB = parts[3]
			entry.RoleA = m.Extra[4]
			entry.RoleB = m.Extra[0]
			entry.DstLang = m.Extra[1]
		} else if gen == GenDictionary {
			// Form: "atomA=atomB" with role/lang in Extra.
			eq := -1
			for j := 0; j < len(form); j++ {
				if form[j] == '=' {
					eq = j
					break
				}
			}
			if eq < 0 {
				continue
			}
			entry.SrcAtom = form[:eq]
			entry.DstAtom = form[eq+1:]
			entry.RoleA = m.Extra[4]
			entry.RoleB = m.Extra[0]
			entry.DstLang = m.Extra[1]
		} else {
			// Form: "atomA=atomB" (legacy bilateral, no context, no roles)
			eq := -1
			for j := 0; j < len(form); j++ {
				if form[j] == '=' {
					eq = j
					break
				}
			}
			if eq < 0 {
				continue
			}
			entry.SrcAtom = form[:eq]
			entry.DstAtom = form[eq+1:]
			// Legacy bilateral: srcLang indicated by StageTag; dstLang
			// implicit (the other side of the bilateral). For the JA-EN
			// bilateral table that's the only existing pair, srcLang is
			// in {1,2} and dstLang is the complement.
			if srcLang == 1 {
				entry.DstLang = 2
			} else if srcLang == 2 {
				entry.DstLang = 1
			}
		}
		// Skip pathological entries: anything whose component strings
		// wouldn't fit in uint16 length fields. These are typically
		// lemmatizer artifacts from old ingest (whole sentences treated
		// as atoms) - rare and not useful for lookup.
		if len(entry.SrcAtom) > 0xffff || len(entry.ContextA) > 0xffff ||
			len(entry.DstAtom) > 0xffff || len(entry.ContextB) > 0xffff {
			continue
		}
		// Skip empty src/dst atoms - lemmatizer-collapsed records that
		// won't be looked up by any meaningful query.
		if entry.SrcAtom == "" || entry.DstAtom == "" {
			continue
		}
		idx.Entries = append(idx.Entries, entry)
	}
	// Mirror every entry so lookups work in both directions.
	// Records are ingested langA->langB only; without mirroring,
	// the reverse direction finds zero candidates.
	base := len(idx.Entries)
	for i := 0; i < base; i++ {
		e := &idx.Entries[i]
		idx.Entries = append(idx.Entries, AtomIdxEntry{
			SrcLang:    e.DstLang,
			DstLang:    e.SrcLang,
			RoleA:      e.RoleB,
			RoleB:      e.RoleA,
			Gen:        e.Gen,
			RArchaic:   e.RArchaic,
			RDiscourse: e.RDiscourse,
			Weight:     e.Weight,
			SrcAtom:    e.DstAtom,
			ContextA:   e.ContextB,
			DstAtom:    e.SrcAtom,
			ContextB:   e.ContextA,
		})
	}
	idx.sortEntries()
	return idx
}

// atomIdxSort implements sort.Interface for AtomIdxEntry slices, ordering
// by (SrcLang, SrcAtom, ContextA, DstLang, DstAtom). Used by sortEntries
// to produce the prefix-scannable order.
type atomIdxSort []AtomIdxEntry

func (s atomIdxSort) Len() int32 { return len(s) }
func (s atomIdxSort) Swap(i, j int32) {
	s[i], s[j] = s[j], s[i]
}
func (s atomIdxSort) Less(i, j int32) bool {
	a := &s[i]
	b := &s[j]
	if a.SrcLang != b.SrcLang {
		return a.SrcLang < b.SrcLang
	}
	if a.SrcAtom != b.SrcAtom {
		return a.SrcAtom < b.SrcAtom
	}
	if a.ContextA != b.ContextA {
		return a.ContextA < b.ContextA
	}
	if a.DstLang != b.DstLang {
		return a.DstLang < b.DstLang
	}
	return a.DstAtom < b.DstAtom
}

func (idx *AtomIdx) sortEntries() {
	sort.Sort(atomIdxSort(idx.Entries))
}

// FindBySrc returns all entries with the given srcLang and srcAtom.
// Prefix scan on the sorted slice - O(log N) binary search + O(K) range
// where K is the number of matches.
func (idx *AtomIdx) FindBySrc(srcLang uint8, srcAtom string) []AtomIdxEntry {
	// Binary-search for first entry >= (srcLang, srcAtom).
	lo := sort.Search(len(idx.Entries), func(i int32) bool {
		e := &idx.Entries[i]
		if e.SrcLang != srcLang {
			return e.SrcLang > srcLang
		}
		return e.SrcAtom >= srcAtom
	})
	var out []AtomIdxEntry
	for k := lo; k < len(idx.Entries); k++ {
		e := &idx.Entries[k]
		if e.SrcLang != srcLang || e.SrcAtom != srcAtom {
			break
		}
		out = append(out, *e)
	}
	return out
}

// FindBySrcContext returns entries that exactly match the full (srcLang,
// srcAtom, contextA) triple. Used by the lookup function's most-specific
// tier before relaxing context.
func (idx *AtomIdx) FindBySrcContext(srcLang uint8, srcAtom, contextA string) []AtomIdxEntry {
	all := idx.FindBySrc(srcLang, srcAtom)
	var out []AtomIdxEntry
	for _, e := range all {
		if e.ContextA == contextA {
			out = append(out, e)
		}
	}
	return out
}

// Save writes the index to a binary file. Format is a length-prefixed
// stream of entries; can be read back by Load.
func (idx *AtomIdx) Save(path string) error {
	f, err := os.Create(path)
	if err != nil {
		return err
	}
	defer f.Close()
	w := bufio.NewWriter(f)
	// Magic + version header. v2 added per-record register coordinate.
	if _, err := w.Write([]byte("ATIDX\x02")); err != nil {
		return err
	}
	// Entry count.
	var hdr [4]byte
	binary.LittleEndian().PutUint32(hdr[:], uint32(len(idx.Entries)))
	if _, err := w.Write(hdr[:]); err != nil {
		return err
	}
	for _, e := range idx.Entries {
		if err := writeEntry(w, &e); err != nil {
			return err
		}
	}
	return w.Flush()
}

// Load reads a sidecar index from disk.
func LoadAtomIdx(path string) (*AtomIdx, error) {
	f, err := os.Open(path)
	if err != nil {
		return nil, err
	}
	defer f.Close()
	r := bufio.NewReader(f)
	magic := [6]byte{}
	if _, err := io.ReadFull(r, magic[:]); err != nil {
		return nil, err
	}
	if string(magic[:]) != "ATIDX\x02" {
		return nil, ErrAtomIdxFormat
	}
	var hdr [4]byte
	if _, err := io.ReadFull(r, hdr[:]); err != nil {
		return nil, err
	}
	n := binary.LittleEndian().Uint32(hdr[:])
	idx := &AtomIdx{Entries: []AtomIdxEntry{:0:int32(n)}}
	for i := uint32(0); i < n; i++ {
		e, err := readEntry(r)
		if err != nil {
			return nil, err
		}
		idx.Entries = append(idx.Entries, e)
	}
	return idx, nil
}

const ErrAtomIdxFormat = errorString("atomidx: bad magic / version")

type errorString string

func (e errorString) Error() string { return string(e) }

func writeEntry(w io.Writer, e *AtomIdxEntry) error {
	var hdr [11]byte
	hdr[0] = e.SrcLang
	hdr[1] = e.DstLang
	hdr[2] = e.RoleA
	hdr[3] = e.RoleB
	hdr[4] = e.Gen
	hdr[5] = e.RArchaic
	hdr[6] = e.RDiscourse
	binary.LittleEndian().PutUint32(hdr[7:11], e.Weight)
	if _, err := w.Write(hdr[:]); err != nil {
		return err
	}
	var lens [8]byte
	binary.LittleEndian().PutUint16(lens[0:2], uint16(len(e.SrcAtom)))
	binary.LittleEndian().PutUint16(lens[2:4], uint16(len(e.ContextA)))
	binary.LittleEndian().PutUint16(lens[4:6], uint16(len(e.DstAtom)))
	binary.LittleEndian().PutUint16(lens[6:8], uint16(len(e.ContextB)))
	if _, err := w.Write(lens[:]); err != nil {
		return err
	}
	if _, err := w.Write([]byte(e.SrcAtom)); err != nil {
		return err
	}
	if _, err := w.Write([]byte(e.ContextA)); err != nil {
		return err
	}
	if _, err := w.Write([]byte(e.DstAtom)); err != nil {
		return err
	}
	if _, err := w.Write([]byte(e.ContextB)); err != nil {
		return err
	}
	return nil
}

func readEntry(r io.Reader) (AtomIdxEntry, error) {
	var hdr [11]byte
	if _, err := io.ReadFull(r, hdr[:]); err != nil {
		return AtomIdxEntry{}, err
	}
	var lens [8]byte
	if _, err := io.ReadFull(r, lens[:]); err != nil {
		return AtomIdxEntry{}, err
	}
	srcLen := int32(binary.LittleEndian().Uint16(lens[0:2]))
	ctxALen := int32(binary.LittleEndian().Uint16(lens[2:4]))
	dstLen := int32(binary.LittleEndian().Uint16(lens[4:6]))
	ctxBLen := int32(binary.LittleEndian().Uint16(lens[6:8]))
	buf := []byte{:srcLen + ctxALen + dstLen + ctxBLen:srcLen + ctxALen + dstLen + ctxBLen}
	if _, err := io.ReadFull(r, buf); err != nil {
		return AtomIdxEntry{}, err
	}
	off := 0
	srcAtom := string(buf[off : off+srcLen])
	off += srcLen
	ctxA := string(buf[off : off+ctxALen])
	off += ctxALen
	dstAtom := string(buf[off : off+dstLen])
	off += dstLen
	ctxB := string(buf[off : off+ctxBLen])
	_ = off
	return AtomIdxEntry{
		SrcLang:    hdr[0],
		DstLang:    hdr[1],
		RoleA:      hdr[2],
		RoleB:      hdr[3],
		Gen:        hdr[4],
		RArchaic:   hdr[5],
		RDiscourse: hdr[6],
		Weight:     binary.LittleEndian().Uint32(hdr[7:11]),
		SrcAtom:    srcAtom,
		ContextA:   ctxA,
		DstAtom:    dstAtom,
		ContextB:   ctxB,
	}, nil
}

// splitPipe4 splits a "a|b|c|d" form into [a, b, c, d]. Returns nil if
// the form has the wrong number of separators (defensive against legacy
// records that slipped through).
func splitPipe4(s string) []string {
	parts := bytes.SplitN([]byte(s), []byte{'|'}, 4)
	if len(parts) != 4 {
		return nil
	}
	return []string{string(parts[0]), string(parts[1]), string(parts[2]), string(parts[3])}
}