package ingest

import (
	"bufio"
	"fmt"
	"io"
	"os"
	"slices"

	"git.smesh.lol/iskradb/lattice"
	"git.smesh.lol/transdb"
)

// CooccurConfig controls co-occurrence counting and PMI filtering.
type CooccurConfig struct {
	MinCooc             uint32  // minimum times a pair must co-occur (default 3)
	PMIMin              float64 // minimum PMI score (default 2.0 ≈ 4× expected)
	MaxPairs            int     // maximum new pairs to insert (0 = no limit)
	MaxPairsPerSentence int     // skip sentences whose EN×JA product exceeds this (default 40)
	JAWordlist          string  // path to pre-built JA wordlist file (empty = build from lattice)
}

func DefaultCooccurConfig() CooccurConfig {
	return CooccurConfig{MinCooc: 3, PMIMin: 2.0, MaxPairs: 0, MaxPairsPerSentence: 40}
}

// ExtendFromSentences reads parallel EN/JA sentence files, computes PMI
// co-occurrence scores, and inserts high-confidence pairs into db that
// are not already covered by existing links.
func ExtendFromSentences(db *DB, enPath, jaPath string, cfg CooccurConfig, verbose bool) (int, error) {
	// Phase 1: build set of valid JA forms.
	// Use pre-built wordlist file if provided (faster); otherwise scan lattice.
	var validJA map[string]uint32
	if cfg.JAWordlist != "" {
		wl, err := transdb.LoadWordlist(cfg.JAWordlist)
		if err != nil {
			return 0, fmt.Errorf("load wordlist: %w", err)
		}
		validJA = wl
		if verbose {
			fmt.Fprintf(os.Stderr, "extend: %d valid JA forms from wordlist\n", len(validJA))
		}
	} else {
		validJA = buildValidJASet(db)
		if verbose {
			fmt.Fprintf(os.Stderr, "extend: %d valid JA forms from lattice\n", len(validJA))
		}
	}

	// Phase 2: stream sentence pairs, count co-occurrences.
	cooc, enFreq, jaFreq, total, err := countCooc(db.Tree, enPath, jaPath, validJA, cfg.MaxPairsPerSentence, verbose)
	if err != nil {
		return 0, err
	}
	if verbose {
		fmt.Fprintf(os.Stderr, "extend: %d sentence pairs, %d unique co-occurrence pairs\n",
			total, len(cooc))
	}

	// Phase 3: score with PMI, collect high-scoring candidates.
	type Candidate struct {
		EN    string
		ENCtx uint64 // 22-bit coord (cooccurrence axis only for now)
		JA    string
		PMI   float64
	}
	var candidates []Candidate
	for pair, cnt := range cooc {
		if cnt < cfg.MinCooc {
			continue
		}
		en, enCtx, ja := splitPairCtx(pair)
		pmi := pmiScore(cnt, enFreq[en], jaFreq[ja], uint32(total))
		if pmi >= cfg.PMIMin {
			candidates = append(candidates, Candidate{en, enCtx, ja, pmi})
		}
	}
	// Sort descending by PMI.
	slices.SortFunc(candidates, func(a, b Candidate) int {
		if a.PMI > b.PMI {
			return -1
		}
		if a.PMI < b.PMI {
			return 1
		}
		return 0
	})
	if verbose {
		fmt.Fprintf(os.Stderr, "extend: %d candidates above PMI %.1f\n", len(candidates), cfg.PMIMin)
	}

	// Phase 4: insert new pairs into lattice.
	inserted := 0
	for _, c := range candidates {
		if cfg.MaxPairs > 0 && inserted >= cfg.MaxPairs {
			break
		}
		if insertCooccurPair(db, c.EN, c.ENCtx, c.JA) {
			inserted++
		}
	}

	// Phase 5: accumulate corpus evidence counts in JA.DataLen.
	// For each high-PMI (EN, JA) pair, find the JA record and increment its
	// DataLen. JA.DataLen = total co-occurrence evidence across all EN partners.
	// Stored on the JA record so every candidate in a rerank comparison carries
	// its own evidence — challengers are not disadvantaged vs the current Link[0].
	// Only inline JA records (DataFile==0, form ≤23 bytes) are counted.
	// Accumulates across corpus re-runs.
	confirmed := 0
	for pair, cnt := range cooc {
		if cnt < cfg.MinCooc {
			continue
		}
		_, _, ja := splitPairCtx(pair)
		jaKey := transdb.MakeKey(transdb.LangJA, 0, ja)
		for _, b := range transdb.ActiveBranches {
			jaRI := db.Tree.LookupRecIdx(lattice.Branch(b), jaKey)
			if jaRI == lattice.NullRec {
				continue
			}
			jaRec := db.Tree.GetRecord(jaRI)
			if jaRec == nil || jaRec.DataFile != 0 {
				break // overflow — DataLen is byte length, don't touch
			}
			if jaRec.DataLen < 0xFFFFFFFF {
				jaRec.DataLen += cnt
			}
			confirmed++
			break
		}
	}
	if verbose && confirmed > 0 {
		fmt.Fprintf(os.Stderr, "extend: %d JA records gained corpus evidence counts\n", confirmed)
	}

	return inserted, nil
}

// buildValidJASet collects all JA surface forms from the existing lattice
// into a map[form]recIdx for fast substring matching.
// Language is detected from the form content (JA = hiragana/katakana/CJK).
func buildValidJASet(db *DB) map[string]uint32 {
	valid := map[string]uint32{}
	for recIdx := range db.Tree.RecKey {
		rec := db.Tree.GetRecord(recIdx)
		if rec == nil {
			continue
		}
		form := transdb.FormFromInline(rec, db.StringPool)
		if form != "" && transdb.Detect(form) == transdb.LangJA {
			valid[form] = recIdx
		}
	}
	return valid
}

// countCooc streams two parallel files line-by-line and counts
// co-occurrences between EN tokens (with POS trigram context) and JA substrings.
func countCooc(tree *lattice.Tree, enPath, jaPath string, validJA map[string]uint32, maxPairsPerSentence int, verbose bool) (
	cooc map[string]uint32, enFreq map[string]uint32, jaFreq map[string]uint32, total int, err error) {

	enF, err := os.Open(enPath)
	if err != nil {
		return nil, nil, nil, 0, fmt.Errorf("open %s: %w", enPath, err)
	}
	defer enF.Close()

	jaF, err := os.Open(jaPath)
	if err != nil {
		return nil, nil, nil, 0, fmt.Errorf("open %s: %w", jaPath, err)
	}
	defer jaF.Close()

	cooc = map[string]uint32{}
	enFreq = map[string]uint32{}
	jaFreq = map[string]uint32{}

	enSc := bufio.NewScanner(enF)
	jaSc := bufio.NewScanner(jaF)

	// Prune every pruneInterval sentences: evict pairs seen < 2 times.
	// More frequent = lower memory, slightly less recall on rare-but-valid pairs.
	const pruneInterval = 10000

	logInterval := 100000
	for enSc.Scan() && jaSc.Scan() {
		enLine := enSc.Text()
		jaLine := jaSc.Text()
		total++

		if verbose && total%logInterval == 0 {
			fmt.Fprintf(os.Stderr, "extend: processed %d sentence pairs... (cooc map: %d entries)\n",
				total, len(cooc))
		}

		if total%pruneInterval == 0 {
			before := len(cooc)
			for k, v := range cooc {
				if v < 2 {
					delete(cooc, k)
				}
			}
			if verbose {
				fmt.Fprintf(os.Stderr, "extend: pruned cooc map %d→%d entries\n", before, len(cooc))
			}
		}

		enToks := tokenizeENSentence(enLine)
		jaToks := extractJATokens(jaLine, validJA)

		// Skip degenerate pairs.
		if len(enToks) == 0 || len(jaToks) == 0 {
			continue
		}
		// Skip sentences whose cartesian product is too large.
		if len(enToks)*len(jaToks) > maxPairsPerSentence {
			continue
		}

		// Compute POS context for each EN token position (overlapping trigram window).
		enPOS := []uint8{:len(enToks):len(enToks)}
		for i, t := range enToks {
			enPOS[i] = transdb.POSForWord(tree, transdb.LangEN, t)
		}

		// Dedup JA tokens within sentence.
		jaSeen := map[string]bool{}
		for _, t := range jaToks {
			jaSeen[t] = true
		}

		// EN frequency: count unique words (not per-position to avoid inflation).
		enCounted := map[string]bool{}
		for _, t := range enToks {
			if !enCounted[t] {
				enFreq[t]++
				enCounted[t] = true
			}
		}
		for j := range jaSeen {
			jaFreq[j]++
		}

		// Count co-occurrences per EN position (with overlapping trigram ctx).
		enPosSeen := map[string]bool{} // dedup (enWord+ctx, jaWord) within sentence
		for i, e := range enToks {
			var prev, next uint8
			if i > 0 {
				prev = enPOS[i-1]
			}
			if i+1 < len(enPOS) {
				next = enPOS[i+1]
			}
			// Cooccurrence axis: (prev_type, next_type) packed into coord.
			// cur position (enPOS[i]) is implicit in the word's grammatical axis.
			cooccur := transdb.CoordCooccur(prev, next)
			ctx := transdb.PackCoord(0, 0, cooccur, 0, 0, 0, 0)
			for j := range jaSeen {
				k := joinPairCtx(ctx, e, j)
				if !enPosSeen[k] {
					enPosSeen[k] = true
					cooc[k]++
				}
			}
		}
	}
	if err = enSc.Err(); err != nil {
		return nil, nil, nil, total, fmt.Errorf("scan %s: %w", enPath, err)
	}
	_ = io.EOF // suppress unused import warning
	return cooc, enFreq, jaFreq, total, nil
}

// enStopWords are high-frequency English function words that co-occur with
// everything and produce no translation signal — only noise and memory pressure.
var enStopWords = map[string]bool{
	"the": true, "a": true, "an": true, "is": true, "are": true,
	"was": true, "were": true, "be": true, "been": true, "being": true,
	"have": true, "has": true, "had": true, "do": true, "does": true,
	"did": true, "will": true, "would": true, "could": true, "should": true,
	"may": true, "might": true, "shall": true, "can": true,
	"of": true, "in": true, "to": true, "for": true, "on": true,
	"at": true, "by": true, "with": true, "from": true, "as": true,
	"and": true, "or": true, "but": true, "not": true, "no": true,
	"it": true, "its": true, "this": true, "that": true, "these": true,
	"those": true, "he": true, "she": true, "we": true, "they": true,
	"you": true, "me": true, "him": true, "her": true, "us": true,
	"them": true, "my": true, "your": true, "his": true, "our": true,
	"their": true, "what": true, "which": true, "who": true, "all": true,
	"if": true, "so": true, "up": true, "out": true, "just": true,
	"also": true, "than": true, "when": true, "where": true, "how": true,
	"why": true, "about": true, "into": true, "then": true, "now": true,
	"here": true, "there": true, "some": true, "any": true, "more": true,
}

// tokenizeENSentence splits an English subtitle line into lowercase tokens,
// skipping stop words and tokens shorter than 3 chars.
func tokenizeENSentence(line string) []string {
	var tokens []string
	var cur []byte
	for i := 0; i < len(line); i++ {
		c := line[i]
		if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') {
			if c >= 'A' && c <= 'Z' {
				c += 32
			}
			cur = append(cur, c)
		} else {
			if len(cur) >= 3 {
				tok := string(append([]byte(nil), cur...))
				if !enStopWords[tok] {
					tokens = append(tokens, tok)
				}
			}
			cur = cur[:0]
		}
	}
	if len(cur) >= 3 {
		tok := string(append([]byte(nil), cur...))
		if !enStopWords[tok] {
			tokens = append(tokens, tok)
		}
	}
	return tokens
}

// extractJATokens finds all substrings of line (up to 8 codepoints)
// that exist as valid JA forms in the lattice.
//
// Uses byte-offset iteration because Moxie's range yields bytes not runes
// and []rune(string) does not decode UTF-8.  CJK/kana are 3 bytes each;
// utf8Start computes the correct byte-length per codepoint from the first byte.
func extractJATokens(line string, validJA map[string]uint32) []string {
	// Build byte offsets for each codepoint boundary.
	offsets := []int{:0:len(line)/3 + 1}
	i := 0
	for i < len(line) {
		offsets = append(offsets, i)
		i += utf8CharLen(line[i])
	}
	offsets = append(offsets, len(line))

	seen := map[string]bool{}
	var tokens []string
	maxCodepoints := 20
	minCodepoints := 2 // skip single-char JA (particles: は, が, を, に, の…)
	n := len(offsets) - 1 // number of codepoints
	for start := 0; start < n; start++ {
		for l := minCodepoints; l <= maxCodepoints && start+l <= n; l++ {
			sub := line[offsets[start]:offsets[start+l]]
			if _, ok := validJA[sub]; ok && !seen[sub] {
				// Copy: sub is a slice of line which aliases the scanner buffer.
				tokens = append(tokens, string(append([]byte(nil), []byte(sub)...)))
				seen[sub] = true
			}
		}
	}
	return tokens
}

// utf8CharLen returns the byte length of the UTF-8 codepoint starting at b.
func utf8CharLen(b byte) int {
	switch {
	case b < 0x80:
		return 1
	case b < 0xE0:
		return 2
	case b < 0xF0:
		return 3
	default:
		return 4
	}
}

// pmiScore computes pointwise mutual information (in bits / log2).
// pmi = log2(P(x,y) / (P(x)*P(y))) = log2(cnt*N / freqX / freqY)
func pmiScore(cnt, freqX, freqY, N uint32) float64 {
	if freqX == 0 || freqY == 0 || N == 0 {
		return 0
	}
	// Use log2 approximation via integer arithmetic converted to float.
	num := float64(cnt) * float64(N)
	den := float64(freqX) * float64(freqY)
	if den == 0 {
		return 0
	}
	return log2(num / den)
}

// log2 computes natural-log-based log2 using ln(x)/ln(2).
func log2(x float64) float64 {
	if x <= 0 {
		return -999
	}
	// Integer-based approximation: count leading bits.
	// For the PMI use case (x often in 1-1000 range), this is accurate enough.
	// Use the series ln(x) ≈ 2*arctanh((x-1)/(x+1)) for x near 1.
	// Better: implement as bit manipulation + correction.
	// For simplicity, compute using a precomputed table of powers of 2.
	result := 0.0
	for x >= 2.0 {
		x /= 2.0
		result += 1.0
	}
	for x < 1.0 {
		x *= 2.0
		result -= 1.0
	}
	// x is now in [1, 2). Use linear approximation: log2(x) ≈ x - 1.
	result += x - 1.0
	return result
}

// joinPairCtx encodes (coord uint64, enWord, jaWord) as a cooc map key.
// coord stored as 8 LE bytes. EN tokens are ASCII (≥0x61) so no ambiguity.
func joinPairCtx(ctx uint64, en, ja string) string {
	return string([]byte{
		byte(ctx), byte(ctx >> 8), byte(ctx >> 16), byte(ctx >> 24),
		byte(ctx >> 32), byte(ctx >> 40), byte(ctx >> 48), byte(ctx >> 56),
	}) | en | "\x00" | ja
}

// splitPairCtx decodes a key produced by joinPairCtx.
func splitPairCtx(pair string) (en string, ctx uint64, ja string) {
	if len(pair) < 8 {
		return "", 0, ""
	}
	ctx = uint64(pair[0]) | uint64(pair[1])<<8 | uint64(pair[2])<<16 | uint64(pair[3])<<24 |
		uint64(pair[4])<<32 | uint64(pair[5])<<40 | uint64(pair[6])<<48 | uint64(pair[7])<<56
	rest := pair[8:]
	for i := 0; i < len(rest); i++ {
		if rest[i] == 0 {
			return rest[:i], ctx, rest[i+1:]
		}
	}
	return rest, ctx, ""
}

// insertCooccurPair inserts a corpus-derived EN-JA translation link.
// enCtx is the packed 3-position POS window. For ctx=0 (baseline) the
// logic is symmetric: new EN record points to JA and vice versa.
// For ctx≠0 (context entries), only the EN record is created pointing
// to the existing JA record — JA links are not modified.
// Returns true if something was inserted.
func insertCooccurPair(db *DB, enWord string, enCtx uint64, jaWord string) bool {
	enKey := transdb.MakeKey(transdb.LangEN, enCtx, enWord)
	jaKey := transdb.MakeKey(transdb.LangJA, 0, jaWord)

	// JA must exist in lattice.
	jaRI := lattice.NullRec
	for _, b := range transdb.ActiveBranches {
		if ri := db.Tree.LookupRecIdx(lattice.Branch(b), jaKey); ri != lattice.NullRec {
			jaRI = ri
			break
		}
	}
	if jaRI == lattice.NullRec {
		return false
	}

	// EN record at this context key must not already exist.
	for _, b := range transdb.ActiveBranches {
		if db.Tree.LookupRecIdx(lattice.Branch(b), enKey) != lattice.NullRec {
			return false
		}
	}

	// Create EN record pointing to JA.
	jaRec := db.Tree.GetRecord(jaRI)
	if jaRec == nil {
		return false
	}
	branch := lattice.Branch(transdb.POSFromBranch(jaRec.Branch))
	var enRec lattice.Record
	transdb.SetFormOnRecord(&enRec, enWord, &db.StringPool)
	enRec.Branch = uint8(branch)
	enRec.Link[0] = jaRI
	db.Tree.InsertRec(branch, enKey, enRec)

	// For ctx=0 new words: also wire JA→EN if JA has no primary EN link yet.
	if enCtx == 0 {
		jaRec = db.Tree.GetRecord(jaRI) // re-fetch after potential realloc
		if jaRec != nil && jaRec.Link[0] == lattice.NullRec {
			newEnRI := lattice.NullRec
			for _, b := range transdb.ActiveBranches {
				if ri := db.Tree.LookupRecIdx(lattice.Branch(b), enKey); ri != lattice.NullRec {
					newEnRI = ri
					break
				}
			}
			if newEnRI != lattice.NullRec {
				jaRec.Link[0] = newEnRI
			}
		}
	}
	return true
}