package ingest import ( "bufio" "fmt" "io" "os" "slices" "git.smesh.lol/iskradb/lattice" "git.smesh.lol/transdb" ) // CooccurConfig controls co-occurrence counting and PMI filtering. type CooccurConfig struct { MinCooc uint32 // minimum times a pair must co-occur (default 3) PMIMin float64 // minimum PMI score (default 2.0 ≈ 4× expected) MaxPairs int // maximum new pairs to insert (0 = no limit) MaxPairsPerSentence int // skip sentences whose EN×JA product exceeds this (default 40) JAWordlist string // path to pre-built JA wordlist file (empty = build from lattice) } func DefaultCooccurConfig() CooccurConfig { return CooccurConfig{MinCooc: 3, PMIMin: 2.0, MaxPairs: 0, MaxPairsPerSentence: 40} } // ExtendFromSentences reads parallel EN/JA sentence files, computes PMI // co-occurrence scores, and inserts high-confidence pairs into db that // are not already covered by existing links. func ExtendFromSentences(db *DB, enPath, jaPath string, cfg CooccurConfig, verbose bool) (int, error) { // Phase 1: build set of valid JA forms. // Use pre-built wordlist file if provided (faster); otherwise scan lattice. var validJA map[string]uint32 if cfg.JAWordlist != "" { wl, err := transdb.LoadWordlist(cfg.JAWordlist) if err != nil { return 0, fmt.Errorf("load wordlist: %w", err) } validJA = wl if verbose { fmt.Fprintf(os.Stderr, "extend: %d valid JA forms from wordlist\n", len(validJA)) } } else { validJA = buildValidJASet(db) if verbose { fmt.Fprintf(os.Stderr, "extend: %d valid JA forms from lattice\n", len(validJA)) } } // Phase 2: stream sentence pairs, count co-occurrences. cooc, enFreq, jaFreq, total, err := countCooc(db.Tree, enPath, jaPath, validJA, cfg.MaxPairsPerSentence, verbose) if err != nil { return 0, err } if verbose { fmt.Fprintf(os.Stderr, "extend: %d sentence pairs, %d unique co-occurrence pairs\n", total, len(cooc)) } // Phase 3: score with PMI, collect high-scoring candidates. type Candidate struct { EN string ENCtx uint64 // 22-bit coord (cooccurrence axis only for now) JA string PMI float64 } var candidates []Candidate for pair, cnt := range cooc { if cnt < cfg.MinCooc { continue } en, enCtx, ja := splitPairCtx(pair) pmi := pmiScore(cnt, enFreq[en], jaFreq[ja], uint32(total)) if pmi >= cfg.PMIMin { candidates = append(candidates, Candidate{en, enCtx, ja, pmi}) } } // Sort descending by PMI. slices.SortFunc(candidates, func(a, b Candidate) int { if a.PMI > b.PMI { return -1 } if a.PMI < b.PMI { return 1 } return 0 }) if verbose { fmt.Fprintf(os.Stderr, "extend: %d candidates above PMI %.1f\n", len(candidates), cfg.PMIMin) } // Phase 4: insert new pairs into lattice. inserted := 0 for _, c := range candidates { if cfg.MaxPairs > 0 && inserted >= cfg.MaxPairs { break } if insertCooccurPair(db, c.EN, c.ENCtx, c.JA) { inserted++ } } // Phase 5: accumulate corpus evidence counts in JA.DataLen. // For each high-PMI (EN, JA) pair, find the JA record and increment its // DataLen. JA.DataLen = total co-occurrence evidence across all EN partners. // Stored on the JA record so every candidate in a rerank comparison carries // its own evidence — challengers are not disadvantaged vs the current Link[0]. // Only inline JA records (DataFile==0, form ≤23 bytes) are counted. // Accumulates across corpus re-runs. confirmed := 0 for pair, cnt := range cooc { if cnt < cfg.MinCooc { continue } _, _, ja := splitPairCtx(pair) jaKey := transdb.MakeKey(transdb.LangJA, 0, ja) for _, b := range transdb.ActiveBranches { jaRI := db.Tree.LookupRecIdx(lattice.Branch(b), jaKey) if jaRI == lattice.NullRec { continue } jaRec := db.Tree.GetRecord(jaRI) if jaRec == nil || jaRec.DataFile != 0 { break // overflow — DataLen is byte length, don't touch } if jaRec.DataLen < 0xFFFFFFFF { jaRec.DataLen += cnt } confirmed++ break } } if verbose && confirmed > 0 { fmt.Fprintf(os.Stderr, "extend: %d JA records gained corpus evidence counts\n", confirmed) } return inserted, nil } // buildValidJASet collects all JA surface forms from the existing lattice // into a map[form]recIdx for fast substring matching. // Language is detected from the form content (JA = hiragana/katakana/CJK). func buildValidJASet(db *DB) map[string]uint32 { valid := map[string]uint32{} for recIdx := range db.Tree.RecKey { rec := db.Tree.GetRecord(recIdx) if rec == nil { continue } form := transdb.FormFromInline(rec, db.StringPool) if form != "" && transdb.Detect(form) == transdb.LangJA { valid[form] = recIdx } } return valid } // countCooc streams two parallel files line-by-line and counts // co-occurrences between EN tokens (with POS trigram context) and JA substrings. func countCooc(tree *lattice.Tree, enPath, jaPath string, validJA map[string]uint32, maxPairsPerSentence int, verbose bool) ( cooc map[string]uint32, enFreq map[string]uint32, jaFreq map[string]uint32, total int, err error) { enF, err := os.Open(enPath) if err != nil { return nil, nil, nil, 0, fmt.Errorf("open %s: %w", enPath, err) } defer enF.Close() jaF, err := os.Open(jaPath) if err != nil { return nil, nil, nil, 0, fmt.Errorf("open %s: %w", jaPath, err) } defer jaF.Close() cooc = map[string]uint32{} enFreq = map[string]uint32{} jaFreq = map[string]uint32{} enSc := bufio.NewScanner(enF) jaSc := bufio.NewScanner(jaF) // Prune every pruneInterval sentences: evict pairs seen < 2 times. // More frequent = lower memory, slightly less recall on rare-but-valid pairs. const pruneInterval = 10000 logInterval := 100000 for enSc.Scan() && jaSc.Scan() { enLine := enSc.Text() jaLine := jaSc.Text() total++ if verbose && total%logInterval == 0 { fmt.Fprintf(os.Stderr, "extend: processed %d sentence pairs... (cooc map: %d entries)\n", total, len(cooc)) } if total%pruneInterval == 0 { before := len(cooc) for k, v := range cooc { if v < 2 { delete(cooc, k) } } if verbose { fmt.Fprintf(os.Stderr, "extend: pruned cooc map %d→%d entries\n", before, len(cooc)) } } enToks := tokenizeENSentence(enLine) jaToks := extractJATokens(jaLine, validJA) // Skip degenerate pairs. if len(enToks) == 0 || len(jaToks) == 0 { continue } // Skip sentences whose cartesian product is too large. if len(enToks)*len(jaToks) > maxPairsPerSentence { continue } // Compute POS context for each EN token position (overlapping trigram window). enPOS := []uint8{:len(enToks):len(enToks)} for i, t := range enToks { enPOS[i] = transdb.POSForWord(tree, transdb.LangEN, t) } // Dedup JA tokens within sentence. jaSeen := map[string]bool{} for _, t := range jaToks { jaSeen[t] = true } // EN frequency: count unique words (not per-position to avoid inflation). enCounted := map[string]bool{} for _, t := range enToks { if !enCounted[t] { enFreq[t]++ enCounted[t] = true } } for j := range jaSeen { jaFreq[j]++ } // Count co-occurrences per EN position (with overlapping trigram ctx). enPosSeen := map[string]bool{} // dedup (enWord+ctx, jaWord) within sentence for i, e := range enToks { var prev, next uint8 if i > 0 { prev = enPOS[i-1] } if i+1 < len(enPOS) { next = enPOS[i+1] } // Cooccurrence axis: (prev_type, next_type) packed into coord. // cur position (enPOS[i]) is implicit in the word's grammatical axis. cooccur := transdb.CoordCooccur(prev, next) ctx := transdb.PackCoord(0, 0, cooccur, 0, 0, 0, 0) for j := range jaSeen { k := joinPairCtx(ctx, e, j) if !enPosSeen[k] { enPosSeen[k] = true cooc[k]++ } } } } if err = enSc.Err(); err != nil { return nil, nil, nil, total, fmt.Errorf("scan %s: %w", enPath, err) } _ = io.EOF // suppress unused import warning return cooc, enFreq, jaFreq, total, nil } // enStopWords are high-frequency English function words that co-occur with // everything and produce no translation signal — only noise and memory pressure. var enStopWords = map[string]bool{ "the": true, "a": true, "an": true, "is": true, "are": true, "was": true, "were": true, "be": true, "been": true, "being": true, "have": true, "has": true, "had": true, "do": true, "does": true, "did": true, "will": true, "would": true, "could": true, "should": true, "may": true, "might": true, "shall": true, "can": true, "of": true, "in": true, "to": true, "for": true, "on": true, "at": true, "by": true, "with": true, "from": true, "as": true, "and": true, "or": true, "but": true, "not": true, "no": true, "it": true, "its": true, "this": true, "that": true, "these": true, "those": true, "he": true, "she": true, "we": true, "they": true, "you": true, "me": true, "him": true, "her": true, "us": true, "them": true, "my": true, "your": true, "his": true, "our": true, "their": true, "what": true, "which": true, "who": true, "all": true, "if": true, "so": true, "up": true, "out": true, "just": true, "also": true, "than": true, "when": true, "where": true, "how": true, "why": true, "about": true, "into": true, "then": true, "now": true, "here": true, "there": true, "some": true, "any": true, "more": true, } // tokenizeENSentence splits an English subtitle line into lowercase tokens, // skipping stop words and tokens shorter than 3 chars. func tokenizeENSentence(line string) []string { var tokens []string var cur []byte for i := 0; i < len(line); i++ { c := line[i] if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') { if c >= 'A' && c <= 'Z' { c += 32 } cur = append(cur, c) } else { if len(cur) >= 3 { tok := string(append([]byte(nil), cur...)) if !enStopWords[tok] { tokens = append(tokens, tok) } } cur = cur[:0] } } if len(cur) >= 3 { tok := string(append([]byte(nil), cur...)) if !enStopWords[tok] { tokens = append(tokens, tok) } } return tokens } // extractJATokens finds all substrings of line (up to 8 codepoints) // that exist as valid JA forms in the lattice. // // Uses byte-offset iteration because Moxie's range yields bytes not runes // and []rune(string) does not decode UTF-8. CJK/kana are 3 bytes each; // utf8Start computes the correct byte-length per codepoint from the first byte. func extractJATokens(line string, validJA map[string]uint32) []string { // Build byte offsets for each codepoint boundary. offsets := []int{:0:len(line)/3 + 1} i := 0 for i < len(line) { offsets = append(offsets, i) i += utf8CharLen(line[i]) } offsets = append(offsets, len(line)) seen := map[string]bool{} var tokens []string maxCodepoints := 20 minCodepoints := 2 // skip single-char JA (particles: は, が, を, に, の…) n := len(offsets) - 1 // number of codepoints for start := 0; start < n; start++ { for l := minCodepoints; l <= maxCodepoints && start+l <= n; l++ { sub := line[offsets[start]:offsets[start+l]] if _, ok := validJA[sub]; ok && !seen[sub] { // Copy: sub is a slice of line which aliases the scanner buffer. tokens = append(tokens, string(append([]byte(nil), []byte(sub)...))) seen[sub] = true } } } return tokens } // utf8CharLen returns the byte length of the UTF-8 codepoint starting at b. func utf8CharLen(b byte) int { switch { case b < 0x80: return 1 case b < 0xE0: return 2 case b < 0xF0: return 3 default: return 4 } } // pmiScore computes pointwise mutual information (in bits / log2). // pmi = log2(P(x,y) / (P(x)*P(y))) = log2(cnt*N / freqX / freqY) func pmiScore(cnt, freqX, freqY, N uint32) float64 { if freqX == 0 || freqY == 0 || N == 0 { return 0 } // Use log2 approximation via integer arithmetic converted to float. num := float64(cnt) * float64(N) den := float64(freqX) * float64(freqY) if den == 0 { return 0 } return log2(num / den) } // log2 computes natural-log-based log2 using ln(x)/ln(2). func log2(x float64) float64 { if x <= 0 { return -999 } // Integer-based approximation: count leading bits. // For the PMI use case (x often in 1-1000 range), this is accurate enough. // Use the series ln(x) ≈ 2*arctanh((x-1)/(x+1)) for x near 1. // Better: implement as bit manipulation + correction. // For simplicity, compute using a precomputed table of powers of 2. result := 0.0 for x >= 2.0 { x /= 2.0 result += 1.0 } for x < 1.0 { x *= 2.0 result -= 1.0 } // x is now in [1, 2). Use linear approximation: log2(x) ≈ x - 1. result += x - 1.0 return result } // joinPairCtx encodes (coord uint64, enWord, jaWord) as a cooc map key. // coord stored as 8 LE bytes. EN tokens are ASCII (≥0x61) so no ambiguity. func joinPairCtx(ctx uint64, en, ja string) string { return string([]byte{ byte(ctx), byte(ctx >> 8), byte(ctx >> 16), byte(ctx >> 24), byte(ctx >> 32), byte(ctx >> 40), byte(ctx >> 48), byte(ctx >> 56), }) | en | "\x00" | ja } // splitPairCtx decodes a key produced by joinPairCtx. func splitPairCtx(pair string) (en string, ctx uint64, ja string) { if len(pair) < 8 { return "", 0, "" } ctx = uint64(pair[0]) | uint64(pair[1])<<8 | uint64(pair[2])<<16 | uint64(pair[3])<<24 | uint64(pair[4])<<32 | uint64(pair[5])<<40 | uint64(pair[6])<<48 | uint64(pair[7])<<56 rest := pair[8:] for i := 0; i < len(rest); i++ { if rest[i] == 0 { return rest[:i], ctx, rest[i+1:] } } return rest, ctx, "" } // insertCooccurPair inserts a corpus-derived EN-JA translation link. // enCtx is the packed 3-position POS window. For ctx=0 (baseline) the // logic is symmetric: new EN record points to JA and vice versa. // For ctx≠0 (context entries), only the EN record is created pointing // to the existing JA record — JA links are not modified. // Returns true if something was inserted. func insertCooccurPair(db *DB, enWord string, enCtx uint64, jaWord string) bool { enKey := transdb.MakeKey(transdb.LangEN, enCtx, enWord) jaKey := transdb.MakeKey(transdb.LangJA, 0, jaWord) // JA must exist in lattice. jaRI := lattice.NullRec for _, b := range transdb.ActiveBranches { if ri := db.Tree.LookupRecIdx(lattice.Branch(b), jaKey); ri != lattice.NullRec { jaRI = ri break } } if jaRI == lattice.NullRec { return false } // EN record at this context key must not already exist. for _, b := range transdb.ActiveBranches { if db.Tree.LookupRecIdx(lattice.Branch(b), enKey) != lattice.NullRec { return false } } // Create EN record pointing to JA. jaRec := db.Tree.GetRecord(jaRI) if jaRec == nil { return false } branch := lattice.Branch(transdb.POSFromBranch(jaRec.Branch)) var enRec lattice.Record transdb.SetFormOnRecord(&enRec, enWord, &db.StringPool) enRec.Branch = uint8(branch) enRec.Link[0] = jaRI db.Tree.InsertRec(branch, enKey, enRec) // For ctx=0 new words: also wire JA→EN if JA has no primary EN link yet. if enCtx == 0 { jaRec = db.Tree.GetRecord(jaRI) // re-fetch after potential realloc if jaRec != nil && jaRec.Link[0] == lattice.NullRec { newEnRI := lattice.NullRec for _, b := range transdb.ActiveBranches { if ri := db.Tree.LookupRecIdx(lattice.Branch(b), enKey); ri != lattice.NullRec { newEnRI = ri break } } if newEnRI != lattice.NullRec { jaRec.Link[0] = newEnRI } } } return true }