package ingest import ( "fmt" "io" "os" "strconv" "git.smesh.lol/iskradb/lattice" "git.smesh.lol/transdb" ) // PriorityScore maps a Japanese surface form to a frequency score. // Lower score = more common = preferred as primary (Link[0]) translation. // Unprioritized forms default to 1000. type PriorityScore map[string]uint32 const scoreUnranked uint32 = 1000 // LoadJMdictPriorities scans JMdict XML for / tags and builds // a form → priority_score map. This uses JMdict's editorial frequency data // (sourced from Ichimoji/newspaper corpora) without requiring a separate download. // // Scoring (lower = more common): // ichi1=10 ichi2=20 news1=30 news2=40 // spec1=50 spec2=60 gai1=70 gai2=80 // nf01-nf48 = 90+N (newspaper frequency bands) // no markers = 1000 func LoadJMdictPriorities(path string) (PriorityScore, error) { r, cleanup, err := openInput(path) if err != nil { return nil, err } defer cleanup() scores := PriorityScore{} sc := NewXMLScanner(r) var ev XMLEvent var ( inEntry, inKEle, inREle bool inKeb, inReb bool inKePri, inRePri bool curForms []string curPri uint32 ) flush := func() { if curPri < scoreUnranked { for _, f := range curForms { if existing, ok := scores[f]; !ok || curPri < existing { scores[f] = curPri } } } } for sc.Next(&ev) { switch ev.Kind { case XMLStart: switch ev.Name { case "entry": inEntry = true case "k_ele": if inEntry { flush() curForms = curForms[:0] curPri = scoreUnranked inKEle = true } case "r_ele": if inEntry { flush() curForms = curForms[:0] curPri = scoreUnranked inREle = true } case "keb": inKeb = true case "reb": inReb = true case "ke_pri": inKePri = true case "re_pri": inRePri = true } case XMLEnd: switch ev.Name { case "entry": flush() inEntry = false inKEle = false inREle = false curForms = curForms[:0] curPri = scoreUnranked case "k_ele": inKEle = false case "r_ele": inREle = false case "keb": inKeb = false case "reb": inReb = false case "ke_pri": inKePri = false case "re_pri": inRePri = false } case XMLText: switch { case inKeb && inKEle: curForms = append(curForms, ev.Text) case inReb && inREle: curForms = append(curForms, ev.Text) case (inKePri && inKEle) || (inRePri && inREle): s := parsePriority(ev.Text) if s < curPri { curPri = s } } } } return scores, nil } // parsePriority converts a JMdict priority tag to a numeric score. func parsePriority(tag string) uint32 { switch tag { case "ichi1": return 10 case "ichi2": return 20 case "news1": return 30 case "news2": return 40 case "spec1": return 50 case "spec2": return 60 case "gai1": return 70 case "gai2": return 80 } // nf01-nf48: newspaper frequency bands if len(tag) == 4 && tag[:2] == "nf" { if n, err := strconv.ParseUint(tag[2:], 10, 32); err == nil { return uint32(90 + n) } } return scoreUnranked } // LoadFreqFromTSV loads an external word frequency file in TSV format: // wordfrequency_count (one per line, frequency is a positive integer) // Higher count = more common. Scores are converted to ranks (lower = more common). func LoadFreqFromTSV(path string) (PriorityScore, error) { f, err := os.Open(path) if err != nil { return nil, err } defer f.Close() // First pass: collect all (word, count) pairs. type wc struct { word string count uint64 } var entries []wc buf := []byte{:0:256} for { buf = buf[:0] b := [1]byte{} for { n, err := f.Read(b[:]) if n > 0 { if b[0] == '\n' { break } buf = append(buf, b[0]) } if err == io.EOF { goto done } if err != nil { return nil, err } } line := string(buf) tab := -1 for i := 0; i < len(line); i++ { if line[i] == '\t' { tab = i break } } if tab < 0 { continue } word := line[:tab] countStr := line[tab+1:] count, err := strconv.ParseUint(countStr, 10, 64) if err != nil || word == "" { continue } entries = append(entries, wc{word, count}) } done: // Sort descending by count, assign rank as score. // Simple insertion sort is fine for small-to-medium lists. for i := 1; i < len(entries); i++ { e := entries[i] j := i - 1 for j >= 0 && entries[j].count < e.count { entries[j+1] = entries[j] j-- } entries[j+1] = e } scores := PriorityScore{} for rank, e := range entries { scores[e.word] = uint32(rank + 1) // rank 1 = most frequent } return scores, nil } // RerankLinks corrects the EN→JA primary translation direction using frequency. // // The problem: JMdict is processed in sequence order. The first JA entry with // EN gloss "government" claims Link[0]; later (more common) entries like 政府 // can only get a back-link (政府.Link[0] = EN record) but the EN record's // Link[0] still points to the archaic word. // // Fix: build an inverted index of all JA records that claim a given EN record, // then for each EN record pick the JA word with the best (lowest) priority // score as Link[0]. O(n) with the index pre-built. func RerankLinks(db *DB, scores PriorityScore) int { // Phase 1: build inverted index EN_recIdx → []JA_recIdxs. // A JA record "claims" an EN record when JA.Link[0] == enRI. inv := map[uint32][]uint32{} for recIdx := range db.Tree.RecKey { rec := db.Tree.GetRecord(recIdx) if rec == nil || rec.Link[0] == lattice.NullRec { continue } form := transdb.FormFromInline(rec, db.StringPool) if transdb.Detect(form) != transdb.LangJA { continue } enRI := rec.Link[0] inv[enRI] = append(inv[enRI], recIdx) } // Phase 2: for each EN record, find the best JA candidate and update Link[0]. swaps := 0 enCount := 0 for enRI, jaRIs := range inv { if len(jaRIs) < 2 { continue // only one JA claimant, nothing to rerank } enRec := db.Tree.GetRecord(enRI) if enRec == nil { continue } enForm := transdb.FormFromInline(enRec, db.StringPool) if transdb.Detect(enForm) != transdb.LangEN { continue } enCount++ // Find the JA record with the best priority score. // Rank: corpus posterior (primary), register weirdness (tiebreaker). // Posterior = JMdict_freq / log2(corpus_count + 1). // High corpus evidence halves the effective frequency score per // doubling of count, so a frequently observed pair beats a rare // JMdict-priority word with no corpus confirmation. // Lower combined rank = more preferred. // // EN.DataLen stores corpus co-occurrence count for Link[0] (inline // records only; DataFile==0 means DataLen is unused otherwise). rankOf := func(jaRI uint32) uint64 { r := db.Tree.GetRecord(jaRI) if r == nil { return uint64(scoreUnranked)*1000 + 999 } f := transdb.FormFromInline(r, db.StringPool) freq := uint64(scoreUnranked) if s, ok := scores[f]; ok { freq = uint64(s) } weird := uint64(transdb.BranchWeirdness(r.Branch)) // Corpus posterior: JA.DataLen holds co-occurrence evidence count // for this JA word across all EN partners (inline records only). // Divide effective freq by log2(count+1)+1: each doubling of corpus // evidence halves the effective score → corpus-confirmed words rank // better regardless of JMdict frequency tier. if r.DataFile == 0 { count := uint64(r.DataLen) if count > 0 { doublings := uint64(0) for c := count; c > 0; c >>= 1 { doublings++ } freq = freq / (doublings + 1) if freq == 0 { freq = 1 } } } return freq*1000 + weird } bestRI := enRec.Link[0] bestRank := rankOf(bestRI) for _, jaRI := range jaRIs { if r := rankOf(jaRI); r < bestRank { bestRank = r bestRI = jaRI } } if bestRI != enRec.Link[0] { // Promote bestRI to Link[0], demote old Link[0] to Link[1]. enRec.Link[1] = enRec.Link[0] enRec.Link[0] = bestRI swaps++ } } fmt.Fprintf(os.Stderr, "rerank: examined %d EN records with multiple JA claimants, %d swaps\n", enCount, swaps) return swaps }