rerank.mx raw

   1  package ingest
   2  
   3  import (
   4  	"fmt"
   5  	"io"
   6  	"os"
   7  	"strconv"
   8  
   9  	"git.smesh.lol/iskradb/lattice"
  10  	"git.smesh.lol/transdb"
  11  )
  12  
  13  // PriorityScore maps a Japanese surface form to a frequency score.
  14  // Lower score = more common = preferred as primary (Link[0]) translation.
  15  // Unprioritized forms default to 1000.
  16  type PriorityScore map[string]uint32
  17  
  18  const scoreUnranked uint32 = 1000
  19  
  20  // LoadJMdictPriorities scans JMdict XML for <ke_pri>/<re_pri> tags and builds
  21  // a form → priority_score map. This uses JMdict's editorial frequency data
  22  // (sourced from Ichimoji/newspaper corpora) without requiring a separate download.
  23  //
  24  // Scoring (lower = more common):
  25  //   ichi1=10  ichi2=20  news1=30  news2=40
  26  //   spec1=50  spec2=60  gai1=70   gai2=80
  27  //   nf01-nf48 = 90+N  (newspaper frequency bands)
  28  //   no markers = 1000
  29  func LoadJMdictPriorities(path string) (PriorityScore, error) {
  30  	r, cleanup, err := openInput(path)
  31  	if err != nil {
  32  		return nil, err
  33  	}
  34  	defer cleanup()
  35  
  36  	scores := PriorityScore{}
  37  	sc := NewXMLScanner(r)
  38  	var ev XMLEvent
  39  
  40  	var (
  41  		inEntry, inKEle, inREle bool
  42  		inKeb, inReb            bool
  43  		inKePri, inRePri        bool
  44  		curForms                []string
  45  		curPri                  uint32
  46  	)
  47  
  48  	flush := func() {
  49  		if curPri < scoreUnranked {
  50  			for _, f := range curForms {
  51  				if existing, ok := scores[f]; !ok || curPri < existing {
  52  					scores[f] = curPri
  53  				}
  54  			}
  55  		}
  56  	}
  57  
  58  	for sc.Next(&ev) {
  59  		switch ev.Kind {
  60  		case XMLStart:
  61  			switch ev.Name {
  62  			case "entry":
  63  				inEntry = true
  64  			case "k_ele":
  65  				if inEntry {
  66  					flush()
  67  					curForms = curForms[:0]
  68  					curPri = scoreUnranked
  69  					inKEle = true
  70  				}
  71  			case "r_ele":
  72  				if inEntry {
  73  					flush()
  74  					curForms = curForms[:0]
  75  					curPri = scoreUnranked
  76  					inREle = true
  77  				}
  78  			case "keb":
  79  				inKeb = true
  80  			case "reb":
  81  				inReb = true
  82  			case "ke_pri":
  83  				inKePri = true
  84  			case "re_pri":
  85  				inRePri = true
  86  			}
  87  
  88  		case XMLEnd:
  89  			switch ev.Name {
  90  			case "entry":
  91  				flush()
  92  				inEntry = false
  93  				inKEle = false
  94  				inREle = false
  95  				curForms = curForms[:0]
  96  				curPri = scoreUnranked
  97  			case "k_ele":
  98  				inKEle = false
  99  			case "r_ele":
 100  				inREle = false
 101  			case "keb":
 102  				inKeb = false
 103  			case "reb":
 104  				inReb = false
 105  			case "ke_pri":
 106  				inKePri = false
 107  			case "re_pri":
 108  				inRePri = false
 109  			}
 110  
 111  		case XMLText:
 112  			switch {
 113  			case inKeb && inKEle:
 114  				curForms = append(curForms, ev.Text)
 115  			case inReb && inREle:
 116  				curForms = append(curForms, ev.Text)
 117  			case (inKePri && inKEle) || (inRePri && inREle):
 118  				s := parsePriority(ev.Text)
 119  				if s < curPri {
 120  					curPri = s
 121  				}
 122  			}
 123  		}
 124  	}
 125  	return scores, nil
 126  }
 127  
 128  // parsePriority converts a JMdict priority tag to a numeric score.
 129  func parsePriority(tag string) uint32 {
 130  	switch tag {
 131  	case "ichi1":
 132  		return 10
 133  	case "ichi2":
 134  		return 20
 135  	case "news1":
 136  		return 30
 137  	case "news2":
 138  		return 40
 139  	case "spec1":
 140  		return 50
 141  	case "spec2":
 142  		return 60
 143  	case "gai1":
 144  		return 70
 145  	case "gai2":
 146  		return 80
 147  	}
 148  	// nf01-nf48: newspaper frequency bands
 149  	if len(tag) == 4 && tag[:2] == "nf" {
 150  		if n, err := strconv.ParseUint(tag[2:], 10, 32); err == nil {
 151  			return uint32(90 + n)
 152  		}
 153  	}
 154  	return scoreUnranked
 155  }
 156  
 157  // LoadFreqFromTSV loads an external word frequency file in TSV format:
 158  //   word<TAB>frequency_count  (one per line, frequency is a positive integer)
 159  // Higher count = more common. Scores are converted to ranks (lower = more common).
 160  func LoadFreqFromTSV(path string) (PriorityScore, error) {
 161  	f, err := os.Open(path)
 162  	if err != nil {
 163  		return nil, err
 164  	}
 165  	defer f.Close()
 166  
 167  	// First pass: collect all (word, count) pairs.
 168  	type wc struct {
 169  		word  string
 170  		count uint64
 171  	}
 172  	var entries []wc
 173  
 174  	buf := []byte{:0:256}
 175  	for {
 176  		buf = buf[:0]
 177  		b := [1]byte{}
 178  		for {
 179  			n, err := f.Read(b[:])
 180  			if n > 0 {
 181  				if b[0] == '\n' {
 182  					break
 183  				}
 184  				buf = append(buf, b[0])
 185  			}
 186  			if err == io.EOF {
 187  				goto done
 188  			}
 189  			if err != nil {
 190  				return nil, err
 191  			}
 192  		}
 193  		line := string(buf)
 194  		tab := -1
 195  		for i := 0; i < len(line); i++ {
 196  			if line[i] == '\t' {
 197  				tab = i
 198  				break
 199  			}
 200  		}
 201  		if tab < 0 {
 202  			continue
 203  		}
 204  		word := line[:tab]
 205  		countStr := line[tab+1:]
 206  		count, err := strconv.ParseUint(countStr, 10, 64)
 207  		if err != nil || word == "" {
 208  			continue
 209  		}
 210  		entries = append(entries, wc{word, count})
 211  	}
 212  done:
 213  	// Sort descending by count, assign rank as score.
 214  	// Simple insertion sort is fine for small-to-medium lists.
 215  	for i := 1; i < len(entries); i++ {
 216  		e := entries[i]
 217  		j := i - 1
 218  		for j >= 0 && entries[j].count < e.count {
 219  			entries[j+1] = entries[j]
 220  			j--
 221  		}
 222  		entries[j+1] = e
 223  	}
 224  	scores := PriorityScore{}
 225  	for rank, e := range entries {
 226  		scores[e.word] = uint32(rank + 1) // rank 1 = most frequent
 227  	}
 228  	return scores, nil
 229  }
 230  
 231  // RerankLinks corrects the EN→JA primary translation direction using frequency.
 232  //
 233  // The problem: JMdict is processed in sequence order. The first JA entry with
 234  // EN gloss "government" claims Link[0]; later (more common) entries like 政府
 235  // can only get a back-link (政府.Link[0] = EN record) but the EN record's
 236  // Link[0] still points to the archaic word.
 237  //
 238  // Fix: build an inverted index of all JA records that claim a given EN record,
 239  // then for each EN record pick the JA word with the best (lowest) priority
 240  // score as Link[0]. O(n) with the index pre-built.
 241  func RerankLinks(db *DB, scores PriorityScore) int {
 242  	// Phase 1: build inverted index EN_recIdx → []JA_recIdxs.
 243  	// A JA record "claims" an EN record when JA.Link[0] == enRI.
 244  	inv := map[uint32][]uint32{}
 245  	for recIdx := range db.Tree.RecKey {
 246  		rec := db.Tree.GetRecord(recIdx)
 247  		if rec == nil || rec.Link[0] == lattice.NullRec {
 248  			continue
 249  		}
 250  		form := transdb.FormFromInline(rec, db.StringPool)
 251  		if transdb.Detect(form) != transdb.LangJA {
 252  			continue
 253  		}
 254  		enRI := rec.Link[0]
 255  		inv[enRI] = append(inv[enRI], recIdx)
 256  	}
 257  
 258  	// Phase 2: for each EN record, find the best JA candidate and update Link[0].
 259  	swaps := 0
 260  	enCount := 0
 261  	for enRI, jaRIs := range inv {
 262  		if len(jaRIs) < 2 {
 263  			continue // only one JA claimant, nothing to rerank
 264  		}
 265  		enRec := db.Tree.GetRecord(enRI)
 266  		if enRec == nil {
 267  			continue
 268  		}
 269  		enForm := transdb.FormFromInline(enRec, db.StringPool)
 270  		if transdb.Detect(enForm) != transdb.LangEN {
 271  			continue
 272  		}
 273  		enCount++
 274  
 275  		// Find the JA record with the best priority score.
 276  		// Rank: corpus posterior (primary), register weirdness (tiebreaker).
 277  		// Posterior = JMdict_freq / log2(corpus_count + 1).
 278  		// High corpus evidence halves the effective frequency score per
 279  		// doubling of count, so a frequently observed pair beats a rare
 280  		// JMdict-priority word with no corpus confirmation.
 281  		// Lower combined rank = more preferred.
 282  		//
 283  		// EN.DataLen stores corpus co-occurrence count for Link[0] (inline
 284  		// records only; DataFile==0 means DataLen is unused otherwise).
 285  		rankOf := func(jaRI uint32) uint64 {
 286  			r := db.Tree.GetRecord(jaRI)
 287  			if r == nil {
 288  				return uint64(scoreUnranked)*1000 + 999
 289  			}
 290  			f := transdb.FormFromInline(r, db.StringPool)
 291  			freq := uint64(scoreUnranked)
 292  			if s, ok := scores[f]; ok {
 293  				freq = uint64(s)
 294  			}
 295  			weird := uint64(transdb.BranchWeirdness(r.Branch))
 296  
 297  			// Corpus posterior: JA.DataLen holds co-occurrence evidence count
 298  			// for this JA word across all EN partners (inline records only).
 299  			// Divide effective freq by log2(count+1)+1: each doubling of corpus
 300  			// evidence halves the effective score → corpus-confirmed words rank
 301  			// better regardless of JMdict frequency tier.
 302  			if r.DataFile == 0 {
 303  				count := uint64(r.DataLen)
 304  				if count > 0 {
 305  					doublings := uint64(0)
 306  					for c := count; c > 0; c >>= 1 {
 307  						doublings++
 308  					}
 309  					freq = freq / (doublings + 1)
 310  					if freq == 0 {
 311  						freq = 1
 312  					}
 313  				}
 314  			}
 315  			return freq*1000 + weird
 316  		}
 317  
 318  		bestRI := enRec.Link[0]
 319  		bestRank := rankOf(bestRI)
 320  
 321  		for _, jaRI := range jaRIs {
 322  			if r := rankOf(jaRI); r < bestRank {
 323  				bestRank = r
 324  				bestRI = jaRI
 325  			}
 326  		}
 327  
 328  		if bestRI != enRec.Link[0] {
 329  			// Promote bestRI to Link[0], demote old Link[0] to Link[1].
 330  			enRec.Link[1] = enRec.Link[0]
 331  			enRec.Link[0] = bestRI
 332  			swaps++
 333  		}
 334  	}
 335  
 336  	fmt.Fprintf(os.Stderr, "rerank: examined %d EN records with multiple JA claimants, %d swaps\n",
 337  		enCount, swaps)
 338  	return swaps
 339  }
 340