rerank.mx raw
1 package ingest
2
3 import (
4 "fmt"
5 "io"
6 "os"
7 "strconv"
8
9 "git.smesh.lol/iskradb/lattice"
10 "git.smesh.lol/transdb"
11 )
12
13 // PriorityScore maps a Japanese surface form to a frequency score.
14 // Lower score = more common = preferred as primary (Link[0]) translation.
15 // Unprioritized forms default to 1000.
16 type PriorityScore map[string]uint32
17
18 const scoreUnranked uint32 = 1000
19
20 // LoadJMdictPriorities scans JMdict XML for <ke_pri>/<re_pri> tags and builds
21 // a form → priority_score map. This uses JMdict's editorial frequency data
22 // (sourced from Ichimoji/newspaper corpora) without requiring a separate download.
23 //
24 // Scoring (lower = more common):
25 // ichi1=10 ichi2=20 news1=30 news2=40
26 // spec1=50 spec2=60 gai1=70 gai2=80
27 // nf01-nf48 = 90+N (newspaper frequency bands)
28 // no markers = 1000
29 func LoadJMdictPriorities(path string) (PriorityScore, error) {
30 r, cleanup, err := openInput(path)
31 if err != nil {
32 return nil, err
33 }
34 defer cleanup()
35
36 scores := PriorityScore{}
37 sc := NewXMLScanner(r)
38 var ev XMLEvent
39
40 var (
41 inEntry, inKEle, inREle bool
42 inKeb, inReb bool
43 inKePri, inRePri bool
44 curForms []string
45 curPri uint32
46 )
47
48 flush := func() {
49 if curPri < scoreUnranked {
50 for _, f := range curForms {
51 if existing, ok := scores[f]; !ok || curPri < existing {
52 scores[f] = curPri
53 }
54 }
55 }
56 }
57
58 for sc.Next(&ev) {
59 switch ev.Kind {
60 case XMLStart:
61 switch ev.Name {
62 case "entry":
63 inEntry = true
64 case "k_ele":
65 if inEntry {
66 flush()
67 curForms = curForms[:0]
68 curPri = scoreUnranked
69 inKEle = true
70 }
71 case "r_ele":
72 if inEntry {
73 flush()
74 curForms = curForms[:0]
75 curPri = scoreUnranked
76 inREle = true
77 }
78 case "keb":
79 inKeb = true
80 case "reb":
81 inReb = true
82 case "ke_pri":
83 inKePri = true
84 case "re_pri":
85 inRePri = true
86 }
87
88 case XMLEnd:
89 switch ev.Name {
90 case "entry":
91 flush()
92 inEntry = false
93 inKEle = false
94 inREle = false
95 curForms = curForms[:0]
96 curPri = scoreUnranked
97 case "k_ele":
98 inKEle = false
99 case "r_ele":
100 inREle = false
101 case "keb":
102 inKeb = false
103 case "reb":
104 inReb = false
105 case "ke_pri":
106 inKePri = false
107 case "re_pri":
108 inRePri = false
109 }
110
111 case XMLText:
112 switch {
113 case inKeb && inKEle:
114 curForms = append(curForms, ev.Text)
115 case inReb && inREle:
116 curForms = append(curForms, ev.Text)
117 case (inKePri && inKEle) || (inRePri && inREle):
118 s := parsePriority(ev.Text)
119 if s < curPri {
120 curPri = s
121 }
122 }
123 }
124 }
125 return scores, nil
126 }
127
128 // parsePriority converts a JMdict priority tag to a numeric score.
129 func parsePriority(tag string) uint32 {
130 switch tag {
131 case "ichi1":
132 return 10
133 case "ichi2":
134 return 20
135 case "news1":
136 return 30
137 case "news2":
138 return 40
139 case "spec1":
140 return 50
141 case "spec2":
142 return 60
143 case "gai1":
144 return 70
145 case "gai2":
146 return 80
147 }
148 // nf01-nf48: newspaper frequency bands
149 if len(tag) == 4 && tag[:2] == "nf" {
150 if n, err := strconv.ParseUint(tag[2:], 10, 32); err == nil {
151 return uint32(90 + n)
152 }
153 }
154 return scoreUnranked
155 }
156
157 // LoadFreqFromTSV loads an external word frequency file in TSV format:
158 // word<TAB>frequency_count (one per line, frequency is a positive integer)
159 // Higher count = more common. Scores are converted to ranks (lower = more common).
160 func LoadFreqFromTSV(path string) (PriorityScore, error) {
161 f, err := os.Open(path)
162 if err != nil {
163 return nil, err
164 }
165 defer f.Close()
166
167 // First pass: collect all (word, count) pairs.
168 type wc struct {
169 word string
170 count uint64
171 }
172 var entries []wc
173
174 buf := []byte{:0:256}
175 for {
176 buf = buf[:0]
177 b := [1]byte{}
178 for {
179 n, err := f.Read(b[:])
180 if n > 0 {
181 if b[0] == '\n' {
182 break
183 }
184 buf = append(buf, b[0])
185 }
186 if err == io.EOF {
187 goto done
188 }
189 if err != nil {
190 return nil, err
191 }
192 }
193 line := string(buf)
194 tab := -1
195 for i := 0; i < len(line); i++ {
196 if line[i] == '\t' {
197 tab = i
198 break
199 }
200 }
201 if tab < 0 {
202 continue
203 }
204 word := line[:tab]
205 countStr := line[tab+1:]
206 count, err := strconv.ParseUint(countStr, 10, 64)
207 if err != nil || word == "" {
208 continue
209 }
210 entries = append(entries, wc{word, count})
211 }
212 done:
213 // Sort descending by count, assign rank as score.
214 // Simple insertion sort is fine for small-to-medium lists.
215 for i := 1; i < len(entries); i++ {
216 e := entries[i]
217 j := i - 1
218 for j >= 0 && entries[j].count < e.count {
219 entries[j+1] = entries[j]
220 j--
221 }
222 entries[j+1] = e
223 }
224 scores := PriorityScore{}
225 for rank, e := range entries {
226 scores[e.word] = uint32(rank + 1) // rank 1 = most frequent
227 }
228 return scores, nil
229 }
230
231 // RerankLinks corrects the EN→JA primary translation direction using frequency.
232 //
233 // The problem: JMdict is processed in sequence order. The first JA entry with
234 // EN gloss "government" claims Link[0]; later (more common) entries like 政府
235 // can only get a back-link (政府.Link[0] = EN record) but the EN record's
236 // Link[0] still points to the archaic word.
237 //
238 // Fix: build an inverted index of all JA records that claim a given EN record,
239 // then for each EN record pick the JA word with the best (lowest) priority
240 // score as Link[0]. O(n) with the index pre-built.
241 func RerankLinks(db *DB, scores PriorityScore) int {
242 // Phase 1: build inverted index EN_recIdx → []JA_recIdxs.
243 // A JA record "claims" an EN record when JA.Link[0] == enRI.
244 inv := map[uint32][]uint32{}
245 for recIdx := range db.Tree.RecKey {
246 rec := db.Tree.GetRecord(recIdx)
247 if rec == nil || rec.Link[0] == lattice.NullRec {
248 continue
249 }
250 form := transdb.FormFromInline(rec, db.StringPool)
251 if transdb.Detect(form) != transdb.LangJA {
252 continue
253 }
254 enRI := rec.Link[0]
255 inv[enRI] = append(inv[enRI], recIdx)
256 }
257
258 // Phase 2: for each EN record, find the best JA candidate and update Link[0].
259 swaps := 0
260 enCount := 0
261 for enRI, jaRIs := range inv {
262 if len(jaRIs) < 2 {
263 continue // only one JA claimant, nothing to rerank
264 }
265 enRec := db.Tree.GetRecord(enRI)
266 if enRec == nil {
267 continue
268 }
269 enForm := transdb.FormFromInline(enRec, db.StringPool)
270 if transdb.Detect(enForm) != transdb.LangEN {
271 continue
272 }
273 enCount++
274
275 // Find the JA record with the best priority score.
276 // Rank: corpus posterior (primary), register weirdness (tiebreaker).
277 // Posterior = JMdict_freq / log2(corpus_count + 1).
278 // High corpus evidence halves the effective frequency score per
279 // doubling of count, so a frequently observed pair beats a rare
280 // JMdict-priority word with no corpus confirmation.
281 // Lower combined rank = more preferred.
282 //
283 // EN.DataLen stores corpus co-occurrence count for Link[0] (inline
284 // records only; DataFile==0 means DataLen is unused otherwise).
285 rankOf := func(jaRI uint32) uint64 {
286 r := db.Tree.GetRecord(jaRI)
287 if r == nil {
288 return uint64(scoreUnranked)*1000 + 999
289 }
290 f := transdb.FormFromInline(r, db.StringPool)
291 freq := uint64(scoreUnranked)
292 if s, ok := scores[f]; ok {
293 freq = uint64(s)
294 }
295 weird := uint64(transdb.BranchWeirdness(r.Branch))
296
297 // Corpus posterior: JA.DataLen holds co-occurrence evidence count
298 // for this JA word across all EN partners (inline records only).
299 // Divide effective freq by log2(count+1)+1: each doubling of corpus
300 // evidence halves the effective score → corpus-confirmed words rank
301 // better regardless of JMdict frequency tier.
302 if r.DataFile == 0 {
303 count := uint64(r.DataLen)
304 if count > 0 {
305 doublings := uint64(0)
306 for c := count; c > 0; c >>= 1 {
307 doublings++
308 }
309 freq = freq / (doublings + 1)
310 if freq == 0 {
311 freq = 1
312 }
313 }
314 }
315 return freq*1000 + weird
316 }
317
318 bestRI := enRec.Link[0]
319 bestRank := rankOf(bestRI)
320
321 for _, jaRI := range jaRIs {
322 if r := rankOf(jaRI); r < bestRank {
323 bestRank = r
324 bestRI = jaRI
325 }
326 }
327
328 if bestRI != enRec.Link[0] {
329 // Promote bestRI to Link[0], demote old Link[0] to Link[1].
330 enRec.Link[1] = enRec.Link[0]
331 enRec.Link[0] = bestRI
332 swaps++
333 }
334 }
335
336 fmt.Fprintf(os.Stderr, "rerank: examined %d EN records with multiple JA claimants, %d swaps\n",
337 enCount, swaps)
338 return swaps
339 }
340