ingest_pattern.mx raw
1 package iskra
2
3 import (
4 "math"
5
6 "git.smesh.lol/iskradb/lattice"
7 )
8
9 // Default Gaussian σ for register-coordinate filtering in LookupAtomLink.
10 // σ_archaic is tighter than σ_discourse: archaism is a stronger semantic
11 // register mismatch than mere sentence-length difference. Tunable per call.
12 const (
13 DefaultSigmaArchaic = 64.0
14 DefaultSigmaDiscourse = 128.0
15 // diversityNearThreshold: the Gaussian factor below which a corpus
16 // coord doesn't count toward the per-DstAtom diversity bonus.
17 // Coords with Gaussian < 0.05 (i.e. ~20× muted) are too far to be
18 // considered "supporting evidence" for a translation. Empirically:
19 // bible (255,199) from query (0,0) gives ~1.4e-8; KFTT (5,171) from
20 // (0,0) gives ~0.17 - KFTT counts as supporting evidence, bible does not.
21 diversityNearThreshold = 0.05
22 )
23
24 // IngestPattern stores atoms and patterns from an extraction result.
25 // Returns the pattern recIdx for cross-domain linking.
26 func IngestPattern(t *Tree, domain uint8, ext ExtractResult) uint32 {
27 if len(ext.Pattern) == 0 {
28 return lattice.NullRec
29 }
30
31 // 1. Upsert the pattern record (Bgrammatical branch).
32 patKey := PatternKey(domain, ext.Pattern)
33 patRI := t.LookupRecIdx(lattice.Bgrammatical, patKey)
34 if patRI != lattice.NullRec {
35 t.metaInc(patRI)
36 } else {
37 var rec lattice.Record
38 t.setFormOnRec(&rec, string(ext.Pattern))
39 rec.Branch = uint8(lattice.Bgrammatical)
40 patRI = t.db.InsertRec(lattice.Bgrammatical, patKey, rec)
41 t.metaSet(patRI, MetaEntry{Count: 1, StageTag: domain})
42 }
43
44 // 2. Upsert each content slot as an atom (Bsemantic branch).
45 // Use lemma (from Set) as the atom key when available; fall back to surface form.
46 for i, word := range ext.Slots {
47 if word == "" {
48 continue
49 }
50 atomForm := word
51 if i < len(ext.Set) && ext.Set[i].Atom != "" {
52 atomForm = ext.Set[i].Atom
53 }
54 atomKey := AtomKey(domain, atomForm)
55 atomRI := t.LookupRecIdx(lattice.Bsemantic, atomKey)
56 if atomRI != lattice.NullRec {
57 t.metaInc(atomRI)
58 if i < len(ext.Roles) {
59 m := t.metaGet(atomRI)
60 if m != nil {
61 var h RoleHist
62 h.Decode(m.Extra)
63 h[ext.Roles[i]]++
64 h.Encode(&m.Extra)
65 if t.BulkMetaStore != nil {
66 t.BulkMetaStore.dirty[atomRI] = true
67 }
68 }
69 }
70 } else {
71 var rec lattice.Record
72 t.setFormOnRec(&rec, atomForm)
73 rec.Branch = uint8(lattice.Bsemantic)
74 atomRI = t.db.InsertRec(lattice.Bsemantic, atomKey, rec)
75 m := MetaEntry{Count: 1, StageTag: domain}
76 if i < len(ext.Roles) {
77 var h RoleHist
78 h[ext.Roles[i]] = 1
79 h.Encode(&m.Extra)
80 }
81 t.metaSet(atomRI, m)
82 }
83 }
84
85 return patRI
86 }
87
88 // IngestCrossDomain records a structural alignment between two patterns.
89 // Called when a JA sentence pattern corresponds to an EN sentence pattern.
90 func IngestCrossDomain(t *Tree, srcDomain, dstDomain uint8, srcPat, dstPat []byte) {
91 if len(srcPat) == 0 || len(dstPat) == 0 {
92 return
93 }
94 key := CrossPatternKey(srcDomain, dstDomain, srcPat, dstPat)
95 ri := t.LookupRecIdx(lattice.Bcooccur, key)
96 if ri != lattice.NullRec {
97 t.metaInc(ri)
98 return
99 }
100 form := string(srcPat) | "=" | string(dstPat)
101 var rec lattice.Record
102 t.setFormOnRec(&rec, form)
103 rec.Branch = uint8(lattice.Bcooccur)
104 ri = t.db.InsertRec(lattice.Bcooccur, key, rec)
105 t.metaSet(ri, MetaEntry{Count: 1, StageTag: srcDomain})
106 }
107
108 // IngestDeepPattern stores a canonical deep pattern and increments its count.
109 // Deep patterns are language-independent role sequences shared across domains.
110 func IngestDeepPattern(t *Tree, deepPat []byte) {
111 if len(deepPat) == 0 {
112 return
113 }
114 key := DeepPatternKey(deepPat)
115 ri := t.LookupRecIdx(lattice.Bgrammatical, key)
116 if ri != lattice.NullRec {
117 t.metaInc(ri)
118 return
119 }
120 var rec lattice.Record
121 t.setFormOnRec(&rec, string(deepPat))
122 rec.Branch = uint8(lattice.Bgrammatical)
123 ri = t.db.InsertRec(lattice.Bgrammatical, key, rec)
124 t.metaSet(ri, MetaEntry{Count: 1, StageTag: 0}) // domain 0 = cross-domain
125 }
126
127 // Atom-link generation marker. Stored in MetaEntry.StageTag's high bit-zone
128 // to distinguish:
129 // GenLegacy (0) - records written by the bilateral IngestAtomLink before
130 // the context-aware schema landed; role/context fields
131 // are empty/unknown. Used as a translation fallback.
132 // GenContexted (1) - records written by IngestContextedAtomLink with
133 // role and governing-context populated. The preferred
134 // lookup path.
135 const (
136 GenLegacy uint8 = 0
137 GenContexted uint8 = 1
138 GenDictionary uint8 = 2
139 )
140
141 // pronounPerson returns the grammatical person (1, 2, 3) of a pronoun
142 // atom, or 0 if the atom is not a known pronoun. Used to prevent
143 // cross-person pronoun links during ingestion.
144 func pronounPerson(lang uint8, atom string) int32 {
145 if lang == 1 {
146 switch atom {
147 case "i", "me", "my", "myself", "we", "us", "our", "ourselves":
148 return 1
149 case "you", "your", "yourself", "yourselves":
150 return 2
151 case "he", "him", "his", "himself",
152 "she", "her", "herself",
153 "they", "them", "their", "themselves":
154 return 3
155 case "it", "itself":
156 return 4 // inanimate - only links to JA demonstratives, not human pronouns
157 }
158 } else if lang == 2 {
159 switch atom {
160 case "\xe7\xa7\x81", // 私
161 "\xe5\x83\x95", // 僕
162 "\xe4\xbf\xba", // 俺
163 "\xe3\x82\x8f\xe3\x81\x97", // わし
164 "\xe8\x87\xaa\xe5\x88\x86", // 自分
165 "\xe7\xa7\x81\xe3\x81\x9f\xe3\x81\xa1", // 私たち
166 "\xe6\x88\x91\xe3\x80\x85": // 我々
167 return 1
168 case "\xe3\x81\x82\xe3\x81\xaa\xe3\x81\x9f", // あなた
169 "\xe5\x90\x9b", // 君
170 "\xe3\x81\x8a\xe5\x89\x8d", // お前
171 "\xe3\x81\x82\xe3\x82\x93\xe3\x81\x9f": // あんた
172 return 2
173 case "\xe5\xbd\xbc", // 彼
174 "\xe5\xbd\xbc\xe5\xa5\xb3", // 彼女
175 "\xe5\xbd\xbc\xe3\x82\x89": // 彼ら
176 return 3
177 case "\xe3\x81\x9d\xe3\x82\x8c", // それ
178 "\xe3\x81\x93\xe3\x82\x8c", // これ
179 "\xe3\x81\x82\xe3\x82\x8c": // あれ
180 return 4 // inanimate demonstratives
181 }
182 }
183 return 0
184 }
185
186 func isSingleKana(s string) bool {
187 return len(s) == 3 && s[0] == 0xe3 && (s[1] == 0x81 || s[1] == 0x82 || s[1] == 0x83)
188 }
189
190 // isJunkJAAtom filters JA atoms that are lemmatizer artifacts.
191 // っ+single-hiragana (e.g. っう, っく) are malformed godan stems
192 // produced when the lemmatizer over-strips a verb.
193 func isJunkJAAtom(s string) bool {
194 if len(s) == 6 && s[0] == 0xe3 && s[1] == 0x81 && s[2] == 0xa3 &&
195 s[3] == 0xe3 && s[4] == 0x81 {
196 return true
197 }
198 return false
199 }
200
201 // AtomLinkKey constructs the lattice key for a context-aware atom-link
202 // record. Composite of (langA, langB, "X", roleA, gen, rArch, rDisc,
203 // atomA \0 contextA \0 atomB) hashed via SipHash.
204 //
205 // Register coordinate (rArch, rDisc) is in the key so the same atom pair
206 // from different-register corpora produces distinct records. This keeps
207 // scripture-derived associations from polluting modern-conversational
208 // lookups even when the atoms collide.
209 //
210 // Note: this is a point-lookup key. Prefix-scan queries are served by a
211 // sidecar index, not by key structure.
212 func AtomLinkKey(langA, langB, roleA, gen, rArch, rDisc uint8, atomA, contextA, atomB string) lattice.Key {
213 n := 7 + len(atomA) + 1 + len(contextA) + 1 + len(atomB)
214 buf := []byte{:n:n}
215 buf[0] = langA
216 buf[1] = langB
217 buf[2] = 'X'
218 buf[3] = roleA
219 buf[4] = gen
220 buf[5] = rArch
221 buf[6] = rDisc
222 off := 7
223 copy(buf[off:], []byte(atomA))
224 off += len(atomA)
225 buf[off] = 0x00
226 off++
227 copy(buf[off:], []byte(contextA))
228 off += len(contextA)
229 buf[off] = 0x00
230 off++
231 copy(buf[off:], []byte(atomB))
232 return lattice.HashKey(buf)
233 }
234
235 // IngestContextedAtomLink records a word-level cross-language link with
236 // role and governing-context tagging. ContextA is an atom from the same
237 // language as atomA (the immediate head when Head>=0, the clause's
238 // predicate atom when Head==-1, or empty string for the predicate itself).
239 // Same for contextB.
240 //
241 // Generation marker distinguishes legacy lossy-migrated records from
242 // proper context-aware records; the lookup function prefers GenContexted
243 // matches and falls back to GenLegacy.
244 func IngestContextedAtomLink(t *Tree,
245 langA, langB uint8,
246 atomA, contextA string, roleA int32,
247 atomB, contextB string, roleB int32,
248 rArch, rDisc uint8,
249 ) {
250 // Lemmatize per language at ingest time so inflected forms collapse.
251 if langA == 1 {
252 atomA = LemmatizeEN(atomA).Lemma
253 } else if langA == 2 {
254 atomA = LemmatizeJA(atomA, roleA == HistVerb).Lemma
255 }
256 if langB == 1 {
257 atomB = LemmatizeEN(atomB).Lemma
258 } else if langB == 2 {
259 atomB = LemmatizeJA(atomB, roleB == HistVerb).Lemma
260 }
261 if atomA == "" || atomB == "" {
262 return
263 }
264 if isSingleKana(atomA) || isSingleKana(atomB) {
265 return
266 }
267 if isJunkJAAtom(atomA) || isJunkJAAtom(atomB) {
268 return
269 }
270 // Person-concordance filter: don't link 1st-person pronouns to
271 // 2nd/3rd-person pronouns across languages. JA restructures
272 // predication (EN "I love you" -> JA "君が好きだ") so role-based
273 // alignment creates false cross-person pronoun links.
274 pA := pronounPerson(langA, atomA)
275 pB := pronounPerson(langB, atomB)
276 if pA > 0 && pB > 0 && pA != pB {
277 return
278 }
279 // Pronouns only link to pronouns. Prevents structural misalignment
280 // where EN "you" (SUBJECT) links to JA 物 (SUBJECT) because JA
281 // restructured the predication.
282 if pA > 0 && pB == 0 {
283 return
284 }
285 if pB > 0 && pA == 0 {
286 return
287 }
288
289 key := AtomLinkKey(langA, langB, uint8(roleA), GenContexted, rArch, rDisc,
290 atomA, contextA, atomB)
291 ri := t.LookupRecIdx(lattice.Bpragmatic, key)
292 if ri != lattice.NullRec {
293 t.metaInc(ri)
294 return
295 }
296 form := atomA | "|" | contextA | "|" | atomB | "|" | contextB
297 var rec lattice.Record
298 t.setFormOnRec(&rec, form)
299 rec.Branch = uint8(lattice.Bpragmatic)
300 ri = t.db.InsertRec(lattice.Bpragmatic, key, rec)
301 stageTag := langA | (GenContexted << 4)
302 t.metaSet(ri, MetaEntry{Count: 1, StageTag: stageTag})
303 // Extra layout for GenContexted records:
304 // Extra[0]: roleB
305 // Extra[1]: langB
306 // Extra[2]: R_archaic (corpus register coordinate)
307 // Extra[3]: R_discourse
308 // Extra[4]: roleA
309 m := t.metaGet(ri)
310 if m != nil {
311 m.Extra[0] = uint8(roleB)
312 m.Extra[1] = langB
313 m.Extra[2] = rArch
314 m.Extra[3] = rDisc
315 m.Extra[4] = uint8(roleA)
316 if t.BulkMetaStore != nil {
317 t.BulkMetaStore.dirty[ri] = true
318 }
319 }
320 }
321
322 // AtomLinkResult is the return type of LookupAtomLink. Carries the
323 // destination atom and provenance information for diagnostic visibility.
324 type AtomLinkResult struct {
325 DstAtom string
326 DstRole int32
327 DstContext string
328 Weight uint32
329 Generation uint8 // 0 = legacy fallback, 1 = context-aware preferred match
330 Tier uint8 // 1-4 relaxation tier that produced the pick; 0 = no match
331 }
332
333 // LookupAtomLink finds the best destination atom for (srcLang, srcAtom)
334 // in dstLang via the sidecar index. Each candidate is scored by:
335 //
336 // score = log(1 + weight) × diversity_near × exp(-distance²/σ²)
337 //
338 // Three components:
339 //
340 // 1. log(1 + weight) - logarithmic in observation count. Compresses the
341 // differentiation between high-frequency records so a Tatoeba-
342 // memorized wrong mapping with weight=50 (score ~3.93) doesn't
343 // outvote a less-frequent correct one with weight=5 (score ~1.79)
344 // by orders of magnitude. Bayesian intuition: the 50 observations
345 // from one corpus are correlated, not independent; their information
346 // content scales sub-linearly.
347 //
348 // 2. diversity_near - count of distinct corpus register-coordinates
349 // among candidates with the same DstAtom, FILTERED to coords whose
350 // Gaussian factor is above the diversityNearThreshold. A DstAtom
351 // backed by 3 corpora near the query has diversity_near=3; one
352 // backed by Bible-only (far from a modern query) has near=0,
353 // defaulting to 1. This prevents far-register records from padding
354 // the diversity of an irrelevant DstAtom.
355 //
356 // 3. Gaussian distance - per-record register-axis filter. Records far
357 // from the query coord get muted. Documented in the register coord
358 // design.
359 //
360 // Net effect: corpus diversity outweighs raw count when the diversity
361 // is in-register. A Tatoeba-only correct mapping at weight=20 scores
362 // log(21)*1 = 3.04. A multi-corpus correct mapping at weight=5 each
363 // across 3 near corpora scores log(6)*3 = 5.38. The diverse one wins.
364 // A Tatoeba+Bible "diverse" but in-register-singular wrong mapping
365 // scores log(weight)*1 because Bible's coord is filtered out.
366 //
367 // Tier order (within tier, highest score wins):
368 // Tier 1: GenContexted match with exact (srcContext, srcRole)
369 // Tier 2: GenContexted match with same srcRole, any context
370 // Tier 3: GenContexted match with any role, any context
371 // Tier 4: GenLegacy bilateral fallback
372 //
373 // If sigmaArch/sigmaDisc are 0 the Gaussian factor is omitted.
374 // IngestStats tracks diagnostic counters for the trilateral scoring pipeline.
375 type IngestStats struct {
376 TriFired int32
377 TriConfirmed int32
378 TriSwapped int32
379 TriRescued int32
380 CtxSimFired int32
381 CtxSimBoosted int32
382 DictConfirmFired int32
383 DictAuthorityFired int32
384 }
385
386 // dictPOSMatch returns true when a dictionary entry's POS-derived role
387 // (dictRole) is compatible with the query atom's contextual role (queryRole).
388 // POS-role mapping from dict-ingest: verb->3, adj/adv->4, noun/name->1, else->7.
389 func dictPOSMatch(dictRole, queryRole int32) bool {
390 switch dictRole {
391 case HistVerb:
392 return queryRole == HistVerb
393 case HistModifier:
394 return queryRole == HistModifier || queryRole == HistScope
395 case HistSubject:
396 return queryRole == HistSubject || queryRole == HistObject ||
397 queryRole == HistTopic || queryRole == HistComplement ||
398 queryRole == HistScope
399 case HistComplement:
400 return true
401 }
402 return true
403 }
404
405 func LookupAtomLink(idx *AtomIdx, srcLang, dstLang uint8,
406 srcAtom, srcContext string, srcRole int32,
407 qArch, qDisc uint8, sigmaArch, sigmaDisc float64,
408 stats *IngestStats,
409 ) AtomLinkResult {
410 if idx == nil {
411 return AtomLinkResult{}
412 }
413 candidates := idx.FindBySrc(srcLang, srcAtom)
414 if len(candidates) == 0 {
415 return AtomLinkResult{}
416 }
417
418 gauss := func(rArch, rDisc uint8) float64 {
419 if sigmaArch <= 0 || sigmaDisc <= 0 {
420 return 1.0
421 }
422 da := float64(int32(rArch) - int32(qArch))
423 dd := float64(int32(rDisc) - int32(qDisc))
424 exponent := (da*da)/(sigmaArch*sigmaArch) + (dd*dd)/(sigmaDisc*sigmaDisc)
425 return math.Exp(-exponent)
426 }
427
428 // First pass: per-DstAtom diversity_near. Count distinct corpus
429 // coords whose Gaussian factor is above diversityNearThreshold.
430 // Coords beyond the threshold are too register-distant to count as
431 // supporting evidence for a translation.
432 type coordSet map[uint16]bool
433 diversity := map[string]coordSet{}
434 for i := range candidates {
435 e := &candidates[i]
436 if e.DstLang != dstLang {
437 continue
438 }
439 if gauss(e.RArchaic, e.RDiscourse) < diversityNearThreshold {
440 continue
441 }
442 coord := uint16(e.RArchaic)<<8 | uint16(e.RDiscourse)
443 s := diversity[e.DstAtom]
444 if s == nil {
445 s = coordSet{}
446 diversity[e.DstAtom] = s
447 }
448 s[coord] = true
449 }
450
451 baseScore := func(e *AtomIdxEntry) float64 {
452 w := math.Log1p(float64(e.Weight))
453 div := float64(len(diversity[e.DstAtom]))
454 if div < 1 {
455 div = 1
456 }
457 return w * div * gauss(e.RArchaic, e.RDiscourse)
458 }
459
460 // Context-similarity: translate srcContext to dstLang once.
461 // When a tier-2 candidate's ContextB matches, the candidate was
462 // observed with the same governing head (translated) as the query -
463 // strong polysemy disambiguation signal.
464 ctxTranslation := ""
465 if srcContext != "" {
466 ctxCands := idx.FindBySrc(srcLang, srcContext)
467 var bestCtxW float64
468 for j := range ctxCands {
469 cc := &ctxCands[j]
470 if cc.DstLang != dstLang {
471 continue
472 }
473 w := float64(cc.Weight) * gauss(cc.RArchaic, cc.RDiscourse)
474 if w > bestCtxW {
475 bestCtxW = w
476 ctxTranslation = cc.DstAtom
477 }
478 }
479 }
480
481 // Bilateral consistency: only check the top candidate per tier to
482 // avoid O(K*M) backward lookups on polysemous atoms.
483 biCheck := func(dstAtom string) float64 {
484 backCands := idx.FindBySrc(dstLang, dstAtom)
485 var best, src float64
486 for j := range backCands {
487 bc := &backCands[j]
488 if bc.DstLang != srcLang {
489 continue
490 }
491 w := float64(bc.Weight)
492 if w > best {
493 best = w
494 }
495 if bc.DstAtom == srcAtom && w > src {
496 src = w
497 }
498 }
499 if best <= 0 {
500 return 1.0
501 }
502 return (src + 1) / (best + 1)
503 }
504
505 // Per-DstAtom aggregation: sum baseScore across all entries for the
506 // same destination atom within each tier. Polysemous atoms observed
507 // in many contexts accumulate evidence.
508 type atomAgg struct {
509 bestEntry *AtomIdxEntry
510 aggScore float64
511 tier int32
512 }
513 tierAtoms := map[string]*atomAgg{}
514
515 for i := range candidates {
516 e := &candidates[i]
517 if e.DstLang != dstLang {
518 continue
519 }
520 s := baseScore(e)
521 ti := -1
522 if e.Gen == GenContexted {
523 if e.ContextA == srcContext && int32(e.RoleA) == srcRole {
524 ti = 0
525 } else if int32(e.RoleA) == srcRole {
526 ti = 1
527 } else {
528 ti = 2
529 }
530 } else if e.Gen == GenLegacy {
531 ti = 3
532 } else if e.Gen == GenDictionary {
533 ti = 4
534 }
535 if ti < 0 {
536 continue
537 }
538 key := string([]byte{byte(ti), ':'}) | e.DstAtom
539 a := tierAtoms[key]
540 if a == nil {
541 a = &atomAgg{tier: ti}
542 tierAtoms[key] = a
543 }
544 a.aggScore += s
545 if a.bestEntry == nil || s > baseScore(a.bestEntry) {
546 a.bestEntry = e
547 }
548 }
549
550 // Collect top-N per tier for bilateral scoring.
551 const topN = 8
552 type ranked struct {
553 entry *AtomIdxEntry
554 aggScore float64
555 dstAtom string
556 }
557 var top [5][topN]ranked
558 for _, a := range tierAtoms {
559 ti := a.tier
560 s := a.aggScore
561 slot := -1
562 for k := 0; k < topN; k++ {
563 if top[ti][k].entry == nil {
564 slot = k
565 break
566 }
567 if s > top[ti][k].aggScore {
568 slot = k
569 break
570 }
571 }
572 if slot < 0 {
573 continue
574 }
575 for k := topN - 1; k > slot; k-- {
576 top[ti][k] = top[ti][k-1]
577 }
578 top[ti][slot] = ranked{entry: a.bestEntry, aggScore: s, dstAtom: a.bestEntry.DstAtom}
579 }
580
581 // triConfirm checks whether srcAtom->dstAtom is confirmed by a
582 // 2-hop path through an intermediate language.
583 intermediateLangs := [2]uint8{0x03, 0x04} // KO, ZH
584 triConfirm := func(dstAtom string) int32 {
585 confirms := 0
586 for _, mid := range intermediateLangs {
587 if mid == srcLang || mid == dstLang {
588 continue
589 }
590 srcMid := topAtomVia(idx, srcLang, mid, srcAtom)
591 if srcMid == "" {
592 continue
593 }
594 dstMid := topAtomVia(idx, dstLang, mid, dstAtom)
595 if dstMid == "" {
596 continue
597 }
598 if srcMid == dstMid {
599 confirms++
600 }
601 }
602 return confirms
603 }
604
605 ctxMatch := func(e *AtomIdxEntry) bool {
606 return ctxTranslation != "" && e.ContextB == ctxTranslation
607 }
608
609 dictConfirm := func(dstAtom string) bool {
610 for i := range candidates {
611 e := &candidates[i]
612 if e.Gen != GenDictionary || e.DstLang != dstLang {
613 continue
614 }
615 if !dictPOSMatch(int32(e.RoleA), srcRole) {
616 continue
617 }
618 if e.DstAtom == dstAtom {
619 return true
620 }
621 // Fuzzy: corpus "search" matches dict "search for",
622 // or corpus "carry out" matches dict "carry".
623 if dstLang == 1 {
624 da := e.DstAtom
625 if len(dstAtom) < len(da) && da[:len(dstAtom)] == dstAtom && da[len(dstAtom)] == ' ' {
626 return true
627 }
628 if len(da) < len(dstAtom) && dstAtom[:len(da)] == da && dstAtom[len(da)] == ' ' {
629 return true
630 }
631 }
632 }
633 return false
634 }
635
636 // Combined scoring: bilateral ratio modulates aggregate score.
637 // ctx-sim, triangulation, and dictionary confirmation are bonuses.
638 type scored struct {
639 entry *AtomIdxEntry
640 combined float64
641 tier int32
642 dictOK bool
643 }
644 var viable []scored
645
646 for ti := 0; ti < 4; ti++ {
647 topAgg := 0.0
648 if top[ti][0].entry != nil {
649 topAgg = top[ti][0].aggScore
650 }
651 for k := 0; k < topN; k++ {
652 r := &top[ti][k]
653 if r.entry == nil {
654 continue
655 }
656 if topAgg > 0 && r.aggScore < topAgg*0.1 {
657 continue
658 }
659 bi := biCheck(r.dstAtom)
660 combined := r.aggScore * (bi + 0.05)
661 if ctxMatch(r.entry) {
662 combined *= 1.5
663 stats.CtxSimFired++
664 }
665 dc := dictConfirm(r.dstAtom)
666 if dc {
667 combined *= 1.6
668 stats.DictConfirmFired++
669 }
670 viable = append(viable, scored{entry: r.entry, combined: combined, tier: ti, dictOK: dc})
671 }
672 if len(viable) > 0 {
673 break
674 }
675 }
676 // Dictionary authority counter: track when dict-confirmed candidates
677 // exist in the viable set (for diagnostics).
678 for _, v := range viable {
679 if v.dictOK {
680 stats.DictAuthorityFired++
681 break
682 }
683 }
684 // Tier-4 (dictionary) only as last resort when corpus tiers empty.
685 // Dict entries are pre-validated translations, so use a biRatio floor
686 // to prevent polysemous back-indexes from over-penalizing common words.
687 if len(viable) == 0 {
688 for k := 0; k < topN; k++ {
689 r := &top[4][k]
690 if r.entry == nil {
691 continue
692 }
693 bi := biCheck(r.dstAtom)
694 if bi < 0.25 {
695 bi = 0.25
696 }
697 combined := r.aggScore * (bi + 0.05)
698 dc := dictConfirm(r.dstAtom)
699 if dc {
700 combined *= 1.6
701 stats.DictConfirmFired++
702 }
703 viable = append(viable, scored{entry: r.entry, combined: combined, tier: 4, dictOK: dc})
704 }
705 }
706
707 // Triangulation bonus on top-2 viable candidates.
708 if len(viable) >= 2 {
709 stats.TriFired++
710 tc0 := triConfirm(viable[0].entry.DstAtom)
711 tc1 := triConfirm(viable[1].entry.DstAtom)
712 if tc0 > 0 || tc1 > 0 {
713 stats.TriConfirmed++
714 }
715 if tc1 > 0 && tc0 == 0 {
716 viable[1].combined *= 1.3
717 stats.TriSwapped++
718 } else if tc0 > 0 && tc1 == 0 {
719 viable[0].combined *= 1.3
720 }
721 } else if len(viable) == 1 {
722 stats.TriFired++
723 tc := triConfirm(viable[0].entry.DstAtom)
724 if tc > 0 {
725 stats.TriConfirmed++
726 stats.TriRescued++
727 }
728 }
729
730 // Pick highest combined score.
731 var pick *AtomIdxEntry
732 tier := uint8(0)
733 bestCombined := 0.0
734 for _, v := range viable {
735 if v.combined > bestCombined {
736 bestCombined = v.combined
737 pick = v.entry
738 tier = uint8(v.tier + 1)
739 }
740 }
741 if pick == nil {
742 return AtomLinkResult{}
743 }
744 return AtomLinkResult{
745 DstAtom: pick.DstAtom,
746 DstRole: int32(pick.RoleB),
747 DstContext: pick.ContextB,
748 Weight: pick.Weight,
749 Generation: pick.Gen,
750 Tier: tier,
751 }
752 }
753
754 // DiagCandidate holds scoring details for one candidate in the ranked list.
755 type DiagCandidate struct {
756 DstAtom string
757 AggScore float64
758 BiRatio float64
759 Combined float64
760 CtxSim bool
761 Tier int32
762 }
763
764 // LookupAtomLinkDiag is LookupAtomLink with full candidate diagnostics.
765 func LookupAtomLinkDiag(idx *AtomIdx, srcLang, dstLang uint8,
766 srcAtom, srcContext string, srcRole int32,
767 qArch, qDisc uint8, sigmaArch, sigmaDisc float64,
768 ) (AtomLinkResult, []DiagCandidate) {
769 if idx == nil {
770 return AtomLinkResult{}, nil
771 }
772 candidates := idx.FindBySrc(srcLang, srcAtom)
773 if len(candidates) == 0 {
774 return AtomLinkResult{}, nil
775 }
776
777 gauss := func(rArch, rDisc uint8) float64 {
778 if sigmaArch <= 0 || sigmaDisc <= 0 {
779 return 1.0
780 }
781 da := float64(int32(rArch) - int32(qArch))
782 dd := float64(int32(rDisc) - int32(qDisc))
783 exponent := (da*da)/(sigmaArch*sigmaArch) + (dd*dd)/(sigmaDisc*sigmaDisc)
784 return math.Exp(-exponent)
785 }
786
787 type coordSet map[uint16]bool
788 diversity := map[string]coordSet{}
789 for i := range candidates {
790 e := &candidates[i]
791 if e.DstLang != dstLang {
792 continue
793 }
794 if gauss(e.RArchaic, e.RDiscourse) < diversityNearThreshold {
795 continue
796 }
797 coord := uint16(e.RArchaic)<<8 | uint16(e.RDiscourse)
798 s := diversity[e.DstAtom]
799 if s == nil {
800 s = coordSet{}
801 diversity[e.DstAtom] = s
802 }
803 s[coord] = true
804 }
805
806 baseScore := func(e *AtomIdxEntry) float64 {
807 w := math.Log1p(float64(e.Weight))
808 div := float64(len(diversity[e.DstAtom]))
809 if div < 1 {
810 div = 1
811 }
812 return w * div * gauss(e.RArchaic, e.RDiscourse)
813 }
814
815 ctxTranslation := ""
816 if srcContext != "" {
817 ctxCands := idx.FindBySrc(srcLang, srcContext)
818 var bestCtxW float64
819 for j := range ctxCands {
820 cc := &ctxCands[j]
821 if cc.DstLang != dstLang {
822 continue
823 }
824 w := float64(cc.Weight) * gauss(cc.RArchaic, cc.RDiscourse)
825 if w > bestCtxW {
826 bestCtxW = w
827 ctxTranslation = cc.DstAtom
828 }
829 }
830 }
831
832 biCheck := func(dstAtom string) float64 {
833 backCands := idx.FindBySrc(dstLang, dstAtom)
834 var best, src float64
835 for j := range backCands {
836 bc := &backCands[j]
837 if bc.DstLang != srcLang {
838 continue
839 }
840 w := float64(bc.Weight)
841 if w > best {
842 best = w
843 }
844 if bc.DstAtom == srcAtom && w > src {
845 src = w
846 }
847 }
848 if best <= 0 {
849 return 1.0
850 }
851 return (src + 1) / (best + 1)
852 }
853
854 type atomAgg struct {
855 bestEntry *AtomIdxEntry
856 aggScore float64
857 tier int32
858 }
859 tierAtoms := map[string]*atomAgg{}
860 for i := range candidates {
861 e := &candidates[i]
862 if e.DstLang != dstLang {
863 continue
864 }
865 s := baseScore(e)
866 ti := -1
867 if e.Gen == GenContexted {
868 if e.ContextA == srcContext && int32(e.RoleA) == srcRole {
869 ti = 0
870 } else if int32(e.RoleA) == srcRole {
871 ti = 1
872 } else {
873 ti = 2
874 }
875 } else if e.Gen == GenLegacy {
876 ti = 3
877 } else if e.Gen == GenDictionary {
878 ti = 4
879 }
880 if ti < 0 {
881 continue
882 }
883 key := string([]byte{byte(ti), ':'}) | e.DstAtom
884 a := tierAtoms[key]
885 if a == nil {
886 a = &atomAgg{tier: ti}
887 tierAtoms[key] = a
888 }
889 a.aggScore += s
890 if a.bestEntry == nil || s > baseScore(a.bestEntry) {
891 a.bestEntry = e
892 }
893 }
894
895 type ranked struct {
896 entry *AtomIdxEntry
897 aggScore float64
898 dstAtom string
899 }
900 var top [5][4]ranked
901 for _, a := range tierAtoms {
902 ti := a.tier
903 s := a.aggScore
904 slot := -1
905 for k := 0; k < 4; k++ {
906 if top[ti][k].entry == nil {
907 slot = k
908 break
909 }
910 if s > top[ti][k].aggScore {
911 slot = k
912 break
913 }
914 }
915 if slot < 0 {
916 continue
917 }
918 for k := 3; k > slot; k-- {
919 top[ti][k] = top[ti][k-1]
920 }
921 top[ti][slot] = ranked{entry: a.bestEntry, aggScore: s, dstAtom: a.bestEntry.DstAtom}
922 }
923
924 ctxMatch := func(e *AtomIdxEntry) bool {
925 return ctxTranslation != "" && e.ContextB == ctxTranslation
926 }
927
928 diagDictConfirm := func(dstAtom string) bool {
929 for i := range candidates {
930 e := &candidates[i]
931 if e.Gen != GenDictionary || e.DstLang != dstLang {
932 continue
933 }
934 if !dictPOSMatch(int32(e.RoleA), srcRole) {
935 continue
936 }
937 if e.DstAtom == dstAtom {
938 return true
939 }
940 if dstLang == 1 {
941 da := e.DstAtom
942 if len(dstAtom) < len(da) && da[:len(dstAtom)] == dstAtom && da[len(dstAtom)] == ' ' {
943 return true
944 }
945 if len(da) < len(dstAtom) && dstAtom[:len(da)] == da && dstAtom[len(da)] == ' ' {
946 return true
947 }
948 }
949 }
950 return false
951 }
952
953 type scoredD struct {
954 entry *AtomIdxEntry
955 combined float64
956 aggScore float64
957 biRatio float64
958 ctxSim bool
959 tier int32
960 }
961 var viable []scoredD
962
963 for ti := 0; ti < 4; ti++ {
964 for k := 0; k < 4; k++ {
965 r := &top[ti][k]
966 if r.entry == nil {
967 continue
968 }
969 bi := biCheck(r.dstAtom)
970 combined := r.aggScore * (bi + 0.05)
971 cm := ctxMatch(r.entry)
972 if cm {
973 combined *= 1.5
974 }
975 if diagDictConfirm(r.dstAtom) {
976 combined *= 1.6
977 }
978 viable = append(viable, scoredD{
979 entry: r.entry, combined: combined,
980 aggScore: r.aggScore, biRatio: bi, ctxSim: cm, tier: ti,
981 })
982 }
983 if len(viable) > 0 {
984 break
985 }
986 }
987 if len(viable) == 0 {
988 for k := 0; k < 4; k++ {
989 r := &top[4][k]
990 if r.entry == nil {
991 continue
992 }
993 bi := biCheck(r.dstAtom)
994 combined := r.aggScore * (bi + 0.05)
995 viable = append(viable, scoredD{
996 entry: r.entry, combined: combined,
997 aggScore: r.aggScore, biRatio: bi, ctxSim: false, tier: 4,
998 })
999 }
1000 }
1001
1002 var diag []DiagCandidate
1003 for _, v := range viable {
1004 diag = append(diag, DiagCandidate{
1005 DstAtom: v.entry.DstAtom,
1006 AggScore: v.aggScore,
1007 BiRatio: v.biRatio,
1008 Combined: v.combined,
1009 CtxSim: v.ctxSim,
1010 Tier: v.tier,
1011 })
1012 }
1013
1014 if len(viable) >= 2 {
1015 tc0 := triConfirmStatic(idx, srcLang, dstLang, srcAtom, viable[0].entry.DstAtom)
1016 tc1 := triConfirmStatic(idx, srcLang, dstLang, srcAtom, viable[1].entry.DstAtom)
1017 if tc1 > 0 && tc0 == 0 {
1018 viable[1].combined *= 1.3
1019 } else if tc0 > 0 && tc1 == 0 {
1020 viable[0].combined *= 1.3
1021 }
1022 }
1023
1024 var pick *AtomIdxEntry
1025 tier := uint8(0)
1026 bestCombined := 0.0
1027 for _, v := range viable {
1028 if v.combined > bestCombined {
1029 bestCombined = v.combined
1030 pick = v.entry
1031 tier = uint8(v.tier + 1)
1032 }
1033 }
1034
1035 if pick == nil {
1036 return AtomLinkResult{}, diag
1037 }
1038 return AtomLinkResult{
1039 DstAtom: pick.DstAtom,
1040 DstRole: int32(pick.RoleB),
1041 DstContext: pick.ContextB,
1042 Weight: pick.Weight,
1043 Generation: pick.Gen,
1044 Tier: tier,
1045 }, diag
1046 }
1047
1048 // triConfirmStatic is a non-counter-incrementing version for diagnostics.
1049 func triConfirmStatic(idx *AtomIdx, srcLang, dstLang uint8, srcAtom, dstAtom string) int32 {
1050 intermediateLangs := [2]uint8{0x03, 0x04}
1051 confirms := 0
1052 for _, mid := range intermediateLangs {
1053 if mid == srcLang || mid == dstLang {
1054 continue
1055 }
1056 srcMid := topAtomVia(idx, srcLang, mid, srcAtom)
1057 if srcMid == "" {
1058 continue
1059 }
1060 dstMid := topAtomVia(idx, dstLang, mid, dstAtom)
1061 if dstMid == "" {
1062 continue
1063 }
1064 if srcMid == dstMid {
1065 confirms++
1066 }
1067 }
1068 return confirms
1069 }
1070
1071 // topAtomVia returns the highest-weight DstAtom for srcLang->dstLang
1072 // without full scoring. Used by triangulation to get a quick "what does
1073 // this atom translate to via language M?" answer.
1074 func topAtomVia(idx *AtomIdx, srcLang, dstLang uint8, srcAtom string) string {
1075 cands := idx.FindBySrc(srcLang, srcAtom)
1076 var bestAtom string
1077 var bestScore float64
1078 for i := range cands {
1079 e := &cands[i]
1080 if e.DstLang != dstLang {
1081 continue
1082 }
1083 da := float64(e.RArchaic)
1084 dd := float64(e.RDiscourse)
1085 g := math.Exp(-(da*da)/(DefaultSigmaArchaic*DefaultSigmaArchaic) - (dd*dd)/(DefaultSigmaDiscourse*DefaultSigmaDiscourse))
1086 s := float64(e.Weight) * g
1087 if s > bestScore {
1088 bestScore = s
1089 bestAtom = e.DstAtom
1090 }
1091 }
1092 return bestAtom
1093 }
1094
1095 // IngestAtomLink records a word-level cross-domain correspondence.
1096 // Words are lemmatized before storage so inflected forms collapse to region centers.
1097 // srcRole/dstRole hint whether the word is a verb (needed for JA lemmatization).
1098 //
1099 // This is the legacy bilateral function (GenLegacy generation). It remains
1100 // in place for backward compatibility and as the lookup fallback for atoms
1101 // that have no GenContexted records yet. New ingest paths should call
1102 // IngestContextedAtomLink instead.
1103 func IngestAtomLink(t *Tree, srcDomain, dstDomain uint8, srcWord, dstWord string, srcRole, dstRole int32, rArch, rDisc uint8) {
1104 srcAtom := srcWord
1105 dstAtom := dstWord
1106 if srcDomain == 1 {
1107 srcAtom = LemmatizeEN(srcWord).Lemma
1108 } else if srcDomain == 2 {
1109 srcAtom = LemmatizeJA(srcWord, srcRole == HistVerb).Lemma
1110 }
1111 if dstDomain == 1 {
1112 dstAtom = LemmatizeEN(dstWord).Lemma
1113 } else if dstDomain == 2 {
1114 dstAtom = LemmatizeJA(dstWord, dstRole == HistVerb).Lemma
1115 }
1116 if isSingleKana(srcAtom) || isSingleKana(dstAtom) {
1117 return
1118 }
1119 if isJunkJAAtom(srcAtom) || isJunkJAAtom(dstAtom) {
1120 return
1121 }
1122
1123 buf := []byte{:5 + len(srcAtom) + len(dstAtom):5 + len(srcAtom) + len(dstAtom)}
1124 buf[0] = srcDomain
1125 buf[1] = dstDomain
1126 buf[2] = 'L'
1127 buf[3] = rArch
1128 buf[4] = rDisc
1129 copy(buf[5:], []byte(srcAtom))
1130 copy(buf[5+len(srcAtom):], []byte(dstAtom))
1131 key := lattice.HashKey(buf)
1132
1133 ri := t.LookupRecIdx(lattice.Bpragmatic, key)
1134 if ri != lattice.NullRec {
1135 t.metaInc(ri)
1136 return
1137 }
1138 form := srcAtom | "=" | dstAtom
1139 var rec lattice.Record
1140 t.setFormOnRec(&rec, form)
1141 rec.Branch = uint8(lattice.Bpragmatic)
1142 ri = t.db.InsertRec(lattice.Bpragmatic, key, rec)
1143 t.metaSet(ri, MetaEntry{Count: 1, StageTag: srcDomain})
1144 // GenLegacy records also carry the corpus register coord so the sidecar
1145 // reader can apply distance weighting to legacy candidates too.
1146 m := t.metaGet(ri)
1147 if m != nil {
1148 m.Extra[2] = rArch
1149 m.Extra[3] = rDisc
1150 if t.BulkMetaStore != nil {
1151 t.BulkMetaStore.dirty[ri] = true
1152 }
1153 }
1154 }
1155
1156 func IngestDictAtomLink(t *Tree, srcDomain, dstDomain uint8, srcWord, dstWord string, srcRole, dstRole int32) {
1157 srcAtom := srcWord
1158 dstAtom := dstWord
1159 if srcDomain == 1 {
1160 srcAtom = LemmatizeEN(srcWord).Lemma
1161 } else if srcDomain == 2 {
1162 srcAtom = LemmatizeJA(srcWord, srcRole == HistVerb).Lemma
1163 }
1164 if dstDomain == 1 {
1165 dstAtom = LemmatizeEN(dstWord).Lemma
1166 } else if dstDomain == 2 {
1167 dstAtom = LemmatizeJA(dstWord, dstRole == HistVerb).Lemma
1168 }
1169 ingestDictAtomLinkInner(t, srcDomain, dstDomain, srcAtom, dstAtom, srcRole, dstRole)
1170 }
1171
1172 func IngestDictAtomLinkRaw(t *Tree, srcDomain, dstDomain uint8, srcWord, dstWord string, srcRole, dstRole int32) {
1173 ingestDictAtomLinkInner(t, srcDomain, dstDomain, srcWord, dstWord, srcRole, dstRole)
1174 }
1175
1176 func ingestDictAtomLinkInner(t *Tree, srcDomain, dstDomain uint8, srcAtom, dstAtom string, srcRole, dstRole int32) {
1177 if srcAtom == "" || dstAtom == "" {
1178 return
1179 }
1180 if isSingleKana(srcAtom) || isSingleKana(dstAtom) {
1181 return
1182 }
1183 if isJunkJAAtom(srcAtom) || isJunkJAAtom(dstAtom) {
1184 return
1185 }
1186
1187 buf := []byte{:5 + len(srcAtom) + len(dstAtom):5 + len(srcAtom) + len(dstAtom)}
1188 buf[0] = srcDomain
1189 buf[1] = dstDomain
1190 buf[2] = 'D'
1191 buf[3] = uint8(srcRole)
1192 buf[4] = uint8(dstRole)
1193 copy(buf[5:], []byte(srcAtom))
1194 copy(buf[5+len(srcAtom):], []byte(dstAtom))
1195 key := lattice.HashKey(buf)
1196
1197 ri := t.LookupRecIdx(lattice.Bpragmatic, key)
1198 if ri != lattice.NullRec {
1199 t.metaInc(ri)
1200 return
1201 }
1202 form := srcAtom | "=" | dstAtom
1203 var rec lattice.Record
1204 t.setFormOnRec(&rec, form)
1205 rec.Branch = uint8(lattice.Bpragmatic)
1206 ri = t.db.InsertRec(lattice.Bpragmatic, key, rec)
1207 stageTag := srcDomain | (GenDictionary << 4)
1208 t.metaSet(ri, MetaEntry{Count: 1, StageTag: stageTag})
1209 m := t.metaGet(ri)
1210 if m != nil {
1211 m.Extra[0] = uint8(dstRole)
1212 m.Extra[1] = dstDomain
1213 m.Extra[4] = uint8(srcRole)
1214 if t.BulkMetaStore != nil {
1215 t.BulkMetaStore.dirty[ri] = true
1216 }
1217 }
1218 }
1219