package iskra import ( "math" "git.smesh.lol/iskradb/lattice" ) // Default Gaussian σ for register-coordinate filtering in LookupAtomLink. // σ_archaic is tighter than σ_discourse: archaism is a stronger semantic // register mismatch than mere sentence-length difference. Tunable per call. const ( DefaultSigmaArchaic = 64.0 DefaultSigmaDiscourse = 128.0 // diversityNearThreshold: the Gaussian factor below which a corpus // coord doesn't count toward the per-DstAtom diversity bonus. // Coords with Gaussian < 0.05 (i.e. ~20× muted) are too far to be // considered "supporting evidence" for a translation. Empirically: // bible (255,199) from query (0,0) gives ~1.4e-8; KFTT (5,171) from // (0,0) gives ~0.17 - KFTT counts as supporting evidence, bible does not. diversityNearThreshold = 0.05 ) // IngestPattern stores atoms and patterns from an extraction result. // Returns the pattern recIdx for cross-domain linking. func IngestPattern(t *Tree, domain uint8, ext ExtractResult) uint32 { if len(ext.Pattern) == 0 { return lattice.NullRec } // 1. Upsert the pattern record (Bgrammatical branch). patKey := PatternKey(domain, ext.Pattern) patRI := t.LookupRecIdx(lattice.Bgrammatical, patKey) if patRI != lattice.NullRec { t.metaInc(patRI) } else { var rec lattice.Record t.setFormOnRec(&rec, string(ext.Pattern)) rec.Branch = uint8(lattice.Bgrammatical) patRI = t.db.InsertRec(lattice.Bgrammatical, patKey, rec) t.metaSet(patRI, MetaEntry{Count: 1, StageTag: domain}) } // 2. Upsert each content slot as an atom (Bsemantic branch). // Use lemma (from Set) as the atom key when available; fall back to surface form. for i, word := range ext.Slots { if word == "" { continue } atomForm := word if i < len(ext.Set) && ext.Set[i].Atom != "" { atomForm = ext.Set[i].Atom } atomKey := AtomKey(domain, atomForm) atomRI := t.LookupRecIdx(lattice.Bsemantic, atomKey) if atomRI != lattice.NullRec { t.metaInc(atomRI) if i < len(ext.Roles) { m := t.metaGet(atomRI) if m != nil { var h RoleHist h.Decode(m.Extra) h[ext.Roles[i]]++ h.Encode(&m.Extra) if t.BulkMetaStore != nil { t.BulkMetaStore.dirty[atomRI] = true } } } } else { var rec lattice.Record t.setFormOnRec(&rec, atomForm) rec.Branch = uint8(lattice.Bsemantic) atomRI = t.db.InsertRec(lattice.Bsemantic, atomKey, rec) m := MetaEntry{Count: 1, StageTag: domain} if i < len(ext.Roles) { var h RoleHist h[ext.Roles[i]] = 1 h.Encode(&m.Extra) } t.metaSet(atomRI, m) } } return patRI } // IngestCrossDomain records a structural alignment between two patterns. // Called when a JA sentence pattern corresponds to an EN sentence pattern. func IngestCrossDomain(t *Tree, srcDomain, dstDomain uint8, srcPat, dstPat []byte) { if len(srcPat) == 0 || len(dstPat) == 0 { return } key := CrossPatternKey(srcDomain, dstDomain, srcPat, dstPat) ri := t.LookupRecIdx(lattice.Bcooccur, key) if ri != lattice.NullRec { t.metaInc(ri) return } form := string(srcPat) | "=" | string(dstPat) var rec lattice.Record t.setFormOnRec(&rec, form) rec.Branch = uint8(lattice.Bcooccur) ri = t.db.InsertRec(lattice.Bcooccur, key, rec) t.metaSet(ri, MetaEntry{Count: 1, StageTag: srcDomain}) } // IngestDeepPattern stores a canonical deep pattern and increments its count. // Deep patterns are language-independent role sequences shared across domains. func IngestDeepPattern(t *Tree, deepPat []byte) { if len(deepPat) == 0 { return } key := DeepPatternKey(deepPat) ri := t.LookupRecIdx(lattice.Bgrammatical, key) if ri != lattice.NullRec { t.metaInc(ri) return } var rec lattice.Record t.setFormOnRec(&rec, string(deepPat)) rec.Branch = uint8(lattice.Bgrammatical) ri = t.db.InsertRec(lattice.Bgrammatical, key, rec) t.metaSet(ri, MetaEntry{Count: 1, StageTag: 0}) // domain 0 = cross-domain } // Atom-link generation marker. Stored in MetaEntry.StageTag's high bit-zone // to distinguish: // GenLegacy (0) - records written by the bilateral IngestAtomLink before // the context-aware schema landed; role/context fields // are empty/unknown. Used as a translation fallback. // GenContexted (1) - records written by IngestContextedAtomLink with // role and governing-context populated. The preferred // lookup path. const ( GenLegacy uint8 = 0 GenContexted uint8 = 1 GenDictionary uint8 = 2 ) // pronounPerson returns the grammatical person (1, 2, 3) of a pronoun // atom, or 0 if the atom is not a known pronoun. Used to prevent // cross-person pronoun links during ingestion. func pronounPerson(lang uint8, atom string) int32 { if lang == 1 { switch atom { case "i", "me", "my", "myself", "we", "us", "our", "ourselves": return 1 case "you", "your", "yourself", "yourselves": return 2 case "he", "him", "his", "himself", "she", "her", "herself", "they", "them", "their", "themselves": return 3 case "it", "itself": return 4 // inanimate - only links to JA demonstratives, not human pronouns } } else if lang == 2 { switch atom { case "\xe7\xa7\x81", // 私 "\xe5\x83\x95", // 僕 "\xe4\xbf\xba", // 俺 "\xe3\x82\x8f\xe3\x81\x97", // わし "\xe8\x87\xaa\xe5\x88\x86", // 自分 "\xe7\xa7\x81\xe3\x81\x9f\xe3\x81\xa1", // 私たち "\xe6\x88\x91\xe3\x80\x85": // 我々 return 1 case "\xe3\x81\x82\xe3\x81\xaa\xe3\x81\x9f", // あなた "\xe5\x90\x9b", // 君 "\xe3\x81\x8a\xe5\x89\x8d", // お前 "\xe3\x81\x82\xe3\x82\x93\xe3\x81\x9f": // あんた return 2 case "\xe5\xbd\xbc", // 彼 "\xe5\xbd\xbc\xe5\xa5\xb3", // 彼女 "\xe5\xbd\xbc\xe3\x82\x89": // 彼ら return 3 case "\xe3\x81\x9d\xe3\x82\x8c", // それ "\xe3\x81\x93\xe3\x82\x8c", // これ "\xe3\x81\x82\xe3\x82\x8c": // あれ return 4 // inanimate demonstratives } } return 0 } func isSingleKana(s string) bool { return len(s) == 3 && s[0] == 0xe3 && (s[1] == 0x81 || s[1] == 0x82 || s[1] == 0x83) } // isJunkJAAtom filters JA atoms that are lemmatizer artifacts. // っ+single-hiragana (e.g. っう, っく) are malformed godan stems // produced when the lemmatizer over-strips a verb. func isJunkJAAtom(s string) bool { if len(s) == 6 && s[0] == 0xe3 && s[1] == 0x81 && s[2] == 0xa3 && s[3] == 0xe3 && s[4] == 0x81 { return true } return false } // AtomLinkKey constructs the lattice key for a context-aware atom-link // record. Composite of (langA, langB, "X", roleA, gen, rArch, rDisc, // atomA \0 contextA \0 atomB) hashed via SipHash. // // Register coordinate (rArch, rDisc) is in the key so the same atom pair // from different-register corpora produces distinct records. This keeps // scripture-derived associations from polluting modern-conversational // lookups even when the atoms collide. // // Note: this is a point-lookup key. Prefix-scan queries are served by a // sidecar index, not by key structure. func AtomLinkKey(langA, langB, roleA, gen, rArch, rDisc uint8, atomA, contextA, atomB string) lattice.Key { n := 7 + len(atomA) + 1 + len(contextA) + 1 + len(atomB) buf := []byte{:n:n} buf[0] = langA buf[1] = langB buf[2] = 'X' buf[3] = roleA buf[4] = gen buf[5] = rArch buf[6] = rDisc off := 7 copy(buf[off:], []byte(atomA)) off += len(atomA) buf[off] = 0x00 off++ copy(buf[off:], []byte(contextA)) off += len(contextA) buf[off] = 0x00 off++ copy(buf[off:], []byte(atomB)) return lattice.HashKey(buf) } // IngestContextedAtomLink records a word-level cross-language link with // role and governing-context tagging. ContextA is an atom from the same // language as atomA (the immediate head when Head>=0, the clause's // predicate atom when Head==-1, or empty string for the predicate itself). // Same for contextB. // // Generation marker distinguishes legacy lossy-migrated records from // proper context-aware records; the lookup function prefers GenContexted // matches and falls back to GenLegacy. func IngestContextedAtomLink(t *Tree, langA, langB uint8, atomA, contextA string, roleA int32, atomB, contextB string, roleB int32, rArch, rDisc uint8, ) { // Lemmatize per language at ingest time so inflected forms collapse. if langA == 1 { atomA = LemmatizeEN(atomA).Lemma } else if langA == 2 { atomA = LemmatizeJA(atomA, roleA == HistVerb).Lemma } if langB == 1 { atomB = LemmatizeEN(atomB).Lemma } else if langB == 2 { atomB = LemmatizeJA(atomB, roleB == HistVerb).Lemma } if atomA == "" || atomB == "" { return } if isSingleKana(atomA) || isSingleKana(atomB) { return } if isJunkJAAtom(atomA) || isJunkJAAtom(atomB) { return } // Person-concordance filter: don't link 1st-person pronouns to // 2nd/3rd-person pronouns across languages. JA restructures // predication (EN "I love you" -> JA "君が好きだ") so role-based // alignment creates false cross-person pronoun links. pA := pronounPerson(langA, atomA) pB := pronounPerson(langB, atomB) if pA > 0 && pB > 0 && pA != pB { return } // Pronouns only link to pronouns. Prevents structural misalignment // where EN "you" (SUBJECT) links to JA 物 (SUBJECT) because JA // restructured the predication. if pA > 0 && pB == 0 { return } if pB > 0 && pA == 0 { return } key := AtomLinkKey(langA, langB, uint8(roleA), GenContexted, rArch, rDisc, atomA, contextA, atomB) ri := t.LookupRecIdx(lattice.Bpragmatic, key) if ri != lattice.NullRec { t.metaInc(ri) return } form := atomA | "|" | contextA | "|" | atomB | "|" | contextB var rec lattice.Record t.setFormOnRec(&rec, form) rec.Branch = uint8(lattice.Bpragmatic) ri = t.db.InsertRec(lattice.Bpragmatic, key, rec) stageTag := langA | (GenContexted << 4) t.metaSet(ri, MetaEntry{Count: 1, StageTag: stageTag}) // Extra layout for GenContexted records: // Extra[0]: roleB // Extra[1]: langB // Extra[2]: R_archaic (corpus register coordinate) // Extra[3]: R_discourse // Extra[4]: roleA m := t.metaGet(ri) if m != nil { m.Extra[0] = uint8(roleB) m.Extra[1] = langB m.Extra[2] = rArch m.Extra[3] = rDisc m.Extra[4] = uint8(roleA) if t.BulkMetaStore != nil { t.BulkMetaStore.dirty[ri] = true } } } // AtomLinkResult is the return type of LookupAtomLink. Carries the // destination atom and provenance information for diagnostic visibility. type AtomLinkResult struct { DstAtom string DstRole int32 DstContext string Weight uint32 Generation uint8 // 0 = legacy fallback, 1 = context-aware preferred match Tier uint8 // 1-4 relaxation tier that produced the pick; 0 = no match } // LookupAtomLink finds the best destination atom for (srcLang, srcAtom) // in dstLang via the sidecar index. Each candidate is scored by: // // score = log(1 + weight) × diversity_near × exp(-distance²/σ²) // // Three components: // // 1. log(1 + weight) - logarithmic in observation count. Compresses the // differentiation between high-frequency records so a Tatoeba- // memorized wrong mapping with weight=50 (score ~3.93) doesn't // outvote a less-frequent correct one with weight=5 (score ~1.79) // by orders of magnitude. Bayesian intuition: the 50 observations // from one corpus are correlated, not independent; their information // content scales sub-linearly. // // 2. diversity_near - count of distinct corpus register-coordinates // among candidates with the same DstAtom, FILTERED to coords whose // Gaussian factor is above the diversityNearThreshold. A DstAtom // backed by 3 corpora near the query has diversity_near=3; one // backed by Bible-only (far from a modern query) has near=0, // defaulting to 1. This prevents far-register records from padding // the diversity of an irrelevant DstAtom. // // 3. Gaussian distance - per-record register-axis filter. Records far // from the query coord get muted. Documented in the register coord // design. // // Net effect: corpus diversity outweighs raw count when the diversity // is in-register. A Tatoeba-only correct mapping at weight=20 scores // log(21)*1 = 3.04. A multi-corpus correct mapping at weight=5 each // across 3 near corpora scores log(6)*3 = 5.38. The diverse one wins. // A Tatoeba+Bible "diverse" but in-register-singular wrong mapping // scores log(weight)*1 because Bible's coord is filtered out. // // Tier order (within tier, highest score wins): // Tier 1: GenContexted match with exact (srcContext, srcRole) // Tier 2: GenContexted match with same srcRole, any context // Tier 3: GenContexted match with any role, any context // Tier 4: GenLegacy bilateral fallback // // If sigmaArch/sigmaDisc are 0 the Gaussian factor is omitted. // IngestStats tracks diagnostic counters for the trilateral scoring pipeline. type IngestStats struct { TriFired int32 TriConfirmed int32 TriSwapped int32 TriRescued int32 CtxSimFired int32 CtxSimBoosted int32 DictConfirmFired int32 DictAuthorityFired int32 } // dictPOSMatch returns true when a dictionary entry's POS-derived role // (dictRole) is compatible with the query atom's contextual role (queryRole). // POS-role mapping from dict-ingest: verb->3, adj/adv->4, noun/name->1, else->7. func dictPOSMatch(dictRole, queryRole int32) bool { switch dictRole { case HistVerb: return queryRole == HistVerb case HistModifier: return queryRole == HistModifier || queryRole == HistScope case HistSubject: return queryRole == HistSubject || queryRole == HistObject || queryRole == HistTopic || queryRole == HistComplement || queryRole == HistScope case HistComplement: return true } return true } func LookupAtomLink(idx *AtomIdx, srcLang, dstLang uint8, srcAtom, srcContext string, srcRole int32, qArch, qDisc uint8, sigmaArch, sigmaDisc float64, stats *IngestStats, ) AtomLinkResult { if idx == nil { return AtomLinkResult{} } candidates := idx.FindBySrc(srcLang, srcAtom) if len(candidates) == 0 { return AtomLinkResult{} } gauss := func(rArch, rDisc uint8) float64 { if sigmaArch <= 0 || sigmaDisc <= 0 { return 1.0 } da := float64(int32(rArch) - int32(qArch)) dd := float64(int32(rDisc) - int32(qDisc)) exponent := (da*da)/(sigmaArch*sigmaArch) + (dd*dd)/(sigmaDisc*sigmaDisc) return math.Exp(-exponent) } // First pass: per-DstAtom diversity_near. Count distinct corpus // coords whose Gaussian factor is above diversityNearThreshold. // Coords beyond the threshold are too register-distant to count as // supporting evidence for a translation. type coordSet map[uint16]bool diversity := map[string]coordSet{} for i := range candidates { e := &candidates[i] if e.DstLang != dstLang { continue } if gauss(e.RArchaic, e.RDiscourse) < diversityNearThreshold { continue } coord := uint16(e.RArchaic)<<8 | uint16(e.RDiscourse) s := diversity[e.DstAtom] if s == nil { s = coordSet{} diversity[e.DstAtom] = s } s[coord] = true } baseScore := func(e *AtomIdxEntry) float64 { w := math.Log1p(float64(e.Weight)) div := float64(len(diversity[e.DstAtom])) if div < 1 { div = 1 } return w * div * gauss(e.RArchaic, e.RDiscourse) } // Context-similarity: translate srcContext to dstLang once. // When a tier-2 candidate's ContextB matches, the candidate was // observed with the same governing head (translated) as the query - // strong polysemy disambiguation signal. ctxTranslation := "" if srcContext != "" { ctxCands := idx.FindBySrc(srcLang, srcContext) var bestCtxW float64 for j := range ctxCands { cc := &ctxCands[j] if cc.DstLang != dstLang { continue } w := float64(cc.Weight) * gauss(cc.RArchaic, cc.RDiscourse) if w > bestCtxW { bestCtxW = w ctxTranslation = cc.DstAtom } } } // Bilateral consistency: only check the top candidate per tier to // avoid O(K*M) backward lookups on polysemous atoms. biCheck := func(dstAtom string) float64 { backCands := idx.FindBySrc(dstLang, dstAtom) var best, src float64 for j := range backCands { bc := &backCands[j] if bc.DstLang != srcLang { continue } w := float64(bc.Weight) if w > best { best = w } if bc.DstAtom == srcAtom && w > src { src = w } } if best <= 0 { return 1.0 } return (src + 1) / (best + 1) } // Per-DstAtom aggregation: sum baseScore across all entries for the // same destination atom within each tier. Polysemous atoms observed // in many contexts accumulate evidence. type atomAgg struct { bestEntry *AtomIdxEntry aggScore float64 tier int32 } tierAtoms := map[string]*atomAgg{} for i := range candidates { e := &candidates[i] if e.DstLang != dstLang { continue } s := baseScore(e) ti := -1 if e.Gen == GenContexted { if e.ContextA == srcContext && int32(e.RoleA) == srcRole { ti = 0 } else if int32(e.RoleA) == srcRole { ti = 1 } else { ti = 2 } } else if e.Gen == GenLegacy { ti = 3 } else if e.Gen == GenDictionary { ti = 4 } if ti < 0 { continue } key := string([]byte{byte(ti), ':'}) | e.DstAtom a := tierAtoms[key] if a == nil { a = &atomAgg{tier: ti} tierAtoms[key] = a } a.aggScore += s if a.bestEntry == nil || s > baseScore(a.bestEntry) { a.bestEntry = e } } // Collect top-N per tier for bilateral scoring. const topN = 8 type ranked struct { entry *AtomIdxEntry aggScore float64 dstAtom string } var top [5][topN]ranked for _, a := range tierAtoms { ti := a.tier s := a.aggScore slot := -1 for k := 0; k < topN; k++ { if top[ti][k].entry == nil { slot = k break } if s > top[ti][k].aggScore { slot = k break } } if slot < 0 { continue } for k := topN - 1; k > slot; k-- { top[ti][k] = top[ti][k-1] } top[ti][slot] = ranked{entry: a.bestEntry, aggScore: s, dstAtom: a.bestEntry.DstAtom} } // triConfirm checks whether srcAtom->dstAtom is confirmed by a // 2-hop path through an intermediate language. intermediateLangs := [2]uint8{0x03, 0x04} // KO, ZH triConfirm := func(dstAtom string) int32 { confirms := 0 for _, mid := range intermediateLangs { if mid == srcLang || mid == dstLang { continue } srcMid := topAtomVia(idx, srcLang, mid, srcAtom) if srcMid == "" { continue } dstMid := topAtomVia(idx, dstLang, mid, dstAtom) if dstMid == "" { continue } if srcMid == dstMid { confirms++ } } return confirms } ctxMatch := func(e *AtomIdxEntry) bool { return ctxTranslation != "" && e.ContextB == ctxTranslation } dictConfirm := func(dstAtom string) bool { for i := range candidates { e := &candidates[i] if e.Gen != GenDictionary || e.DstLang != dstLang { continue } if !dictPOSMatch(int32(e.RoleA), srcRole) { continue } if e.DstAtom == dstAtom { return true } // Fuzzy: corpus "search" matches dict "search for", // or corpus "carry out" matches dict "carry". if dstLang == 1 { da := e.DstAtom if len(dstAtom) < len(da) && da[:len(dstAtom)] == dstAtom && da[len(dstAtom)] == ' ' { return true } if len(da) < len(dstAtom) && dstAtom[:len(da)] == da && dstAtom[len(da)] == ' ' { return true } } } return false } // Combined scoring: bilateral ratio modulates aggregate score. // ctx-sim, triangulation, and dictionary confirmation are bonuses. type scored struct { entry *AtomIdxEntry combined float64 tier int32 dictOK bool } var viable []scored for ti := 0; ti < 4; ti++ { topAgg := 0.0 if top[ti][0].entry != nil { topAgg = top[ti][0].aggScore } for k := 0; k < topN; k++ { r := &top[ti][k] if r.entry == nil { continue } if topAgg > 0 && r.aggScore < topAgg*0.1 { continue } bi := biCheck(r.dstAtom) combined := r.aggScore * (bi + 0.05) if ctxMatch(r.entry) { combined *= 1.5 stats.CtxSimFired++ } dc := dictConfirm(r.dstAtom) if dc { combined *= 1.6 stats.DictConfirmFired++ } viable = append(viable, scored{entry: r.entry, combined: combined, tier: ti, dictOK: dc}) } if len(viable) > 0 { break } } // Dictionary authority counter: track when dict-confirmed candidates // exist in the viable set (for diagnostics). for _, v := range viable { if v.dictOK { stats.DictAuthorityFired++ break } } // Tier-4 (dictionary) only as last resort when corpus tiers empty. // Dict entries are pre-validated translations, so use a biRatio floor // to prevent polysemous back-indexes from over-penalizing common words. if len(viable) == 0 { for k := 0; k < topN; k++ { r := &top[4][k] if r.entry == nil { continue } bi := biCheck(r.dstAtom) if bi < 0.25 { bi = 0.25 } combined := r.aggScore * (bi + 0.05) dc := dictConfirm(r.dstAtom) if dc { combined *= 1.6 stats.DictConfirmFired++ } viable = append(viable, scored{entry: r.entry, combined: combined, tier: 4, dictOK: dc}) } } // Triangulation bonus on top-2 viable candidates. if len(viable) >= 2 { stats.TriFired++ tc0 := triConfirm(viable[0].entry.DstAtom) tc1 := triConfirm(viable[1].entry.DstAtom) if tc0 > 0 || tc1 > 0 { stats.TriConfirmed++ } if tc1 > 0 && tc0 == 0 { viable[1].combined *= 1.3 stats.TriSwapped++ } else if tc0 > 0 && tc1 == 0 { viable[0].combined *= 1.3 } } else if len(viable) == 1 { stats.TriFired++ tc := triConfirm(viable[0].entry.DstAtom) if tc > 0 { stats.TriConfirmed++ stats.TriRescued++ } } // Pick highest combined score. var pick *AtomIdxEntry tier := uint8(0) bestCombined := 0.0 for _, v := range viable { if v.combined > bestCombined { bestCombined = v.combined pick = v.entry tier = uint8(v.tier + 1) } } if pick == nil { return AtomLinkResult{} } return AtomLinkResult{ DstAtom: pick.DstAtom, DstRole: int32(pick.RoleB), DstContext: pick.ContextB, Weight: pick.Weight, Generation: pick.Gen, Tier: tier, } } // DiagCandidate holds scoring details for one candidate in the ranked list. type DiagCandidate struct { DstAtom string AggScore float64 BiRatio float64 Combined float64 CtxSim bool Tier int32 } // LookupAtomLinkDiag is LookupAtomLink with full candidate diagnostics. func LookupAtomLinkDiag(idx *AtomIdx, srcLang, dstLang uint8, srcAtom, srcContext string, srcRole int32, qArch, qDisc uint8, sigmaArch, sigmaDisc float64, ) (AtomLinkResult, []DiagCandidate) { if idx == nil { return AtomLinkResult{}, nil } candidates := idx.FindBySrc(srcLang, srcAtom) if len(candidates) == 0 { return AtomLinkResult{}, nil } gauss := func(rArch, rDisc uint8) float64 { if sigmaArch <= 0 || sigmaDisc <= 0 { return 1.0 } da := float64(int32(rArch) - int32(qArch)) dd := float64(int32(rDisc) - int32(qDisc)) exponent := (da*da)/(sigmaArch*sigmaArch) + (dd*dd)/(sigmaDisc*sigmaDisc) return math.Exp(-exponent) } type coordSet map[uint16]bool diversity := map[string]coordSet{} for i := range candidates { e := &candidates[i] if e.DstLang != dstLang { continue } if gauss(e.RArchaic, e.RDiscourse) < diversityNearThreshold { continue } coord := uint16(e.RArchaic)<<8 | uint16(e.RDiscourse) s := diversity[e.DstAtom] if s == nil { s = coordSet{} diversity[e.DstAtom] = s } s[coord] = true } baseScore := func(e *AtomIdxEntry) float64 { w := math.Log1p(float64(e.Weight)) div := float64(len(diversity[e.DstAtom])) if div < 1 { div = 1 } return w * div * gauss(e.RArchaic, e.RDiscourse) } ctxTranslation := "" if srcContext != "" { ctxCands := idx.FindBySrc(srcLang, srcContext) var bestCtxW float64 for j := range ctxCands { cc := &ctxCands[j] if cc.DstLang != dstLang { continue } w := float64(cc.Weight) * gauss(cc.RArchaic, cc.RDiscourse) if w > bestCtxW { bestCtxW = w ctxTranslation = cc.DstAtom } } } biCheck := func(dstAtom string) float64 { backCands := idx.FindBySrc(dstLang, dstAtom) var best, src float64 for j := range backCands { bc := &backCands[j] if bc.DstLang != srcLang { continue } w := float64(bc.Weight) if w > best { best = w } if bc.DstAtom == srcAtom && w > src { src = w } } if best <= 0 { return 1.0 } return (src + 1) / (best + 1) } type atomAgg struct { bestEntry *AtomIdxEntry aggScore float64 tier int32 } tierAtoms := map[string]*atomAgg{} for i := range candidates { e := &candidates[i] if e.DstLang != dstLang { continue } s := baseScore(e) ti := -1 if e.Gen == GenContexted { if e.ContextA == srcContext && int32(e.RoleA) == srcRole { ti = 0 } else if int32(e.RoleA) == srcRole { ti = 1 } else { ti = 2 } } else if e.Gen == GenLegacy { ti = 3 } else if e.Gen == GenDictionary { ti = 4 } if ti < 0 { continue } key := string([]byte{byte(ti), ':'}) | e.DstAtom a := tierAtoms[key] if a == nil { a = &atomAgg{tier: ti} tierAtoms[key] = a } a.aggScore += s if a.bestEntry == nil || s > baseScore(a.bestEntry) { a.bestEntry = e } } type ranked struct { entry *AtomIdxEntry aggScore float64 dstAtom string } var top [5][4]ranked for _, a := range tierAtoms { ti := a.tier s := a.aggScore slot := -1 for k := 0; k < 4; k++ { if top[ti][k].entry == nil { slot = k break } if s > top[ti][k].aggScore { slot = k break } } if slot < 0 { continue } for k := 3; k > slot; k-- { top[ti][k] = top[ti][k-1] } top[ti][slot] = ranked{entry: a.bestEntry, aggScore: s, dstAtom: a.bestEntry.DstAtom} } ctxMatch := func(e *AtomIdxEntry) bool { return ctxTranslation != "" && e.ContextB == ctxTranslation } diagDictConfirm := func(dstAtom string) bool { for i := range candidates { e := &candidates[i] if e.Gen != GenDictionary || e.DstLang != dstLang { continue } if !dictPOSMatch(int32(e.RoleA), srcRole) { continue } if e.DstAtom == dstAtom { return true } if dstLang == 1 { da := e.DstAtom if len(dstAtom) < len(da) && da[:len(dstAtom)] == dstAtom && da[len(dstAtom)] == ' ' { return true } if len(da) < len(dstAtom) && dstAtom[:len(da)] == da && dstAtom[len(da)] == ' ' { return true } } } return false } type scoredD struct { entry *AtomIdxEntry combined float64 aggScore float64 biRatio float64 ctxSim bool tier int32 } var viable []scoredD for ti := 0; ti < 4; ti++ { for k := 0; k < 4; k++ { r := &top[ti][k] if r.entry == nil { continue } bi := biCheck(r.dstAtom) combined := r.aggScore * (bi + 0.05) cm := ctxMatch(r.entry) if cm { combined *= 1.5 } if diagDictConfirm(r.dstAtom) { combined *= 1.6 } viable = append(viable, scoredD{ entry: r.entry, combined: combined, aggScore: r.aggScore, biRatio: bi, ctxSim: cm, tier: ti, }) } if len(viable) > 0 { break } } if len(viable) == 0 { for k := 0; k < 4; k++ { r := &top[4][k] if r.entry == nil { continue } bi := biCheck(r.dstAtom) combined := r.aggScore * (bi + 0.05) viable = append(viable, scoredD{ entry: r.entry, combined: combined, aggScore: r.aggScore, biRatio: bi, ctxSim: false, tier: 4, }) } } var diag []DiagCandidate for _, v := range viable { diag = append(diag, DiagCandidate{ DstAtom: v.entry.DstAtom, AggScore: v.aggScore, BiRatio: v.biRatio, Combined: v.combined, CtxSim: v.ctxSim, Tier: v.tier, }) } if len(viable) >= 2 { tc0 := triConfirmStatic(idx, srcLang, dstLang, srcAtom, viable[0].entry.DstAtom) tc1 := triConfirmStatic(idx, srcLang, dstLang, srcAtom, viable[1].entry.DstAtom) if tc1 > 0 && tc0 == 0 { viable[1].combined *= 1.3 } else if tc0 > 0 && tc1 == 0 { viable[0].combined *= 1.3 } } var pick *AtomIdxEntry tier := uint8(0) bestCombined := 0.0 for _, v := range viable { if v.combined > bestCombined { bestCombined = v.combined pick = v.entry tier = uint8(v.tier + 1) } } if pick == nil { return AtomLinkResult{}, diag } return AtomLinkResult{ DstAtom: pick.DstAtom, DstRole: int32(pick.RoleB), DstContext: pick.ContextB, Weight: pick.Weight, Generation: pick.Gen, Tier: tier, }, diag } // triConfirmStatic is a non-counter-incrementing version for diagnostics. func triConfirmStatic(idx *AtomIdx, srcLang, dstLang uint8, srcAtom, dstAtom string) int32 { intermediateLangs := [2]uint8{0x03, 0x04} confirms := 0 for _, mid := range intermediateLangs { if mid == srcLang || mid == dstLang { continue } srcMid := topAtomVia(idx, srcLang, mid, srcAtom) if srcMid == "" { continue } dstMid := topAtomVia(idx, dstLang, mid, dstAtom) if dstMid == "" { continue } if srcMid == dstMid { confirms++ } } return confirms } // topAtomVia returns the highest-weight DstAtom for srcLang->dstLang // without full scoring. Used by triangulation to get a quick "what does // this atom translate to via language M?" answer. func topAtomVia(idx *AtomIdx, srcLang, dstLang uint8, srcAtom string) string { cands := idx.FindBySrc(srcLang, srcAtom) var bestAtom string var bestScore float64 for i := range cands { e := &cands[i] if e.DstLang != dstLang { continue } da := float64(e.RArchaic) dd := float64(e.RDiscourse) g := math.Exp(-(da*da)/(DefaultSigmaArchaic*DefaultSigmaArchaic) - (dd*dd)/(DefaultSigmaDiscourse*DefaultSigmaDiscourse)) s := float64(e.Weight) * g if s > bestScore { bestScore = s bestAtom = e.DstAtom } } return bestAtom } // IngestAtomLink records a word-level cross-domain correspondence. // Words are lemmatized before storage so inflected forms collapse to region centers. // srcRole/dstRole hint whether the word is a verb (needed for JA lemmatization). // // This is the legacy bilateral function (GenLegacy generation). It remains // in place for backward compatibility and as the lookup fallback for atoms // that have no GenContexted records yet. New ingest paths should call // IngestContextedAtomLink instead. func IngestAtomLink(t *Tree, srcDomain, dstDomain uint8, srcWord, dstWord string, srcRole, dstRole int32, rArch, rDisc uint8) { srcAtom := srcWord dstAtom := dstWord if srcDomain == 1 { srcAtom = LemmatizeEN(srcWord).Lemma } else if srcDomain == 2 { srcAtom = LemmatizeJA(srcWord, srcRole == HistVerb).Lemma } if dstDomain == 1 { dstAtom = LemmatizeEN(dstWord).Lemma } else if dstDomain == 2 { dstAtom = LemmatizeJA(dstWord, dstRole == HistVerb).Lemma } if isSingleKana(srcAtom) || isSingleKana(dstAtom) { return } if isJunkJAAtom(srcAtom) || isJunkJAAtom(dstAtom) { return } buf := []byte{:5 + len(srcAtom) + len(dstAtom):5 + len(srcAtom) + len(dstAtom)} buf[0] = srcDomain buf[1] = dstDomain buf[2] = 'L' buf[3] = rArch buf[4] = rDisc copy(buf[5:], []byte(srcAtom)) copy(buf[5+len(srcAtom):], []byte(dstAtom)) key := lattice.HashKey(buf) ri := t.LookupRecIdx(lattice.Bpragmatic, key) if ri != lattice.NullRec { t.metaInc(ri) return } form := srcAtom | "=" | dstAtom var rec lattice.Record t.setFormOnRec(&rec, form) rec.Branch = uint8(lattice.Bpragmatic) ri = t.db.InsertRec(lattice.Bpragmatic, key, rec) t.metaSet(ri, MetaEntry{Count: 1, StageTag: srcDomain}) // GenLegacy records also carry the corpus register coord so the sidecar // reader can apply distance weighting to legacy candidates too. m := t.metaGet(ri) if m != nil { m.Extra[2] = rArch m.Extra[3] = rDisc if t.BulkMetaStore != nil { t.BulkMetaStore.dirty[ri] = true } } } func IngestDictAtomLink(t *Tree, srcDomain, dstDomain uint8, srcWord, dstWord string, srcRole, dstRole int32) { srcAtom := srcWord dstAtom := dstWord if srcDomain == 1 { srcAtom = LemmatizeEN(srcWord).Lemma } else if srcDomain == 2 { srcAtom = LemmatizeJA(srcWord, srcRole == HistVerb).Lemma } if dstDomain == 1 { dstAtom = LemmatizeEN(dstWord).Lemma } else if dstDomain == 2 { dstAtom = LemmatizeJA(dstWord, dstRole == HistVerb).Lemma } ingestDictAtomLinkInner(t, srcDomain, dstDomain, srcAtom, dstAtom, srcRole, dstRole) } func IngestDictAtomLinkRaw(t *Tree, srcDomain, dstDomain uint8, srcWord, dstWord string, srcRole, dstRole int32) { ingestDictAtomLinkInner(t, srcDomain, dstDomain, srcWord, dstWord, srcRole, dstRole) } func ingestDictAtomLinkInner(t *Tree, srcDomain, dstDomain uint8, srcAtom, dstAtom string, srcRole, dstRole int32) { if srcAtom == "" || dstAtom == "" { return } if isSingleKana(srcAtom) || isSingleKana(dstAtom) { return } if isJunkJAAtom(srcAtom) || isJunkJAAtom(dstAtom) { return } buf := []byte{:5 + len(srcAtom) + len(dstAtom):5 + len(srcAtom) + len(dstAtom)} buf[0] = srcDomain buf[1] = dstDomain buf[2] = 'D' buf[3] = uint8(srcRole) buf[4] = uint8(dstRole) copy(buf[5:], []byte(srcAtom)) copy(buf[5+len(srcAtom):], []byte(dstAtom)) key := lattice.HashKey(buf) ri := t.LookupRecIdx(lattice.Bpragmatic, key) if ri != lattice.NullRec { t.metaInc(ri) return } form := srcAtom | "=" | dstAtom var rec lattice.Record t.setFormOnRec(&rec, form) rec.Branch = uint8(lattice.Bpragmatic) ri = t.db.InsertRec(lattice.Bpragmatic, key, rec) stageTag := srcDomain | (GenDictionary << 4) t.metaSet(ri, MetaEntry{Count: 1, StageTag: stageTag}) m := t.metaGet(ri) if m != nil { m.Extra[0] = uint8(dstRole) m.Extra[1] = dstDomain m.Extra[4] = uint8(srcRole) if t.BulkMetaStore != nil { t.BulkMetaStore.dirty[ri] = true } } }