package iskra import "git.smesh.lol/iskradb/lattice" // Word order constants encoded in LangDesc.Order. const ( OrderSVO uint8 = 0 OrderSOV uint8 = 1 OrderVSO uint8 = 2 OrderVOS uint8 = 3 OrderOVS uint8 = 4 OrderOSV uint8 = 5 ) // Marker system constants encoded in LangDesc.Markers. const ( MarkerPrepositional uint8 = 0 MarkerPostpositional uint8 = 1 MarkerCase uint8 = 2 ) // LangDesc encodes the structural properties of a language for the cluster pipeline. // Stored in Bcooccur at MakeKey(domain, 0, ""). type LangDesc struct { Order uint8 // word order: OrderSVO, OrderSOV, etc. HeadFinal bool // false=head-initial (EN), true=head-final (JA) Particle bool // false=position-bounded parser, true=particle-bounded PreNomRC bool // false=post-nominal RC (EN), true=pre-nominal RC (JA) ZeroCopula bool // false=overt copula (EN), true=zero copula (JA) Markers uint8 // MarkerPrepositional / Postpositional / Case } // Particle role codes stored in Record.Branch of Bcooccur particle-role records. const ( RoleNone uint8 = 0 RoleNPSubjTopic uint8 = 1 RoleNPSubjGram uint8 = 2 RoleNPObjDirect uint8 = 3 RolePPSource uint8 = 4 RolePPLimit uint8 = 5 RolePPComitative uint8 = 6 RolePPLocative uint8 = 7 RolePPTemporal uint8 = 8 RoleNPDative uint8 = 9 RolePPLocStatic uint8 = 10 RolePPInstrumental uint8 = 11 ) // CoordVerbClass is the sentinel coord used for verb class records in Bcooccur. // Distinct from particle role records (coord=0) and lang descriptors (coord=0, empty word). const CoordVerbClass uint64 = 1 // KeyNormalizer is a per-domain hook that maps a raw word to its canonical // lookup form before hashing. nil = identity. // // Examples: // LangEN: lowercase ("Cat" → "cat") // LangJA: identity (surface form already canonical) // DomainMoxieSRC: qualified name resolution ("Method" → "pkg.Type.Method") // DomainMoxieIR: LLVM mangled name canonicalization type KeyNormalizer func(word string) string var keyNormalizers = map[uint8]KeyNormalizer{} // RegisterKeyNormalizer stores a per-domain key normalization function. func RegisterKeyNormalizer(domain uint8, fn KeyNormalizer) { keyNormalizers[domain] = fn } // NormalizeKey applies the registered normalizer for domain, or returns word unchanged. func NormalizeKey(domain uint8, word string) string { if fn, ok := keyNormalizers[domain]; ok && fn != nil { return fn(word) } return word } func packDescBranch(d LangDesc) uint8 { v := d.Order & 0x7 if d.HeadFinal { v |= 1 << 3 } if d.Particle { v |= 1 << 4 } if d.PreNomRC { v |= 1 << 5 } if d.ZeroCopula { v |= 1 << 6 } return v } func unpackDesc(b uint8, markerSys uint8) LangDesc { return LangDesc{ Order: b & 0x7, HeadFinal: (b>>3)&1 == 1, Particle: (b>>4)&1 == 1, PreNomRC: (b>>5)&1 == 1, ZeroCopula: (b>>6)&1 == 1, Markers: markerSys & 0x3, } } // RegisterLangDesc writes a LangDesc into Bcooccur at MakeKey(domain, 0, ""). func RegisterLangDesc(tree *lattice.Tree, pool *[]byte, domain uint8, desc LangDesc) { key := MakeKey(domain, 0, "") ri := tree.LookupRecIdx(lattice.Bcooccur, key) if ri != lattice.NullRec { if r := tree.GetRecord(ri); r != nil { r.Branch = packDescBranch(desc) r.DataFile = uint32(desc.Markers&0x3) << 6 } return } var rec lattice.Record rec.Branch = packDescBranch(desc) rec.DataFile = uint32(desc.Markers&0x3) << 6 tree.InsertRec(lattice.Bcooccur, key, rec) } // GetLangDesc reads a LangDesc from Bcooccur. Returns false if not registered. func GetLangDesc(tree *lattice.Tree, domain uint8) (LangDesc, bool) { key := MakeKey(domain, 0, "") ri := tree.LookupRecIdx(lattice.Bcooccur, key) if ri == lattice.NullRec { return LangDesc{}, false } rec := tree.GetRecord(ri) if rec == nil { return LangDesc{}, false } markerSys := uint8((rec.DataFile >> 6) & 0x3) return unpackDesc(rec.Branch, markerSys), true } // RegisterParticleRole stores a particle→role mapping in Bcooccur. // semCoord=0 for unambiguous particles; semantic-coord for ambiguous disambiguation. func RegisterParticleRole(tree *lattice.Tree, pool *[]byte, domain uint8, semCoord uint64, particle string, role uint8) { key := MakeKey(domain, semCoord, particle) ri := tree.LookupRecIdx(lattice.Bcooccur, key) if ri != lattice.NullRec { if r := tree.GetRecord(ri); r != nil { r.Branch = role } return } var rec lattice.Record rec.Branch = role tree.InsertRec(lattice.Bcooccur, key, rec) } // LookupParticleRole resolves a particle to its syntactic role using NP semantic flags. // Tries semantic-coord disambiguation first via RelaxCoord, then coord=0 default. func LookupParticleRole(tree *lattice.Tree, domain uint8, particle string, npFlags uint64) uint8 { if npFlags != 0 { semCoord := PackCoord(npFlags, 0, 0, 0, 0, 0, 0) for _, c := range RelaxCoord(semCoord) { if c == 0 { break } ri := tree.LookupRecIdx(lattice.Bcooccur, MakeKey(domain, c, particle)) if ri != lattice.NullRec { if rec := tree.GetRecord(ri); rec != nil && rec.Branch != 0 { return rec.Branch } } } } ri := tree.LookupRecIdx(lattice.Bcooccur, MakeKey(domain, 0, particle)) if ri != lattice.NullRec { if rec := tree.GetRecord(ri); rec != nil { return rec.Branch } } return RoleNone } // MarkerFunc maps a particle role to its surface marker string for a given domain. type MarkerFunc func(role uint8) string var markerFuncs = map[uint8]MarkerFunc{} // RegisterMarkerFunc registers the marker lookup function for a domain. func RegisterMarkerFunc(domain uint8, fn MarkerFunc) { markerFuncs[domain] = fn } // LookupTargetMarker returns the surface marker (preposition/postposition) for a // role in the target domain, calling the registered MarkerFunc. func LookupTargetMarker(domain uint8, role uint8) string { if fn, ok := markerFuncs[domain]; ok && fn != nil { return fn(role) } return "" }