langdesc.mx raw
1 package iskra
2
3 import "git.smesh.lol/iskradb/lattice"
4
5 // Word order constants encoded in LangDesc.Order.
6 const (
7 OrderSVO uint8 = 0
8 OrderSOV uint8 = 1
9 OrderVSO uint8 = 2
10 OrderVOS uint8 = 3
11 OrderOVS uint8 = 4
12 OrderOSV uint8 = 5
13 )
14
15 // Marker system constants encoded in LangDesc.Markers.
16 const (
17 MarkerPrepositional uint8 = 0
18 MarkerPostpositional uint8 = 1
19 MarkerCase uint8 = 2
20 )
21
22 // LangDesc encodes the structural properties of a language for the cluster pipeline.
23 // Stored in Bcooccur at MakeKey(domain, 0, "").
24 type LangDesc struct {
25 Order uint8 // word order: OrderSVO, OrderSOV, etc.
26 HeadFinal bool // false=head-initial (EN), true=head-final (JA)
27 Particle bool // false=position-bounded parser, true=particle-bounded
28 PreNomRC bool // false=post-nominal RC (EN), true=pre-nominal RC (JA)
29 ZeroCopula bool // false=overt copula (EN), true=zero copula (JA)
30 Markers uint8 // MarkerPrepositional / Postpositional / Case
31 }
32
33 // Particle role codes stored in Record.Branch of Bcooccur particle-role records.
34 const (
35 RoleNone uint8 = 0
36 RoleNPSubjTopic uint8 = 1
37 RoleNPSubjGram uint8 = 2
38 RoleNPObjDirect uint8 = 3
39 RolePPSource uint8 = 4
40 RolePPLimit uint8 = 5
41 RolePPComitative uint8 = 6
42 RolePPLocative uint8 = 7
43 RolePPTemporal uint8 = 8
44 RoleNPDative uint8 = 9
45 RolePPLocStatic uint8 = 10
46 RolePPInstrumental uint8 = 11
47 )
48
49 // CoordVerbClass is the sentinel coord used for verb class records in Bcooccur.
50 // Distinct from particle role records (coord=0) and lang descriptors (coord=0, empty word).
51 const CoordVerbClass uint64 = 1
52
53 // KeyNormalizer is a per-domain hook that maps a raw word to its canonical
54 // lookup form before hashing. nil = identity.
55 //
56 // Examples:
57 // LangEN: lowercase ("Cat" → "cat")
58 // LangJA: identity (surface form already canonical)
59 // DomainMoxieSRC: qualified name resolution ("Method" → "pkg.Type.Method")
60 // DomainMoxieIR: LLVM mangled name canonicalization
61 type KeyNormalizer func(word string) string
62
63 var keyNormalizers = map[uint8]KeyNormalizer{}
64
65 // RegisterKeyNormalizer stores a per-domain key normalization function.
66 func RegisterKeyNormalizer(domain uint8, fn KeyNormalizer) {
67 keyNormalizers[domain] = fn
68 }
69
70 // NormalizeKey applies the registered normalizer for domain, or returns word unchanged.
71 func NormalizeKey(domain uint8, word string) string {
72 if fn, ok := keyNormalizers[domain]; ok && fn != nil {
73 return fn(word)
74 }
75 return word
76 }
77
78 func packDescBranch(d LangDesc) uint8 {
79 v := d.Order & 0x7
80 if d.HeadFinal { v |= 1 << 3 }
81 if d.Particle { v |= 1 << 4 }
82 if d.PreNomRC { v |= 1 << 5 }
83 if d.ZeroCopula { v |= 1 << 6 }
84 return v
85 }
86
87 func unpackDesc(b uint8, markerSys uint8) LangDesc {
88 return LangDesc{
89 Order: b & 0x7,
90 HeadFinal: (b>>3)&1 == 1,
91 Particle: (b>>4)&1 == 1,
92 PreNomRC: (b>>5)&1 == 1,
93 ZeroCopula: (b>>6)&1 == 1,
94 Markers: markerSys & 0x3,
95 }
96 }
97
98 // RegisterLangDesc writes a LangDesc into Bcooccur at MakeKey(domain, 0, "").
99 func RegisterLangDesc(tree *lattice.Tree, pool *[]byte, domain uint8, desc LangDesc) {
100 key := MakeKey(domain, 0, "")
101 ri := tree.LookupRecIdx(lattice.Bcooccur, key)
102 if ri != lattice.NullRec {
103 if r := tree.GetRecord(ri); r != nil {
104 r.Branch = packDescBranch(desc)
105 r.DataFile = uint32(desc.Markers&0x3) << 6
106 }
107 return
108 }
109 var rec lattice.Record
110 rec.Branch = packDescBranch(desc)
111 rec.DataFile = uint32(desc.Markers&0x3) << 6
112 tree.InsertRec(lattice.Bcooccur, key, rec)
113 }
114
115 // GetLangDesc reads a LangDesc from Bcooccur. Returns false if not registered.
116 func GetLangDesc(tree *lattice.Tree, domain uint8) (LangDesc, bool) {
117 key := MakeKey(domain, 0, "")
118 ri := tree.LookupRecIdx(lattice.Bcooccur, key)
119 if ri == lattice.NullRec {
120 return LangDesc{}, false
121 }
122 rec := tree.GetRecord(ri)
123 if rec == nil {
124 return LangDesc{}, false
125 }
126 markerSys := uint8((rec.DataFile >> 6) & 0x3)
127 return unpackDesc(rec.Branch, markerSys), true
128 }
129
130 // RegisterParticleRole stores a particle→role mapping in Bcooccur.
131 // semCoord=0 for unambiguous particles; semantic-coord for ambiguous disambiguation.
132 func RegisterParticleRole(tree *lattice.Tree, pool *[]byte, domain uint8, semCoord uint64, particle string, role uint8) {
133 key := MakeKey(domain, semCoord, particle)
134 ri := tree.LookupRecIdx(lattice.Bcooccur, key)
135 if ri != lattice.NullRec {
136 if r := tree.GetRecord(ri); r != nil {
137 r.Branch = role
138 }
139 return
140 }
141 var rec lattice.Record
142 rec.Branch = role
143 tree.InsertRec(lattice.Bcooccur, key, rec)
144 }
145
146 // LookupParticleRole resolves a particle to its syntactic role using NP semantic flags.
147 // Tries semantic-coord disambiguation first via RelaxCoord, then coord=0 default.
148 func LookupParticleRole(tree *lattice.Tree, domain uint8, particle string, npFlags uint64) uint8 {
149 if npFlags != 0 {
150 semCoord := PackCoord(npFlags, 0, 0, 0, 0, 0, 0)
151 for _, c := range RelaxCoord(semCoord) {
152 if c == 0 {
153 break
154 }
155 ri := tree.LookupRecIdx(lattice.Bcooccur, MakeKey(domain, c, particle))
156 if ri != lattice.NullRec {
157 if rec := tree.GetRecord(ri); rec != nil && rec.Branch != 0 {
158 return rec.Branch
159 }
160 }
161 }
162 }
163 ri := tree.LookupRecIdx(lattice.Bcooccur, MakeKey(domain, 0, particle))
164 if ri != lattice.NullRec {
165 if rec := tree.GetRecord(ri); rec != nil {
166 return rec.Branch
167 }
168 }
169 return RoleNone
170 }
171
172 // MarkerFunc maps a particle role to its surface marker string for a given domain.
173 type MarkerFunc func(role uint8) string
174
175 var markerFuncs = map[uint8]MarkerFunc{}
176
177 // RegisterMarkerFunc registers the marker lookup function for a domain.
178 func RegisterMarkerFunc(domain uint8, fn MarkerFunc) {
179 markerFuncs[domain] = fn
180 }
181
182 // LookupTargetMarker returns the surface marker (preposition/postposition) for a
183 // role in the target domain, calling the registered MarkerFunc.
184 func LookupTargetMarker(domain uint8, role uint8) string {
185 if fn, ok := markerFuncs[domain]; ok && fn != nil {
186 return fn(role)
187 }
188 return ""
189 }
190