pattern.mx raw
1 package iskra
2
3 import "git.smesh.lol/iskradb/lattice"
4
5 // Pattern encoding: a sentence skeleton is a sequence of elements.
6 // Each element is either a content SLOT (role placeholder) or a structural MARKER.
7 //
8 // Byte encoding:
9 // 0x80 | role_id = content slot (role in bits 0-6)
10 // 0x00 - 0x7F = marker ID (structural word: particle, preposition, keyword)
11 //
12 // This gives 128 marker IDs and 128 role types. Patterns are typically 3-8 bytes.
13
14 // Slot roles - what a content slot expects.
15 const (
16 SlotNoun uint8 = 0x80 // noun/nominal content
17 SlotVerb uint8 = 0x81 // verb/action
18 SlotModifier uint8 = 0x82 // adjective/adverb
19 SlotScope uint8 = 0x83 // scope body (block, clause)
20 SlotLiteral uint8 = 0x84 // literal value
21 SlotExpr uint8 = 0x85 // expression (code) or clause (natural)
22 )
23
24 func IsSlot(b uint8) bool { return b&0x80 != 0 }
25 func SlotRole(b uint8) uint8 { return b & 0x7F }
26
27 // Marker IDs - structural words that define the skeleton.
28 // JA particles (1-16), EN prepositions/determiners (17-48), code keywords (49-80).
29 const (
30 // JA particles
31 MkWa uint8 = 1 // は (topic)
32 MkGa uint8 = 2 // が (subject)
33 MkWo uint8 = 3 // を (object)
34 MkNi uint8 = 4 // に (dative/locative)
35 MkDe uint8 = 5 // で (instrumental/locative)
36 MkHe uint8 = 6 // へ (direction)
37 MkMo uint8 = 7 // も (inclusive)
38 MkNo uint8 = 8 // の (genitive)
39 MkTo uint8 = 9 // と (comitative/quotative)
40 MkKara uint8 = 10 // から (source)
41 MkMade uint8 = 11 // まで (limit)
42 MkYori uint8 = 12 // より (comparison)
43 MkKedo uint8 = 13 // けど (concessive)
44 MkKa uint8 = 14 // か (question)
45 MkYo uint8 = 15 // よ (assertion)
46 MkNe uint8 = 16 // ね (confirmation)
47
48 // EN structural words
49 MkThe uint8 = 17
50 MkA uint8 = 18
51 MkIn uint8 = 19
52 MkOn uint8 = 20
53 MkAt uint8 = 21
54 MkWith uint8 = 22
55 MkBy uint8 = 23
56 MkFor uint8 = 24
57 MkTo_EN uint8 = 25
58 MkOf uint8 = 26
59 MkFrom uint8 = 27
60 MkAs uint8 = 28
61 MkThat uint8 = 29
62 MkIs uint8 = 30
63 MkAre uint8 = 31
64 MkWas uint8 = 32
65 MkDo uint8 = 33
66 MkI uint8 = 34
67 MkYou uint8 = 35
68 MkPron3 uint8 = 36 // he/she/him/her
69 MkIt_EN uint8 = 37
70 MkWe_EN uint8 = 38
71 MkThey_ uint8 = 39
72 MkNot uint8 = 40
73 MkAnd uint8 = 41
74 MkBut uint8 = 42
75 MkThan uint8 = 47 // than (comparative standard)
76
77 // Morph-bit markers: synthetic particles that carry morph info
78 // not naturally expressible in JA surface forms. Used by the renderer
79 // to make round-trips lossless.
80 MkDef uint8 = 43 // ★ definiteness (the cat → 猫★)
81 MkPlural uint8 = 44 // ☆ plural (books → 本☆)
82 MkCopula uint8 = 45 // 〇 copula (is a student → 学生〇)
83 Mk3Sg uint8 = 46 // ◯ 3rd person singular (he eats → 食◯)
84
85 // EN possessive determiners (distinct from subject pronouns).
86 // These never become standalone subject slots; they become POSS modifiers
87 // of the next noun.
88 MkMy uint8 = 70 // my
89 MkYour uint8 = 71 // your
90 MkHis uint8 = 72 // his
91 MkHerP uint8 = 73 // her (possessive; her/she-objective conflated in EN)
92 MkIts uint8 = 74 // its
93 MkOurP uint8 = 75 // our
94 MkTheirP uint8 = 76 // their
95
96 // Code structural keywords
97 MkIf uint8 = 49
98 MkElse uint8 = 50
99 MkFor_C uint8 = 51
100 MkReturn uint8 = 52
101 MkLBrace uint8 = 53
102 MkRBrace uint8 = 54
103 MkLParen uint8 = 55
104 MkRParen uint8 = 56
105 MkAssign uint8 = 57
106 MkDot uint8 = 58
107 MkComma uint8 = 59
108 MkColon uint8 = 60
109 MkArrow uint8 = 61
110 MkCase uint8 = 62
111 MkSelect uint8 = 63
112 MkSpawn uint8 = 64
113 MkChan uint8 = 65
114 )
115
116 // markerToJA maps marker IDs to JA particle strings.
117 // Entries 17-46 are synthetic morph markers for lossless round-trip.
118 func markerToJA() [50]string {
119 return [50]string{
120 "",
121 "\xe3\x81\xaf", // 1: は
122 "\xe3\x81\x8c", // 2: が
123 "\xe3\x82\x92", // 3: を
124 "\xe3\x81\xab", // 4: に
125 "\xe3\x81\xa7", // 5: で
126 "\xe3\x81\xb8", // 6: へ
127 "\xe3\x82\x82", // 7: も
128 "\xe3\x81\xae", // 8: の
129 "\xe3\x81\xa8", // 9: と
130 "\xe3\x81\x8b\xe3\x82\x89", // 10: から
131 "\xe3\x81\xbe\xe3\x81\xa7", // 11: まで
132 "\xe3\x82\x88\xe3\x82\x8a", // 12: より
133 "\xe3\x81\x91\xe3\x81\xa9", // 13: けど
134 "\xe3\x81\x8b", // 14: か
135 "\xe3\x82\x88", // 15: よ
136 "\xe3\x81\xad", // 16: ね
137 "", "", "", "", "", "", "", "", "", "", // 17-26 (EN markers, no JA equivalent)
138 "", "", "", "", "", "", "", "", "", "", // 27-36
139 "", "", "", "", "", "", // 37-42
140 "\xe2\x98\x85", // 43: ★ MkDef
141 "\xe2\x98\x86", // 44: ☆ MkPlural
142 "\xe3\x80\x87", // 45: 〇 MkCopula
143 "\xe2\x97\xaf", // 46: ◯ Mk3Sg
144 "", "", "", // 47-49
145 }
146 }
147
148 // jaParticleToMarker builds the JA particle string to marker ID map.
149 func jaParticleToMarker() map[string]uint8 {
150 m := map[string]uint8{}
151 tbl := markerToJA()
152 for i := uint8(1); i <= 16; i++ {
153 m[tbl[i]] = i
154 }
155 // Synthetic morph markers (recoverable round-trip).
156 for _, mk := range []uint8{MkDef, MkPlural, MkCopula, Mk3Sg} {
157 m[tbl[mk]] = mk
158 }
159 return m
160 }
161
162 // enWordToMarker maps EN structural words to marker IDs.
163 func enWordToMarker() map[string]uint8 {
164 return map[string]uint8{
165 "the": MkThe, "a": MkA, "an": MkA,
166 "in": MkIn, "on": MkOn, "at": MkAt,
167 "with": MkWith, "by": MkBy, "for": MkFor,
168 "to": MkTo_EN, "of": MkOf, "from": MkFrom,
169 "as": MkAs, "that": MkThat, "which": MkThat, "who": MkThat,
170 "than": MkThan,
171 "is": MkIs, "are": MkAre, "was": MkWas, "were": MkWas,
172 "am": MkIs, "be": MkIs, "been": MkIs, "being": MkIs,
173 "do": MkDo, "does": MkDo, "did": MkDo,
174 "will": MkDo, "would": MkDo, "shall": MkDo, "should": MkDo,
175 "can": MkDo, "could": MkDo, "may": MkDo, "might": MkDo, "must": MkDo,
176 "have": MkDo, "has": MkDo, "had": MkDo,
177 "i": MkI, "me": MkI, "mine": MkI, "myself": MkI,
178 "my": MkMy,
179 "you": MkYou, "yours": MkYou, "yourself": MkYou,
180 "your": MkYour,
181 "he": MkPron3, "him": MkPron3, "himself": MkPron3,
182 "his": MkHis,
183 "she": MkPron3, "hers": MkPron3, "herself": MkPron3,
184 "her": MkHerP,
185 "it": MkIt_EN, "itself": MkIt_EN,
186 "its": MkIts,
187 "we": MkWe_EN, "us": MkWe_EN, "ours": MkWe_EN, "ourselves": MkWe_EN,
188 "our": MkOurP,
189 "they": MkThey_, "them": MkThey_, "theirs": MkThey_, "themselves": MkThey_,
190 "their": MkTheirP,
191 "this": MkIt_EN, "these": MkThey_, "those": MkThey_,
192 "not": MkNot, "n't": MkNot,
193 "and": MkAnd, "or": MkAnd,
194 "but": MkBut, "however": MkBut, "although": MkBut,
195 "about": MkOf, "into": MkIn, "onto": MkOn,
196 "through": MkIn, "over": MkOn, "under": MkAt,
197 "after": MkFrom, "before": MkTo_EN,
198 "between": MkAt, "among": MkAt,
199 "during": MkAt, "until": MkTo_EN, "since": MkFrom,
200 "without": MkWith, "within": MkIn,
201 "around": MkAt, "behind": MkAt, "beside": MkAt,
202 "toward": MkTo_EN, "towards": MkTo_EN,
203 "across": MkIn, "along": MkIn,
204 "against": MkWith,
205 "upon": MkOn,
206 }
207 }
208
209 // PatternKey hashes a pattern byte sequence into a lattice key.
210 func PatternKey(domain uint8, pat []byte) lattice.Key {
211 buf := []byte{:2 + len(pat):2 + len(pat)}
212 buf[0] = domain
213 buf[1] = 'P' // domain separator: 'P' for pattern
214 copy(buf[2:], pat)
215 return lattice.HashKey(buf)
216 }
217
218 // AtomKey hashes a word into a lattice key for atom storage.
219 func AtomKey(domain uint8, word string) lattice.Key {
220 buf := []byte{:2 + len(word):2 + len(word)}
221 buf[0] = domain
222 buf[1] = 'W' // domain separator: 'W' for word/atom
223 copy(buf[2:], []byte(word))
224 return lattice.HashKey(buf)
225 }
226
227 // DeepPatternKey hashes a canonical role sequence into a lattice key.
228 func DeepPatternKey(deepPat []byte) lattice.Key {
229 buf := []byte{:1 + len(deepPat):1 + len(deepPat)}
230 buf[0] = 'D'
231 copy(buf[1:], deepPat)
232 return lattice.HashKey(buf)
233 }
234
235 // CrossPatternKey hashes a cross-domain pattern link.
236 func CrossPatternKey(srcDomain, dstDomain uint8, srcPat, dstPat []byte) lattice.Key {
237 buf := []byte{:3 + len(srcPat) + len(dstPat):3 + len(srcPat) + len(dstPat)}
238 buf[0] = srcDomain
239 buf[1] = dstDomain
240 buf[2] = 'X' // cross-link marker
241 copy(buf[3:], srcPat)
242 copy(buf[3+len(srcPat):], dstPat)
243 return lattice.HashKey(buf)
244 }
245
246 // RoleHist is the per-atom role frequency distribution.
247 // Stored in MetaEntry.Extra[0:16] as 8 uint16 counters.
248 type RoleHist [8]uint16
249
250 const (
251 HistTopic = 0
252 HistSubject = 1
253 HistObject = 2
254 HistVerb = 3
255 HistModifier = 4
256 HistScope = 5
257 HistOperator = 6
258 HistComplement = 7
259 )
260
261 // RoleEquiv returns true if two roles should be considered equivalent
262 // for cross-domain atom alignment. Topic is treated as Subject (first pass;
263 // contrastive-wa refinement comes later via valence check).
264 func RoleEquiv(a, b int32) bool {
265 if a == b {
266 return true
267 }
268 na := normalizeRole(a)
269 nb := normalizeRole(b)
270 return na == nb
271 }
272
273 func normalizeRole(r int32) int32 {
274 if r == HistTopic {
275 return HistSubject
276 }
277 return r
278 }
279
280 func (h *RoleHist) Inc(role uint8) {
281 idx := slotToHistIdx(role)
282 if idx < 8 && h[idx] < 0xFFFF {
283 h[idx]++
284 }
285 }
286
287 func (h *RoleHist) Encode(extra *[16]byte) {
288 for i := 0; i < 8; i++ {
289 extra[i*2] = byte(h[i])
290 extra[i*2+1] = byte(h[i] >> 8)
291 }
292 }
293
294 func (h *RoleHist) Decode(extra [16]byte) {
295 for i := 0; i < 8; i++ {
296 h[i] = uint16(extra[i*2]) | uint16(extra[i*2+1])<<8
297 }
298 }
299
300 func (h *RoleHist) DominantRole() uint8 {
301 max := uint16(0)
302 idx := 0
303 for i, v := range h {
304 if v > max {
305 max = v
306 idx = i
307 }
308 }
309 return uint8(idx)
310 }
311
312 func slotToHistIdx(slot uint8) int32 {
313 switch slot {
314 case SlotNoun:
315 return HistObject // default; caller refines to Topic/Subject/Object
316 case SlotVerb:
317 return HistVerb
318 case SlotModifier:
319 return HistModifier
320 case SlotScope:
321 return HistScope
322 case SlotLiteral:
323 return HistComplement
324 case SlotExpr:
325 return HistComplement
326 }
327 return HistComplement
328 }
329
330 // MarkerToOblRole maps a marker ID to its oblique/thematic role.
331 // Returns ORNone if the marker doesn't carry an oblique role
332 // (e.g. determiners, copulas, sentence-final particles).
333 func MarkerToOblRole(mk uint8) uint8 {
334 switch mk {
335 // JA particles
336 case MkNi:
337 return ORGoal // default: motion verb context. Could be ORLoc/ORRecip; verb sem disambiguates.
338 case MkHe:
339 return ORGoal // unambiguously directional
340 case MkDe:
341 return ORLoc // default; can be ORInstr in context
342 case MkKara:
343 return ORSource
344 case MkMade:
345 return ORLimit
346 case MkYori:
347 return ORCompare
348 case MkTo:
349 return ORComit
350 case MkNo:
351 return ORPart
352 // EN prepositions
353 case MkTo_EN:
354 return ORGoal
355 case MkIn, MkOn, MkAt:
356 return ORLoc
357 case MkFrom:
358 return ORSource
359 case MkWith:
360 return ORInstr
361 case MkBy:
362 return ORAgent
363 case MkFor:
364 return ORBenef
365 case MkOf:
366 return ORPart
367 case MkAs:
368 return ORNone // role-like but not a thematic role
369 case MkThan:
370 return ORCompare
371 }
372 return ORNone
373 }
374
375 // MarkerToRole maps a marker ID to the role it assigns to the FOLLOWING slot
376 // (EN prepositions) or PRECEDING slot (JA particles).
377 func MarkerToRole(mk uint8) int32 {
378 switch mk {
379 case MkWa, MkMo:
380 return HistTopic
381 case MkGa:
382 return HistSubject
383 case MkWo:
384 return HistObject
385 case MkNi, MkHe:
386 return HistScope
387 case MkDe:
388 return HistModifier
389 case MkNo:
390 return HistOperator
391 case MkTo:
392 return HistObject
393 case MkKara, MkFrom:
394 return HistScope // SOURCE role conflated with locative
395 case MkMade:
396 return HistScope
397 case MkYori, MkThan:
398 return HistModifier // comparative standard
399 case MkIn, MkOn, MkAt:
400 return HistScope // LOCATION role
401 case MkWith:
402 return HistModifier // INSTRUMENT/COMITATIVE
403 case MkBy:
404 return HistModifier // AGENT in passive, MEANS otherwise
405 case MkFor:
406 return HistModifier // BENEFICIARY/PURPOSE
407 case MkOf:
408 return HistOperator // POSSESSOR/PART
409 case MkTo_EN:
410 return HistScope // GOAL/RECIPIENT - oblique role, not direct object
411 case MkI, MkYou, MkPron3, MkIt_EN, MkWe_EN, MkThey_:
412 return HistSubject
413 case MkIs, MkAre, MkWas, MkDo:
414 return HistVerb
415 case MkNot:
416 return HistModifier
417 case MkAnd, MkBut:
418 return HistSubject
419 }
420 return HistComplement
421 }
422