package iskra import "git.smesh.lol/iskradb/lattice" // Pattern encoding: a sentence skeleton is a sequence of elements. // Each element is either a content SLOT (role placeholder) or a structural MARKER. // // Byte encoding: // 0x80 | role_id = content slot (role in bits 0-6) // 0x00 - 0x7F = marker ID (structural word: particle, preposition, keyword) // // This gives 128 marker IDs and 128 role types. Patterns are typically 3-8 bytes. // Slot roles - what a content slot expects. const ( SlotNoun uint8 = 0x80 // noun/nominal content SlotVerb uint8 = 0x81 // verb/action SlotModifier uint8 = 0x82 // adjective/adverb SlotScope uint8 = 0x83 // scope body (block, clause) SlotLiteral uint8 = 0x84 // literal value SlotExpr uint8 = 0x85 // expression (code) or clause (natural) ) func IsSlot(b uint8) bool { return b&0x80 != 0 } func SlotRole(b uint8) uint8 { return b & 0x7F } // Marker IDs - structural words that define the skeleton. // JA particles (1-16), EN prepositions/determiners (17-48), code keywords (49-80). const ( // JA particles MkWa uint8 = 1 // は (topic) MkGa uint8 = 2 // が (subject) MkWo uint8 = 3 // を (object) MkNi uint8 = 4 // に (dative/locative) MkDe uint8 = 5 // で (instrumental/locative) MkHe uint8 = 6 // へ (direction) MkMo uint8 = 7 // も (inclusive) MkNo uint8 = 8 // の (genitive) MkTo uint8 = 9 // と (comitative/quotative) MkKara uint8 = 10 // から (source) MkMade uint8 = 11 // まで (limit) MkYori uint8 = 12 // より (comparison) MkKedo uint8 = 13 // けど (concessive) MkKa uint8 = 14 // か (question) MkYo uint8 = 15 // よ (assertion) MkNe uint8 = 16 // ね (confirmation) // EN structural words MkThe uint8 = 17 MkA uint8 = 18 MkIn uint8 = 19 MkOn uint8 = 20 MkAt uint8 = 21 MkWith uint8 = 22 MkBy uint8 = 23 MkFor uint8 = 24 MkTo_EN uint8 = 25 MkOf uint8 = 26 MkFrom uint8 = 27 MkAs uint8 = 28 MkThat uint8 = 29 MkIs uint8 = 30 MkAre uint8 = 31 MkWas uint8 = 32 MkDo uint8 = 33 MkI uint8 = 34 MkYou uint8 = 35 MkPron3 uint8 = 36 // he/she/him/her MkIt_EN uint8 = 37 MkWe_EN uint8 = 38 MkThey_ uint8 = 39 MkNot uint8 = 40 MkAnd uint8 = 41 MkBut uint8 = 42 MkThan uint8 = 47 // than (comparative standard) // Morph-bit markers: synthetic particles that carry morph info // not naturally expressible in JA surface forms. Used by the renderer // to make round-trips lossless. MkDef uint8 = 43 // ★ definiteness (the cat → 猫★) MkPlural uint8 = 44 // ☆ plural (books → 本☆) MkCopula uint8 = 45 // 〇 copula (is a student → 学生〇) Mk3Sg uint8 = 46 // ◯ 3rd person singular (he eats → 食◯) // EN possessive determiners (distinct from subject pronouns). // These never become standalone subject slots; they become POSS modifiers // of the next noun. MkMy uint8 = 70 // my MkYour uint8 = 71 // your MkHis uint8 = 72 // his MkHerP uint8 = 73 // her (possessive; her/she-objective conflated in EN) MkIts uint8 = 74 // its MkOurP uint8 = 75 // our MkTheirP uint8 = 76 // their // Code structural keywords MkIf uint8 = 49 MkElse uint8 = 50 MkFor_C uint8 = 51 MkReturn uint8 = 52 MkLBrace uint8 = 53 MkRBrace uint8 = 54 MkLParen uint8 = 55 MkRParen uint8 = 56 MkAssign uint8 = 57 MkDot uint8 = 58 MkComma uint8 = 59 MkColon uint8 = 60 MkArrow uint8 = 61 MkCase uint8 = 62 MkSelect uint8 = 63 MkSpawn uint8 = 64 MkChan uint8 = 65 ) // markerToJA maps marker IDs to JA particle strings. // Entries 17-46 are synthetic morph markers for lossless round-trip. func markerToJA() [50]string { return [50]string{ "", "\xe3\x81\xaf", // 1: は "\xe3\x81\x8c", // 2: が "\xe3\x82\x92", // 3: を "\xe3\x81\xab", // 4: に "\xe3\x81\xa7", // 5: で "\xe3\x81\xb8", // 6: へ "\xe3\x82\x82", // 7: も "\xe3\x81\xae", // 8: の "\xe3\x81\xa8", // 9: と "\xe3\x81\x8b\xe3\x82\x89", // 10: から "\xe3\x81\xbe\xe3\x81\xa7", // 11: まで "\xe3\x82\x88\xe3\x82\x8a", // 12: より "\xe3\x81\x91\xe3\x81\xa9", // 13: けど "\xe3\x81\x8b", // 14: か "\xe3\x82\x88", // 15: よ "\xe3\x81\xad", // 16: ね "", "", "", "", "", "", "", "", "", "", // 17-26 (EN markers, no JA equivalent) "", "", "", "", "", "", "", "", "", "", // 27-36 "", "", "", "", "", "", // 37-42 "\xe2\x98\x85", // 43: ★ MkDef "\xe2\x98\x86", // 44: ☆ MkPlural "\xe3\x80\x87", // 45: 〇 MkCopula "\xe2\x97\xaf", // 46: ◯ Mk3Sg "", "", "", // 47-49 } } // jaParticleToMarker builds the JA particle string to marker ID map. func jaParticleToMarker() map[string]uint8 { m := map[string]uint8{} tbl := markerToJA() for i := uint8(1); i <= 16; i++ { m[tbl[i]] = i } // Synthetic morph markers (recoverable round-trip). for _, mk := range []uint8{MkDef, MkPlural, MkCopula, Mk3Sg} { m[tbl[mk]] = mk } return m } // enWordToMarker maps EN structural words to marker IDs. func enWordToMarker() map[string]uint8 { return map[string]uint8{ "the": MkThe, "a": MkA, "an": MkA, "in": MkIn, "on": MkOn, "at": MkAt, "with": MkWith, "by": MkBy, "for": MkFor, "to": MkTo_EN, "of": MkOf, "from": MkFrom, "as": MkAs, "that": MkThat, "which": MkThat, "who": MkThat, "than": MkThan, "is": MkIs, "are": MkAre, "was": MkWas, "were": MkWas, "am": MkIs, "be": MkIs, "been": MkIs, "being": MkIs, "do": MkDo, "does": MkDo, "did": MkDo, "will": MkDo, "would": MkDo, "shall": MkDo, "should": MkDo, "can": MkDo, "could": MkDo, "may": MkDo, "might": MkDo, "must": MkDo, "have": MkDo, "has": MkDo, "had": MkDo, "i": MkI, "me": MkI, "mine": MkI, "myself": MkI, "my": MkMy, "you": MkYou, "yours": MkYou, "yourself": MkYou, "your": MkYour, "he": MkPron3, "him": MkPron3, "himself": MkPron3, "his": MkHis, "she": MkPron3, "hers": MkPron3, "herself": MkPron3, "her": MkHerP, "it": MkIt_EN, "itself": MkIt_EN, "its": MkIts, "we": MkWe_EN, "us": MkWe_EN, "ours": MkWe_EN, "ourselves": MkWe_EN, "our": MkOurP, "they": MkThey_, "them": MkThey_, "theirs": MkThey_, "themselves": MkThey_, "their": MkTheirP, "this": MkIt_EN, "these": MkThey_, "those": MkThey_, "not": MkNot, "n't": MkNot, "and": MkAnd, "or": MkAnd, "but": MkBut, "however": MkBut, "although": MkBut, "about": MkOf, "into": MkIn, "onto": MkOn, "through": MkIn, "over": MkOn, "under": MkAt, "after": MkFrom, "before": MkTo_EN, "between": MkAt, "among": MkAt, "during": MkAt, "until": MkTo_EN, "since": MkFrom, "without": MkWith, "within": MkIn, "around": MkAt, "behind": MkAt, "beside": MkAt, "toward": MkTo_EN, "towards": MkTo_EN, "across": MkIn, "along": MkIn, "against": MkWith, "upon": MkOn, } } // PatternKey hashes a pattern byte sequence into a lattice key. func PatternKey(domain uint8, pat []byte) lattice.Key { buf := []byte{:2 + len(pat):2 + len(pat)} buf[0] = domain buf[1] = 'P' // domain separator: 'P' for pattern copy(buf[2:], pat) return lattice.HashKey(buf) } // AtomKey hashes a word into a lattice key for atom storage. func AtomKey(domain uint8, word string) lattice.Key { buf := []byte{:2 + len(word):2 + len(word)} buf[0] = domain buf[1] = 'W' // domain separator: 'W' for word/atom copy(buf[2:], []byte(word)) return lattice.HashKey(buf) } // DeepPatternKey hashes a canonical role sequence into a lattice key. func DeepPatternKey(deepPat []byte) lattice.Key { buf := []byte{:1 + len(deepPat):1 + len(deepPat)} buf[0] = 'D' copy(buf[1:], deepPat) return lattice.HashKey(buf) } // CrossPatternKey hashes a cross-domain pattern link. func CrossPatternKey(srcDomain, dstDomain uint8, srcPat, dstPat []byte) lattice.Key { buf := []byte{:3 + len(srcPat) + len(dstPat):3 + len(srcPat) + len(dstPat)} buf[0] = srcDomain buf[1] = dstDomain buf[2] = 'X' // cross-link marker copy(buf[3:], srcPat) copy(buf[3+len(srcPat):], dstPat) return lattice.HashKey(buf) } // RoleHist is the per-atom role frequency distribution. // Stored in MetaEntry.Extra[0:16] as 8 uint16 counters. type RoleHist [8]uint16 const ( HistTopic = 0 HistSubject = 1 HistObject = 2 HistVerb = 3 HistModifier = 4 HistScope = 5 HistOperator = 6 HistComplement = 7 ) // RoleEquiv returns true if two roles should be considered equivalent // for cross-domain atom alignment. Topic is treated as Subject (first pass; // contrastive-wa refinement comes later via valence check). func RoleEquiv(a, b int32) bool { if a == b { return true } na := normalizeRole(a) nb := normalizeRole(b) return na == nb } func normalizeRole(r int32) int32 { if r == HistTopic { return HistSubject } return r } func (h *RoleHist) Inc(role uint8) { idx := slotToHistIdx(role) if idx < 8 && h[idx] < 0xFFFF { h[idx]++ } } func (h *RoleHist) Encode(extra *[16]byte) { for i := 0; i < 8; i++ { extra[i*2] = byte(h[i]) extra[i*2+1] = byte(h[i] >> 8) } } func (h *RoleHist) Decode(extra [16]byte) { for i := 0; i < 8; i++ { h[i] = uint16(extra[i*2]) | uint16(extra[i*2+1])<<8 } } func (h *RoleHist) DominantRole() uint8 { max := uint16(0) idx := 0 for i, v := range h { if v > max { max = v idx = i } } return uint8(idx) } func slotToHistIdx(slot uint8) int32 { switch slot { case SlotNoun: return HistObject // default; caller refines to Topic/Subject/Object case SlotVerb: return HistVerb case SlotModifier: return HistModifier case SlotScope: return HistScope case SlotLiteral: return HistComplement case SlotExpr: return HistComplement } return HistComplement } // MarkerToOblRole maps a marker ID to its oblique/thematic role. // Returns ORNone if the marker doesn't carry an oblique role // (e.g. determiners, copulas, sentence-final particles). func MarkerToOblRole(mk uint8) uint8 { switch mk { // JA particles case MkNi: return ORGoal // default: motion verb context. Could be ORLoc/ORRecip; verb sem disambiguates. case MkHe: return ORGoal // unambiguously directional case MkDe: return ORLoc // default; can be ORInstr in context case MkKara: return ORSource case MkMade: return ORLimit case MkYori: return ORCompare case MkTo: return ORComit case MkNo: return ORPart // EN prepositions case MkTo_EN: return ORGoal case MkIn, MkOn, MkAt: return ORLoc case MkFrom: return ORSource case MkWith: return ORInstr case MkBy: return ORAgent case MkFor: return ORBenef case MkOf: return ORPart case MkAs: return ORNone // role-like but not a thematic role case MkThan: return ORCompare } return ORNone } // MarkerToRole maps a marker ID to the role it assigns to the FOLLOWING slot // (EN prepositions) or PRECEDING slot (JA particles). func MarkerToRole(mk uint8) int32 { switch mk { case MkWa, MkMo: return HistTopic case MkGa: return HistSubject case MkWo: return HistObject case MkNi, MkHe: return HistScope case MkDe: return HistModifier case MkNo: return HistOperator case MkTo: return HistObject case MkKara, MkFrom: return HistScope // SOURCE role conflated with locative case MkMade: return HistScope case MkYori, MkThan: return HistModifier // comparative standard case MkIn, MkOn, MkAt: return HistScope // LOCATION role case MkWith: return HistModifier // INSTRUMENT/COMITATIVE case MkBy: return HistModifier // AGENT in passive, MEANS otherwise case MkFor: return HistModifier // BENEFICIARY/PURPOSE case MkOf: return HistOperator // POSSESSOR/PART case MkTo_EN: return HistScope // GOAL/RECIPIENT - oblique role, not direct object case MkI, MkYou, MkPron3, MkIt_EN, MkWe_EN, MkThey_: return HistSubject case MkIs, MkAre, MkWas, MkDo: return HistVerb case MkNot: return HistModifier case MkAnd, MkBut: return HistSubject } return HistComplement }