pattern.mx raw

   1  package iskra
   2  
   3  import "git.smesh.lol/iskradb/lattice"
   4  
   5  // Pattern encoding: a sentence skeleton is a sequence of elements.
   6  // Each element is either a content SLOT (role placeholder) or a structural MARKER.
   7  //
   8  // Byte encoding:
   9  //   0x80 | role_id  = content slot (role in bits 0-6)
  10  //   0x00 - 0x7F     = marker ID (structural word: particle, preposition, keyword)
  11  //
  12  // This gives 128 marker IDs and 128 role types. Patterns are typically 3-8 bytes.
  13  
  14  // Slot roles - what a content slot expects.
  15  const (
  16  	SlotNoun     uint8 = 0x80 // noun/nominal content
  17  	SlotVerb     uint8 = 0x81 // verb/action
  18  	SlotModifier uint8 = 0x82 // adjective/adverb
  19  	SlotScope    uint8 = 0x83 // scope body (block, clause)
  20  	SlotLiteral  uint8 = 0x84 // literal value
  21  	SlotExpr     uint8 = 0x85 // expression (code) or clause (natural)
  22  )
  23  
  24  func IsSlot(b uint8) bool { return b&0x80 != 0 }
  25  func SlotRole(b uint8) uint8 { return b & 0x7F }
  26  
  27  // Marker IDs - structural words that define the skeleton.
  28  // JA particles (1-16), EN prepositions/determiners (17-48), code keywords (49-80).
  29  const (
  30  	// JA particles
  31  	MkWa    uint8 = 1  // は (topic)
  32  	MkGa    uint8 = 2  // が (subject)
  33  	MkWo    uint8 = 3  // を (object)
  34  	MkNi    uint8 = 4  // に (dative/locative)
  35  	MkDe    uint8 = 5  // で (instrumental/locative)
  36  	MkHe    uint8 = 6  // へ (direction)
  37  	MkMo    uint8 = 7  // も (inclusive)
  38  	MkNo    uint8 = 8  // の (genitive)
  39  	MkTo    uint8 = 9  // と (comitative/quotative)
  40  	MkKara  uint8 = 10 // から (source)
  41  	MkMade  uint8 = 11 // まで (limit)
  42  	MkYori  uint8 = 12 // より (comparison)
  43  	MkKedo  uint8 = 13 // けど (concessive)
  44  	MkKa    uint8 = 14 // か (question)
  45  	MkYo    uint8 = 15 // よ (assertion)
  46  	MkNe    uint8 = 16 // ね (confirmation)
  47  
  48  	// EN structural words
  49  	MkThe   uint8 = 17
  50  	MkA     uint8 = 18
  51  	MkIn    uint8 = 19
  52  	MkOn    uint8 = 20
  53  	MkAt    uint8 = 21
  54  	MkWith  uint8 = 22
  55  	MkBy    uint8 = 23
  56  	MkFor   uint8 = 24
  57  	MkTo_EN uint8 = 25
  58  	MkOf    uint8 = 26
  59  	MkFrom  uint8 = 27
  60  	MkAs    uint8 = 28
  61  	MkThat  uint8 = 29
  62  	MkIs    uint8 = 30
  63  	MkAre   uint8 = 31
  64  	MkWas   uint8 = 32
  65  	MkDo    uint8 = 33
  66  	MkI     uint8 = 34
  67  	MkYou   uint8 = 35
  68  	MkPron3 uint8 = 36 // he/she/him/her
  69  	MkIt_EN uint8 = 37
  70  	MkWe_EN uint8 = 38
  71  	MkThey_ uint8 = 39
  72  	MkNot   uint8 = 40
  73  	MkAnd   uint8 = 41
  74  	MkBut   uint8 = 42
  75  	MkThan  uint8 = 47 // than (comparative standard)
  76  
  77  	// Morph-bit markers: synthetic particles that carry morph info
  78  	// not naturally expressible in JA surface forms. Used by the renderer
  79  	// to make round-trips lossless.
  80  	MkDef    uint8 = 43 // ★ definiteness (the cat → 猫★)
  81  	MkPlural uint8 = 44 // ☆ plural (books → 本☆)
  82  	MkCopula uint8 = 45 // 〇 copula (is a student → 学生〇)
  83  	Mk3Sg    uint8 = 46 // ◯ 3rd person singular (he eats → 食◯)
  84  
  85  	// EN possessive determiners (distinct from subject pronouns).
  86  	// These never become standalone subject slots; they become POSS modifiers
  87  	// of the next noun.
  88  	MkMy    uint8 = 70 // my
  89  	MkYour  uint8 = 71 // your
  90  	MkHis   uint8 = 72 // his
  91  	MkHerP  uint8 = 73 // her (possessive; her/she-objective conflated in EN)
  92  	MkIts   uint8 = 74 // its
  93  	MkOurP  uint8 = 75 // our
  94  	MkTheirP uint8 = 76 // their
  95  
  96  	// Code structural keywords
  97  	MkIf     uint8 = 49
  98  	MkElse   uint8 = 50
  99  	MkFor_C  uint8 = 51
 100  	MkReturn uint8 = 52
 101  	MkLBrace uint8 = 53
 102  	MkRBrace uint8 = 54
 103  	MkLParen uint8 = 55
 104  	MkRParen uint8 = 56
 105  	MkAssign uint8 = 57
 106  	MkDot    uint8 = 58
 107  	MkComma  uint8 = 59
 108  	MkColon  uint8 = 60
 109  	MkArrow  uint8 = 61
 110  	MkCase   uint8 = 62
 111  	MkSelect uint8 = 63
 112  	MkSpawn  uint8 = 64
 113  	MkChan   uint8 = 65
 114  )
 115  
 116  // markerToJA maps marker IDs to JA particle strings.
 117  // Entries 17-46 are synthetic morph markers for lossless round-trip.
 118  func markerToJA() [50]string {
 119  	return [50]string{
 120  	"",
 121  	"\xe3\x81\xaf",             // 1: は
 122  	"\xe3\x81\x8c",             // 2: が
 123  	"\xe3\x82\x92",             // 3: を
 124  	"\xe3\x81\xab",             // 4: に
 125  	"\xe3\x81\xa7",             // 5: で
 126  	"\xe3\x81\xb8",             // 6: へ
 127  	"\xe3\x82\x82",             // 7: も
 128  	"\xe3\x81\xae",             // 8: の
 129  	"\xe3\x81\xa8",             // 9: と
 130  	"\xe3\x81\x8b\xe3\x82\x89", // 10: から
 131  	"\xe3\x81\xbe\xe3\x81\xa7", // 11: まで
 132  	"\xe3\x82\x88\xe3\x82\x8a", // 12: より
 133  	"\xe3\x81\x91\xe3\x81\xa9", // 13: けど
 134  	"\xe3\x81\x8b",             // 14: か
 135  	"\xe3\x82\x88",             // 15: よ
 136  	"\xe3\x81\xad",             // 16: ね
 137  	"", "", "", "", "", "", "", "", "", "", // 17-26 (EN markers, no JA equivalent)
 138  	"", "", "", "", "", "", "", "", "", "", // 27-36
 139  	"", "", "", "", "", "", // 37-42
 140  	"\xe2\x98\x85", // 43: ★ MkDef
 141  	"\xe2\x98\x86", // 44: ☆ MkPlural
 142  	"\xe3\x80\x87", // 45: 〇 MkCopula
 143  	"\xe2\x97\xaf", // 46: ◯ Mk3Sg
 144  	"", "", "",     // 47-49
 145  	}
 146  }
 147  
 148  // jaParticleToMarker builds the JA particle string to marker ID map.
 149  func jaParticleToMarker() map[string]uint8 {
 150  	m := map[string]uint8{}
 151  	tbl := markerToJA()
 152  	for i := uint8(1); i <= 16; i++ {
 153  		m[tbl[i]] = i
 154  	}
 155  	// Synthetic morph markers (recoverable round-trip).
 156  	for _, mk := range []uint8{MkDef, MkPlural, MkCopula, Mk3Sg} {
 157  		m[tbl[mk]] = mk
 158  	}
 159  	return m
 160  }
 161  
 162  // enWordToMarker maps EN structural words to marker IDs.
 163  func enWordToMarker() map[string]uint8 {
 164  	return map[string]uint8{
 165  		"the": MkThe, "a": MkA, "an": MkA,
 166  		"in": MkIn, "on": MkOn, "at": MkAt,
 167  		"with": MkWith, "by": MkBy, "for": MkFor,
 168  		"to": MkTo_EN, "of": MkOf, "from": MkFrom,
 169  		"as": MkAs, "that": MkThat, "which": MkThat, "who": MkThat,
 170  		"than": MkThan,
 171  		"is": MkIs, "are": MkAre, "was": MkWas, "were": MkWas,
 172  		"am": MkIs, "be": MkIs, "been": MkIs, "being": MkIs,
 173  		"do": MkDo, "does": MkDo, "did": MkDo,
 174  		"will": MkDo, "would": MkDo, "shall": MkDo, "should": MkDo,
 175  		"can": MkDo, "could": MkDo, "may": MkDo, "might": MkDo, "must": MkDo,
 176  		"have": MkDo, "has": MkDo, "had": MkDo,
 177  		"i": MkI, "me": MkI, "mine": MkI, "myself": MkI,
 178  		"my": MkMy,
 179  		"you": MkYou, "yours": MkYou, "yourself": MkYou,
 180  		"your": MkYour,
 181  		"he": MkPron3, "him": MkPron3, "himself": MkPron3,
 182  		"his": MkHis,
 183  		"she": MkPron3, "hers": MkPron3, "herself": MkPron3,
 184  		"her": MkHerP,
 185  		"it": MkIt_EN, "itself": MkIt_EN,
 186  		"its": MkIts,
 187  		"we": MkWe_EN, "us": MkWe_EN, "ours": MkWe_EN, "ourselves": MkWe_EN,
 188  		"our": MkOurP,
 189  		"they": MkThey_, "them": MkThey_, "theirs": MkThey_, "themselves": MkThey_,
 190  		"their": MkTheirP,
 191  		"this": MkIt_EN, "these": MkThey_, "those": MkThey_,
 192  		"not": MkNot, "n't": MkNot,
 193  		"and": MkAnd, "or": MkAnd,
 194  		"but": MkBut, "however": MkBut, "although": MkBut,
 195  		"about": MkOf, "into": MkIn, "onto": MkOn,
 196  		"through": MkIn, "over": MkOn, "under": MkAt,
 197  		"after": MkFrom, "before": MkTo_EN,
 198  		"between": MkAt, "among": MkAt,
 199  		"during": MkAt, "until": MkTo_EN, "since": MkFrom,
 200  		"without": MkWith, "within": MkIn,
 201  		"around": MkAt, "behind": MkAt, "beside": MkAt,
 202  		"toward": MkTo_EN, "towards": MkTo_EN,
 203  		"across": MkIn, "along": MkIn,
 204  		"against": MkWith,
 205  		"upon": MkOn,
 206  	}
 207  }
 208  
 209  // PatternKey hashes a pattern byte sequence into a lattice key.
 210  func PatternKey(domain uint8, pat []byte) lattice.Key {
 211  	buf := []byte{:2 + len(pat):2 + len(pat)}
 212  	buf[0] = domain
 213  	buf[1] = 'P' // domain separator: 'P' for pattern
 214  	copy(buf[2:], pat)
 215  	return lattice.HashKey(buf)
 216  }
 217  
 218  // AtomKey hashes a word into a lattice key for atom storage.
 219  func AtomKey(domain uint8, word string) lattice.Key {
 220  	buf := []byte{:2 + len(word):2 + len(word)}
 221  	buf[0] = domain
 222  	buf[1] = 'W' // domain separator: 'W' for word/atom
 223  	copy(buf[2:], []byte(word))
 224  	return lattice.HashKey(buf)
 225  }
 226  
 227  // DeepPatternKey hashes a canonical role sequence into a lattice key.
 228  func DeepPatternKey(deepPat []byte) lattice.Key {
 229  	buf := []byte{:1 + len(deepPat):1 + len(deepPat)}
 230  	buf[0] = 'D'
 231  	copy(buf[1:], deepPat)
 232  	return lattice.HashKey(buf)
 233  }
 234  
 235  // CrossPatternKey hashes a cross-domain pattern link.
 236  func CrossPatternKey(srcDomain, dstDomain uint8, srcPat, dstPat []byte) lattice.Key {
 237  	buf := []byte{:3 + len(srcPat) + len(dstPat):3 + len(srcPat) + len(dstPat)}
 238  	buf[0] = srcDomain
 239  	buf[1] = dstDomain
 240  	buf[2] = 'X' // cross-link marker
 241  	copy(buf[3:], srcPat)
 242  	copy(buf[3+len(srcPat):], dstPat)
 243  	return lattice.HashKey(buf)
 244  }
 245  
 246  // RoleHist is the per-atom role frequency distribution.
 247  // Stored in MetaEntry.Extra[0:16] as 8 uint16 counters.
 248  type RoleHist [8]uint16
 249  
 250  const (
 251  	HistTopic      = 0
 252  	HistSubject    = 1
 253  	HistObject     = 2
 254  	HistVerb       = 3
 255  	HistModifier   = 4
 256  	HistScope      = 5
 257  	HistOperator   = 6
 258  	HistComplement = 7
 259  )
 260  
 261  // RoleEquiv returns true if two roles should be considered equivalent
 262  // for cross-domain atom alignment. Topic is treated as Subject (first pass;
 263  // contrastive-wa refinement comes later via valence check).
 264  func RoleEquiv(a, b int32) bool {
 265  	if a == b {
 266  		return true
 267  	}
 268  	na := normalizeRole(a)
 269  	nb := normalizeRole(b)
 270  	return na == nb
 271  }
 272  
 273  func normalizeRole(r int32) int32 {
 274  	if r == HistTopic {
 275  		return HistSubject
 276  	}
 277  	return r
 278  }
 279  
 280  func (h *RoleHist) Inc(role uint8) {
 281  	idx := slotToHistIdx(role)
 282  	if idx < 8 && h[idx] < 0xFFFF {
 283  		h[idx]++
 284  	}
 285  }
 286  
 287  func (h *RoleHist) Encode(extra *[16]byte) {
 288  	for i := 0; i < 8; i++ {
 289  		extra[i*2] = byte(h[i])
 290  		extra[i*2+1] = byte(h[i] >> 8)
 291  	}
 292  }
 293  
 294  func (h *RoleHist) Decode(extra [16]byte) {
 295  	for i := 0; i < 8; i++ {
 296  		h[i] = uint16(extra[i*2]) | uint16(extra[i*2+1])<<8
 297  	}
 298  }
 299  
 300  func (h *RoleHist) DominantRole() uint8 {
 301  	max := uint16(0)
 302  	idx := 0
 303  	for i, v := range h {
 304  		if v > max {
 305  			max = v
 306  			idx = i
 307  		}
 308  	}
 309  	return uint8(idx)
 310  }
 311  
 312  func slotToHistIdx(slot uint8) int32 {
 313  	switch slot {
 314  	case SlotNoun:
 315  		return HistObject // default; caller refines to Topic/Subject/Object
 316  	case SlotVerb:
 317  		return HistVerb
 318  	case SlotModifier:
 319  		return HistModifier
 320  	case SlotScope:
 321  		return HistScope
 322  	case SlotLiteral:
 323  		return HistComplement
 324  	case SlotExpr:
 325  		return HistComplement
 326  	}
 327  	return HistComplement
 328  }
 329  
 330  // MarkerToOblRole maps a marker ID to its oblique/thematic role.
 331  // Returns ORNone if the marker doesn't carry an oblique role
 332  // (e.g. determiners, copulas, sentence-final particles).
 333  func MarkerToOblRole(mk uint8) uint8 {
 334  	switch mk {
 335  	// JA particles
 336  	case MkNi:
 337  		return ORGoal // default: motion verb context. Could be ORLoc/ORRecip; verb sem disambiguates.
 338  	case MkHe:
 339  		return ORGoal // unambiguously directional
 340  	case MkDe:
 341  		return ORLoc // default; can be ORInstr in context
 342  	case MkKara:
 343  		return ORSource
 344  	case MkMade:
 345  		return ORLimit
 346  	case MkYori:
 347  		return ORCompare
 348  	case MkTo:
 349  		return ORComit
 350  	case MkNo:
 351  		return ORPart
 352  	// EN prepositions
 353  	case MkTo_EN:
 354  		return ORGoal
 355  	case MkIn, MkOn, MkAt:
 356  		return ORLoc
 357  	case MkFrom:
 358  		return ORSource
 359  	case MkWith:
 360  		return ORInstr
 361  	case MkBy:
 362  		return ORAgent
 363  	case MkFor:
 364  		return ORBenef
 365  	case MkOf:
 366  		return ORPart
 367  	case MkAs:
 368  		return ORNone // role-like but not a thematic role
 369  	case MkThan:
 370  		return ORCompare
 371  	}
 372  	return ORNone
 373  }
 374  
 375  // MarkerToRole maps a marker ID to the role it assigns to the FOLLOWING slot
 376  // (EN prepositions) or PRECEDING slot (JA particles).
 377  func MarkerToRole(mk uint8) int32 {
 378  	switch mk {
 379  	case MkWa, MkMo:
 380  		return HistTopic
 381  	case MkGa:
 382  		return HistSubject
 383  	case MkWo:
 384  		return HistObject
 385  	case MkNi, MkHe:
 386  		return HistScope
 387  	case MkDe:
 388  		return HistModifier
 389  	case MkNo:
 390  		return HistOperator
 391  	case MkTo:
 392  		return HistObject
 393  	case MkKara, MkFrom:
 394  		return HistScope // SOURCE role conflated with locative
 395  	case MkMade:
 396  		return HistScope
 397  	case MkYori, MkThan:
 398  		return HistModifier // comparative standard
 399  	case MkIn, MkOn, MkAt:
 400  		return HistScope // LOCATION role
 401  	case MkWith:
 402  		return HistModifier // INSTRUMENT/COMITATIVE
 403  	case MkBy:
 404  		return HistModifier // AGENT in passive, MEANS otherwise
 405  	case MkFor:
 406  		return HistModifier // BENEFICIARY/PURPOSE
 407  	case MkOf:
 408  		return HistOperator // POSSESSOR/PART
 409  	case MkTo_EN:
 410  		return HistScope // GOAL/RECIPIENT - oblique role, not direct object
 411  	case MkI, MkYou, MkPron3, MkIt_EN, MkWe_EN, MkThey_:
 412  		return HistSubject
 413  	case MkIs, MkAre, MkWas, MkDo:
 414  		return HistVerb
 415  	case MkNot:
 416  		return HistModifier
 417  	case MkAnd, MkBut:
 418  		return HistSubject
 419  	}
 420  	return HistComplement
 421  }
 422