coord.mx raw

   1  package iskra
   2  
   3  import (
   4  	"crypto/siphash"
   5  
   6  	"git.smesh.lol/iskradb/lattice"
   7  )
   8  
   9  // 64-bit coordinate layout:
  10  //   bits 63-48  semantic   (16 bits): 8 subject|object category pairs, 2 bits each
  11  //   bits 47-32  (reserved)
  12  //   bits 31-29  grammatical (3 bits): syntactic role
  13  //   bits 28-25  cooccur     (4 bits): prev_type(2) + next_type(2)
  14  //   bits 24-20  morphstate  (5 bits): tense/aspect/polarity/formality/evidential
  15  //   bits 19-18  pragmatic   (2 bits): domain context
  16  //   bits 17-16  valency     (2 bits): argument count
  17  //   bits 15-2   (reserved — available for case/number in Slavic declension)
  18  //   bits  1-0   register    (2 bits): social register
  19  //
  20  // coord=0 is the base key (dictionary form, context-free lookups).
  21  const (
  22  	CoordSemanticShift    = 48
  23  	CoordGrammaticalShift = 29
  24  	CoordCooccurShift     = 25
  25  	CoordMorphShift       = 20
  26  	CoordPragmaticShift   = 18
  27  	CoordValencyShift     = 16
  28  	CoordRegisterShift    = 0
  29  )
  30  
  31  // Semantic bitfield — 2 bits per ontological category (subject | object flag).
  32  const (
  33  	SemanticHumanSubj uint64 = 1 << 0
  34  	SemanticHumanObj  uint64 = 1 << 1
  35  	SemanticAnimSubj  uint64 = 1 << 2
  36  	SemanticAnimObj   uint64 = 1 << 3
  37  	SemanticAbstSubj  uint64 = 1 << 4
  38  	SemanticAbstObj   uint64 = 1 << 5
  39  	SemanticPlaceSubj uint64 = 1 << 6
  40  	SemanticPlaceObj  uint64 = 1 << 7
  41  	SemanticArtiSubj  uint64 = 1 << 8
  42  	SemanticArtiObj   uint64 = 1 << 9
  43  	SemanticNatSubj   uint64 = 1 << 10
  44  	SemanticNatObj    uint64 = 1 << 11
  45  	SemanticEventSubj uint64 = 1 << 12
  46  	SemanticEventObj  uint64 = 1 << 13
  47  	SemanticCollSubj  uint64 = 1 << 14
  48  	SemanticCollObj   uint64 = 1 << 15
  49  )
  50  
  51  // Co-occurrence word-type values (prev/next slot in CoordCooccur).
  52  const (
  53  	CooccurNone     uint8 = 0
  54  	CooccurNominal  uint8 = 1
  55  	CooccurVerbal   uint8 = 2
  56  	CooccurFunction uint8 = 3
  57  )
  58  
  59  // PackCoord assembles a 64-bit coordinate from individual axis values.
  60  func PackCoord(semantic, grammatical, cooccur, morph, pragmatic, valency, register uint64) uint64 {
  61  	return ((semantic & 0xFFFF) << CoordSemanticShift) |
  62  		((grammatical & 0x7) << CoordGrammaticalShift) |
  63  		((cooccur & 0xF) << CoordCooccurShift) |
  64  		((morph & 0x1F) << CoordMorphShift) |
  65  		((pragmatic & 0x3) << CoordPragmaticShift) |
  66  		((valency & 0x3) << CoordValencyShift) |
  67  		(register & 0x3)
  68  }
  69  
  70  // CoordSemantic extracts the 16-bit semantic bitfield.
  71  func CoordSemantic(coord uint64) uint64 {
  72  	return (coord >> CoordSemanticShift) & 0xFFFF
  73  }
  74  
  75  // CoordMorph extracts the 5-bit morphological state.
  76  func CoordMorph(coord uint64) uint8 {
  77  	return uint8((coord >> CoordMorphShift) & 0x1F)
  78  }
  79  
  80  // CoordCooccur packs (prevType, nextType) into the 4-bit cooccurrence field.
  81  func CoordCooccur(prevType, nextType uint8) uint64 {
  82  	return uint64(prevType&3) | (uint64(nextType&3) << 2)
  83  }
  84  
  85  // CoordPrevType extracts the prev word-type from the cooccurrence field.
  86  func CoordPrevType(coord uint64) uint8 {
  87  	return uint8((coord >> CoordCooccurShift) & 3)
  88  }
  89  
  90  // CoordNextType extracts the next word-type from the cooccurrence field.
  91  func CoordNextType(coord uint64) uint8 {
  92  	return uint8((coord >> (CoordCooccurShift + 2)) & 3)
  93  }
  94  
  95  // RelaxCoord returns coords to try in fallback order (specific → general).
  96  // Strips axes in priority order: pragmatic, register, valency, semantic bits
  97  // MSB→LSB, grammatical, cooccurrence, morphstate.
  98  func RelaxCoord(coord uint64) []uint64 {
  99  	if coord == 0 {
 100  		return []uint64{0}
 101  	}
 102  	seq := []uint64{coord}
 103  	add := func(c uint64) {
 104  		if c != seq[len(seq)-1] {
 105  			seq = append(seq, c)
 106  		}
 107  	}
 108  	c := coord
 109  	c = c &^ (uint64(0x3) << CoordPragmaticShift)
 110  	add(c)
 111  	c = c &^ uint64(0x3)
 112  	add(c)
 113  	c = c &^ (uint64(0x3) << CoordValencyShift)
 114  	add(c)
 115  	sem := (c >> CoordSemanticShift) & 0xFFFF
 116  	for bit := uint64(15); bit < 16; bit-- {
 117  		if (sem>>bit)&1 == 1 {
 118  			sem &^= 1 << bit
 119  			c = (c &^ (uint64(0xFFFF) << CoordSemanticShift)) | (sem << CoordSemanticShift)
 120  			add(c)
 121  		}
 122  		if bit == 0 {
 123  			break
 124  		}
 125  	}
 126  	c = c &^ (uint64(0x7) << CoordGrammaticalShift)
 127  	add(c)
 128  	c = c &^ (uint64(0xF) << CoordCooccurShift)
 129  	add(c)
 130  	c = c &^ (uint64(0x1F) << CoordMorphShift)
 131  	add(c)
 132  	return seq
 133  }
 134  
 135  // MakeKey returns the 128-bit SipHash key for (domain, coord, word).
 136  // domain: 0x01=EN, 0x02=JA, 0x10-0x14=Moxie stages, etc.
 137  // Hash input: [domain(1), coord_LE(8), word(N)]
 138  func MakeKey(domain uint8, coord uint64, word string) lattice.Key {
 139  	buf := []byte{:9 + len(word):9 + len(word)}
 140  	buf[0] = domain
 141  	buf[1] = byte(coord)
 142  	buf[2] = byte(coord >> 8)
 143  	buf[3] = byte(coord >> 16)
 144  	buf[4] = byte(coord >> 24)
 145  	buf[5] = byte(coord >> 32)
 146  	buf[6] = byte(coord >> 40)
 147  	buf[7] = byte(coord >> 48)
 148  	buf[8] = byte(coord >> 56)
 149  	copy(buf[9:], []byte(word))
 150  	return lattice.Key(siphash.Sum128(siphash.DefaultKey, buf))
 151  }
 152