langdesc.mx raw

   1  package iskra
   2  
   3  import "git.smesh.lol/iskradb/lattice"
   4  
   5  // Word order constants encoded in LangDesc.Order.
   6  const (
   7  	OrderSVO uint8 = 0
   8  	OrderSOV uint8 = 1
   9  	OrderVSO uint8 = 2
  10  	OrderVOS uint8 = 3
  11  	OrderOVS uint8 = 4
  12  	OrderOSV uint8 = 5
  13  )
  14  
  15  // Marker system constants encoded in LangDesc.Markers.
  16  const (
  17  	MarkerPrepositional  uint8 = 0
  18  	MarkerPostpositional uint8 = 1
  19  	MarkerCase           uint8 = 2
  20  )
  21  
  22  // LangDesc encodes the structural properties of a language for the cluster pipeline.
  23  // Stored in Bcooccur at MakeKey(domain, 0, "").
  24  type LangDesc struct {
  25  	Order      uint8 // word order: OrderSVO, OrderSOV, etc.
  26  	HeadFinal  bool  // false=head-initial (EN), true=head-final (JA)
  27  	Particle   bool  // false=position-bounded parser, true=particle-bounded
  28  	PreNomRC   bool  // false=post-nominal RC (EN), true=pre-nominal RC (JA)
  29  	ZeroCopula bool  // false=overt copula (EN), true=zero copula (JA)
  30  	Markers    uint8 // MarkerPrepositional / Postpositional / Case
  31  }
  32  
  33  // Particle role codes stored in Record.Branch of Bcooccur particle-role records.
  34  const (
  35  	RoleNone           uint8 = 0
  36  	RoleNPSubjTopic    uint8 = 1
  37  	RoleNPSubjGram     uint8 = 2
  38  	RoleNPObjDirect    uint8 = 3
  39  	RolePPSource       uint8 = 4
  40  	RolePPLimit        uint8 = 5
  41  	RolePPComitative   uint8 = 6
  42  	RolePPLocative     uint8 = 7
  43  	RolePPTemporal     uint8 = 8
  44  	RoleNPDative       uint8 = 9
  45  	RolePPLocStatic    uint8 = 10
  46  	RolePPInstrumental uint8 = 11
  47  )
  48  
  49  // CoordVerbClass is the sentinel coord used for verb class records in Bcooccur.
  50  // Distinct from particle role records (coord=0) and lang descriptors (coord=0, empty word).
  51  const CoordVerbClass uint64 = 1
  52  
  53  // KeyNormalizer is a per-domain hook that maps a raw word to its canonical
  54  // lookup form before hashing. nil = identity.
  55  //
  56  // Examples:
  57  //   LangEN: lowercase ("Cat" → "cat")
  58  //   LangJA: identity (surface form already canonical)
  59  //   DomainMoxieSRC: qualified name resolution ("Method" → "pkg.Type.Method")
  60  //   DomainMoxieIR: LLVM mangled name canonicalization
  61  type KeyNormalizer func(word string) string
  62  
  63  var keyNormalizers = map[uint8]KeyNormalizer{}
  64  
  65  // RegisterKeyNormalizer stores a per-domain key normalization function.
  66  func RegisterKeyNormalizer(domain uint8, fn KeyNormalizer) {
  67  	keyNormalizers[domain] = fn
  68  }
  69  
  70  // NormalizeKey applies the registered normalizer for domain, or returns word unchanged.
  71  func NormalizeKey(domain uint8, word string) string {
  72  	if fn, ok := keyNormalizers[domain]; ok && fn != nil {
  73  		return fn(word)
  74  	}
  75  	return word
  76  }
  77  
  78  func packDescBranch(d LangDesc) uint8 {
  79  	v := d.Order & 0x7
  80  	if d.HeadFinal  { v |= 1 << 3 }
  81  	if d.Particle   { v |= 1 << 4 }
  82  	if d.PreNomRC   { v |= 1 << 5 }
  83  	if d.ZeroCopula { v |= 1 << 6 }
  84  	return v
  85  }
  86  
  87  func unpackDesc(b uint8, markerSys uint8) LangDesc {
  88  	return LangDesc{
  89  		Order:      b & 0x7,
  90  		HeadFinal:  (b>>3)&1 == 1,
  91  		Particle:   (b>>4)&1 == 1,
  92  		PreNomRC:   (b>>5)&1 == 1,
  93  		ZeroCopula: (b>>6)&1 == 1,
  94  		Markers:    markerSys & 0x3,
  95  	}
  96  }
  97  
  98  // RegisterLangDesc writes a LangDesc into Bcooccur at MakeKey(domain, 0, "").
  99  func RegisterLangDesc(tree *lattice.Tree, pool *[]byte, domain uint8, desc LangDesc) {
 100  	key := MakeKey(domain, 0, "")
 101  	ri := tree.LookupRecIdx(lattice.Bcooccur, key)
 102  	if ri != lattice.NullRec {
 103  		if r := tree.GetRecord(ri); r != nil {
 104  			r.Branch = packDescBranch(desc)
 105  			r.DataFile = uint32(desc.Markers&0x3) << 6
 106  		}
 107  		return
 108  	}
 109  	var rec lattice.Record
 110  	rec.Branch = packDescBranch(desc)
 111  	rec.DataFile = uint32(desc.Markers&0x3) << 6
 112  	tree.InsertRec(lattice.Bcooccur, key, rec)
 113  }
 114  
 115  // GetLangDesc reads a LangDesc from Bcooccur. Returns false if not registered.
 116  func GetLangDesc(tree *lattice.Tree, domain uint8) (LangDesc, bool) {
 117  	key := MakeKey(domain, 0, "")
 118  	ri := tree.LookupRecIdx(lattice.Bcooccur, key)
 119  	if ri == lattice.NullRec {
 120  		return LangDesc{}, false
 121  	}
 122  	rec := tree.GetRecord(ri)
 123  	if rec == nil {
 124  		return LangDesc{}, false
 125  	}
 126  	markerSys := uint8((rec.DataFile >> 6) & 0x3)
 127  	return unpackDesc(rec.Branch, markerSys), true
 128  }
 129  
 130  // RegisterParticleRole stores a particle→role mapping in Bcooccur.
 131  // semCoord=0 for unambiguous particles; semantic-coord for ambiguous disambiguation.
 132  func RegisterParticleRole(tree *lattice.Tree, pool *[]byte, domain uint8, semCoord uint64, particle string, role uint8) {
 133  	key := MakeKey(domain, semCoord, particle)
 134  	ri := tree.LookupRecIdx(lattice.Bcooccur, key)
 135  	if ri != lattice.NullRec {
 136  		if r := tree.GetRecord(ri); r != nil {
 137  			r.Branch = role
 138  		}
 139  		return
 140  	}
 141  	var rec lattice.Record
 142  	rec.Branch = role
 143  	tree.InsertRec(lattice.Bcooccur, key, rec)
 144  }
 145  
 146  // LookupParticleRole resolves a particle to its syntactic role using NP semantic flags.
 147  // Tries semantic-coord disambiguation first via RelaxCoord, then coord=0 default.
 148  func LookupParticleRole(tree *lattice.Tree, domain uint8, particle string, npFlags uint64) uint8 {
 149  	if npFlags != 0 {
 150  		semCoord := PackCoord(npFlags, 0, 0, 0, 0, 0, 0)
 151  		for _, c := range RelaxCoord(semCoord) {
 152  			if c == 0 {
 153  				break
 154  			}
 155  			ri := tree.LookupRecIdx(lattice.Bcooccur, MakeKey(domain, c, particle))
 156  			if ri != lattice.NullRec {
 157  				if rec := tree.GetRecord(ri); rec != nil && rec.Branch != 0 {
 158  					return rec.Branch
 159  				}
 160  			}
 161  		}
 162  	}
 163  	ri := tree.LookupRecIdx(lattice.Bcooccur, MakeKey(domain, 0, particle))
 164  	if ri != lattice.NullRec {
 165  		if rec := tree.GetRecord(ri); rec != nil {
 166  			return rec.Branch
 167  		}
 168  	}
 169  	return RoleNone
 170  }
 171  
 172  // MarkerFunc maps a particle role to its surface marker string for a given domain.
 173  type MarkerFunc func(role uint8) string
 174  
 175  var markerFuncs = map[uint8]MarkerFunc{}
 176  
 177  // RegisterMarkerFunc registers the marker lookup function for a domain.
 178  func RegisterMarkerFunc(domain uint8, fn MarkerFunc) {
 179  	markerFuncs[domain] = fn
 180  }
 181  
 182  // LookupTargetMarker returns the surface marker (preposition/postposition) for a
 183  // role in the target domain, calling the registered MarkerFunc.
 184  func LookupTargetMarker(domain uint8, role uint8) string {
 185  	if fn, ok := markerFuncs[domain]; ok && fn != nil {
 186  		return fn(role)
 187  	}
 188  	return ""
 189  }
 190