cluster.mx raw

   1  package transdb
   2  
   3  import (
   4  	"git.mleku.dev/iskra"
   5  	"git.smesh.lol/iskradb/lattice"
   6  )
   7  
   8  // Re-export cluster types from iskra.
   9  type ClusterType = iskra.ClusterType
  10  type Cluster = iskra.Cluster
  11  
  12  const (
  13  	ClusterNPSubj = iskra.ClusterNPSubj
  14  	ClusterNPObj  = iskra.ClusterNPObj
  15  	ClusterVP     = iskra.ClusterVP
  16  	ClusterPP     = iskra.ClusterPP
  17  	ClusterMod    = iskra.ClusterMod
  18  )
  19  
  20  // jaParticleSet: JA boundary-marking particles for ParseClusters.
  21  var jaParticleSet = map[string]bool{
  22  	"は": true, "が": true, "を": true,
  23  	"に": true, "で": true, "と": true,
  24  	"から": true, "まで": true, "へ": true,
  25  	"より": true, "も": true,
  26  }
  27  
  28  // jaDefaultRole: fallback roles when Bcooccur is not yet populated.
  29  var jaDefaultRole = map[string]uint8{
  30  	"は":   RoleNPSubjTopic,
  31  	"が":   RoleNPSubjGram,
  32  	"を":   RoleNPObjDirect,
  33  	"から": RolePPSource,
  34  	"まで": RolePPLimit,
  35  	"と":   RolePPComitative,
  36  	"に":   RoleNPDative,
  37  	"で":   RolePPInstrumental,
  38  	"へ":   RolePPLocative,
  39  	"より": RolePPSource,
  40  	"も":   RoleNPSubjTopic,
  41  }
  42  
  43  // jaClusterHooks constructs the ClusterHooks for JA parsing.
  44  func jaClusterHooks(tree *lattice.Tree) iskra.ClusterHooks {
  45  	return iskra.ClusterHooks{
  46  		IsParticle: func(tok string) bool { return jaParticleSet[tok] },
  47  		LookupRole: func(tok string, npFlags uint64) uint8 {
  48  			return LookupParticleRole(tree, LangJA, tok, npFlags)
  49  		},
  50  		HasVerb: func(tok string) bool { return isJAVerb(tree, tok) },
  51  		AccumFlags: func(tokens []string) uint64 {
  52  			var flags uint64
  53  			for _, tok := range tokens {
  54  				for _, b := range ActiveBranches {
  55  					if ri := tree.LookupRecIdx(b, MakeKey(LangJA, 0, tok)); ri != lattice.NullRec {
  56  						if rec := tree.GetRecord(ri); rec != nil {
  57  							flags |= GetSemanticFromDataFile(rec)
  58  						}
  59  						break
  60  					}
  61  				}
  62  			}
  63  			return flags
  64  		},
  65  		FilterContent: func(tokens []string) []string {
  66  			var out []string
  67  			for _, tok := range tokens {
  68  				if !jaFunctionWord[tok] {
  69  					out = append(out, tok)
  70  				}
  71  			}
  72  			return out
  73  		},
  74  		DefaultRole: func(tok string) uint8 { return jaDefaultRole[tok] },
  75  	}
  76  }
  77  
  78  // ParseClusters segments a pre-tokenized JA sequence into phrase clusters.
  79  func ParseClusters(tokens []string, tree *lattice.Tree, lang uint8) []*Cluster {
  80  	hooks := jaClusterHooks(tree)
  81  	return iskra.ParseClusters(tokens, hooks)
  82  }
  83  
  84  // clusterHeadLookup finds the target form and morph state for a source token.
  85  func clusterHeadLookup(tree *lattice.Tree, pool []byte, tok string, srcLang uint8, coord uint64) (form string, state uint8) {
  86  	order := defaultBranchOrder
  87  	if srcLang == LangJA {
  88  		order = branchOrderJA(coord)
  89  	}
  90  	for _, c := range RelaxCoord(coord) {
  91  		key := MakeKey(srcLang, c, tok)
  92  		for _, b := range order {
  93  			ri := tree.LookupRecIdx(lattice.Branch(b), key)
  94  			if ri == lattice.NullRec {
  95  				continue
  96  			}
  97  			rec := tree.GetRecord(ri)
  98  			if rec == nil {
  99  				continue
 100  			}
 101  			if rec.Link[0] == lattice.NullRec {
 102  				break
 103  			}
 104  			dst := tree.GetRecord(rec.Link[0])
 105  			if dst == nil {
 106  				break
 107  			}
 108  			if f := FormFromInline(dst, pool); f != "" {
 109  				return f, GetMorphState(rec)
 110  			}
 111  			break
 112  		}
 113  	}
 114  	if srcLang == LangJA {
 115  		ms := inferMorphState(tok)
 116  		for _, stem := range verbStems(tok) {
 117  			sCoord := PackCoord(0, 0, 0, uint64(ms), 0, 0, 0)
 118  			if f, _ := clusterHeadLookup(tree, pool, stem, srcLang, sCoord); f != "" {
 119  				return f, ms
 120  			}
 121  		}
 122  	}
 123  	return "", 0
 124  }
 125  
 126  // TranslateCluster translates a cluster's tokens and fills c.Trans.
 127  func TranslateCluster(c *Cluster, tree *lattice.Tree, pool []byte, srcLang, dstLang uint8) {
 128  	for _, nc := range c.Nested {
 129  		TranslateCluster(nc, tree, pool, srcLang, dstLang)
 130  	}
 131  
 132  	hooks := jaClusterHooks(tree)
 133  	content := hooks.FilterContent(c.Tokens)
 134  
 135  	var morphState uint8
 136  	if c.Kind == ClusterVP && !c.Copular && len(content) > 0 {
 137  		head := content[len(content)-1]
 138  		if srcLang == LangJA {
 139  			morphState = inferMorphState(head)
 140  		}
 141  	}
 142  	coord := PackCoord(c.Flags, 0, 0, uint64(morphState), 0, 0, 0)
 143  
 144  	headIdx := len(content) - 1
 145  	if srcLang == LangEN || headIdx < 0 {
 146  		headIdx = 0
 147  	}
 148  
 149  	headTrans := ""
 150  	headState := uint8(0)
 151  	if len(content) > 0 {
 152  		headTrans, headState = clusterHeadLookup(tree, pool, content[headIdx], srcLang, coord)
 153  		if headTrans == "" {
 154  			headTrans = content[headIdx]
 155  		} else if dstLang == LangEN {
 156  			headTrans = applyMorphEN(headTrans, headState)
 157  		}
 158  	}
 159  
 160  	modCoord := PackCoord(c.Flags, 0, 0, 0, 0, 0, 0)
 161  	var modTrans []string
 162  	for i, tok := range content {
 163  		if i == headIdx {
 164  			continue
 165  		}
 166  		t, _ := clusterHeadLookup(tree, pool, tok, srcLang, modCoord)
 167  		if t == "" {
 168  			t = tok
 169  		} else if dstLang == LangEN {
 170  			t = stripTo(t)
 171  		}
 172  		modTrans = append(modTrans, t)
 173  	}
 174  
 175  	rcTrans := ""
 176  	if len(c.Nested) > 0 {
 177  		var parts []string
 178  		for _, nc := range c.Nested {
 179  			if nc.Trans != "" {
 180  				parts = append(parts, nc.Trans)
 181  			}
 182  		}
 183  		rcTrans = joinWords(parts, dstLang)
 184  	}
 185  
 186  	var parts []string
 187  	if dstLang == LangEN {
 188  		if rcTrans != "" {
 189  			parts = append(parts, rcTrans)
 190  		}
 191  		parts = append(parts, modTrans...)
 192  		if headTrans != "" {
 193  			parts = append(parts, headTrans)
 194  		}
 195  	} else {
 196  		parts = append(parts, modTrans...)
 197  		if rcTrans != "" {
 198  			parts = append(parts, rcTrans)
 199  		}
 200  		if headTrans != "" {
 201  			parts = append(parts, headTrans)
 202  		}
 203  	}
 204  
 205  	c.Trans = joinWords(parts, dstLang)
 206  }
 207  
 208  // joinWords joins parts with spaces (EN) or no separator (JA).
 209  func joinWords(parts []string, lang uint8) string {
 210  	var out []byte
 211  	first := true
 212  	for _, p := range parts {
 213  		if p == "" {
 214  			continue
 215  		}
 216  		if lang == LangEN && !first {
 217  			out = append(out, ' ')
 218  		}
 219  		out = append(out, []byte(p)...)
 220  		first = false
 221  	}
 222  	return string(out)
 223  }
 224  
 225  // ReorderClusters rearranges clusters from src to dst word order.
 226  func ReorderClusters(clusters []*Cluster, srcOrder, dstOrder uint8) []*Cluster {
 227  	return iskra.ReorderClusters(clusters, srcOrder, dstOrder)
 228  }
 229  
 230  // InsertMarkers assembles clusters into a target-language string.
 231  func InsertMarkers(clusters []*Cluster, dstDesc LangDesc, dstLang uint8) string {
 232  	return iskra.InsertMarkers(clusters, dstDesc, dstLang)
 233  }
 234