cluster.mx raw

   1  package iskra
   2  
   3  // ClusterType classifies a phrase cluster by syntactic role.
   4  type ClusterType uint8
   5  
   6  const (
   7  	ClusterNPSubj ClusterType = 0
   8  	ClusterNPObj  ClusterType = 1
   9  	ClusterVP     ClusterType = 2
  10  	ClusterPP     ClusterType = 3
  11  	ClusterMod    ClusterType = 4
  12  )
  13  
  14  // Cluster is a phrase unit produced by ParseClusters.
  15  type Cluster struct {
  16  	Kind    ClusterType
  17  	Tokens  []string   // content tokens after boundary particles consumed
  18  	Flags   uint64     // accumulated semantic flags from tokens
  19  	Role    uint8      // particle role code (RoleNP*, RolePP*)
  20  	Nested  []*Cluster // relative clause VP, if present
  21  	Trans   string     // target-language string, filled by TranslateCluster
  22  	Copular bool       // VP cluster with no finite verb = copular sentence
  23  }
  24  
  25  // ParticleDetector returns true if tok is a phrase-boundary particle in this domain.
  26  type ParticleDetector func(tok string) bool
  27  
  28  // RoleLookup maps (particle, npFlags) to a role code using the domain's disambiguation rules.
  29  type RoleLookup func(particle string, npFlags uint64) uint8
  30  
  31  // VerbDetector returns true if tok is a finite verb in this domain.
  32  type VerbDetector func(tok string) bool
  33  
  34  // FlagAccumulator extracts and ORs semantic flags from a token slice.
  35  type FlagAccumulator func(tokens []string) uint64
  36  
  37  // ContentFilter returns only content words from a token slice (strips function words).
  38  type ContentFilter func(tokens []string) []string
  39  
  40  // ClusterHooks groups the language-specific callbacks for ParseClusters.
  41  type ClusterHooks struct {
  42  	IsParticle  ParticleDetector
  43  	LookupRole  RoleLookup
  44  	HasVerb     VerbDetector
  45  	AccumFlags  FlagAccumulator
  46  	FilterContent ContentFilter
  47  	DefaultRole func(particle string) uint8 // fallback when LookupRole returns RoleNone
  48  }
  49  
  50  // KindFromRole maps a particle role code to a ClusterType.
  51  func KindFromRole(role uint8) ClusterType {
  52  	switch role {
  53  	case RoleNPSubjTopic, RoleNPSubjGram:
  54  		return ClusterNPSubj
  55  	case RoleNPObjDirect:
  56  		return ClusterNPObj
  57  	case RolePPLocative, RolePPTemporal, RoleNPDative,
  58  		RolePPSource, RolePPLimit, RolePPComitative,
  59  		RolePPLocStatic, RolePPInstrumental:
  60  		return ClusterPP
  61  	}
  62  	return ClusterMod
  63  }
  64  
  65  // splitAtFirstVerb splits tokens at the first verb token: vpTokens ends at the verb
  66  // (inclusive), headTokens is everything after.
  67  func splitAtFirstVerb(tokens []string, isVerb VerbDetector) (vpTokens, headTokens []string) {
  68  	for i, tok := range tokens {
  69  		if isVerb(tok) {
  70  			return tokens[:i+1], tokens[i+1:]
  71  		}
  72  	}
  73  	return tokens, nil
  74  }
  75  
  76  // hasAnyVerb returns true if any token satisfies isVerb.
  77  func hasAnyVerb(tokens []string, isVerb VerbDetector) bool {
  78  	for _, tok := range tokens {
  79  		if isVerb(tok) {
  80  			return true
  81  		}
  82  	}
  83  	return false
  84  }
  85  
  86  // ParseClusters segments a pre-tokenized sequence into phrase clusters.
  87  // Particle-bounded mode (JA, KO): each particle closes the preceding accumulation.
  88  // hooks provides all language-specific detection and classification functions.
  89  // Must be called on already-tokenized input — compound forms must be pre-collapsed.
  90  func ParseClusters(tokens []string, hooks ClusterHooks) []*Cluster {
  91  	var clusters []*Cluster
  92  	var acc []string
  93  
  94  	for _, tok := range tokens {
  95  		if !hooks.IsParticle(tok) {
  96  			acc = append(acc, tok)
  97  			continue
  98  		}
  99  		if len(acc) == 0 {
 100  			continue
 101  		}
 102  		var flags uint64
 103  		if hooks.AccumFlags != nil {
 104  			flags = hooks.AccumFlags(acc)
 105  		}
 106  		role := hooks.LookupRole(tok, flags)
 107  		if role == RoleNone && hooks.DefaultRole != nil {
 108  			role = hooks.DefaultRole(tok)
 109  		}
 110  		c := &Cluster{
 111  			Kind:  KindFromRole(role),
 112  			Flags: flags,
 113  			Role:  role,
 114  		}
 115  		if hasAnyVerb(acc, hooks.HasVerb) {
 116  			vp, head := splitAtFirstVerb(acc, hooks.HasVerb)
 117  			c.Tokens = head
 118  			c.Nested = ParseClusters(vp, hooks)
 119  		} else {
 120  			c.Tokens = acc
 121  		}
 122  		clusters = append(clusters, c)
 123  		acc = nil
 124  	}
 125  
 126  	if len(acc) > 0 {
 127  		var flags uint64
 128  		if hooks.AccumFlags != nil {
 129  			flags = hooks.AccumFlags(acc)
 130  		}
 131  		clusters = append(clusters, &Cluster{
 132  			Kind:    ClusterVP,
 133  			Tokens:  acc,
 134  			Flags:   flags,
 135  			Copular: !hasAnyVerb(acc, hooks.HasVerb),
 136  		})
 137  	}
 138  	return clusters
 139  }
 140  
 141  // ReorderClusters rearranges clusters from source word order to destination word order.
 142  // PP and Mod clusters are appended after the SVO/SOV core in source order.
 143  func ReorderClusters(clusters []*Cluster, srcOrder, dstOrder uint8) []*Cluster {
 144  	var subj, obj, vp *Cluster
 145  	var pps, mods []*Cluster
 146  
 147  	for _, c := range clusters {
 148  		switch c.Kind {
 149  		case ClusterNPSubj:
 150  			subj = c
 151  		case ClusterNPObj:
 152  			obj = c
 153  		case ClusterVP:
 154  			vp = c
 155  		case ClusterPP:
 156  			pps = append(pps, c)
 157  		default:
 158  			mods = append(mods, c)
 159  		}
 160  	}
 161  
 162  	var result []*Cluster
 163  	switch dstOrder {
 164  	case OrderSVO:
 165  		if subj != nil { result = append(result, subj) }
 166  		if vp != nil   { result = append(result, vp) }
 167  		if obj != nil  { result = append(result, obj) }
 168  	case OrderSOV:
 169  		if subj != nil { result = append(result, subj) }
 170  		if obj != nil  { result = append(result, obj) }
 171  		if vp != nil   { result = append(result, vp) }
 172  	case OrderVSO:
 173  		if vp != nil   { result = append(result, vp) }
 174  		if subj != nil { result = append(result, subj) }
 175  		if obj != nil  { result = append(result, obj) }
 176  	default:
 177  		if subj != nil { result = append(result, subj) }
 178  		if vp != nil   { result = append(result, vp) }
 179  		if obj != nil  { result = append(result, obj) }
 180  	}
 181  	result = append(result, pps...)
 182  	result = append(result, mods...)
 183  	return result
 184  }
 185  
 186  // InsertMarkers assembles translated clusters into a target-language string.
 187  // Inserts prepositions (prepositional target), postpositions (postpositional target),
 188  // and copula ("is") for overt-copula languages.
 189  func InsertMarkers(clusters []*Cluster, dstDesc LangDesc, dstDomain uint8) string {
 190  	copular := false
 191  	for _, c := range clusters {
 192  		if c.Kind == ClusterVP && c.Copular {
 193  			copular = true
 194  			break
 195  		}
 196  	}
 197  
 198  	var out []byte
 199  	addSpace := false
 200  
 201  	emitEN := func(s string) {
 202  		if s == "" {
 203  			return
 204  		}
 205  		if addSpace {
 206  			out = append(out, ' ')
 207  		}
 208  		out = append(out, []byte(s)...)
 209  		addSpace = true
 210  	}
 211  
 212  	for _, c := range clusters {
 213  		if c.Trans == "" {
 214  			continue
 215  		}
 216  		if dstDesc.Markers == MarkerPostpositional {
 217  			out = append(out, []byte(c.Trans)...)
 218  			marker := LookupTargetMarker(dstDomain, c.Role)
 219  			if marker != "" {
 220  				out = append(out, []byte(marker)...)
 221  			}
 222  		} else {
 223  			if c.Kind == ClusterPP {
 224  				marker := LookupTargetMarker(dstDomain, c.Role)
 225  				if marker != "" {
 226  					emitEN(marker)
 227  				}
 228  			} else if c.Kind == ClusterVP && copular && !dstDesc.ZeroCopula {
 229  				emitEN("is")
 230  			}
 231  			emitEN(c.Trans)
 232  		}
 233  	}
 234  	return string(out)
 235  }
 236