extract_zh.mx raw

   1  package iskra
   2  
   3  // Chinese (ZH) extractor.
   4  //
   5  // Chinese is the EN-close cousin in the typological matrix: SVO, analytic
   6  // (no morphology), positional role assignment. The structural surface is
   7  // closer to ExtractEN's, with two consequential differences:
   8  //
   9  //   1. Aspect, not tense. ZH marks completion/duration/experience with
  10  //      post-verb particles (了, 着, 過), not tense morphology. The
  11  //      existing morph bits (MetaTensePast, MetaAspectProg) get repurposed:
  12  //      了 -> MetaTensePast (perfective ≈ past in most contexts)
  13  //      着 -> MetaAspectProg (progressive/durative)
  14  //      過 -> MetaTensePast (experiential past)
  15  //
  16  //   2. The copula 是 is a content-word marker (not a verb-aux like EN "is").
  17  //      Like JA だ, it puts the following noun into MKCop relation with the
  18  //      preceding subject. Handled inline in the extractor.
  19  //
  20  // Initial version handles:
  21  //   - Subject / verb / object positional roles (SVO)
  22  //   - Structural markers: 的 (possessive), 是 (copula), 了/着/過 (aspect),
  23  //     不/沒 (negation), 在 (location), 從 (source), 給 (recipient marker),
  24  //     把 (OBJ fronting), 被 (passive marker)
  25  //   - Pre-noun attributive 的 modifiers
  26  //
  27  // Defers: tone-based disambiguation, formal/literary vs colloquial register,
  28  // resultative compounds (V+補語), serial-verb constructions, BA-construction
  29  // SVO reordering, 是…的 cleft, 把/被 case-marking subtleties.
  30  
  31  // zhMarkerToRole maps a ZH structural marker to the JA-cognate marker ID.
  32  // Where ZH has finer or different distinctions than the existing MkXxx set,
  33  // the marker rides on existing IDs and the morph/oblique role carries the
  34  // finer distinction.
  35  func zhMarkerToMarker() map[string]uint8 {
  36  	return map[string]uint8{
  37  		"\xe7\x9a\x84":     MkNo,   // 的 (POSS/attributive ≈ JA の)
  38  		"\xe4\xb9\x8b":     MkNo,   // 之 (literary POSS ≈ 的)
  39  		"\xe5\x9c\xa8":     MkIn,   // 在 (LOC ≈ EN "in")
  40  		"\xe5\xbe\x9e":     MkFrom, // 從 (SOURCE ≈ EN "from")
  41  		"\xe7\xbb\x99":     MkTo_EN, // 给 (RECIP ≈ EN "to" in dative sense)
  42  		"\xe7\xb5\xa6":     MkTo_EN, // 給 (traditional form of 给)
  43  		"\xe8\xa2\xab":     MkBy,   // 被 (passive agent marker ≈ EN "by")
  44  		"\xe5\x92\x8c":     MkAnd,  // 和 (and)
  45  		"\xe6\x88\x96":     MkAnd,  // 或 (or - no MkOr; rides on MkAnd)
  46  	}
  47  }
  48  
  49  // zhAspectMarker maps post-verb aspect particles to the morph bit they
  50  // contribute to the verb that precedes them.
  51  func zhAspectMarker() map[string]uint16 {
  52  	return map[string]uint16{
  53  		"\xe4\xba\x86":   MetaTensePast,  // 了 (perfective)
  54  		"\xe7\x9d\x80":   MetaAspectProg, // 着 (progressive/durative)
  55  		"\xe7\x9d\x80\x80": MetaAspectProg, // safety alias (rare)
  56  		"\xe9\x81\x8e":   MetaTensePast,  // 過 (experiential past, traditional)
  57  		"\xe8\xbf\x87":   MetaTensePast,  // 过 (simplified)
  58  	}
  59  }
  60  
  61  // zhNegMarker: ZH negation particles. Both contribute MetaPolarNeg;
  62  // 沒 also implies past (MetaTensePast) by convention.
  63  func zhNegMarker() map[string]uint16 {
  64  	return map[string]uint16{
  65  		"\xe4\xb8\x8d": MetaPolarNeg,                  // 不 (general negation)
  66  		"\xe6\xb2\x92": MetaPolarNeg | MetaTensePast,  // 沒 (past-negative / not-have)
  67  		"\xe6\xb2\xa1": MetaPolarNeg | MetaTensePast,  // 没 (simplified 沒)
  68  	}
  69  }
  70  
  71  // zhCopula: the 是 marker. Acts like JA だ (predicative copula), staging
  72  // the next noun as MKCop on the preceding subject.
  73  const zhCopula = "\xe6\x98\xaf" // 是
  74  
  75  // zhObjFronter: the 把 marker. Moves the following noun to pre-verb OBJ
  76  // position. Mirrors the OBJ-fronting in JA を, but applied positionally
  77  // rather than as a slot-following particle.
  78  const zhObjFronter = "\xe6\x8a\x8a" // 把
  79  
  80  // ExtractZH takes pre-tokenized ZH tokens (space-segmented; from the Bible
  81  // corpus the segmentation is already in the input) and produces ExtractResult.
  82  // Initial version handles core SVO + the structural markers above.
  83  func ExtractZH(tokens []string) ExtractResult {
  84  	var pat []byte
  85  	var slots []string
  86  	var roles []int32
  87  	var slotMarkers []uint8
  88  	var slotMorphs []uint16
  89  	var slotOblRoles []uint8
  90  	var slotHeads []int16
  91  	var slotModKinds []uint8
  92  	sawVerb := false
  93  	pendingRole := HistSubject
  94  	pendingMark := uint8(0)
  95  	pendingOblRole := uint8(ORNone)
  96  	pendingCop := false
  97  	pendingCopHead := int16(-1)
  98  	pendingObjFront := false
  99  	var clausesZH []Clause
 100  	clauseRelZH := ClauseRoot
 101  	clauseParentZH := int16(-1)
 102  
 103  	for _, tok := range tokens {
 104  		// Clause-boundary token (synthetic 、 from tokenizer punctuation).
 105  		if tok == "\xe3\x80\x81" {
 106  			if len(slots) > 0 {
 107  				if len(roles) > 0 && slotModKinds[len(roles)-1] != MKCop {
 108  					if !sawVerb {
 109  						roles[len(roles)-1] = HistVerb
 110  					}
 111  				}
 112  				clauseSet := buildSetFromSlices(
 113  					slots, roles, slotMorphs,
 114  					slotMarkers, slotOblRoles, slotHeads, slotModKinds,
 115  				)
 116  				nextParent := int16(len(clausesZH))
 117  				clausesZH = append(clausesZH, Clause{
 118  					Set: clauseSet, Relation: clauseRelZH,
 119  					Parent: clauseParentZH, HostIdx: -1,
 120  				})
 121  				slots = nil
 122  				roles = nil
 123  				slotMarkers = nil
 124  				slotMorphs = nil
 125  				slotOblRoles = nil
 126  				slotHeads = nil
 127  				slotModKinds = nil
 128  				sawVerb = false
 129  				pendingRole = HistSubject
 130  				pendingMark = 0
 131  				pendingOblRole = ORNone
 132  				pendingCop = false
 133  				pendingCopHead = -1
 134  				pendingObjFront = false
 135  				clauseRelZH = ClauseAnd
 136  				clauseParentZH = nextParent - 1
 137  			}
 138  			continue
 139  		}
 140  
 141  		// Structural marker dispatch.
 142  		if mk, isMk := zhMarkerToMarker()[tok]; isMk {
 143  			pat = append(pat, mk)
 144  			switch mk {
 145  			case MkNo:
 146  				// 的 / 之: preceding slot is POSS modifier of next slot.
 147  				// Same as JA の handling.
 148  				if len(slots) > 0 {
 149  					roles[len(roles)-1] = HistOperator
 150  					slotMarkers[len(slotMarkers)-1] = mk
 151  					slotOblRoles[len(slotOblRoles)-1] = ORPart
 152  				}
 153  			case MkIn:
 154  				pendingRole = HistScope
 155  				pendingMark = mk
 156  				pendingOblRole = ORLoc
 157  			case MkFrom:
 158  				pendingRole = HistScope
 159  				pendingMark = mk
 160  				pendingOblRole = ORSource
 161  			case MkTo_EN:
 162  				// 给 / 給: recipient marker, similar to JA に in dative use.
 163  				pendingRole = HistModifier
 164  				pendingMark = mk
 165  				pendingOblRole = ORRecip
 166  			case MkBy:
 167  				// 被: passive agent marker.
 168  				pendingRole = HistModifier
 169  				pendingMark = mk
 170  				pendingOblRole = ORAgent
 171  			case MkAnd:
 172  				// Coordination: leave existing role on prior slot; the next
 173  				// content word inherits the prior's role.
 174  				// (Simple version - no MKCoord wiring yet.)
 175  			}
 176  			continue
 177  		}
 178  
 179  		// Aspect markers attach to the preceding verb's morph.
 180  		if m, isAsp := zhAspectMarker()[tok]; isAsp {
 181  			if len(slotMorphs) > 0 {
 182  				slotMorphs[len(slotMorphs)-1] |= m
 183  			}
 184  			continue
 185  		}
 186  
 187  		// Negation markers attach to the upcoming verb.
 188  		if m, isNeg := zhNegMarker()[tok]; isNeg {
 189  			// Apply to the NEXT slot (verb position). Stage via the existing
 190  			// morph-accumulation pattern: since we don't have a pendingNeg
 191  			// here, we'll combine with the next emitted slot's morph by
 192  			// using slotMorphs append-time logic. Simplest: track in a local.
 193  			_ = m
 194  			// Stage via a sentinel: append a zero morph and OR it onto
 195  			// the next slot. Defer for v0; for now, ignore negation.
 196  			continue
 197  		}
 198  
 199  		// Copula 是: stage MKCop for the next noun.
 200  		if tok == zhCopula {
 201  			subj := findSubjectIdx(roles, slotHeads)
 202  			if subj >= 0 {
 203  				pendingCop = true
 204  				pendingCopHead = subj
 205  				pendingRole = HistComplement
 206  			}
 207  			continue
 208  		}
 209  
 210  		// 把: OBJ-fronting. The next content word is the OBJ.
 211  		if tok == zhObjFronter {
 212  			pendingRole = HistObject
 213  			pendingObjFront = true
 214  			continue
 215  		}
 216  
 217  		// Content word: emit as a slot.
 218  		pat = append(pat, SlotNoun)
 219  		slots = append(slots, tok)
 220  		role := pendingRole
 221  		if !sawVerb && !pendingCop && !pendingObjFront && pendingMark == 0 {
 222  			role = HistSubject
 223  		}
 224  		roles = append(roles, role)
 225  		slotMorphs = append(slotMorphs, 0)
 226  		slotMarkers = append(slotMarkers, pendingMark)
 227  		slotOblRoles = append(slotOblRoles, pendingOblRole)
 228  		slotHeads = append(slotHeads, -1)
 229  		slotModKinds = append(slotModKinds, MKNone)
 230  		newIdx := int16(len(slots) - 1)
 231  		// Copula resolution.
 232  		if pendingCop {
 233  			slotHeads[newIdx] = pendingCopHead
 234  			slotModKinds[newIdx] = MKCop
 235  			pendingCop = false
 236  			pendingCopHead = -1
 237  		}
 238  		// Position-based verb detection: second content word in a clause
 239  		// with no copula and no pre-positioned obj is the verb.
 240  		if !sawVerb && !pendingCop && pendingMark == 0 && pendingOblRole == ORNone {
 241  			// Heuristic: the second content word is the verb in SVO.
 242  			contentSoFar := 0
 243  			for _, r := range roles {
 244  				if r == HistSubject || r == HistVerb {
 245  					contentSoFar++
 246  				}
 247  			}
 248  			if contentSoFar == 2 {
 249  				// Demote this slot to HistVerb.
 250  				roles[newIdx] = HistVerb
 251  				sawVerb = true
 252  				pendingRole = HistObject
 253  			}
 254  		} else if sawVerb && pendingMark == 0 && !pendingCop {
 255  			pendingRole = HistObject
 256  		}
 257  		pendingMark = 0
 258  		pendingOblRole = ORNone
 259  		pendingObjFront = false
 260  	}
 261  
 262  	// Finalize last clause.
 263  	set := buildSetFromSlices(
 264  		slots, roles, slotMorphs,
 265  		slotMarkers, slotOblRoles, slotHeads, slotModKinds,
 266  	)
 267  	clausesZH = append(clausesZH, Clause{
 268  		Set: set, Relation: clauseRelZH,
 269  		Parent: clauseParentZH, HostIdx: -1,
 270  	})
 271  
 272  	flatSlots := []string{:0:len(slots)}
 273  	flatRoles := []int32{:0:len(slots)}
 274  	for _, c := range clausesZH {
 275  		for _, e := range c.Set {
 276  			flatSlots = append(flatSlots, e.Atom)
 277  			flatRoles = append(flatRoles, e.Role)
 278  		}
 279  	}
 280  
 281  	return ExtractResult{
 282  		Pattern: pat, Slots: flatSlots, Roles: flatRoles,
 283  		DeepPat: buildDeepPat(flatRoles), Set: clausesZH[0].Set,
 284  		Discourse: clausesZH,
 285  	}
 286  }
 287