package iskra // Chinese (ZH) extractor. // // Chinese is the EN-close cousin in the typological matrix: SVO, analytic // (no morphology), positional role assignment. The structural surface is // closer to ExtractEN's, with two consequential differences: // // 1. Aspect, not tense. ZH marks completion/duration/experience with // post-verb particles (了, 着, 過), not tense morphology. The // existing morph bits (MetaTensePast, MetaAspectProg) get repurposed: // 了 -> MetaTensePast (perfective ≈ past in most contexts) // 着 -> MetaAspectProg (progressive/durative) // 過 -> MetaTensePast (experiential past) // // 2. The copula 是 is a content-word marker (not a verb-aux like EN "is"). // Like JA だ, it puts the following noun into MKCop relation with the // preceding subject. Handled inline in the extractor. // // Initial version handles: // - Subject / verb / object positional roles (SVO) // - Structural markers: 的 (possessive), 是 (copula), 了/着/過 (aspect), // 不/沒 (negation), 在 (location), 從 (source), 給 (recipient marker), // 把 (OBJ fronting), 被 (passive marker) // - Pre-noun attributive 的 modifiers // // Defers: tone-based disambiguation, formal/literary vs colloquial register, // resultative compounds (V+補語), serial-verb constructions, BA-construction // SVO reordering, 是…的 cleft, 把/被 case-marking subtleties. // zhMarkerToRole maps a ZH structural marker to the JA-cognate marker ID. // Where ZH has finer or different distinctions than the existing MkXxx set, // the marker rides on existing IDs and the morph/oblique role carries the // finer distinction. func zhMarkerToMarker() map[string]uint8 { return map[string]uint8{ "\xe7\x9a\x84": MkNo, // 的 (POSS/attributive ≈ JA の) "\xe4\xb9\x8b": MkNo, // 之 (literary POSS ≈ 的) "\xe5\x9c\xa8": MkIn, // 在 (LOC ≈ EN "in") "\xe5\xbe\x9e": MkFrom, // 從 (SOURCE ≈ EN "from") "\xe7\xbb\x99": MkTo_EN, // 给 (RECIP ≈ EN "to" in dative sense) "\xe7\xb5\xa6": MkTo_EN, // 給 (traditional form of 给) "\xe8\xa2\xab": MkBy, // 被 (passive agent marker ≈ EN "by") "\xe5\x92\x8c": MkAnd, // 和 (and) "\xe6\x88\x96": MkAnd, // 或 (or - no MkOr; rides on MkAnd) } } // zhAspectMarker maps post-verb aspect particles to the morph bit they // contribute to the verb that precedes them. func zhAspectMarker() map[string]uint16 { return map[string]uint16{ "\xe4\xba\x86": MetaTensePast, // 了 (perfective) "\xe7\x9d\x80": MetaAspectProg, // 着 (progressive/durative) "\xe7\x9d\x80\x80": MetaAspectProg, // safety alias (rare) "\xe9\x81\x8e": MetaTensePast, // 過 (experiential past, traditional) "\xe8\xbf\x87": MetaTensePast, // 过 (simplified) } } // zhNegMarker: ZH negation particles. Both contribute MetaPolarNeg; // 沒 also implies past (MetaTensePast) by convention. func zhNegMarker() map[string]uint16 { return map[string]uint16{ "\xe4\xb8\x8d": MetaPolarNeg, // 不 (general negation) "\xe6\xb2\x92": MetaPolarNeg | MetaTensePast, // 沒 (past-negative / not-have) "\xe6\xb2\xa1": MetaPolarNeg | MetaTensePast, // 没 (simplified 沒) } } // zhCopula: the 是 marker. Acts like JA だ (predicative copula), staging // the next noun as MKCop on the preceding subject. const zhCopula = "\xe6\x98\xaf" // 是 // zhObjFronter: the 把 marker. Moves the following noun to pre-verb OBJ // position. Mirrors the OBJ-fronting in JA を, but applied positionally // rather than as a slot-following particle. const zhObjFronter = "\xe6\x8a\x8a" // 把 // ExtractZH takes pre-tokenized ZH tokens (space-segmented; from the Bible // corpus the segmentation is already in the input) and produces ExtractResult. // Initial version handles core SVO + the structural markers above. func ExtractZH(tokens []string) ExtractResult { var pat []byte var slots []string var roles []int32 var slotMarkers []uint8 var slotMorphs []uint16 var slotOblRoles []uint8 var slotHeads []int16 var slotModKinds []uint8 sawVerb := false pendingRole := HistSubject pendingMark := uint8(0) pendingOblRole := uint8(ORNone) pendingCop := false pendingCopHead := int16(-1) pendingObjFront := false var clausesZH []Clause clauseRelZH := ClauseRoot clauseParentZH := int16(-1) for _, tok := range tokens { // Clause-boundary token (synthetic 、 from tokenizer punctuation). if tok == "\xe3\x80\x81" { if len(slots) > 0 { if len(roles) > 0 && slotModKinds[len(roles)-1] != MKCop { if !sawVerb { roles[len(roles)-1] = HistVerb } } clauseSet := buildSetFromSlices( slots, roles, slotMorphs, slotMarkers, slotOblRoles, slotHeads, slotModKinds, ) nextParent := int16(len(clausesZH)) clausesZH = append(clausesZH, Clause{ Set: clauseSet, Relation: clauseRelZH, Parent: clauseParentZH, HostIdx: -1, }) slots = nil roles = nil slotMarkers = nil slotMorphs = nil slotOblRoles = nil slotHeads = nil slotModKinds = nil sawVerb = false pendingRole = HistSubject pendingMark = 0 pendingOblRole = ORNone pendingCop = false pendingCopHead = -1 pendingObjFront = false clauseRelZH = ClauseAnd clauseParentZH = nextParent - 1 } continue } // Structural marker dispatch. if mk, isMk := zhMarkerToMarker()[tok]; isMk { pat = append(pat, mk) switch mk { case MkNo: // 的 / 之: preceding slot is POSS modifier of next slot. // Same as JA の handling. if len(slots) > 0 { roles[len(roles)-1] = HistOperator slotMarkers[len(slotMarkers)-1] = mk slotOblRoles[len(slotOblRoles)-1] = ORPart } case MkIn: pendingRole = HistScope pendingMark = mk pendingOblRole = ORLoc case MkFrom: pendingRole = HistScope pendingMark = mk pendingOblRole = ORSource case MkTo_EN: // 给 / 給: recipient marker, similar to JA に in dative use. pendingRole = HistModifier pendingMark = mk pendingOblRole = ORRecip case MkBy: // 被: passive agent marker. pendingRole = HistModifier pendingMark = mk pendingOblRole = ORAgent case MkAnd: // Coordination: leave existing role on prior slot; the next // content word inherits the prior's role. // (Simple version - no MKCoord wiring yet.) } continue } // Aspect markers attach to the preceding verb's morph. if m, isAsp := zhAspectMarker()[tok]; isAsp { if len(slotMorphs) > 0 { slotMorphs[len(slotMorphs)-1] |= m } continue } // Negation markers attach to the upcoming verb. if m, isNeg := zhNegMarker()[tok]; isNeg { // Apply to the NEXT slot (verb position). Stage via the existing // morph-accumulation pattern: since we don't have a pendingNeg // here, we'll combine with the next emitted slot's morph by // using slotMorphs append-time logic. Simplest: track in a local. _ = m // Stage via a sentinel: append a zero morph and OR it onto // the next slot. Defer for v0; for now, ignore negation. continue } // Copula 是: stage MKCop for the next noun. if tok == zhCopula { subj := findSubjectIdx(roles, slotHeads) if subj >= 0 { pendingCop = true pendingCopHead = subj pendingRole = HistComplement } continue } // 把: OBJ-fronting. The next content word is the OBJ. if tok == zhObjFronter { pendingRole = HistObject pendingObjFront = true continue } // Content word: emit as a slot. pat = append(pat, SlotNoun) slots = append(slots, tok) role := pendingRole if !sawVerb && !pendingCop && !pendingObjFront && pendingMark == 0 { role = HistSubject } roles = append(roles, role) slotMorphs = append(slotMorphs, 0) slotMarkers = append(slotMarkers, pendingMark) slotOblRoles = append(slotOblRoles, pendingOblRole) slotHeads = append(slotHeads, -1) slotModKinds = append(slotModKinds, MKNone) newIdx := int16(len(slots) - 1) // Copula resolution. if pendingCop { slotHeads[newIdx] = pendingCopHead slotModKinds[newIdx] = MKCop pendingCop = false pendingCopHead = -1 } // Position-based verb detection: second content word in a clause // with no copula and no pre-positioned obj is the verb. if !sawVerb && !pendingCop && pendingMark == 0 && pendingOblRole == ORNone { // Heuristic: the second content word is the verb in SVO. contentSoFar := 0 for _, r := range roles { if r == HistSubject || r == HistVerb { contentSoFar++ } } if contentSoFar == 2 { // Demote this slot to HistVerb. roles[newIdx] = HistVerb sawVerb = true pendingRole = HistObject } } else if sawVerb && pendingMark == 0 && !pendingCop { pendingRole = HistObject } pendingMark = 0 pendingOblRole = ORNone pendingObjFront = false } // Finalize last clause. set := buildSetFromSlices( slots, roles, slotMorphs, slotMarkers, slotOblRoles, slotHeads, slotModKinds, ) clausesZH = append(clausesZH, Clause{ Set: set, Relation: clauseRelZH, Parent: clauseParentZH, HostIdx: -1, }) flatSlots := []string{:0:len(slots)} flatRoles := []int32{:0:len(slots)} for _, c := range clausesZH { for _, e := range c.Set { flatSlots = append(flatSlots, e.Atom) flatRoles = append(flatRoles, e.Role) } } return ExtractResult{ Pattern: pat, Slots: flatSlots, Roles: flatRoles, DeepPat: buildDeepPat(flatRoles), Set: clausesZH[0].Set, Discourse: clausesZH, } }