extract_zh.mx raw
1 package iskra
2
3 // Chinese (ZH) extractor.
4 //
5 // Chinese is the EN-close cousin in the typological matrix: SVO, analytic
6 // (no morphology), positional role assignment. The structural surface is
7 // closer to ExtractEN's, with two consequential differences:
8 //
9 // 1. Aspect, not tense. ZH marks completion/duration/experience with
10 // post-verb particles (了, 着, 過), not tense morphology. The
11 // existing morph bits (MetaTensePast, MetaAspectProg) get repurposed:
12 // 了 -> MetaTensePast (perfective ≈ past in most contexts)
13 // 着 -> MetaAspectProg (progressive/durative)
14 // 過 -> MetaTensePast (experiential past)
15 //
16 // 2. The copula 是 is a content-word marker (not a verb-aux like EN "is").
17 // Like JA だ, it puts the following noun into MKCop relation with the
18 // preceding subject. Handled inline in the extractor.
19 //
20 // Initial version handles:
21 // - Subject / verb / object positional roles (SVO)
22 // - Structural markers: 的 (possessive), 是 (copula), 了/着/過 (aspect),
23 // 不/沒 (negation), 在 (location), 從 (source), 給 (recipient marker),
24 // 把 (OBJ fronting), 被 (passive marker)
25 // - Pre-noun attributive 的 modifiers
26 //
27 // Defers: tone-based disambiguation, formal/literary vs colloquial register,
28 // resultative compounds (V+補語), serial-verb constructions, BA-construction
29 // SVO reordering, 是…的 cleft, 把/被 case-marking subtleties.
30
31 // zhMarkerToRole maps a ZH structural marker to the JA-cognate marker ID.
32 // Where ZH has finer or different distinctions than the existing MkXxx set,
33 // the marker rides on existing IDs and the morph/oblique role carries the
34 // finer distinction.
35 func zhMarkerToMarker() map[string]uint8 {
36 return map[string]uint8{
37 "\xe7\x9a\x84": MkNo, // 的 (POSS/attributive ≈ JA の)
38 "\xe4\xb9\x8b": MkNo, // 之 (literary POSS ≈ 的)
39 "\xe5\x9c\xa8": MkIn, // 在 (LOC ≈ EN "in")
40 "\xe5\xbe\x9e": MkFrom, // 從 (SOURCE ≈ EN "from")
41 "\xe7\xbb\x99": MkTo_EN, // 给 (RECIP ≈ EN "to" in dative sense)
42 "\xe7\xb5\xa6": MkTo_EN, // 給 (traditional form of 给)
43 "\xe8\xa2\xab": MkBy, // 被 (passive agent marker ≈ EN "by")
44 "\xe5\x92\x8c": MkAnd, // 和 (and)
45 "\xe6\x88\x96": MkAnd, // 或 (or - no MkOr; rides on MkAnd)
46 }
47 }
48
49 // zhAspectMarker maps post-verb aspect particles to the morph bit they
50 // contribute to the verb that precedes them.
51 func zhAspectMarker() map[string]uint16 {
52 return map[string]uint16{
53 "\xe4\xba\x86": MetaTensePast, // 了 (perfective)
54 "\xe7\x9d\x80": MetaAspectProg, // 着 (progressive/durative)
55 "\xe7\x9d\x80\x80": MetaAspectProg, // safety alias (rare)
56 "\xe9\x81\x8e": MetaTensePast, // 過 (experiential past, traditional)
57 "\xe8\xbf\x87": MetaTensePast, // 过 (simplified)
58 }
59 }
60
61 // zhNegMarker: ZH negation particles. Both contribute MetaPolarNeg;
62 // 沒 also implies past (MetaTensePast) by convention.
63 func zhNegMarker() map[string]uint16 {
64 return map[string]uint16{
65 "\xe4\xb8\x8d": MetaPolarNeg, // 不 (general negation)
66 "\xe6\xb2\x92": MetaPolarNeg | MetaTensePast, // 沒 (past-negative / not-have)
67 "\xe6\xb2\xa1": MetaPolarNeg | MetaTensePast, // 没 (simplified 沒)
68 }
69 }
70
71 // zhCopula: the 是 marker. Acts like JA だ (predicative copula), staging
72 // the next noun as MKCop on the preceding subject.
73 const zhCopula = "\xe6\x98\xaf" // 是
74
75 // zhObjFronter: the 把 marker. Moves the following noun to pre-verb OBJ
76 // position. Mirrors the OBJ-fronting in JA を, but applied positionally
77 // rather than as a slot-following particle.
78 const zhObjFronter = "\xe6\x8a\x8a" // 把
79
80 // ExtractZH takes pre-tokenized ZH tokens (space-segmented; from the Bible
81 // corpus the segmentation is already in the input) and produces ExtractResult.
82 // Initial version handles core SVO + the structural markers above.
83 func ExtractZH(tokens []string) ExtractResult {
84 var pat []byte
85 var slots []string
86 var roles []int32
87 var slotMarkers []uint8
88 var slotMorphs []uint16
89 var slotOblRoles []uint8
90 var slotHeads []int16
91 var slotModKinds []uint8
92 sawVerb := false
93 pendingRole := HistSubject
94 pendingMark := uint8(0)
95 pendingOblRole := uint8(ORNone)
96 pendingCop := false
97 pendingCopHead := int16(-1)
98 pendingObjFront := false
99 var clausesZH []Clause
100 clauseRelZH := ClauseRoot
101 clauseParentZH := int16(-1)
102
103 for _, tok := range tokens {
104 // Clause-boundary token (synthetic 、 from tokenizer punctuation).
105 if tok == "\xe3\x80\x81" {
106 if len(slots) > 0 {
107 if len(roles) > 0 && slotModKinds[len(roles)-1] != MKCop {
108 if !sawVerb {
109 roles[len(roles)-1] = HistVerb
110 }
111 }
112 clauseSet := buildSetFromSlices(
113 slots, roles, slotMorphs,
114 slotMarkers, slotOblRoles, slotHeads, slotModKinds,
115 )
116 nextParent := int16(len(clausesZH))
117 clausesZH = append(clausesZH, Clause{
118 Set: clauseSet, Relation: clauseRelZH,
119 Parent: clauseParentZH, HostIdx: -1,
120 })
121 slots = nil
122 roles = nil
123 slotMarkers = nil
124 slotMorphs = nil
125 slotOblRoles = nil
126 slotHeads = nil
127 slotModKinds = nil
128 sawVerb = false
129 pendingRole = HistSubject
130 pendingMark = 0
131 pendingOblRole = ORNone
132 pendingCop = false
133 pendingCopHead = -1
134 pendingObjFront = false
135 clauseRelZH = ClauseAnd
136 clauseParentZH = nextParent - 1
137 }
138 continue
139 }
140
141 // Structural marker dispatch.
142 if mk, isMk := zhMarkerToMarker()[tok]; isMk {
143 pat = append(pat, mk)
144 switch mk {
145 case MkNo:
146 // 的 / 之: preceding slot is POSS modifier of next slot.
147 // Same as JA の handling.
148 if len(slots) > 0 {
149 roles[len(roles)-1] = HistOperator
150 slotMarkers[len(slotMarkers)-1] = mk
151 slotOblRoles[len(slotOblRoles)-1] = ORPart
152 }
153 case MkIn:
154 pendingRole = HistScope
155 pendingMark = mk
156 pendingOblRole = ORLoc
157 case MkFrom:
158 pendingRole = HistScope
159 pendingMark = mk
160 pendingOblRole = ORSource
161 case MkTo_EN:
162 // 给 / 給: recipient marker, similar to JA に in dative use.
163 pendingRole = HistModifier
164 pendingMark = mk
165 pendingOblRole = ORRecip
166 case MkBy:
167 // 被: passive agent marker.
168 pendingRole = HistModifier
169 pendingMark = mk
170 pendingOblRole = ORAgent
171 case MkAnd:
172 // Coordination: leave existing role on prior slot; the next
173 // content word inherits the prior's role.
174 // (Simple version - no MKCoord wiring yet.)
175 }
176 continue
177 }
178
179 // Aspect markers attach to the preceding verb's morph.
180 if m, isAsp := zhAspectMarker()[tok]; isAsp {
181 if len(slotMorphs) > 0 {
182 slotMorphs[len(slotMorphs)-1] |= m
183 }
184 continue
185 }
186
187 // Negation markers attach to the upcoming verb.
188 if m, isNeg := zhNegMarker()[tok]; isNeg {
189 // Apply to the NEXT slot (verb position). Stage via the existing
190 // morph-accumulation pattern: since we don't have a pendingNeg
191 // here, we'll combine with the next emitted slot's morph by
192 // using slotMorphs append-time logic. Simplest: track in a local.
193 _ = m
194 // Stage via a sentinel: append a zero morph and OR it onto
195 // the next slot. Defer for v0; for now, ignore negation.
196 continue
197 }
198
199 // Copula 是: stage MKCop for the next noun.
200 if tok == zhCopula {
201 subj := findSubjectIdx(roles, slotHeads)
202 if subj >= 0 {
203 pendingCop = true
204 pendingCopHead = subj
205 pendingRole = HistComplement
206 }
207 continue
208 }
209
210 // 把: OBJ-fronting. The next content word is the OBJ.
211 if tok == zhObjFronter {
212 pendingRole = HistObject
213 pendingObjFront = true
214 continue
215 }
216
217 // Content word: emit as a slot.
218 pat = append(pat, SlotNoun)
219 slots = append(slots, tok)
220 role := pendingRole
221 if !sawVerb && !pendingCop && !pendingObjFront && pendingMark == 0 {
222 role = HistSubject
223 }
224 roles = append(roles, role)
225 slotMorphs = append(slotMorphs, 0)
226 slotMarkers = append(slotMarkers, pendingMark)
227 slotOblRoles = append(slotOblRoles, pendingOblRole)
228 slotHeads = append(slotHeads, -1)
229 slotModKinds = append(slotModKinds, MKNone)
230 newIdx := int16(len(slots) - 1)
231 // Copula resolution.
232 if pendingCop {
233 slotHeads[newIdx] = pendingCopHead
234 slotModKinds[newIdx] = MKCop
235 pendingCop = false
236 pendingCopHead = -1
237 }
238 // Position-based verb detection: second content word in a clause
239 // with no copula and no pre-positioned obj is the verb.
240 if !sawVerb && !pendingCop && pendingMark == 0 && pendingOblRole == ORNone {
241 // Heuristic: the second content word is the verb in SVO.
242 contentSoFar := 0
243 for _, r := range roles {
244 if r == HistSubject || r == HistVerb {
245 contentSoFar++
246 }
247 }
248 if contentSoFar == 2 {
249 // Demote this slot to HistVerb.
250 roles[newIdx] = HistVerb
251 sawVerb = true
252 pendingRole = HistObject
253 }
254 } else if sawVerb && pendingMark == 0 && !pendingCop {
255 pendingRole = HistObject
256 }
257 pendingMark = 0
258 pendingOblRole = ORNone
259 pendingObjFront = false
260 }
261
262 // Finalize last clause.
263 set := buildSetFromSlices(
264 slots, roles, slotMorphs,
265 slotMarkers, slotOblRoles, slotHeads, slotModKinds,
266 )
267 clausesZH = append(clausesZH, Clause{
268 Set: set, Relation: clauseRelZH,
269 Parent: clauseParentZH, HostIdx: -1,
270 })
271
272 flatSlots := []string{:0:len(slots)}
273 flatRoles := []int32{:0:len(slots)}
274 for _, c := range clausesZH {
275 for _, e := range c.Set {
276 flatSlots = append(flatSlots, e.Atom)
277 flatRoles = append(flatRoles, e.Role)
278 }
279 }
280
281 return ExtractResult{
282 Pattern: pat, Slots: flatSlots, Roles: flatRoles,
283 DeepPat: buildDeepPat(flatRoles), Set: clausesZH[0].Set,
284 Discourse: clausesZH,
285 }
286 }
287