extract.mx raw
1 package iskra
2
3 // UntranslatedMarker is the placeholder emitted by cross-language
4 // translation when no atom-link mapping exists for the source atom.
5 // Renderers emit it verbatim and skip inflection. The marker is ASCII-
6 // safe (no language-specific bytes) and visible in output for diagnosis.
7 const UntranslatedMarker = "[missing]"
8
9 // SetEntry represents one element of the sentence-as-set abstraction.
10 //
11 // Lossless canonical form: Role + Atom + Morph + Class + OblRole + Head + ModKind.
12 //
13 // Three-layer role schema:
14 //
15 // Role - core grammatical role (Subject/Object/Verb/Topic/etc) from
16 // the histogram. RRG macroroles.
17 // OblRole - thematic/oblique role (Goal/Loc/Instr/etc) for adjunct slots.
18 // Language-independent.
19 // Head/ModKind - structural relation to another entry in the Set. Entry is
20 // a top-level argument when Head=-1; otherwise it modifies the
21 // entry at index Head with relation ModKind (POSS, ATTR, etc).
22 //
23 // Head is an index into the current Set. Valid only when the slot order is
24 // the canonical extraction order for the language (modifier-before-head for
25 // POSS and ATTR in both JA and EN). If translation ever reorders entries, or
26 // if relative-clause modifiers introduce post-head ordering, switch to stable
27 // IDs.
28 type SetEntry struct {
29 Role int32 // macrorole (Subject/Object/Verb/Topic/...)
30 Atom string // region center (lemma/stem)
31 Morph uint16 // tense|aspect|polarity|formality|number|def|mood|3sg|passive|causative|...
32 Class uint8 // verb class (for verbs): 1=ichidan, 2-10=godan variants
33 Mark uint8 // original particle/preposition (within-language preservation)
34 OblRole uint8 // thematic role: Goal/Loc/Instr/etc. ORNone for core args.
35 Head int16 // index of head entry; -1 if top-level argument
36 ModKind uint8 // modification kind: MKNone (top), MKPoss, MKAttr, MKRel, MKApp
37 }
38
39 // Modification kinds (for Head/ModKind nesting).
40 //
41 // Layered semantics:
42 // - POSS/ATTR: structural modifier of another argument (Head points at the
43 // modified entry). The modifier doesn't have a clause-level role on its own.
44 // - COP: copular predicate of the subject. The complement asserts an identity
45 // or attribution about the entry at Head. No verb slot exists in the clause.
46 // The complement's Morph carries tense/aspect/polite (だ vs です vs だった).
47 // - REL/APP: reserved.
48 const (
49 MKNone uint8 = 0
50 MKPoss uint8 = 1 // possessive (私の魚 / my fish)
51 MKAttr uint8 = 2 // attributive (赤い車 / red car) - adjective modifies noun
52 MKCop uint8 = 3 // copular predicate (学生だ / is a student); Head = subject
53 MKAdv uint8 = 4 // adverbial (速く走る / runs fast); Head = verb
54 MKCoord uint8 = 5 // coordination peer (猫と犬 / cats and dogs); Head = first conjunct
55 MKRel uint8 = 6 // relative clause; modifier verb's Head=host noun
56 MKApp uint8 = 7 // RESERVED: apposition; not yet implemented
57 MKAdj uint8 = 8 // predicative adjective (面白い / is interesting); Head=subject
58 )
59
60 // Oblique role values. Language-independent thematic relations.
61 // Use these for adjunct slots; ORNone means the slot's role is purely macro
62 // (Subject/Object/Verb).
63 const (
64 ORNone uint8 = 0
65 ORGoal uint8 = 1 // to, へ, に-motion
66 ORLoc uint8 = 2 // in/on/at, で-location, に-stative
67 ORSource uint8 = 3 // from, から
68 ORLimit uint8 = 4 // until, まで
69 ORInstr uint8 = 5 // with-instrument, で-instrumental
70 ORComit uint8 = 6 // with-companion, と
71 ORBenef uint8 = 7 // for, のために
72 ORAgent uint8 = 8 // by, によって (passive agent)
73 ORRecip uint8 = 9 // to-recipient, に-dative
74 ORPart uint8 = 10 // of, の-partitive/genitive
75 ORCompare uint8 = 11 // than, より (standard of comparison)
76 )
77
78 // ExtractResult is the output of pattern extraction from a token sequence.
79 //
80 // Set vs Discourse: Set holds the root clause's role-set; Discourse holds all
81 // clauses (root + subordinate/coord). For single-clause sentences, Discourse
82 // has exactly one element whose Set == ExtractResult.Set. Multi-clause
83 // sentences (clause coordination, conditional, relative clause) populate
84 // Discourse with multiple Clause entries.
85 type ExtractResult struct {
86 Pattern []byte // encoded pattern (markers + slots)
87 Slots []string // content words filling each slot (surface forms)
88 Roles []int32 // hist index for each slot (assigned by following marker)
89 DeepPat []uint8 // canonical role sequence (sorted, normalized)
90 Set []SetEntry // root-clause role-set (== Discourse[0].Set when populated)
91 Discourse []Clause // all clauses; len 1 for single-clause inputs
92 }
93
94 // Clause is one complete predication within a Discourse.
95 //
96 // Single-clause sentences produce one Clause with Relation=ClauseRoot,
97 // Parent=-1, HostIdx=-1. Multi-clause sentences add more Clauses with
98 // Relation/Parent/HostIdx specifying how each subordinate or peer clause
99 // relates to its anchor.
100 type Clause struct {
101 Set []SetEntry // role-set of this clause (modifier nesting etc. live inside)
102 Relation ClauseRelation // how this clause relates to its Parent
103 Parent int16 // index of parent clause in Discourse.Clauses; -1 for root
104 HostIdx int16 // for REL: index of modified entry in parent's Set; -1 otherwise
105 }
106
107 // ClauseRelation enumerates inter-clause relations in a Discourse.
108 //
109 // Asymmetric relations (IF, BECAUSE, REL) point from the subordinate clause
110 // to its parent. Peer relations (AND, OR, BUT) point from the second clause
111 // to the first; commutativity is implicit at the semantic level.
112 type ClauseRelation uint8
113
114 const (
115 ClauseRoot ClauseRelation = 0 // root clause; no parent
116 ClauseAnd ClauseRelation = 1 // X and Y - peer
117 ClauseOr ClauseRelation = 2 // X or Y - peer
118 ClauseBut ClauseRelation = 3 // X but Y - peer with contrast
119 ClauseIf ClauseRelation = 4 // if X (then parent) - condition
120 ClauseBecause ClauseRelation = 5 // because X (then parent) - cause
121 ClauseRel ClauseRelation = 6 // relative clause modifying parent.Set[HostIdx]
122 )
123
124 // ExtractJA takes JA tokens (already split on particles) and produces
125 // the structural pattern + content slots.
126 func ExtractJA(tokens []string) ExtractResult {
127 var pat []byte
128 var slots []string
129 var roles []int32
130 var slotMarkers []uint8
131 var slotMorphs []uint16
132 var slotOblRoles []uint8
133 var slotHeads []int16
134 var slotModKinds []uint8
135 pendingRole := HistVerb
136 pendingHead := int16(-1)
137 pendingModKind := uint8(MKNone)
138 pendingCoordHeadJA := int16(-1)
139 // Multi-clause accumulator for JA. Comma 、 signals clause boundary.
140 var clausesJA []Clause
141 clauseRelJA := ClauseRoot
142 clauseParentJA := int16(-1)
143 // Skip-tokens index for the の-relational-noun-に locative compound:
144 // when the pattern is detected at の, we consume the next two tokens
145 // (the relational noun + に) and apply ORLoc to the preceding base noun.
146 skipUntilJA := -1
147
148 for i, tok := range tokens {
149 if i <= skipUntilJA {
150 continue
151 }
152 // もし at clause start signals a conditional clause (ClauseIf).
153 // Consume it and mark the current clause's relation.
154 if len(slots) == 0 && tok == "\xe3\x82\x82\xe3\x81\x97" {
155 clauseRelJA = ClauseIf
156 continue
157 }
158 // JA comma 、 (E3 80 81) signals a clause boundary. Finalize the
159 // current clause, reset per-slot state, mark next clause as ClauseAnd.
160 if tok == "\xe3\x80\x81" {
161 if len(slots) > 0 {
162 lastIdx := len(slots) - 1
163 // Apply same predicate-shape detection as end-of-input does:
164 // copula strip (学生だ → student MKCop) then predicate-i-adj
165 // (面白い → MKAdj) then fall back to last-slot=HistVerb.
166 appliedPred := false
167 if slotModKinds[lastIdx] != MKCop {
168 if stripped, ok, copMorph := stripJACopula(slots[lastIdx]); ok && len(stripped) > 0 {
169 slots[lastIdx] = stripped
170 slotMorphs[lastIdx] |= copMorph
171 slotModKinds[lastIdx] = MKCop
172 slotHeads[lastIdx] = findSubjectIdx(roles, slotHeads)
173 roles[lastIdx] = HistComplement
174 appliedPred = true
175 } else if endsInIKana(slots[lastIdx]) && len(slots[lastIdx]) > 3 &&
176 !endsInNaiSuffix(slots[lastIdx]) && !endsInTaiSuffix(slots[lastIdx]) &&
177 slotHeads[lastIdx] < 0 && slotModKinds[lastIdx] == MKNone {
178 slotModKinds[lastIdx] = MKAdj
179 slotHeads[lastIdx] = findSubjectIdx(roles, slotHeads)
180 roles[lastIdx] = HistComplement
181 appliedPred = true
182 }
183 }
184 if !appliedPred && len(roles) > 0 && slotModKinds[lastIdx] != MKCop {
185 roles[lastIdx] = HistVerb
186 }
187 clauseSet := buildSetFromSlices(
188 slots, roles, slotMorphs,
189 slotMarkers, slotOblRoles, slotHeads, slotModKinds,
190 )
191 nextParent := int16(len(clausesJA))
192 clausesJA = append(clausesJA, Clause{
193 Set: clauseSet, Relation: clauseRelJA,
194 Parent: clauseParentJA, HostIdx: -1,
195 })
196 slots = nil
197 roles = nil
198 slotMarkers = nil
199 slotMorphs = nil
200 slotOblRoles = nil
201 slotHeads = nil
202 slotModKinds = nil
203 pendingRole = HistVerb
204 pendingHead = -1
205 pendingModKind = MKNone
206 pendingCoordHeadJA = -1
207 clauseRelJA = ClauseAnd
208 clauseParentJA = nextParent - 1
209 }
210 continue
211 }
212 mk, isMk := jaParticleToMarker()[tok]
213 if isMk {
214 pat = append(pat, mk)
215 // Synthetic morph markers attach to PRECEDING slot's morph.
216 switch mk {
217 case MkDef:
218 if len(slotMorphs) > 0 {
219 slotMorphs[len(slotMorphs)-1] |= MetaDefDef
220 }
221 continue
222 case MkPlural:
223 if len(slotMorphs) > 0 {
224 slotMorphs[len(slotMorphs)-1] |= MetaNumPlural
225 }
226 continue
227 case Mk3Sg:
228 if len(slotMorphs) > 0 {
229 slotMorphs[len(slotMorphs)-1] |= Meta3Sg
230 }
231 continue
232 case MkCopula:
233 // Copula marker - reserved.
234 continue
235 case MkNo:
236 // Locative-compound disambiguation: の followed by a relational
237 // noun (中/上/下/前/後/横/隣/間/内/外) and then に collapses to
238 // an ORLoc oblique on the base noun. 箱の中に = "in the box";
239 // the 中 (inside) is implicit in ORLoc, and the prior 箱 takes
240 // the locative role. Skip the next two tokens (relNoun + に).
241 if i+2 < len(tokens) && len(slots) > 0 &&
242 isJARelationalNoun(tokens[i+1]) &&
243 tokens[i+2] == "\xe3\x81\xab" {
244 lastIdx := len(slots) - 1
245 if lastIdx < len(slotOblRoles) {
246 slotOblRoles[lastIdx] = jaRelationalNounToOblRole(tokens[i+1])
247 }
248 if lastIdx < len(roles) {
249 roles[lastIdx] = HistScope
250 }
251 if lastIdx < len(slotMarkers) {
252 slotMarkers[lastIdx] = MkNi
253 }
254 skipUntilJA = i + 2
255 continue
256 }
257 // の: preceding slot is a POSS modifier of the next slot.
258 if len(slots) > 0 {
259 pendingHead = int16(len(slots) - 1)
260 pendingModKind = MKPoss
261 }
262 if len(roles) > 0 {
263 roles[len(roles)-1] = MarkerToRole(mk)
264 if len(slotMarkers) == len(slots) {
265 slotMarkers[len(slotMarkers)-1] = mk
266 }
267 }
268 continue
269 case MkDe:
270 // で is ambiguous: locative/instrumental particle (家で本を読む
271 // = "read book at home") or te-form of copula で joining two
272 // copular clauses (学生で彼は先生だ = "[I'm a student] and
273 // [he is a teacher]"). Disambiguator: if で is followed by
274 // [noun][は/が], it's te-copula clause-coord.
275 if i+2 < len(tokens) && len(slots) > 0 {
276 next2 := tokens[i+2]
277 if next2 == "\xe3\x81\xaf" || next2 == "\xe3\x81\x8c" {
278 // Te-form copula: mark the preceding noun as MKCop
279 // predicate of the current clause's subject, then
280 // finalize the clause and start a new one.
281 lastIdx := len(slots) - 1
282 slotModKinds[lastIdx] = MKCop
283 slotHeads[lastIdx] = findSubjectIdx(roles, slotHeads)
284 roles[lastIdx] = HistComplement
285 clauseSet := buildSetFromSlices(
286 slots, roles, slotMorphs,
287 slotMarkers, slotOblRoles, slotHeads, slotModKinds,
288 )
289 nextParent := int16(len(clausesJA))
290 clausesJA = append(clausesJA, Clause{
291 Set: clauseSet, Relation: clauseRelJA,
292 Parent: clauseParentJA, HostIdx: -1,
293 })
294 slots = nil
295 roles = nil
296 slotMarkers = nil
297 slotMorphs = nil
298 slotOblRoles = nil
299 slotHeads = nil
300 slotModKinds = nil
301 pendingRole = HistVerb
302 pendingHead = -1
303 pendingModKind = MKNone
304 pendingCoordHeadJA = -1
305 clauseRelJA = ClauseAnd
306 clauseParentJA = nextParent - 1
307 continue
308 }
309 }
310 // Fall through to default for instrumental/locative で.
311 case MkTo:
312 // と is ambiguous: comitative (友達と) or coordination
313 // (猫と犬). Coord heuristic: preceding slot has no Mark yet.
314 // For chained coord (X と Y と Z), all peers point at the
315 // FIRST conjunct, not the previous one - walk up the chain.
316 if len(slots) > 0 && len(slotMarkers) == len(slots) &&
317 (slotMarkers[len(slotMarkers)-1] == 0 ||
318 slotMarkers[len(slotMarkers)-1] == MkTo) {
319 prev := int16(len(slots) - 1)
320 if slotModKinds[prev] == MKCoord {
321 prev = slotHeads[prev]
322 }
323 pendingCoordHeadJA = prev
324 if slotMarkers[len(slotMarkers)-1] == 0 {
325 slotMarkers[len(slotMarkers)-1] = mk
326 }
327 continue
328 }
329 // Fall through to default marker handling for comitative.
330 }
331 if len(roles) > 0 {
332 newRole := MarkerToRole(mk)
333 roles[len(roles)-1] = newRole
334 // Propagate the role backward through any coord chain so
335 // the head conjunct gets the same role as the particle-marked
336 // conjunct (猫と犬が = both subjects, marked via が on 犬).
337 j := len(roles) - 1
338 for j > 0 && slotHeads[j] >= 0 && slotModKinds[j] == MKCoord {
339 j = int32(slotHeads[j])
340 roles[j] = newRole
341 }
342 if len(slotMarkers) == len(slots) {
343 // Keep the existing と Mark on the coord head; only update
344 // non-coord-marker slots' Mark.
345 if slotMarkers[len(slotMarkers)-1] != MkTo {
346 slotMarkers[len(slotMarkers)-1] = mk
347 }
348 }
349 if len(slotOblRoles) == len(slots) {
350 if or := MarkerToOblRole(mk); or != ORNone {
351 slotOblRoles[len(slotOblRoles)-1] = or
352 }
353 }
354 }
355 if i < len(tokens)-1 {
356 pendingRole = HistVerb
357 }
358 } else {
359 pat = append(pat, SlotNoun)
360 slots = append(slots, tok)
361 roles = append(roles, pendingRole)
362 slotMarkers = append(slotMarkers, 0)
363 slotMorphs = append(slotMorphs, 0)
364 slotOblRoles = append(slotOblRoles, ORNone)
365 slotHeads = append(slotHeads, -1)
366 slotModKinds = append(slotModKinds, MKNone)
367 newIdx := int16(len(slots) - 1)
368 // Temporal-noun adverbial: 昨日/今日/明日/etc. - sentence-initial
369 // adjuncts that surface as bare nouns but semantically modify the
370 // clause's verb. Mark as MKAdv with head=-1; resolved at the
371 // final-pass below (binds to the verb slot once it's identified).
372 if isJATemporalNoun(tok) {
373 roles[newIdx] = HistModifier
374 slotModKinds[newIdx] = MKAdv
375 slotHeads[newIdx] = -1
376 }
377 // ば-ending token signals conditional clause: mark this clause
378 // as ClauseIf. The lemmatizer strips ば from the verb separately.
379 if len(tok) >= 3 {
380 tb := []byte(tok)
381 if tb[len(tb)-3] == 0xe3 && tb[len(tb)-2] == 0x81 && tb[len(tb)-1] == 0xb0 {
382 clauseRelJA = ClauseIf
383 }
384 }
385 // Coordination resolution: と connected this slot to the previous.
386 if pendingCoordHeadJA >= 0 && pendingCoordHeadJA < newIdx {
387 slotHeads[newIdx] = pendingCoordHeadJA
388 slotModKinds[newIdx] = MKCoord
389 roles[newIdx] = roles[pendingCoordHeadJA]
390 pendingCoordHeadJA = -1
391 }
392 // Resolve pending POSS modifier from a preceding の.
393 if pendingHead >= 0 && pendingHead < newIdx {
394 slotHeads[pendingHead] = newIdx
395 slotModKinds[pendingHead] = pendingModKind
396 pendingHead = -1
397 pendingModKind = MKNone
398 }
399 // ATTR detection: i-adjective immediately preceding this noun
400 // (no particle between them - we'd have continued out via the
401 // marker branch otherwise). Heuristic: previous slot's atom ends
402 // in い with no intervening particle, and previous slot didn't
403 // already get a modifier role from a particle. Known false
404 // positives: な-adjectives ending in い (きれい), nouns ending
405 // in い (兄). Logged limitation; not silently corrupted because
406 // the comparison metric will catch any resulting drift.
407 if newIdx >= 1 {
408 prev := newIdx - 1
409 if slotHeads[prev] < 0 && slotMarkers[prev] == 0 {
410 prevAtom := slots[prev]
411 if endsInIKana(prevAtom) &&
412 !endsInNaiSuffix(prevAtom) && !endsInTaiSuffix(prevAtom) {
413 slotHeads[prev] = newIdx
414 slotModKinds[prev] = MKAttr
415 } else if endsInKuKana(prevAtom) {
416 slotHeads[prev] = newIdx
417 slotModKinds[prev] = MKAdv
418 } else if isJABareKanjiAdj(prevAtom) {
419 slotHeads[prev] = newIdx
420 slotModKinds[prev] = MKAttr
421 } else if endsInTaKana(prevAtom) {
422 // た-form REL: 食べた猫 (the cat that ate). A past-tense
423 // verb immediately preceding a noun (no particle) is a
424 // relative-clause predicate modifying the noun.
425 slotHeads[prev] = newIdx
426 slotModKinds[prev] = MKRel
427 roles[prev] = HistModifier
428 }
429 }
430 }
431 pendingRole = HistVerb
432 }
433 }
434
435 // Copula detection: if the last slot is a noun ending in だ/です/だった/でした,
436 // it's a copular predicate, not a verb. Strip the copula suffix, mark the
437 // slot with MKCop, point Head at the subject, and DO NOT apply the
438 // last-slot-verb-override.
439 copulaApplied := false
440 if len(slots) > 0 {
441 lastIdx := len(slots) - 1
442 if stripped, ok, copMorph := stripJACopula(slots[lastIdx]); ok {
443 // Verify the stripped result isn't a verb-like stem.
444 // (A verb past form like 食べた must keep た as past suffix, not
445 // be treated as copula. The disambiguator: if stripping leaves
446 // only hiragana that looks like a verb stem, skip copula.)
447 // For now, accept any non-empty stripped result on the last slot.
448 if len(stripped) > 0 {
449 slots[lastIdx] = stripped
450 slotMorphs[lastIdx] |= copMorph
451 slotModKinds[lastIdx] = MKCop
452 slotHeads[lastIdx] = findSubjectIdx(roles, slotHeads)
453 roles[lastIdx] = HistComplement
454 copulaApplied = true
455 }
456 }
457 }
458
459 // Locative-existence copula: if the final verb lemmatizes to いる/ある
460 // and the clause has an ORLoc-marked slot, the いる is the existence
461 // verb (be located). Promote the locative slot to MKCop (parallel to EN
462 // "is in X" representation: [X SCOPE ORLoc MKCop h=subj]) and drop the
463 // existence verb - its semantics is absorbed by the copular link.
464 if !copulaApplied && len(slots) > 1 {
465 lastIdx := len(slots) - 1
466 lastAtom := slots[lastIdx]
467 // いる stem after lemmatization is い (ichidan); ある stem is あ.
468 lem := LemmatizeJA(lastAtom, true)
469 if (lem.Lemma == "\xe3\x81\x84" || lem.Lemma == "\xe3\x81\x82") &&
470 lem.Class == VClassIchidan {
471 for i := 0; i < lastIdx; i++ {
472 if i < len(slotOblRoles) && slotOblRoles[i] == ORLoc {
473 slotModKinds[i] = MKCop
474 slotHeads[i] = findSubjectIdx(roles, slotHeads)
475 // Inherit both the verb's lemma morph AND any synthetic
476 // morph markers attached to the verb slot (Meta3Sg via ◯).
477 slotMorphs[i] |= lem.Morph | slotMorphs[lastIdx]
478 // Drop the existence-verb slot.
479 slots = slots[:lastIdx]
480 roles = roles[:lastIdx]
481 slotMarkers = slotMarkers[:lastIdx]
482 slotMorphs = slotMorphs[:lastIdx]
483 slotOblRoles = slotOblRoles[:lastIdx]
484 slotHeads = slotHeads[:lastIdx]
485 slotModKinds = slotModKinds[:lastIdx]
486 copulaApplied = true
487 break
488 }
489 }
490 }
491 }
492 // Predicative i-adjective detection: when the last slot ends in い and the
493 // copula stripper didn't fire, treat it as an adjectival predicate
494 // (面白い / "is interesting"). Set ModKind=MKAdj, Head=subject, role=
495 // Complement. The default last-slot=HistVerb override below is skipped.
496 // False positives: な-adjectives ending in い (きれい), nouns ending in い
497 // (兄). Accepted limitation, same profile as ATTR detection.
498 predAdjApplied := false
499 if !copulaApplied && len(slots) > 0 {
500 lastIdx := len(slots) - 1
501 atom := slots[lastIdx]
502 if slotHeads[lastIdx] < 0 && slotModKinds[lastIdx] == MKNone &&
503 endsInIKana(atom) && len(atom) > 3 &&
504 !endsInNaiSuffix(atom) && !endsInTaiSuffix(atom) {
505 slotModKinds[lastIdx] = MKAdj
506 slotHeads[lastIdx] = findSubjectIdx(roles, slotHeads)
507 roles[lastIdx] = HistComplement
508 // Comparative: if any earlier slot is ORCompare-marked (より),
509 // the predicate adjective carries MetaCompare.
510 for i := 0; i < lastIdx; i++ {
511 if i < len(slotOblRoles) && slotOblRoles[i] == ORCompare {
512 slotMorphs[lastIdx] |= MetaCompare
513 break
514 }
515 }
516 predAdjApplied = true
517 }
518 }
519 if !copulaApplied && !predAdjApplied && len(roles) > 0 {
520 roles[len(roles)-1] = HistVerb
521 }
522
523 // Passive/causative agent reinterpretation: に defaults to ORLoc (locative)
524 // or ORGoal (motion goal) depending on verb semantics. When the final verb
525 // carries MetaPassive, a に-marked slot is the agent ("by X") - flip its
526 // OblRole to ORAgent and role to HistModifier so EN renders it as "by X".
527 // MetaCausative reinterprets に-marked slot as the causer-agent similarly.
528 // Pre-compute the verb morph by lemmatizing here; buildSetFromSlices below
529 // will re-do the same lemmatization, but the cost is one extra strip on the
530 // last slot - cheap compared to scanning every slot for に at render time.
531 if !copulaApplied && !predAdjApplied && len(slots) > 0 {
532 lastIdx := len(slots) - 1
533 verbLem := LemmatizeJA(slots[lastIdx], true)
534 verbMorph := verbLem.Morph
535 if lastIdx < len(slotMorphs) {
536 verbMorph |= slotMorphs[lastIdx]
537 }
538 if verbMorph&(MetaPassive|MetaCausative) != 0 {
539 for i := 0; i < len(slots); i++ {
540 if i < len(slotMarkers) && slotMarkers[i] == MkNi {
541 if i < len(slotOblRoles) {
542 slotOblRoles[i] = ORAgent
543 }
544 if i < len(roles) {
545 roles[i] = HistModifier
546 }
547 }
548 }
549 } else if isJADitransitive(verbLem.Lemma) {
550 // Ditransitive: に-marked slot is the recipient (彼に本をあげる).
551 // Flip from default ORGoal/ORLoc to ORRecip; role HistScope to
552 // HistModifier for cross-language parity with EN ditransitive
553 // extraction.
554 for i := 0; i < len(slots); i++ {
555 if i < len(slotMarkers) && slotMarkers[i] == MkNi {
556 if i < len(slotOblRoles) {
557 slotOblRoles[i] = ORRecip
558 }
559 if i < len(roles) {
560 roles[i] = HistModifier
561 }
562 }
563 }
564 }
565 }
566
567 // Temporal-adverbial binding: any slot marked MKAdv with head=-1 from
568 // the temporal-noun detection above gets bound to the clause's verb
569 // (last slot if !copulaApplied && !predAdjApplied, otherwise no binding
570 // since copular/adjectival clauses have no verb to modify).
571 if !copulaApplied && !predAdjApplied && len(slots) > 0 {
572 verbIdx := int16(len(slots) - 1)
573 for i := 0; i < int32(verbIdx); i++ {
574 if i < len(slotModKinds) && slotModKinds[i] == MKAdv &&
575 i < len(slotHeads) && slotHeads[i] < 0 {
576 slotHeads[i] = verbIdx
577 }
578 }
579 }
580
581 // Modifier role propagation: ATTR modifiers copy their head noun's role
582 // (EN gives "red" the same role as "car"); ADV modifiers get HistModifier.
583 for i := 0; i < len(slots); i++ {
584 if i < len(slotModKinds) && i < len(slotHeads) && slotHeads[i] >= 0 {
585 h := int32(slotHeads[i])
586 switch slotModKinds[i] {
587 case MKAttr:
588 if h < len(roles) {
589 roles[i] = roles[h]
590 }
591 case MKAdv:
592 roles[i] = HistModifier
593 }
594 }
595 }
596
597 var set []SetEntry
598 for i, word := range slots {
599 role := HistComplement
600 if i < len(roles) {
601 role = roles[i]
602 }
603 // Modifier entries (Head>=0) skip verb lemmatization regardless of role-
604 // override at sentence-final position. A POSS/ATTR modifier on the
605 // last slot is still a modifier, not the clause's verb. Exception:
606 // MKRel modifiers ARE verbs (relative-clause predicates) and must be
607 // lemmatized to recover tense morph + verb class.
608 isVerb := role == HistVerb
609 head := int16(-1)
610 modKind := uint8(MKNone)
611 if i < len(slotHeads) {
612 head = slotHeads[i]
613 }
614 if i < len(slotModKinds) {
615 modKind = slotModKinds[i]
616 }
617 if modKind != MKNone && modKind != MKRel {
618 isVerb = false
619 }
620 if modKind == MKRel {
621 isVerb = true
622 }
623 lem := LemmatizeJA(word, isVerb)
624 mark := uint8(0)
625 extraMorph := uint16(0)
626 obl := uint8(ORNone)
627 if i < len(slotMarkers) {
628 mark = slotMarkers[i]
629 }
630 if i < len(slotMorphs) {
631 extraMorph = slotMorphs[i]
632 }
633 if i < len(slotOblRoles) {
634 obl = slotOblRoles[i]
635 }
636 set = append(set, SetEntry{
637 Role: role, Atom: lem.Lemma, Morph: lem.Morph | extraMorph,
638 Class: lem.Class, Mark: mark, OblRole: obl,
639 Head: head, ModKind: modKind,
640 })
641 }
642
643 // Finalize the last (or only) clause.
644 clausesJA = append(clausesJA, Clause{
645 Set: set, Relation: clauseRelJA,
646 Parent: clauseParentJA, HostIdx: -1,
647 })
648
649 // Flatten Slots/Roles across all clauses (see ExtractEN for rationale).
650 flatSlotsJA := []string{:0:len(slots)}
651 flatRolesJA := []int32{:0:len(slots)}
652 for _, c := range clausesJA {
653 for _, e := range c.Set {
654 flatSlotsJA = append(flatSlotsJA, e.Atom)
655 flatRolesJA = append(flatRolesJA, e.Role)
656 }
657 }
658
659 return ExtractResult{
660 Pattern: pat, Slots: flatSlotsJA, Roles: flatRolesJA,
661 DeepPat: buildDeepPat(flatRolesJA), Set: clausesJA[0].Set,
662 Discourse: clausesJA,
663 }
664 }
665
666 // findSubjectIdx returns the index of the entry that should serve as the
667 // subject of a copular predicate: the first entry whose role is Topic or
668 // Subject and that is not itself a modifier (Head=-1).
669 // Returns -1 if no candidate found.
670 func findSubjectIdx(roles []int32, heads []int16) int16 {
671 for i, r := range roles {
672 if i >= len(heads) {
673 break
674 }
675 if heads[i] >= 0 {
676 continue // modifier, skip
677 }
678 if r == HistTopic || r == HistSubject {
679 return int16(i)
680 }
681 }
682 return -1
683 }
684
685 // buildSetFromSlices converts parallel per-slot slices into a []SetEntry.
686 // Used by ExtractEN both at clause boundaries and end-of-input.
687 func buildSetFromSlices(
688 slots []string, roles []int32, slotMorphs []uint16,
689 slotMarks, slotOblRoles []uint8, slotHeads []int16, slotModKinds []uint8,
690 ) []SetEntry {
691 // Role propagation before set construction: ATTR copies head's role,
692 // ADV gets HistModifier. Mutates roles[] in place (caller's slice).
693 for i := 0; i < len(slots); i++ {
694 if i < len(slotModKinds) && i < len(slotHeads) && slotHeads[i] >= 0 {
695 h := int32(slotHeads[i])
696 switch slotModKinds[i] {
697 case MKAttr:
698 if h < len(roles) {
699 roles[i] = roles[h]
700 }
701 case MKAdv:
702 roles[i] = HistModifier
703 }
704 }
705 }
706 var set []SetEntry
707 for i, lemma := range slots {
708 role := HistComplement
709 if i < len(roles) {
710 role = roles[i]
711 }
712 m := uint16(0)
713 if i < len(slotMorphs) {
714 m = slotMorphs[i]
715 }
716 mark := uint8(0)
717 if i < len(slotMarks) {
718 mark = slotMarks[i]
719 }
720 obl := uint8(ORNone)
721 if i < len(slotOblRoles) {
722 obl = slotOblRoles[i]
723 }
724 head := int16(-1)
725 modKind := uint8(MKNone)
726 if i < len(slotHeads) {
727 head = slotHeads[i]
728 }
729 if i < len(slotModKinds) {
730 modKind = slotModKinds[i]
731 }
732 set = append(set, SetEntry{
733 Role: role, Atom: lemma, Morph: m, Class: 0, Mark: mark, OblRole: obl,
734 Head: head, ModKind: modKind,
735 })
736 }
737 return set
738 }
739
740 // hasPredication returns true when the current per-slot state contains a
741 // completed predication (a real verb slot or a copular complement).
742 // Used to disambiguate "and" between NP coordination and clause coordination.
743 func hasPredication(roles []int32, slotModKinds []uint8) bool {
744 for i, r := range roles {
745 if r == HistVerb {
746 return true
747 }
748 if i < len(slotModKinds) && slotModKinds[i] == MKCop {
749 return true
750 }
751 }
752 return false
753 }
754
755 // ExtractEN takes EN tokens and produces pattern + slots.
756 // Handles determiners (the/a) as morph hints, prepositions as role markers,
757 // verb auxiliaries as fillers of the verb slot, and pronouns as content nouns.
758 func ExtractEN(tokens []string) ExtractResult {
759 var pat []byte
760 var slots []string
761 var roles []int32
762 var slotMorphs []uint16
763 var slotMarks []uint8
764 var slotOblRoles []uint8
765 var slotHeads []int16
766 var slotModKinds []uint8
767
768 sawVerb := false
769 contentCount := 0
770 pendingRole := HistSubject
771 pendingDef := false
772 pendingNeg := false
773 pendingMark := uint8(0)
774 pendingOblRole := uint8(ORNone)
775 pendingHead := int16(-1)
776 pendingModKind := uint8(MKNone)
777 pendingCop := false
778 pendingCopHead := int16(-1)
779 pendingCopMorph := uint16(0)
780 // Track whether the staged copula aux is a form of "be" (vs do/have/will).
781 // Only be-aux + past-participle yields passive voice; do/will + bare-V
782 // is do-support or modal, not passive.
783 pendingCopAuxIsBe := false
784 // Volitional state: "let X V" / "let's V" means the V slot is volitional.
785 pendingVol := false
786 // Coordination state: when "and"/"or" is seen between content words, the
787 // next content word becomes a MKCoord peer of the most recent content.
788 pendingCoordHead := int16(-1)
789 // REL state: when "that/which/who" follows a noun, the next emitted verb
790 // becomes a MKRel modifier of that noun (intransitive REL only - flat-Set
791 // representation; transitive REL with its own subject needs sub-clauses).
792 pendingRelHead := int16(-1)
793 // Ditransitive state: when a verb in the enDitransitive set is emitted,
794 // the FIRST bare-NP (no preceding preposition) is the candidate recipient.
795 // It only commits to ORRecip when a SECOND object follows; otherwise the
796 // single object is a plain patient (read a book, write a letter, etc.).
797 pendingRecipCand := false
798 pendingRecipIdx := int16(-1)
799 // Causative state: when "make"/"let" appears as the first verb and a
800 // bare-V follows after an NP (made me wait, let him go), the auxiliary
801 // is not emitted. Instead MetaCausative is staged for the embedded verb
802 // emission. The intervening NP is the causee, emitted as plain object.
803 pendingCausative := false
804 pendingCausativeMorph := uint16(0)
805 // resetPending: single reset point for ALL pending-state variables above.
806 // Every clause-boundary site (comma/semicolon split, "and"/"or"/"but"
807 // clause-coord, subordinator boundary, second-verb-no-comma boundary)
808 // calls this. Adding a new pending var requires exactly one new line
809 // here - never twelve scattered reset blocks to keep in sync.
810 resetPending := func() {
811 pendingRole = HistSubject
812 pendingDef = false
813 pendingNeg = false
814 pendingMark = 0
815 pendingOblRole = ORNone
816 pendingHead = -1
817 pendingModKind = MKNone
818 pendingCop = false
819 pendingCopHead = -1
820 pendingCopMorph = 0
821 pendingCopAuxIsBe = false
822 pendingVol = false
823 pendingCoordHead = -1
824 pendingRelHead = -1
825 pendingRecipCand = false
826 pendingRecipIdx = -1
827 pendingCausative = false
828 pendingCausativeMorph = 0
829 }
830 _ = resetPending
831 // Multi-clause discourse accumulator. clauseRel/clauseParent track how
832 // the CURRENT (being-built) clause relates to its parent in clauses[].
833 var clauses []Clause
834 clauseRel := ClauseRoot
835 clauseParent := int16(-1)
836
837 for tokIdx, tok := range tokens {
838 // Clause-boundary token from tokenizeEN punctuation classification
839 // (synthetic 、 emitted for ,/;/:/./?/!/—/…/♫). Finalize the current
840 // clause as a ClauseAnd peer, reset per-slot state, continue.
841 if tok == "\xe3\x80\x81" {
842 if len(slots) > 0 {
843 clauseSet := buildSetFromSlices(
844 slots, roles, slotMorphs,
845 slotMarks, slotOblRoles, slotHeads, slotModKinds,
846 )
847 clauseRelLocal := clauseRel
848 if clauseRelLocal == ClauseRoot && len(clauses) > 0 {
849 clauseRelLocal = ClauseAnd
850 }
851 nextParent := int16(len(clauses))
852 clauses = append(clauses, Clause{
853 Set: clauseSet, Relation: clauseRelLocal,
854 Parent: clauseParent, HostIdx: -1,
855 })
856 slots = nil
857 roles = nil
858 slotMorphs = nil
859 slotMarks = nil
860 slotOblRoles = nil
861 slotHeads = nil
862 slotModKinds = nil
863 sawVerb = false
864 contentCount = 0
865 resetPending()
866 clauseRel = ClauseAnd
867 clauseParent = nextParent - 1
868 }
869 continue
870 }
871 low := toLowerEN(tok)
872 mk, isMk := enWordToMarker()[low]
873 if isMk {
874 pat = append(pat, mk)
875 switch enMarkerClass(mk) {
876 case enMarkDeterminer:
877 // Clause boundary detection: in a subordinate clause that
878 // has a completed verb, a new determiner starts the main
879 // clause. "if it rains the cat runs" - "the" begins main.
880 if sawVerb && (clauseRel == ClauseIf || clauseRel == ClauseBecause) {
881 clauseSet := buildSetFromSlices(
882 slots, roles, slotMorphs,
883 slotMarks, slotOblRoles, slotHeads, slotModKinds,
884 )
885 nextParent := int16(len(clauses))
886 clauses = append(clauses, Clause{
887 Set: clauseSet, Relation: clauseRel,
888 Parent: nextParent, HostIdx: -1,
889 })
890 slots = nil
891 roles = nil
892 slotMorphs = nil
893 slotMarks = nil
894 slotOblRoles = nil
895 slotHeads = nil
896 slotModKinds = nil
897 sawVerb = false
898 contentCount = 0
899 resetPending()
900 clauseRel = ClauseRoot
901 clauseParent = -1
902 }
903 if mk == MkThe {
904 pendingDef = true
905 } // "a" leaves pendingDef=false (indefinite)
906 case enMarkNegation:
907 pendingNeg = true
908 case enMarkPronoun:
909 // Pronouns are content nouns; emit as slot.
910 contentCount++
911 role := pendingRole
912 if !sawVerb {
913 role = HistSubject
914 } else {
915 role = pendingRole
916 }
917 pat[len(pat)-1] = SlotNoun
918 pronAtom := tok
919 plem := LemmatizeEN(toLowerEN(tok))
920 if plem.Lemma != "" {
921 pronAtom = plem.Lemma
922 }
923 slots = append(slots, pronAtom)
924 roles = append(roles, role)
925 m := uint16(0)
926 if pendingDef {
927 m |= MetaDefDef
928 }
929 slotMorphs = append(slotMorphs, m)
930 slotMarks = append(slotMarks, pendingMark)
931 slotOblRoles = append(slotOblRoles, ORNone)
932 slotHeads = append(slotHeads, -1)
933 slotModKinds = append(slotModKinds, MKNone)
934 newPronIdx := int16(len(slots) - 1)
935 // Ditransitive recipient (two-noun pattern, mirrors the noun-
936 // emit branch). Pronouns are common recipients (give him X).
937 if pendingRecipCand {
938 if pendingRecipIdx < 0 {
939 if pendingMark == 0 && pendingOblRole == ORNone {
940 pendingRecipIdx = newPronIdx
941 } else {
942 pendingRecipCand = false
943 }
944 } else {
945 slotOblRoles[pendingRecipIdx] = ORRecip
946 roles[pendingRecipIdx] = HistModifier
947 pendingRecipCand = false
948 pendingRecipIdx = -1
949 }
950 }
951 pendingMark = 0
952 pendingDef = false
953 case enMarkPossDet:
954 // Possessive determiner: emit as POSS modifier of the next noun.
955 // The next slot emission resolves the Head pointer.
956 pat[len(pat)-1] = SlotNoun
957 slots = append(slots, possDetSurface(mk))
958 roles = append(roles, HistOperator) // flattened role for the modifier
959 slotMorphs = append(slotMorphs, 0)
960 slotMarks = append(slotMarks, mk)
961 slotOblRoles = append(slotOblRoles, ORNone)
962 slotHeads = append(slotHeads, -1)
963 slotModKinds = append(slotModKinds, MKNone)
964 pendingHead = int16(len(slots) - 1)
965 pendingModKind = MKPoss
966 case enMarkVerbAux:
967 // Verb-auxiliaries (be/is/are/was/were/am/do/etc) are not
968 // content verbs. For copular constructions ("he is a student"),
969 // the next noun becomes the copular predicate, attached via
970 // MKCop to the most recent subject. No verb slot is emitted.
971 //
972 // If we've already seen a real content verb (e.g. "is eating"),
973 // the aux just carries tense/aspect onto that verb.
974 if sawVerb && len(slotMorphs) > 0 {
975 lem := LemmatizeEN(low)
976 slotMorphs[len(slotMorphs)-1] |= lem.Morph
977 } else {
978 // No real verb. Stage copula info for the next noun.
979 subj := findSubjectIdx(roles, slotHeads)
980 if subj >= 0 {
981 pendingCop = true
982 pendingCopHead = subj
983 lem := LemmatizeEN(low)
984 pendingCopMorph = lem.Morph
985 pendingCopAuxIsBe = lem.Lemma == "be"
986 if pendingNeg {
987 pendingCopMorph |= MetaPolarNeg
988 pendingNeg = false
989 }
990 }
991 pendingRole = HistComplement
992 }
993 case enMarkPreposition:
994 pendingRole = MarkerToRole(mk)
995 pendingMark = mk
996 pendingOblRole = MarkerToOblRole(mk)
997 case enMarkConjunction:
998 // "and"/"or" is either NP-coord (between nouns at same position)
999 // or CLAUSE-coord (between two complete predications).
1000 // Disambiguator: if a predication has been completed
1001 // (sawVerb || any MKCop), this is clause-coord.
1002 if hasPredication(roles, slotModKinds) {
1003 // CLAUSE-COORD: finalize the current clause, reset
1004 // per-slot state, mark next clause as ClauseAnd peer.
1005 clauseSet := buildSetFromSlices(
1006 slots, roles, slotMorphs,
1007 slotMarks, slotOblRoles, slotHeads, slotModKinds,
1008 )
1009 clauseRelLocal := clauseRel
1010 clauseParentLocal := clauseParent
1011 if clauseRelLocal == ClauseRoot && len(clauses) > 0 {
1012 // Shouldn't happen but guard
1013 clauseRelLocal = ClauseAnd
1014 }
1015 nextParent := int16(len(clauses))
1016 clauses = append(clauses, Clause{
1017 Set: clauseSet, Relation: clauseRelLocal,
1018 Parent: clauseParentLocal, HostIdx: -1,
1019 })
1020 // Reset per-slot state.
1021 slots = nil
1022 roles = nil
1023 slotMorphs = nil
1024 slotMarks = nil
1025 slotOblRoles = nil
1026 slotHeads = nil
1027 slotModKinds = nil
1028 sawVerb = false
1029 contentCount = 0
1030 resetPending()
1031 // Next clause inherits AND relation, parent points at the
1032 // previous root clause (or its index).
1033 if mk == MkBut {
1034 clauseRel = ClauseBut
1035 } else {
1036 clauseRel = ClauseAnd
1037 }
1038 clauseParent = nextParent - 1
1039 continue
1040 }
1041 // NP-COORD: existing behavior.
1042 if len(slots) > 0 {
1043 prev := int16(len(slots) - 1)
1044 if int32(prev) < len(slotModKinds) && slotModKinds[prev] == MKCoord {
1045 prev = slotHeads[prev]
1046 }
1047 pendingCoordHead = prev
1048 } else {
1049 pendingRole = HistSubject
1050 sawVerb = false
1051 contentCount = 0
1052 }
1053 case enMarkRelative:
1054 // "that/which/who" after a noun: capture the noun's index so
1055 // the next verb is emitted as a MKRel modifier of that noun.
1056 // Intransitive REL only - if a subject follows ("that I saw"),
1057 // the verb attaches to that subject and pendingRelHead does
1058 // not fire. Transitive REL needs the sub-clause path; not yet.
1059 if len(slots) > 0 && roles[len(roles)-1] == HistSubject {
1060 pendingRelHead = int16(len(slots) - 1)
1061 }
1062 }
1063 continue
1064 }
1065 if isENPunct(tok) {
1066 continue
1067 }
1068
1069 // "let" / "let's" as a volitional auxiliary: marks the subsequent verb
1070 // as MetaMoodVol. "let" only fires at clause start (else it's the verb
1071 // meaning "allow/permit"). "let's" is unambiguously volitional and
1072 // fires anywhere; the renderer may emit "let's" after a subject like
1073 // "we let's go" so we can't gate on contentCount==0.
1074 if !sawVerb && low == "let's" {
1075 pendingVol = true
1076 continue
1077 }
1078 if !sawVerb && contentCount == 0 && low == "let" {
1079 pendingVol = true
1080 continue
1081 }
1082
1083 // Subordinating conjunctions at clause start: "if" / "because" mark
1084 // the upcoming clause as subordinate (ClauseIf / ClauseBecause).
1085 // Don't emit a slot for the conjunction; consume into clauseRel.
1086 if !sawVerb && contentCount == 0 && len(slots) == 0 {
1087 switch low {
1088 case "if":
1089 clauseRel = ClauseIf
1090 continue
1091 case "because":
1092 clauseRel = ClauseBecause
1093 continue
1094 }
1095 }
1096
1097 // Adverb detection: -ly suffix or hardcoded list. Adverbs modify
1098 // the verb in the clause. Emit as a slot with ModKind=MKAdv. Head
1099 // resolves at verb-emission time (forward) or via the last-emitted
1100 // verb slot (backward).
1101 //
1102 // When pendingCop is set, predicate-adjective check takes precedence
1103 // (fast is both an adverb and a predicate-adj; "is fast" wants the
1104 // adj reading, "runs fast" wants the adverb reading).
1105 isAdv := looksLikeAdverb(low)
1106 if pendingCop && looksLikePredicateAdj(low) {
1107 isAdv = false
1108 }
1109 if isAdv {
1110 contentCount++
1111 pat = append(pat, SlotNoun)
1112 slots = append(slots, low)
1113 roles = append(roles, HistModifier)
1114 slotMorphs = append(slotMorphs, 0)
1115 slotMarks = append(slotMarks, 0)
1116 slotOblRoles = append(slotOblRoles, ORNone)
1117 advIdx := int16(len(slots) - 1)
1118 // Find the verb to bind to. Backward: most-recent verb slot.
1119 // Forward: stage pendingAdvHead, resolved at next verb emission.
1120 boundHead := int16(-1)
1121 for i := int32(advIdx) - 1; i >= 0; i-- {
1122 if roles[i] == HistVerb {
1123 boundHead = int16(i)
1124 break
1125 }
1126 }
1127 if boundHead >= 0 {
1128 slotHeads = append(slotHeads, boundHead)
1129 slotModKinds = append(slotModKinds, MKAdv)
1130 } else {
1131 // No prior verb; wait for the next verb emission.
1132 slotHeads = append(slotHeads, -1)
1133 slotModKinds = append(slotModKinds, MKAdv)
1134 }
1135 continue
1136 }
1137
1138 // Content word.
1139 contentCount++
1140 isVerb := looksLikeVerb(low)
1141 // Second-verb-no-comma clause boundary: if we've already seen a
1142 // verb in this clause and the current word is also a verb (with no
1143 // preposition or conjunction between), the current clause ends and
1144 // a new clause begins. Common after subordinators: "if it rains
1145 // stay home" - "rains" verb-of-condition, "stay" verb-of-main.
1146 // Suppress when pendingCausative is staged - the upcoming verb is
1147 // the embedded action of an analytic causative, not a new clause.
1148 if sawVerb && isVerb && pendingMark == 0 && pendingOblRole == ORNone &&
1149 !pendingCausative {
1150 clauseSet := buildSetFromSlices(
1151 slots, roles, slotMorphs,
1152 slotMarks, slotOblRoles, slotHeads, slotModKinds,
1153 )
1154 nextParent := int16(len(clauses))
1155 // Condition/cause clauses come BEFORE the main clause in surface
1156 // order; Parent points forward to the next root clause (its index
1157 // will be len(clauses), the index this new clause is about to take).
1158 parent := clauseParent
1159 if clauseRel == ClauseIf || clauseRel == ClauseBecause {
1160 parent = nextParent
1161 }
1162 clauses = append(clauses, Clause{
1163 Set: clauseSet, Relation: clauseRel,
1164 Parent: parent, HostIdx: -1,
1165 })
1166 slots = nil
1167 roles = nil
1168 slotMorphs = nil
1169 slotMarks = nil
1170 slotOblRoles = nil
1171 slotHeads = nil
1172 slotModKinds = nil
1173 sawVerb = false
1174 contentCount = 1 // we're about to emit this verb
1175 resetPending()
1176 // Main clause after subordinator: ClauseRoot
1177 clauseRel = ClauseRoot
1178 clauseParent = -1
1179 }
1180 // ATTR detection: pattern is [current content][next content][verb or end].
1181 // If current and next are both content non-verbs, and the position
1182 // after next is a verb (or end-of-clause), then current modifies next.
1183 // Examples: "red car runs" - red is ATTR of car (runs is the verb).
1184 isAttr := false
1185 if !isVerb && tokIdx+1 < len(tokens) {
1186 next := toLowerEN(tokens[tokIdx+1])
1187 if _, nextIsMk := enWordToMarker()[next]; !nextIsMk && !isENPunct(tokens[tokIdx+1]) && !looksLikeVerb(next) {
1188 // Two-token check: position after next must be verb or end.
1189 if tokIdx+2 >= len(tokens) {
1190 isAttr = true
1191 } else {
1192 afterNext := toLowerEN(tokens[tokIdx+2])
1193 if _, anIsMk := enWordToMarker()[afterNext]; anIsMk {
1194 // Marker (preposition/etc) after the noun phrase - NP ends here.
1195 isAttr = true
1196 } else if looksLikeVerb(afterNext) {
1197 isAttr = true
1198 }
1199 }
1200 }
1201 }
1202 // Sister check: if previous slot was ATTR-tagged at extract time, or a
1203 // pendingHead is awaiting this slot as its head, or a copular predicate
1204 // is pending, the current word is the predicate / noun, not a verb.
1205 prevWasAttrOrPending := pendingHead >= 0 || pendingCop || pendingCoordHead >= 0
1206 if len(slotModKinds) > 0 && slotModKinds[len(slotModKinds)-1] == MKAttr {
1207 prevWasAttrOrPending = true
1208 }
1209 if !isVerb && !sawVerb && contentCount == 2 && pendingRole == HistObject {
1210 // pendingRole was set to Object by a preposition; not a verb position
1211 } else if !isVerb && !sawVerb && contentCount == 2 && !isAttr && !prevWasAttrOrPending {
1212 isVerb = true
1213 }
1214 // Predicative adjective: when a copula is staged ("is/are/was/were")
1215 // and the current word looks like a predicate adjective (deverbal -ing,
1216 // -ful, -ous, -ic, -able, -ible) or its lemma does (bigger→big), emit
1217 // as HistComplement with MKAdj pointing at the subject. The copula's
1218 // morph (tense, 3sg) merges with the lemma's morph (e.g. MetaCompare
1219 // for comparatives) into the adjective's morph; no separate verb slot.
1220 if pendingCop {
1221 // Surface-form check first (interesting, big, etc.) - atom stays
1222 // as the surface. Only when surface miss AND lemma-form matches
1223 // (comparatives: bigger→big) do we use the lemma+lem.Morph.
1224 useLemma := false
1225 var lem LemmaResult
1226 if !looksLikePredicateAdj(low) {
1227 lem = LemmatizeEN(low)
1228 if looksLikePredicateAdj(lem.Lemma) {
1229 useLemma = true
1230 }
1231 }
1232 if looksLikePredicateAdj(low) || useLemma {
1233 pat = append(pat, SlotNoun)
1234 atom := low
1235 m := pendingCopMorph
1236 if useLemma {
1237 atom = lem.Lemma
1238 m |= lem.Morph
1239 }
1240 slots = append(slots, atom)
1241 roles = append(roles, HistComplement)
1242 if pendingNeg {
1243 m |= MetaPolarNeg
1244 pendingNeg = false
1245 }
1246 slotMorphs = append(slotMorphs, m)
1247 slotMarks = append(slotMarks, 0)
1248 slotOblRoles = append(slotOblRoles, ORNone)
1249 slotHeads = append(slotHeads, pendingCopHead)
1250 slotModKinds = append(slotModKinds, MKAdj)
1251 pendingCop = false
1252 pendingCopHead = -1
1253 pendingCopMorph = 0
1254 pendingCopAuxIsBe = false
1255 sawVerb = true
1256 continue
1257 }
1258 }
1259 // REL-intransitive: if a relative pronoun staged pendingRelHead and
1260 // the current word is a verb, emit it as a MKRel modifier of the
1261 // host noun. Do NOT set sawVerb so the next real verb still becomes
1262 // the clause's main predicate.
1263 if pendingRelHead >= 0 && isVerb {
1264 pat = append(pat, SlotVerb)
1265 lem := LemmatizeEN(low)
1266 slots = append(slots, lem.Lemma)
1267 roles = append(roles, HistModifier)
1268 m := lem.Morph
1269 if m&MetaNumPlural != 0 {
1270 m = (m &^ MetaNumPlural) | Meta3Sg
1271 }
1272 if pendingNeg {
1273 m |= MetaPolarNeg
1274 pendingNeg = false
1275 }
1276 slotMorphs = append(slotMorphs, m)
1277 slotMarks = append(slotMarks, 0)
1278 slotOblRoles = append(slotOblRoles, ORNone)
1279 slotHeads = append(slotHeads, pendingRelHead)
1280 slotModKinds = append(slotModKinds, MKRel)
1281 pendingRelHead = -1
1282 continue
1283 }
1284 // Causative: emit the embedded verb with MetaCausative. The aux
1285 // (make/let) was suppressed at its position; its tense morph rides
1286 // on the embedded verb.
1287 if pendingCausative && isVerb {
1288 pat = append(pat, SlotVerb)
1289 lem := LemmatizeEN(low)
1290 slots = append(slots, lem.Lemma)
1291 roles = append(roles, HistVerb)
1292 m := lem.Morph | MetaCausative | pendingCausativeMorph
1293 if m&MetaNumPlural != 0 {
1294 m = (m &^ MetaNumPlural) | Meta3Sg
1295 }
1296 if pendingNeg {
1297 m |= MetaPolarNeg
1298 pendingNeg = false
1299 }
1300 slotMorphs = append(slotMorphs, m)
1301 slotMarks = append(slotMarks, 0)
1302 slotOblRoles = append(slotOblRoles, ORNone)
1303 slotHeads = append(slotHeads, -1)
1304 slotModKinds = append(slotModKinds, MKNone)
1305 pendingCausative = false
1306 pendingCausativeMorph = 0
1307 continue
1308 }
1309 if !sawVerb && isVerb {
1310 lem := LemmatizeEN(low)
1311 // Causative aux detection: "make"/"let" + (NP) + bare-V. Suppress
1312 // the aux emission, stage MetaCausative for the next verb.
1313 if (lem.Lemma == "make" || lem.Lemma == "let") &&
1314 causativeBareVFollows(tokens, tokIdx) {
1315 pendingCausative = true
1316 pendingCausativeMorph = lem.Morph
1317 sawVerb = true
1318 pendingRole = HistObject
1319 continue
1320 }
1321 pat = append(pat, SlotVerb)
1322 slots = append(slots, lem.Lemma)
1323 roles = append(roles, HistVerb)
1324 m := lem.Morph
1325 if m&MetaNumPlural != 0 {
1326 m = (m &^ MetaNumPlural) | Meta3Sg
1327 }
1328 if pendingNeg {
1329 m |= MetaPolarNeg
1330 pendingNeg = false
1331 }
1332 if pendingVol {
1333 m |= MetaMoodVol
1334 pendingVol = false
1335 }
1336 // If a verb-aux had staged copula info but the next content turns
1337 // out to be a real verb, the aux is an auxiliary helper, not a
1338 // copula. The grammar of the verb form determines which:
1339 // "is/was + V-ing" -> progressive aspect on V
1340 // "is/was + V-ed/en" (past participle, not progressive) -> passive
1341 // LemmatizeEN sets MetaAspectProg for -ing forms, MetaTensePast
1342 // for -ed/irregulars. Aspect bit distinguishes prog from passive.
1343 if pendingCop {
1344 m |= pendingCopMorph
1345 // Passive voice requires a form of "be" as auxiliary plus
1346 // past-participle form on the embedded verb. "did + V" is
1347 // do-support (do-support + bare V = emphatic/negative/
1348 // question), not passive. Only be-aux + past tense (with
1349 // no progressive) yields passive.
1350 if pendingCopAuxIsBe &&
1351 m&MetaAspectProg == 0 && m&MetaTensePast != 0 {
1352 m |= MetaPassive
1353 // The aux's tense ("is"=non-past, "was"=past) overrides
1354 // the participle's "past" reading - "was bitten" is past
1355 // passive, "is bitten" is non-past passive.
1356 if pendingCopMorph&MetaTensePast == 0 {
1357 m &^= MetaTensePast
1358 }
1359 }
1360 pendingCop = false
1361 pendingCopHead = -1
1362 pendingCopMorph = 0
1363 pendingCopAuxIsBe = false
1364 }
1365 slotMorphs = append(slotMorphs, m)
1366 slotMarks = append(slotMarks, 0)
1367 slotOblRoles = append(slotOblRoles, ORNone)
1368 slotHeads = append(slotHeads, -1)
1369 slotModKinds = append(slotModKinds, MKNone)
1370 verbIdx := int16(len(slots) - 1)
1371 // Resolve any pre-verb adverbs that were waiting for a verb head.
1372 for i := 0; i < int32(verbIdx); i++ {
1373 if slotModKinds[i] == MKAdv && slotHeads[i] < 0 {
1374 slotHeads[i] = verbIdx
1375 }
1376 }
1377 // Ditransitive: if this verb takes a bare-NP recipient before the
1378 // patient ("give X Y"), flag the next-noun-with-no-preposition as
1379 // a recipient candidate. Commits to ORRecip only if a second
1380 // object follows ("give him a book"); single-object uses keep
1381 // the noun as plain patient ("read a book").
1382 if isEnDitransitive(lem.Lemma) {
1383 pendingRecipCand = true
1384 pendingRecipIdx = -1
1385 }
1386 sawVerb = true
1387 pendingRole = HistObject
1388 } else {
1389 pat = append(pat, SlotNoun)
1390 lem := LemmatizeEN(low)
1391 slots = append(slots, lem.Lemma)
1392 role := pendingRole
1393 if !sawVerb && !pendingCop {
1394 role = HistSubject
1395 }
1396 roles = append(roles, role)
1397 m := lem.Morph
1398 if pendingDef {
1399 m |= MetaDefDef
1400 pendingDef = false
1401 }
1402 slotMorphs = append(slotMorphs, m)
1403 slotMarks = append(slotMarks, pendingMark)
1404 slotOblRoles = append(slotOblRoles, pendingOblRole)
1405 slotHeads = append(slotHeads, -1)
1406 slotModKinds = append(slotModKinds, MKNone)
1407 newIdx := int16(len(slots) - 1)
1408 // Ditransitive recipient (two-noun pattern): the first bare-NP
1409 // after a ditransitive verb is staged as a candidate; the second
1410 // noun's arrival promotes the first to ORRecip + HistModifier.
1411 // A preposition on the first noun cancels (existing prep path
1412 // handles "give book to him"). Single-object uses leave the
1413 // candidate uncommitted so it stays a plain object.
1414 if pendingRecipCand {
1415 if pendingRecipIdx < 0 {
1416 // This is the first noun. Stage as candidate unless a
1417 // preposition fired.
1418 if pendingMark == 0 && pendingOblRole == ORNone {
1419 pendingRecipIdx = newIdx
1420 } else {
1421 pendingRecipCand = false
1422 }
1423 } else {
1424 // Second noun arrives - promote the candidate.
1425 slotOblRoles[pendingRecipIdx] = ORRecip
1426 roles[pendingRecipIdx] = HistModifier
1427 pendingRecipCand = false
1428 pendingRecipIdx = -1
1429 }
1430 }
1431 // Coordination resolution: if "and"/"or" set pendingCoordHead,
1432 // this noun is a peer conjunct of that slot. Inherit its role.
1433 if pendingCoordHead >= 0 && pendingCoordHead < newIdx {
1434 slotHeads[newIdx] = pendingCoordHead
1435 slotModKinds[newIdx] = MKCoord
1436 roles[newIdx] = roles[pendingCoordHead]
1437 pendingCoordHead = -1
1438 }
1439 // Copula resolution: if a verb-aux staged copula state, this noun
1440 // is the predicate. Bind Head=subject, ModKind=MKCop, merge morph.
1441 if pendingCop {
1442 slotHeads[newIdx] = pendingCopHead
1443 slotModKinds[newIdx] = MKCop
1444 slotMorphs[newIdx] |= pendingCopMorph
1445 pendingCop = false
1446 pendingCopHead = -1
1447 pendingCopMorph = 0
1448 pendingCopAuxIsBe = false
1449 }
1450 // Resolve a pending POSS/ATTR modifier from a preceding determiner
1451 // or ATTR-detected adjective.
1452 if pendingHead >= 0 && pendingHead < newIdx {
1453 slotHeads[pendingHead] = newIdx
1454 slotModKinds[pendingHead] = pendingModKind
1455 // Transfer pending def/morph from modifier to head (the
1456 // determiner applied to the noun phrase, whose head is this).
1457 if pendingModKind == MKAttr {
1458 // Move MetaDefDef and other phrase-level morph from
1459 // modifier to head if present.
1460 if slotMorphs[pendingHead]&MetaDefDef != 0 {
1461 slotMorphs[newIdx] |= MetaDefDef
1462 slotMorphs[pendingHead] &^= MetaDefDef
1463 }
1464 // The head takes the clause role; modifier loses it.
1465 if roles[pendingHead] == HistSubject {
1466 roles[newIdx] = HistSubject
1467 }
1468 }
1469 pendingHead = -1
1470 pendingModKind = MKNone
1471 }
1472 // ATTR detection (forward-looking): if the lookahead identified
1473 // the current slot as an ATTR modifier of the next noun, set up
1474 // pendingHead so the next slot emission resolves it.
1475 if isAttr {
1476 pendingHead = newIdx
1477 pendingModKind = MKAttr
1478 }
1479 pendingMark = 0
1480 pendingOblRole = ORNone
1481 if sawVerb && pendingRole == HistObject {
1482 pendingRole = HistComplement
1483 }
1484 }
1485 }
1486
1487 // Finalize the last clause.
1488 finalSet := buildSetFromSlices(
1489 slots, roles, slotMorphs,
1490 slotMarks, slotOblRoles, slotHeads, slotModKinds,
1491 )
1492 clauses = append(clauses, Clause{
1493 Set: finalSet, Relation: clauseRel,
1494 Parent: clauseParent, HostIdx: -1,
1495 })
1496
1497 // Flatten Slots/Roles across all clauses so the atom-link layer sees
1498 // every word. With punct-aware tokenization, clauses are finalized
1499 // mid-input and the per-clause `slots` array gets reset; only the final
1500 // clause would otherwise be visible in ExtractResult.Slots.
1501 flatSlots := []string{:0:len(slots)}
1502 flatRoles := []int32{:0:len(slots)}
1503 for _, c := range clauses {
1504 for _, e := range c.Set {
1505 flatSlots = append(flatSlots, e.Atom)
1506 flatRoles = append(flatRoles, e.Role)
1507 }
1508 }
1509
1510 return ExtractResult{
1511 Pattern: pat, Slots: flatSlots, Roles: flatRoles,
1512 DeepPat: buildDeepPat(flatRoles), Set: clauses[0].Set,
1513 Discourse: clauses,
1514 }
1515 }
1516
1517 // ExtractCode takes code tokens and produces pattern + slots.
1518 // Structural keywords become markers. Identifiers/literals become slots.
1519 func ExtractCode(tokens []string) ExtractResult {
1520 var pat []byte
1521 var slots []string
1522 var roles []int32
1523
1524 for _, tok := range tokens {
1525 mk := codeTokenToMarker(tok)
1526 if mk != 0 {
1527 pat = append(pat, mk)
1528 } else {
1529 pat = append(pat, SlotNoun)
1530 slots = append(slots, tok)
1531 roles = append(roles, HistComplement)
1532 }
1533 }
1534 return ExtractResult{Pattern: pat, Slots: slots, Roles: roles, DeepPat: buildDeepPat(roles)}
1535 }
1536
1537 func codeTokenToMarker(tok string) uint8 {
1538 switch tok {
1539 case "if":
1540 return MkIf
1541 case "else":
1542 return MkElse
1543 case "for", "range":
1544 return MkFor_C
1545 case "return":
1546 return MkReturn
1547 case "{":
1548 return MkLBrace
1549 case "}":
1550 return MkRBrace
1551 case "(":
1552 return MkLParen
1553 case ")":
1554 return MkRParen
1555 case "=", ":=":
1556 return MkAssign
1557 case ".":
1558 return MkDot
1559 case ",":
1560 return MkComma
1561 case ":":
1562 return MkColon
1563 case "->", "<-":
1564 return MkArrow
1565 case "case":
1566 return MkCase
1567 case "select":
1568 return MkSelect
1569 case "spawn":
1570 return MkSpawn
1571 case "chan":
1572 return MkChan
1573 }
1574 return 0
1575 }
1576
1577 // EN marker functional classes.
1578 const (
1579 enMarkUnknown = 0
1580 enMarkDeterminer = 1 // the, a, an
1581 enMarkPossDet = 2 // my, your, his, her, its, our, their (POSS modifiers)
1582 enMarkVerbAux = 3 // is, are, was, do, have...
1583 enMarkPreposition = 4 // in, on, at, with, by...
1584 enMarkNegation = 5 // not, n't
1585 enMarkConjunction = 6 // and, but, or
1586 enMarkRelative = 7 // that, which, who
1587 enMarkPronoun = 8 // i, you, he, she, it, we, they (subject pronouns)
1588 )
1589
1590 func enMarkerClass(mk uint8) int32 {
1591 switch mk {
1592 case MkThe, MkA:
1593 return enMarkDeterminer
1594 case MkMy, MkYour, MkHis, MkHerP, MkIts, MkOurP, MkTheirP:
1595 return enMarkPossDet
1596 case MkIs, MkAre, MkWas, MkDo:
1597 return enMarkVerbAux
1598 case MkIn, MkOn, MkAt, MkWith, MkBy, MkFor, MkTo_EN, MkOf, MkFrom, MkAs, MkThan:
1599 return enMarkPreposition
1600 case MkNot:
1601 return enMarkNegation
1602 case MkAnd, MkBut:
1603 return enMarkConjunction
1604 case MkThat:
1605 return enMarkRelative
1606 case MkI, MkYou, MkPron3, MkIt_EN, MkWe_EN, MkThey_:
1607 return enMarkPronoun
1608 }
1609 return enMarkUnknown
1610 }
1611
1612 // looksLikeAdverb returns true if w is likely an adverb: -ly suffix or in
1613 // a hardcoded list of common irregular adverbs that aren't morphologically
1614 // derivable.
1615 func looksLikeAdverb(w string) bool {
1616 if len(w) > 3 && hasSuffix(w, "ly") {
1617 return true
1618 }
1619 switch w {
1620 case "fast", "well", "hard", "here", "there", "now", "then",
1621 "today", "yesterday", "tomorrow", "always", "often", "never",
1622 "sometimes", "usually", "rarely", "still", "already", "yet",
1623 "soon", "later", "early", "late", "ever", "again":
1624 return true
1625 }
1626 return false
1627 }
1628
1629 // causativeBareVFollows returns true if tokens after tokIdx contain a bare
1630 // infinitive verb (no intervening "to") within the next NP-shaped lookahead
1631 // window. Used to detect "make/let + NP + bare-V" causative pattern.
1632 // NP shape: at most 3 tokens (det + adj + noun, or possessive + noun, or
1633 // single pronoun) before the bare verb.
1634 func causativeBareVFollows(tokens []string, tokIdx int32) bool {
1635 for k := 1; k <= 4 && tokIdx+k < len(tokens); k++ {
1636 t := toLowerEN(tokens[tokIdx+k])
1637 if t == "to" {
1638 return false // "to V" infinitival, not bare-V causative
1639 }
1640 if mk, isMk := enWordToMarker()[t]; isMk {
1641 switch enMarkerClass(mk) {
1642 case enMarkPreposition, enMarkConjunction:
1643 return false
1644 }
1645 continue // determiner, pronoun, possessive - part of the NP
1646 }
1647 if looksLikeVerb(t) {
1648 return true
1649 }
1650 }
1651 return false
1652 }
1653
1654 // isEnDitransitive returns true for verbs that take a bare-NP recipient
1655 // before the patient object: "give X Y" = "give Y to X". When such a verb
1656 // is emitted and the next noun has no preceding preposition, that noun is
1657 // the recipient (HistModifier + ORRecip), not a second direct object.
1658 // Closed set; verbs that always take prepositional dative (e.g., "explain
1659 // X to Y") are excluded.
1660 func isEnDitransitive(lemma string) bool {
1661 switch lemma {
1662 case "give", "send", "tell", "show", "offer", "hand", "pass",
1663 "teach", "write", "read", "sell", "buy", "bring",
1664 "mail", "lend", "owe", "pay", "throw", "hand", "ask":
1665 return true
1666 }
1667 return false
1668 }
1669
1670 // looksLikePredicateAdj returns true for words that, when following a copula
1671 // (is/are/was/were), are predicative adjectives rather than verbs or nouns.
1672 // Used to disambiguate "is interesting" (predicate-adj) from "is V-ing"
1673 // (progressive verb) or "is X" (copular noun).
1674 //
1675 // Detection: three layers
1676 // 1. Common short adjective whitelist (big, small, fast, hungry, ...)
1677 // 2. Deverbal -ing predicate adjectives (interesting, exciting, ...)
1678 // 3. Adjective-shape suffixes (-ful, -ous, -ic, -able, -ible)
1679 //
1680 // Conservative; false negatives fall through to MKCop (noun-complement) which
1681 // preserves semantics but loses the JA-side MKAdj-parity for round-trip.
1682 func looksLikePredicateAdj(w string) bool {
1683 if len(w) < 2 {
1684 return false
1685 }
1686 // Common short adjective whitelist - parity with JA i-adj predicates.
1687 switch w {
1688 case "big", "small", "tall", "short", "long", "wide", "narrow",
1689 "thick", "thin", "deep", "shallow", "high", "low",
1690 "hot", "cold", "warm", "cool", "wet", "dry",
1691 "fast", "slow", "quick", "old", "new", "young",
1692 "good", "bad", "nice", "fine", "great", "poor",
1693 "happy", "sad", "angry", "tired", "hungry", "thirsty",
1694 "sleepy", "busy", "lazy", "easy", "hard", "soft",
1695 "loud", "quiet", "clean", "dirty", "empty", "full",
1696 "rich", "weak", "strong", "smart", "kind", "mean",
1697 "red", "blue", "green", "yellow", "white", "black",
1698 "pink", "brown", "gray", "grey", "purple", "orange",
1699 "heavy", "light", "free", "cheap", "expensive",
1700 "safe", "sick", "well", "ill", "ready", "right", "wrong",
1701 "true", "false", "real", "fake", "open", "closed",
1702 "bright", "dark", "sweet", "sour", "salty", "bitter",
1703 "round", "square", "flat", "sharp", "dull",
1704 "strange", "weird", "normal", "common", "rare",
1705 "important", "famous", "popular", "different", "similar",
1706 "alive", "dead", "alone", "together":
1707 return true
1708 }
1709 if len(w) < 4 {
1710 return false
1711 }
1712 // Deverbal -ing predicate adjectives.
1713 switch w {
1714 case "interesting", "exciting", "boring", "tiring", "amazing",
1715 "frightening", "surprising", "confusing", "disappointing",
1716 "satisfying", "encouraging", "pleasing", "fascinating",
1717 "depressing", "embarrassing", "shocking", "thrilling",
1718 "charming", "annoying", "relaxing", "stunning",
1719 "missing", "willing", "outstanding", "promising":
1720 return true
1721 }
1722 // Adjective-shape suffixes.
1723 if hasSuffix(w, "ful") || hasSuffix(w, "ous") || hasSuffix(w, "ic") ||
1724 hasSuffix(w, "able") || hasSuffix(w, "ible") {
1725 return true
1726 }
1727 return false
1728 }
1729
1730 // possDetSurface returns the surface form of an EN possessive determiner
1731 // for round-trip rendering.
1732 func possDetSurface(mk uint8) string {
1733 switch mk {
1734 case MkMy:
1735 return "my"
1736 case MkYour:
1737 return "your"
1738 case MkHis:
1739 return "his"
1740 case MkHerP:
1741 return "her"
1742 case MkIts:
1743 return "its"
1744 case MkOurP:
1745 return "our"
1746 case MkTheirP:
1747 return "their"
1748 }
1749 return ""
1750 }
1751
1752 func toLowerEN(s string) string {
1753 b := []byte(s)
1754 for i, c := range b {
1755 if c >= 'A' && c <= 'Z' {
1756 b[i] = c + 32
1757 }
1758 }
1759 return string(b)
1760 }
1761
1762 func isENPunct(s string) bool {
1763 if len(s) != 1 {
1764 return false
1765 }
1766 c := s[0]
1767 return c == '.' || c == ',' || c == '!' || c == '?' || c == ';' || c == ':' || c == '"' || c == '\''
1768 }
1769
1770 // looksLikeVerb is a heuristic for EN verb detection.
1771 // Uses common verb endings and a high-frequency set.
1772 // Min-length guards on suffix detection prevent short-word false positives
1773 // (red/fed/led/bed all end in -ed; sing/king/ring all end in -ing).
1774 func looksLikeVerb(w string) bool {
1775 if len(w) < 2 {
1776 return false
1777 }
1778 if (hasSuffix(w, "ing") && len(w) > 4) ||
1779 (hasSuffix(w, "ed") && len(w) > 3) ||
1780 hasSuffix(w, "ize") || hasSuffix(w, "ise") || hasSuffix(w, "ate") {
1781 return true
1782 }
1783 if hasSuffix(w, "fy") || (hasSuffix(w, "en") && len(w) > 3) {
1784 return true
1785 }
1786 switch w {
1787 case
1788 "go", "went", "gone", "goes",
1789 "eat", "eats", "drink", "drinks", "read", "reads",
1790 "write", "writes", "walk", "walks", "talk", "talks",
1791 "sleep", "sleeps", "wake", "wakes", "sit", "sits",
1792 "stand", "stands", "lie", "lies", "live", "lives",
1793 "die", "dies", "chew", "chews", "fly", "flies",
1794 "swim", "swims", "jump", "jumps", "throw", "throws",
1795 "catch", "catches", "kick", "kicks", "hit", "hits",
1796 "push", "pushes", "pull", "pulls", "grab", "grabs",
1797 "bite", "bites", "chase", "chases",
1798 "get", "got", "gotten", "gets",
1799 "make", "made", "makes",
1800 "take", "took", "taken", "takes",
1801 "come", "came", "comes",
1802 "see", "saw", "seen", "sees",
1803 "know", "knew", "known", "knows",
1804 "give", "gave", "given", "gives",
1805 "say", "said", "says",
1806 "tell", "told", "tells",
1807 "think", "thought", "thinks",
1808 "find", "found", "finds",
1809 "leave", "left", "leaves",
1810 "call", "calls",
1811 "ask", "asks",
1812 "seem", "seems",
1813 "feel", "felt", "feels",
1814 "become", "became", "becomes",
1815 "keep", "kept", "keeps",
1816 "begin", "began", "begun", "begins",
1817 "show", "shows",
1818 "hear", "heard", "hears",
1819 "play", "plays",
1820 "move", "moves",
1821 "live", "lives",
1822 "believe", "believes",
1823 "hold", "held", "holds",
1824 "bring", "brought", "brings",
1825 "happen", "happens",
1826 "write", "wrote", "writes",
1827 "provide", "provides",
1828 "sit", "sat", "sits",
1829 "stand", "stood", "stands",
1830 "lose", "lost", "loses",
1831 "pay", "paid", "pays",
1832 "meet", "met", "meets",
1833 "include", "includes",
1834 "continue", "continues",
1835 "learn", "learns",
1836 "change", "changes",
1837 "lead", "led", "leads",
1838 "understand", "understood",
1839 "watch", "watches",
1840 "follow", "follows",
1841 "stop", "stops",
1842 "create", "creates",
1843 "speak", "spoke", "speaks",
1844 "read", "reads",
1845 "allow", "allows",
1846 "add", "adds",
1847 "spend", "spent", "spends",
1848 "grow", "grew", "grows",
1849 "open", "opens",
1850 "walk", "walks",
1851 "win", "won", "wins",
1852 "teach", "taught",
1853 "offer", "offers",
1854 "remember", "remembers",
1855 "love", "loves",
1856 "consider", "considers",
1857 "appear", "appears",
1858 "buy", "bought", "buys",
1859 "wait", "waits",
1860 "serve", "serves",
1861 "die", "died", "dies",
1862 "send", "sent", "sends",
1863 "expect", "expects",
1864 "build", "built", "builds",
1865 "stay", "stays",
1866 "fall", "fell", "falls",
1867 "cut", "cuts",
1868 "reach", "reaches",
1869 "kill", "kills",
1870 "remain", "remains",
1871 "suggest", "suggests",
1872 "raise", "raises",
1873 "pass", "passes",
1874 "sell", "sold", "sells",
1875 "require", "requires",
1876 "report", "reports",
1877 "decide", "decides",
1878 "pull", "pulls",
1879 "develop", "develops",
1880 "use", "uses", "put", "puts", "set", "sets", "run", "runs",
1881 "let", "lets", "try", "tries", "need", "needs", "want", "wants",
1882 "start", "starts",
1883 "help", "helps",
1884 "turn", "turns",
1885 "work", "works",
1886 "like", "likes",
1887 "look", "looks",
1888 "mean", "means", "meant",
1889 "can", "could", "will", "would", "shall", "should", "may", "might", "must":
1890 return true
1891 }
1892 return false
1893 }
1894
1895 func hasSuffix(s, suffix string) bool {
1896 if len(s) < len(suffix) {
1897 return false
1898 }
1899 return s[len(s)-len(suffix):] == suffix
1900 }
1901
1902 // buildDeepPat creates a canonical (sorted, normalized) role sequence from roles.
1903 func buildDeepPat(roles []int32) []uint8 {
1904 if len(roles) == 0 {
1905 return nil
1906 }
1907 dp := []uint8{:len(roles):len(roles)}
1908 for i, r := range roles {
1909 nr := r
1910 if nr == HistTopic {
1911 nr = HistSubject
1912 }
1913 dp[i] = uint8(nr)
1914 }
1915 // Insertion sort (patterns are short, 3-8 elements).
1916 for i := 1; i < len(dp); i++ {
1917 key := dp[i]
1918 j := i - 1
1919 for j >= 0 && dp[j] > key {
1920 dp[j+1] = dp[j]
1921 j--
1922 }
1923 dp[j+1] = key
1924 }
1925 return dp
1926 }
1927