extract.mx raw

   1  package iskra
   2  
   3  // UntranslatedMarker is the placeholder emitted by cross-language
   4  // translation when no atom-link mapping exists for the source atom.
   5  // Renderers emit it verbatim and skip inflection. The marker is ASCII-
   6  // safe (no language-specific bytes) and visible in output for diagnosis.
   7  const UntranslatedMarker = "[missing]"
   8  
   9  // SetEntry represents one element of the sentence-as-set abstraction.
  10  //
  11  // Lossless canonical form: Role + Atom + Morph + Class + OblRole + Head + ModKind.
  12  //
  13  // Three-layer role schema:
  14  //
  15  //   Role       - core grammatical role (Subject/Object/Verb/Topic/etc) from
  16  //                the histogram. RRG macroroles.
  17  //   OblRole    - thematic/oblique role (Goal/Loc/Instr/etc) for adjunct slots.
  18  //                Language-independent.
  19  //   Head/ModKind - structural relation to another entry in the Set. Entry is
  20  //                a top-level argument when Head=-1; otherwise it modifies the
  21  //                entry at index Head with relation ModKind (POSS, ATTR, etc).
  22  //
  23  // Head is an index into the current Set. Valid only when the slot order is
  24  // the canonical extraction order for the language (modifier-before-head for
  25  // POSS and ATTR in both JA and EN). If translation ever reorders entries, or
  26  // if relative-clause modifiers introduce post-head ordering, switch to stable
  27  // IDs.
  28  type SetEntry struct {
  29  	Role    int32    // macrorole (Subject/Object/Verb/Topic/...)
  30  	Atom    string // region center (lemma/stem)
  31  	Morph   uint16 // tense|aspect|polarity|formality|number|def|mood|3sg|passive|causative|...
  32  	Class   uint8  // verb class (for verbs): 1=ichidan, 2-10=godan variants
  33  	Mark    uint8  // original particle/preposition (within-language preservation)
  34  	OblRole uint8  // thematic role: Goal/Loc/Instr/etc. ORNone for core args.
  35  	Head    int16   // index of head entry; -1 if top-level argument
  36  	ModKind uint8  // modification kind: MKNone (top), MKPoss, MKAttr, MKRel, MKApp
  37  }
  38  
  39  // Modification kinds (for Head/ModKind nesting).
  40  //
  41  // Layered semantics:
  42  //   - POSS/ATTR: structural modifier of another argument (Head points at the
  43  //     modified entry). The modifier doesn't have a clause-level role on its own.
  44  //   - COP: copular predicate of the subject. The complement asserts an identity
  45  //     or attribution about the entry at Head. No verb slot exists in the clause.
  46  //     The complement's Morph carries tense/aspect/polite (だ vs です vs だった).
  47  //   - REL/APP: reserved.
  48  const (
  49  	MKNone  uint8 = 0
  50  	MKPoss  uint8 = 1 // possessive (私の魚 / my fish)
  51  	MKAttr  uint8 = 2 // attributive (赤い車 / red car) - adjective modifies noun
  52  	MKCop   uint8 = 3 // copular predicate (学生だ / is a student); Head = subject
  53  	MKAdv   uint8 = 4 // adverbial (速く走る / runs fast); Head = verb
  54  	MKCoord uint8 = 5 // coordination peer (猫と犬 / cats and dogs); Head = first conjunct
  55  	MKRel   uint8 = 6 // relative clause; modifier verb's Head=host noun
  56  	MKApp   uint8 = 7 // RESERVED: apposition; not yet implemented
  57  	MKAdj   uint8 = 8 // predicative adjective (面白い / is interesting); Head=subject
  58  )
  59  
  60  // Oblique role values. Language-independent thematic relations.
  61  // Use these for adjunct slots; ORNone means the slot's role is purely macro
  62  // (Subject/Object/Verb).
  63  const (
  64  	ORNone   uint8 = 0
  65  	ORGoal   uint8 = 1  // to, へ, に-motion
  66  	ORLoc    uint8 = 2  // in/on/at, で-location, に-stative
  67  	ORSource uint8 = 3  // from, から
  68  	ORLimit  uint8 = 4  // until, まで
  69  	ORInstr  uint8 = 5  // with-instrument, で-instrumental
  70  	ORComit  uint8 = 6  // with-companion, と
  71  	ORBenef  uint8 = 7  // for, のために
  72  	ORAgent  uint8 = 8  // by, によって (passive agent)
  73  	ORRecip  uint8 = 9  // to-recipient, に-dative
  74  	ORPart   uint8 = 10 // of, の-partitive/genitive
  75  	ORCompare uint8 = 11 // than, より (standard of comparison)
  76  )
  77  
  78  // ExtractResult is the output of pattern extraction from a token sequence.
  79  //
  80  // Set vs Discourse: Set holds the root clause's role-set; Discourse holds all
  81  // clauses (root + subordinate/coord). For single-clause sentences, Discourse
  82  // has exactly one element whose Set == ExtractResult.Set. Multi-clause
  83  // sentences (clause coordination, conditional, relative clause) populate
  84  // Discourse with multiple Clause entries.
  85  type ExtractResult struct {
  86  	Pattern   []byte     // encoded pattern (markers + slots)
  87  	Slots     []string   // content words filling each slot (surface forms)
  88  	Roles     []int32      // hist index for each slot (assigned by following marker)
  89  	DeepPat   []uint8    // canonical role sequence (sorted, normalized)
  90  	Set       []SetEntry // root-clause role-set (== Discourse[0].Set when populated)
  91  	Discourse []Clause   // all clauses; len 1 for single-clause inputs
  92  }
  93  
  94  // Clause is one complete predication within a Discourse.
  95  //
  96  // Single-clause sentences produce one Clause with Relation=ClauseRoot,
  97  // Parent=-1, HostIdx=-1. Multi-clause sentences add more Clauses with
  98  // Relation/Parent/HostIdx specifying how each subordinate or peer clause
  99  // relates to its anchor.
 100  type Clause struct {
 101  	Set      []SetEntry      // role-set of this clause (modifier nesting etc. live inside)
 102  	Relation ClauseRelation  // how this clause relates to its Parent
 103  	Parent   int16            // index of parent clause in Discourse.Clauses; -1 for root
 104  	HostIdx  int16            // for REL: index of modified entry in parent's Set; -1 otherwise
 105  }
 106  
 107  // ClauseRelation enumerates inter-clause relations in a Discourse.
 108  //
 109  // Asymmetric relations (IF, BECAUSE, REL) point from the subordinate clause
 110  // to its parent. Peer relations (AND, OR, BUT) point from the second clause
 111  // to the first; commutativity is implicit at the semantic level.
 112  type ClauseRelation uint8
 113  
 114  const (
 115  	ClauseRoot    ClauseRelation = 0 // root clause; no parent
 116  	ClauseAnd     ClauseRelation = 1 // X and Y - peer
 117  	ClauseOr      ClauseRelation = 2 // X or Y - peer
 118  	ClauseBut     ClauseRelation = 3 // X but Y - peer with contrast
 119  	ClauseIf      ClauseRelation = 4 // if X (then parent) - condition
 120  	ClauseBecause ClauseRelation = 5 // because X (then parent) - cause
 121  	ClauseRel     ClauseRelation = 6 // relative clause modifying parent.Set[HostIdx]
 122  )
 123  
 124  // ExtractJA takes JA tokens (already split on particles) and produces
 125  // the structural pattern + content slots.
 126  func ExtractJA(tokens []string) ExtractResult {
 127  	var pat []byte
 128  	var slots []string
 129  	var roles []int32
 130  	var slotMarkers []uint8
 131  	var slotMorphs []uint16
 132  	var slotOblRoles []uint8
 133  	var slotHeads []int16
 134  	var slotModKinds []uint8
 135  	pendingRole := HistVerb
 136  	pendingHead := int16(-1)
 137  	pendingModKind := uint8(MKNone)
 138  	pendingCoordHeadJA := int16(-1)
 139  	// Multi-clause accumulator for JA. Comma 、 signals clause boundary.
 140  	var clausesJA []Clause
 141  	clauseRelJA := ClauseRoot
 142  	clauseParentJA := int16(-1)
 143  	// Skip-tokens index for the の-relational-noun-に locative compound:
 144  	// when the pattern is detected at の, we consume the next two tokens
 145  	// (the relational noun + に) and apply ORLoc to the preceding base noun.
 146  	skipUntilJA := -1
 147  
 148  	for i, tok := range tokens {
 149  		if i <= skipUntilJA {
 150  			continue
 151  		}
 152  		// もし at clause start signals a conditional clause (ClauseIf).
 153  		// Consume it and mark the current clause's relation.
 154  		if len(slots) == 0 && tok == "\xe3\x82\x82\xe3\x81\x97" {
 155  			clauseRelJA = ClauseIf
 156  			continue
 157  		}
 158  		// JA comma 、 (E3 80 81) signals a clause boundary. Finalize the
 159  		// current clause, reset per-slot state, mark next clause as ClauseAnd.
 160  		if tok == "\xe3\x80\x81" {
 161  			if len(slots) > 0 {
 162  				lastIdx := len(slots) - 1
 163  				// Apply same predicate-shape detection as end-of-input does:
 164  				// copula strip (学生だ → student MKCop) then predicate-i-adj
 165  				// (面白い → MKAdj) then fall back to last-slot=HistVerb.
 166  				appliedPred := false
 167  				if slotModKinds[lastIdx] != MKCop {
 168  					if stripped, ok, copMorph := stripJACopula(slots[lastIdx]); ok && len(stripped) > 0 {
 169  						slots[lastIdx] = stripped
 170  						slotMorphs[lastIdx] |= copMorph
 171  						slotModKinds[lastIdx] = MKCop
 172  						slotHeads[lastIdx] = findSubjectIdx(roles, slotHeads)
 173  						roles[lastIdx] = HistComplement
 174  						appliedPred = true
 175  					} else if endsInIKana(slots[lastIdx]) && len(slots[lastIdx]) > 3 &&
 176  						!endsInNaiSuffix(slots[lastIdx]) && !endsInTaiSuffix(slots[lastIdx]) &&
 177  						slotHeads[lastIdx] < 0 && slotModKinds[lastIdx] == MKNone {
 178  						slotModKinds[lastIdx] = MKAdj
 179  						slotHeads[lastIdx] = findSubjectIdx(roles, slotHeads)
 180  						roles[lastIdx] = HistComplement
 181  						appliedPred = true
 182  					}
 183  				}
 184  				if !appliedPred && len(roles) > 0 && slotModKinds[lastIdx] != MKCop {
 185  					roles[lastIdx] = HistVerb
 186  				}
 187  				clauseSet := buildSetFromSlices(
 188  					slots, roles, slotMorphs,
 189  					slotMarkers, slotOblRoles, slotHeads, slotModKinds,
 190  				)
 191  				nextParent := int16(len(clausesJA))
 192  				clausesJA = append(clausesJA, Clause{
 193  					Set: clauseSet, Relation: clauseRelJA,
 194  					Parent: clauseParentJA, HostIdx: -1,
 195  				})
 196  				slots = nil
 197  				roles = nil
 198  				slotMarkers = nil
 199  				slotMorphs = nil
 200  				slotOblRoles = nil
 201  				slotHeads = nil
 202  				slotModKinds = nil
 203  				pendingRole = HistVerb
 204  				pendingHead = -1
 205  				pendingModKind = MKNone
 206  				pendingCoordHeadJA = -1
 207  				clauseRelJA = ClauseAnd
 208  				clauseParentJA = nextParent - 1
 209  			}
 210  			continue
 211  		}
 212  		mk, isMk := jaParticleToMarker()[tok]
 213  		if isMk {
 214  			pat = append(pat, mk)
 215  			// Synthetic morph markers attach to PRECEDING slot's morph.
 216  			switch mk {
 217  			case MkDef:
 218  				if len(slotMorphs) > 0 {
 219  					slotMorphs[len(slotMorphs)-1] |= MetaDefDef
 220  				}
 221  				continue
 222  			case MkPlural:
 223  				if len(slotMorphs) > 0 {
 224  					slotMorphs[len(slotMorphs)-1] |= MetaNumPlural
 225  				}
 226  				continue
 227  			case Mk3Sg:
 228  				if len(slotMorphs) > 0 {
 229  					slotMorphs[len(slotMorphs)-1] |= Meta3Sg
 230  				}
 231  				continue
 232  			case MkCopula:
 233  				// Copula marker - reserved.
 234  				continue
 235  			case MkNo:
 236  				// Locative-compound disambiguation: の followed by a relational
 237  				// noun (中/上/下/前/後/横/隣/間/内/外) and then に collapses to
 238  				// an ORLoc oblique on the base noun. 箱の中に = "in the box";
 239  				// the 中 (inside) is implicit in ORLoc, and the prior 箱 takes
 240  				// the locative role. Skip the next two tokens (relNoun + に).
 241  				if i+2 < len(tokens) && len(slots) > 0 &&
 242  					isJARelationalNoun(tokens[i+1]) &&
 243  					tokens[i+2] == "\xe3\x81\xab" {
 244  					lastIdx := len(slots) - 1
 245  					if lastIdx < len(slotOblRoles) {
 246  						slotOblRoles[lastIdx] = jaRelationalNounToOblRole(tokens[i+1])
 247  					}
 248  					if lastIdx < len(roles) {
 249  						roles[lastIdx] = HistScope
 250  					}
 251  					if lastIdx < len(slotMarkers) {
 252  						slotMarkers[lastIdx] = MkNi
 253  					}
 254  					skipUntilJA = i + 2
 255  					continue
 256  				}
 257  				// の: preceding slot is a POSS modifier of the next slot.
 258  				if len(slots) > 0 {
 259  					pendingHead = int16(len(slots) - 1)
 260  					pendingModKind = MKPoss
 261  				}
 262  				if len(roles) > 0 {
 263  					roles[len(roles)-1] = MarkerToRole(mk)
 264  					if len(slotMarkers) == len(slots) {
 265  						slotMarkers[len(slotMarkers)-1] = mk
 266  					}
 267  				}
 268  				continue
 269  			case MkDe:
 270  				// で is ambiguous: locative/instrumental particle (家で本を読む
 271  				// = "read book at home") or te-form of copula で joining two
 272  				// copular clauses (学生で彼は先生だ = "[I'm a student] and
 273  				// [he is a teacher]"). Disambiguator: if で is followed by
 274  				// [noun][は/が], it's te-copula clause-coord.
 275  				if i+2 < len(tokens) && len(slots) > 0 {
 276  					next2 := tokens[i+2]
 277  					if next2 == "\xe3\x81\xaf" || next2 == "\xe3\x81\x8c" {
 278  						// Te-form copula: mark the preceding noun as MKCop
 279  						// predicate of the current clause's subject, then
 280  						// finalize the clause and start a new one.
 281  						lastIdx := len(slots) - 1
 282  						slotModKinds[lastIdx] = MKCop
 283  						slotHeads[lastIdx] = findSubjectIdx(roles, slotHeads)
 284  						roles[lastIdx] = HistComplement
 285  						clauseSet := buildSetFromSlices(
 286  							slots, roles, slotMorphs,
 287  							slotMarkers, slotOblRoles, slotHeads, slotModKinds,
 288  						)
 289  						nextParent := int16(len(clausesJA))
 290  						clausesJA = append(clausesJA, Clause{
 291  							Set: clauseSet, Relation: clauseRelJA,
 292  							Parent: clauseParentJA, HostIdx: -1,
 293  						})
 294  						slots = nil
 295  						roles = nil
 296  						slotMarkers = nil
 297  						slotMorphs = nil
 298  						slotOblRoles = nil
 299  						slotHeads = nil
 300  						slotModKinds = nil
 301  						pendingRole = HistVerb
 302  						pendingHead = -1
 303  						pendingModKind = MKNone
 304  						pendingCoordHeadJA = -1
 305  						clauseRelJA = ClauseAnd
 306  						clauseParentJA = nextParent - 1
 307  						continue
 308  					}
 309  				}
 310  				// Fall through to default for instrumental/locative で.
 311  			case MkTo:
 312  				// と is ambiguous: comitative (友達と) or coordination
 313  				// (猫と犬). Coord heuristic: preceding slot has no Mark yet.
 314  				// For chained coord (X と Y と Z), all peers point at the
 315  				// FIRST conjunct, not the previous one - walk up the chain.
 316  				if len(slots) > 0 && len(slotMarkers) == len(slots) &&
 317  					(slotMarkers[len(slotMarkers)-1] == 0 ||
 318  						slotMarkers[len(slotMarkers)-1] == MkTo) {
 319  					prev := int16(len(slots) - 1)
 320  					if slotModKinds[prev] == MKCoord {
 321  						prev = slotHeads[prev]
 322  					}
 323  					pendingCoordHeadJA = prev
 324  					if slotMarkers[len(slotMarkers)-1] == 0 {
 325  						slotMarkers[len(slotMarkers)-1] = mk
 326  					}
 327  					continue
 328  				}
 329  				// Fall through to default marker handling for comitative.
 330  			}
 331  			if len(roles) > 0 {
 332  				newRole := MarkerToRole(mk)
 333  				roles[len(roles)-1] = newRole
 334  				// Propagate the role backward through any coord chain so
 335  				// the head conjunct gets the same role as the particle-marked
 336  				// conjunct (猫と犬が = both subjects, marked via が on 犬).
 337  				j := len(roles) - 1
 338  				for j > 0 && slotHeads[j] >= 0 && slotModKinds[j] == MKCoord {
 339  					j = int32(slotHeads[j])
 340  					roles[j] = newRole
 341  				}
 342  				if len(slotMarkers) == len(slots) {
 343  					// Keep the existing と Mark on the coord head; only update
 344  					// non-coord-marker slots' Mark.
 345  					if slotMarkers[len(slotMarkers)-1] != MkTo {
 346  						slotMarkers[len(slotMarkers)-1] = mk
 347  					}
 348  				}
 349  				if len(slotOblRoles) == len(slots) {
 350  					if or := MarkerToOblRole(mk); or != ORNone {
 351  						slotOblRoles[len(slotOblRoles)-1] = or
 352  					}
 353  				}
 354  			}
 355  			if i < len(tokens)-1 {
 356  				pendingRole = HistVerb
 357  			}
 358  		} else {
 359  			pat = append(pat, SlotNoun)
 360  			slots = append(slots, tok)
 361  			roles = append(roles, pendingRole)
 362  			slotMarkers = append(slotMarkers, 0)
 363  			slotMorphs = append(slotMorphs, 0)
 364  			slotOblRoles = append(slotOblRoles, ORNone)
 365  			slotHeads = append(slotHeads, -1)
 366  			slotModKinds = append(slotModKinds, MKNone)
 367  			newIdx := int16(len(slots) - 1)
 368  			// Temporal-noun adverbial: 昨日/今日/明日/etc. - sentence-initial
 369  			// adjuncts that surface as bare nouns but semantically modify the
 370  			// clause's verb. Mark as MKAdv with head=-1; resolved at the
 371  			// final-pass below (binds to the verb slot once it's identified).
 372  			if isJATemporalNoun(tok) {
 373  				roles[newIdx] = HistModifier
 374  				slotModKinds[newIdx] = MKAdv
 375  				slotHeads[newIdx] = -1
 376  			}
 377  			// ば-ending token signals conditional clause: mark this clause
 378  			// as ClauseIf. The lemmatizer strips ば from the verb separately.
 379  			if len(tok) >= 3 {
 380  				tb := []byte(tok)
 381  				if tb[len(tb)-3] == 0xe3 && tb[len(tb)-2] == 0x81 && tb[len(tb)-1] == 0xb0 {
 382  					clauseRelJA = ClauseIf
 383  				}
 384  			}
 385  			// Coordination resolution: と connected this slot to the previous.
 386  			if pendingCoordHeadJA >= 0 && pendingCoordHeadJA < newIdx {
 387  				slotHeads[newIdx] = pendingCoordHeadJA
 388  				slotModKinds[newIdx] = MKCoord
 389  				roles[newIdx] = roles[pendingCoordHeadJA]
 390  				pendingCoordHeadJA = -1
 391  			}
 392  			// Resolve pending POSS modifier from a preceding の.
 393  			if pendingHead >= 0 && pendingHead < newIdx {
 394  				slotHeads[pendingHead] = newIdx
 395  				slotModKinds[pendingHead] = pendingModKind
 396  				pendingHead = -1
 397  				pendingModKind = MKNone
 398  			}
 399  			// ATTR detection: i-adjective immediately preceding this noun
 400  			// (no particle between them - we'd have continued out via the
 401  			// marker branch otherwise). Heuristic: previous slot's atom ends
 402  			// in い with no intervening particle, and previous slot didn't
 403  			// already get a modifier role from a particle. Known false
 404  			// positives: な-adjectives ending in い (きれい), nouns ending
 405  			// in い (兄). Logged limitation; not silently corrupted because
 406  			// the comparison metric will catch any resulting drift.
 407  			if newIdx >= 1 {
 408  				prev := newIdx - 1
 409  				if slotHeads[prev] < 0 && slotMarkers[prev] == 0 {
 410  					prevAtom := slots[prev]
 411  					if endsInIKana(prevAtom) &&
 412  						!endsInNaiSuffix(prevAtom) && !endsInTaiSuffix(prevAtom) {
 413  						slotHeads[prev] = newIdx
 414  						slotModKinds[prev] = MKAttr
 415  					} else if endsInKuKana(prevAtom) {
 416  						slotHeads[prev] = newIdx
 417  						slotModKinds[prev] = MKAdv
 418  					} else if isJABareKanjiAdj(prevAtom) {
 419  						slotHeads[prev] = newIdx
 420  						slotModKinds[prev] = MKAttr
 421  					} else if endsInTaKana(prevAtom) {
 422  						// た-form REL: 食べた猫 (the cat that ate). A past-tense
 423  						// verb immediately preceding a noun (no particle) is a
 424  						// relative-clause predicate modifying the noun.
 425  						slotHeads[prev] = newIdx
 426  						slotModKinds[prev] = MKRel
 427  						roles[prev] = HistModifier
 428  					}
 429  				}
 430  			}
 431  			pendingRole = HistVerb
 432  		}
 433  	}
 434  
 435  	// Copula detection: if the last slot is a noun ending in だ/です/だった/でした,
 436  	// it's a copular predicate, not a verb. Strip the copula suffix, mark the
 437  	// slot with MKCop, point Head at the subject, and DO NOT apply the
 438  	// last-slot-verb-override.
 439  	copulaApplied := false
 440  	if len(slots) > 0 {
 441  		lastIdx := len(slots) - 1
 442  		if stripped, ok, copMorph := stripJACopula(slots[lastIdx]); ok {
 443  			// Verify the stripped result isn't a verb-like stem.
 444  			// (A verb past form like 食べた must keep た as past suffix, not
 445  			// be treated as copula. The disambiguator: if stripping leaves
 446  			// only hiragana that looks like a verb stem, skip copula.)
 447  			// For now, accept any non-empty stripped result on the last slot.
 448  			if len(stripped) > 0 {
 449  				slots[lastIdx] = stripped
 450  				slotMorphs[lastIdx] |= copMorph
 451  				slotModKinds[lastIdx] = MKCop
 452  				slotHeads[lastIdx] = findSubjectIdx(roles, slotHeads)
 453  				roles[lastIdx] = HistComplement
 454  				copulaApplied = true
 455  			}
 456  		}
 457  	}
 458  
 459  	// Locative-existence copula: if the final verb lemmatizes to いる/ある
 460  	// and the clause has an ORLoc-marked slot, the いる is the existence
 461  	// verb (be located). Promote the locative slot to MKCop (parallel to EN
 462  	// "is in X" representation: [X SCOPE ORLoc MKCop h=subj]) and drop the
 463  	// existence verb - its semantics is absorbed by the copular link.
 464  	if !copulaApplied && len(slots) > 1 {
 465  		lastIdx := len(slots) - 1
 466  		lastAtom := slots[lastIdx]
 467  		// いる stem after lemmatization is い (ichidan); ある stem is あ.
 468  		lem := LemmatizeJA(lastAtom, true)
 469  		if (lem.Lemma == "\xe3\x81\x84" || lem.Lemma == "\xe3\x81\x82") &&
 470  			lem.Class == VClassIchidan {
 471  			for i := 0; i < lastIdx; i++ {
 472  				if i < len(slotOblRoles) && slotOblRoles[i] == ORLoc {
 473  					slotModKinds[i] = MKCop
 474  					slotHeads[i] = findSubjectIdx(roles, slotHeads)
 475  					// Inherit both the verb's lemma morph AND any synthetic
 476  					// morph markers attached to the verb slot (Meta3Sg via ◯).
 477  					slotMorphs[i] |= lem.Morph | slotMorphs[lastIdx]
 478  					// Drop the existence-verb slot.
 479  					slots = slots[:lastIdx]
 480  					roles = roles[:lastIdx]
 481  					slotMarkers = slotMarkers[:lastIdx]
 482  					slotMorphs = slotMorphs[:lastIdx]
 483  					slotOblRoles = slotOblRoles[:lastIdx]
 484  					slotHeads = slotHeads[:lastIdx]
 485  					slotModKinds = slotModKinds[:lastIdx]
 486  					copulaApplied = true
 487  					break
 488  				}
 489  			}
 490  		}
 491  	}
 492  	// Predicative i-adjective detection: when the last slot ends in い and the
 493  	// copula stripper didn't fire, treat it as an adjectival predicate
 494  	// (面白い / "is interesting"). Set ModKind=MKAdj, Head=subject, role=
 495  	// Complement. The default last-slot=HistVerb override below is skipped.
 496  	// False positives: な-adjectives ending in い (きれい), nouns ending in い
 497  	// (兄). Accepted limitation, same profile as ATTR detection.
 498  	predAdjApplied := false
 499  	if !copulaApplied && len(slots) > 0 {
 500  		lastIdx := len(slots) - 1
 501  		atom := slots[lastIdx]
 502  		if slotHeads[lastIdx] < 0 && slotModKinds[lastIdx] == MKNone &&
 503  			endsInIKana(atom) && len(atom) > 3 &&
 504  			!endsInNaiSuffix(atom) && !endsInTaiSuffix(atom) {
 505  			slotModKinds[lastIdx] = MKAdj
 506  			slotHeads[lastIdx] = findSubjectIdx(roles, slotHeads)
 507  			roles[lastIdx] = HistComplement
 508  			// Comparative: if any earlier slot is ORCompare-marked (より),
 509  			// the predicate adjective carries MetaCompare.
 510  			for i := 0; i < lastIdx; i++ {
 511  				if i < len(slotOblRoles) && slotOblRoles[i] == ORCompare {
 512  					slotMorphs[lastIdx] |= MetaCompare
 513  					break
 514  				}
 515  			}
 516  			predAdjApplied = true
 517  		}
 518  	}
 519  	if !copulaApplied && !predAdjApplied && len(roles) > 0 {
 520  		roles[len(roles)-1] = HistVerb
 521  	}
 522  
 523  	// Passive/causative agent reinterpretation: に defaults to ORLoc (locative)
 524  	// or ORGoal (motion goal) depending on verb semantics. When the final verb
 525  	// carries MetaPassive, a に-marked slot is the agent ("by X") - flip its
 526  	// OblRole to ORAgent and role to HistModifier so EN renders it as "by X".
 527  	// MetaCausative reinterprets に-marked slot as the causer-agent similarly.
 528  	// Pre-compute the verb morph by lemmatizing here; buildSetFromSlices below
 529  	// will re-do the same lemmatization, but the cost is one extra strip on the
 530  	// last slot - cheap compared to scanning every slot for に at render time.
 531  	if !copulaApplied && !predAdjApplied && len(slots) > 0 {
 532  		lastIdx := len(slots) - 1
 533  		verbLem := LemmatizeJA(slots[lastIdx], true)
 534  		verbMorph := verbLem.Morph
 535  		if lastIdx < len(slotMorphs) {
 536  			verbMorph |= slotMorphs[lastIdx]
 537  		}
 538  		if verbMorph&(MetaPassive|MetaCausative) != 0 {
 539  			for i := 0; i < len(slots); i++ {
 540  				if i < len(slotMarkers) && slotMarkers[i] == MkNi {
 541  					if i < len(slotOblRoles) {
 542  						slotOblRoles[i] = ORAgent
 543  					}
 544  					if i < len(roles) {
 545  						roles[i] = HistModifier
 546  					}
 547  				}
 548  			}
 549  		} else if isJADitransitive(verbLem.Lemma) {
 550  			// Ditransitive: に-marked slot is the recipient (彼に本をあげる).
 551  			// Flip from default ORGoal/ORLoc to ORRecip; role HistScope to
 552  			// HistModifier for cross-language parity with EN ditransitive
 553  			// extraction.
 554  			for i := 0; i < len(slots); i++ {
 555  				if i < len(slotMarkers) && slotMarkers[i] == MkNi {
 556  					if i < len(slotOblRoles) {
 557  						slotOblRoles[i] = ORRecip
 558  					}
 559  					if i < len(roles) {
 560  						roles[i] = HistModifier
 561  					}
 562  				}
 563  			}
 564  		}
 565  	}
 566  
 567  	// Temporal-adverbial binding: any slot marked MKAdv with head=-1 from
 568  	// the temporal-noun detection above gets bound to the clause's verb
 569  	// (last slot if !copulaApplied && !predAdjApplied, otherwise no binding
 570  	// since copular/adjectival clauses have no verb to modify).
 571  	if !copulaApplied && !predAdjApplied && len(slots) > 0 {
 572  		verbIdx := int16(len(slots) - 1)
 573  		for i := 0; i < int32(verbIdx); i++ {
 574  			if i < len(slotModKinds) && slotModKinds[i] == MKAdv &&
 575  				i < len(slotHeads) && slotHeads[i] < 0 {
 576  				slotHeads[i] = verbIdx
 577  			}
 578  		}
 579  	}
 580  
 581  	// Modifier role propagation: ATTR modifiers copy their head noun's role
 582  	// (EN gives "red" the same role as "car"); ADV modifiers get HistModifier.
 583  	for i := 0; i < len(slots); i++ {
 584  		if i < len(slotModKinds) && i < len(slotHeads) && slotHeads[i] >= 0 {
 585  			h := int32(slotHeads[i])
 586  			switch slotModKinds[i] {
 587  			case MKAttr:
 588  				if h < len(roles) {
 589  					roles[i] = roles[h]
 590  				}
 591  			case MKAdv:
 592  				roles[i] = HistModifier
 593  			}
 594  		}
 595  	}
 596  
 597  	var set []SetEntry
 598  	for i, word := range slots {
 599  		role := HistComplement
 600  		if i < len(roles) {
 601  			role = roles[i]
 602  		}
 603  		// Modifier entries (Head>=0) skip verb lemmatization regardless of role-
 604  		// override at sentence-final position. A POSS/ATTR modifier on the
 605  		// last slot is still a modifier, not the clause's verb. Exception:
 606  		// MKRel modifiers ARE verbs (relative-clause predicates) and must be
 607  		// lemmatized to recover tense morph + verb class.
 608  		isVerb := role == HistVerb
 609  		head := int16(-1)
 610  		modKind := uint8(MKNone)
 611  		if i < len(slotHeads) {
 612  			head = slotHeads[i]
 613  		}
 614  		if i < len(slotModKinds) {
 615  			modKind = slotModKinds[i]
 616  		}
 617  		if modKind != MKNone && modKind != MKRel {
 618  			isVerb = false
 619  		}
 620  		if modKind == MKRel {
 621  			isVerb = true
 622  		}
 623  		lem := LemmatizeJA(word, isVerb)
 624  		mark := uint8(0)
 625  		extraMorph := uint16(0)
 626  		obl := uint8(ORNone)
 627  		if i < len(slotMarkers) {
 628  			mark = slotMarkers[i]
 629  		}
 630  		if i < len(slotMorphs) {
 631  			extraMorph = slotMorphs[i]
 632  		}
 633  		if i < len(slotOblRoles) {
 634  			obl = slotOblRoles[i]
 635  		}
 636  		set = append(set, SetEntry{
 637  			Role: role, Atom: lem.Lemma, Morph: lem.Morph | extraMorph,
 638  			Class: lem.Class, Mark: mark, OblRole: obl,
 639  			Head: head, ModKind: modKind,
 640  		})
 641  	}
 642  
 643  	// Finalize the last (or only) clause.
 644  	clausesJA = append(clausesJA, Clause{
 645  		Set: set, Relation: clauseRelJA,
 646  		Parent: clauseParentJA, HostIdx: -1,
 647  	})
 648  
 649  	// Flatten Slots/Roles across all clauses (see ExtractEN for rationale).
 650  	flatSlotsJA := []string{:0:len(slots)}
 651  	flatRolesJA := []int32{:0:len(slots)}
 652  	for _, c := range clausesJA {
 653  		for _, e := range c.Set {
 654  			flatSlotsJA = append(flatSlotsJA, e.Atom)
 655  			flatRolesJA = append(flatRolesJA, e.Role)
 656  		}
 657  	}
 658  
 659  	return ExtractResult{
 660  		Pattern: pat, Slots: flatSlotsJA, Roles: flatRolesJA,
 661  		DeepPat: buildDeepPat(flatRolesJA), Set: clausesJA[0].Set,
 662  		Discourse: clausesJA,
 663  	}
 664  }
 665  
 666  // findSubjectIdx returns the index of the entry that should serve as the
 667  // subject of a copular predicate: the first entry whose role is Topic or
 668  // Subject and that is not itself a modifier (Head=-1).
 669  // Returns -1 if no candidate found.
 670  func findSubjectIdx(roles []int32, heads []int16) int16 {
 671  	for i, r := range roles {
 672  		if i >= len(heads) {
 673  			break
 674  		}
 675  		if heads[i] >= 0 {
 676  			continue // modifier, skip
 677  		}
 678  		if r == HistTopic || r == HistSubject {
 679  			return int16(i)
 680  		}
 681  	}
 682  	return -1
 683  }
 684  
 685  // buildSetFromSlices converts parallel per-slot slices into a []SetEntry.
 686  // Used by ExtractEN both at clause boundaries and end-of-input.
 687  func buildSetFromSlices(
 688  	slots []string, roles []int32, slotMorphs []uint16,
 689  	slotMarks, slotOblRoles []uint8, slotHeads []int16, slotModKinds []uint8,
 690  ) []SetEntry {
 691  	// Role propagation before set construction: ATTR copies head's role,
 692  	// ADV gets HistModifier. Mutates roles[] in place (caller's slice).
 693  	for i := 0; i < len(slots); i++ {
 694  		if i < len(slotModKinds) && i < len(slotHeads) && slotHeads[i] >= 0 {
 695  			h := int32(slotHeads[i])
 696  			switch slotModKinds[i] {
 697  			case MKAttr:
 698  				if h < len(roles) {
 699  					roles[i] = roles[h]
 700  				}
 701  			case MKAdv:
 702  				roles[i] = HistModifier
 703  			}
 704  		}
 705  	}
 706  	var set []SetEntry
 707  	for i, lemma := range slots {
 708  		role := HistComplement
 709  		if i < len(roles) {
 710  			role = roles[i]
 711  		}
 712  		m := uint16(0)
 713  		if i < len(slotMorphs) {
 714  			m = slotMorphs[i]
 715  		}
 716  		mark := uint8(0)
 717  		if i < len(slotMarks) {
 718  			mark = slotMarks[i]
 719  		}
 720  		obl := uint8(ORNone)
 721  		if i < len(slotOblRoles) {
 722  			obl = slotOblRoles[i]
 723  		}
 724  		head := int16(-1)
 725  		modKind := uint8(MKNone)
 726  		if i < len(slotHeads) {
 727  			head = slotHeads[i]
 728  		}
 729  		if i < len(slotModKinds) {
 730  			modKind = slotModKinds[i]
 731  		}
 732  		set = append(set, SetEntry{
 733  			Role: role, Atom: lemma, Morph: m, Class: 0, Mark: mark, OblRole: obl,
 734  			Head: head, ModKind: modKind,
 735  		})
 736  	}
 737  	return set
 738  }
 739  
 740  // hasPredication returns true when the current per-slot state contains a
 741  // completed predication (a real verb slot or a copular complement).
 742  // Used to disambiguate "and" between NP coordination and clause coordination.
 743  func hasPredication(roles []int32, slotModKinds []uint8) bool {
 744  	for i, r := range roles {
 745  		if r == HistVerb {
 746  			return true
 747  		}
 748  		if i < len(slotModKinds) && slotModKinds[i] == MKCop {
 749  			return true
 750  		}
 751  	}
 752  	return false
 753  }
 754  
 755  // ExtractEN takes EN tokens and produces pattern + slots.
 756  // Handles determiners (the/a) as morph hints, prepositions as role markers,
 757  // verb auxiliaries as fillers of the verb slot, and pronouns as content nouns.
 758  func ExtractEN(tokens []string) ExtractResult {
 759  	var pat []byte
 760  	var slots []string
 761  	var roles []int32
 762  	var slotMorphs []uint16
 763  	var slotMarks []uint8
 764  	var slotOblRoles []uint8
 765  	var slotHeads []int16
 766  	var slotModKinds []uint8
 767  
 768  	sawVerb := false
 769  	contentCount := 0
 770  	pendingRole := HistSubject
 771  	pendingDef := false
 772  	pendingNeg := false
 773  	pendingMark := uint8(0)
 774  	pendingOblRole := uint8(ORNone)
 775  	pendingHead := int16(-1)
 776  	pendingModKind := uint8(MKNone)
 777  	pendingCop := false
 778  	pendingCopHead := int16(-1)
 779  	pendingCopMorph := uint16(0)
 780  	// Track whether the staged copula aux is a form of "be" (vs do/have/will).
 781  	// Only be-aux + past-participle yields passive voice; do/will + bare-V
 782  	// is do-support or modal, not passive.
 783  	pendingCopAuxIsBe := false
 784  	// Volitional state: "let X V" / "let's V" means the V slot is volitional.
 785  	pendingVol := false
 786  	// Coordination state: when "and"/"or" is seen between content words, the
 787  	// next content word becomes a MKCoord peer of the most recent content.
 788  	pendingCoordHead := int16(-1)
 789  	// REL state: when "that/which/who" follows a noun, the next emitted verb
 790  	// becomes a MKRel modifier of that noun (intransitive REL only - flat-Set
 791  	// representation; transitive REL with its own subject needs sub-clauses).
 792  	pendingRelHead := int16(-1)
 793  	// Ditransitive state: when a verb in the enDitransitive set is emitted,
 794  	// the FIRST bare-NP (no preceding preposition) is the candidate recipient.
 795  	// It only commits to ORRecip when a SECOND object follows; otherwise the
 796  	// single object is a plain patient (read a book, write a letter, etc.).
 797  	pendingRecipCand := false
 798  	pendingRecipIdx := int16(-1)
 799  	// Causative state: when "make"/"let" appears as the first verb and a
 800  	// bare-V follows after an NP (made me wait, let him go), the auxiliary
 801  	// is not emitted. Instead MetaCausative is staged for the embedded verb
 802  	// emission. The intervening NP is the causee, emitted as plain object.
 803  	pendingCausative := false
 804  	pendingCausativeMorph := uint16(0)
 805  	// resetPending: single reset point for ALL pending-state variables above.
 806  	// Every clause-boundary site (comma/semicolon split, "and"/"or"/"but"
 807  	// clause-coord, subordinator boundary, second-verb-no-comma boundary)
 808  	// calls this. Adding a new pending var requires exactly one new line
 809  	// here - never twelve scattered reset blocks to keep in sync.
 810  	resetPending := func() {
 811  		pendingRole = HistSubject
 812  		pendingDef = false
 813  		pendingNeg = false
 814  		pendingMark = 0
 815  		pendingOblRole = ORNone
 816  		pendingHead = -1
 817  		pendingModKind = MKNone
 818  		pendingCop = false
 819  		pendingCopHead = -1
 820  		pendingCopMorph = 0
 821  		pendingCopAuxIsBe = false
 822  		pendingVol = false
 823  		pendingCoordHead = -1
 824  		pendingRelHead = -1
 825  		pendingRecipCand = false
 826  		pendingRecipIdx = -1
 827  		pendingCausative = false
 828  		pendingCausativeMorph = 0
 829  	}
 830  	_ = resetPending
 831  	// Multi-clause discourse accumulator. clauseRel/clauseParent track how
 832  	// the CURRENT (being-built) clause relates to its parent in clauses[].
 833  	var clauses []Clause
 834  	clauseRel := ClauseRoot
 835  	clauseParent := int16(-1)
 836  
 837  	for tokIdx, tok := range tokens {
 838  		// Clause-boundary token from tokenizeEN punctuation classification
 839  		// (synthetic 、 emitted for ,/;/:/./?/!/—/…/♫). Finalize the current
 840  		// clause as a ClauseAnd peer, reset per-slot state, continue.
 841  		if tok == "\xe3\x80\x81" {
 842  			if len(slots) > 0 {
 843  				clauseSet := buildSetFromSlices(
 844  					slots, roles, slotMorphs,
 845  					slotMarks, slotOblRoles, slotHeads, slotModKinds,
 846  				)
 847  				clauseRelLocal := clauseRel
 848  				if clauseRelLocal == ClauseRoot && len(clauses) > 0 {
 849  					clauseRelLocal = ClauseAnd
 850  				}
 851  				nextParent := int16(len(clauses))
 852  				clauses = append(clauses, Clause{
 853  					Set: clauseSet, Relation: clauseRelLocal,
 854  					Parent: clauseParent, HostIdx: -1,
 855  				})
 856  				slots = nil
 857  				roles = nil
 858  				slotMorphs = nil
 859  				slotMarks = nil
 860  				slotOblRoles = nil
 861  				slotHeads = nil
 862  				slotModKinds = nil
 863  				sawVerb = false
 864  				contentCount = 0
 865  				resetPending()
 866  				clauseRel = ClauseAnd
 867  				clauseParent = nextParent - 1
 868  			}
 869  			continue
 870  		}
 871  		low := toLowerEN(tok)
 872  		mk, isMk := enWordToMarker()[low]
 873  		if isMk {
 874  			pat = append(pat, mk)
 875  			switch enMarkerClass(mk) {
 876  			case enMarkDeterminer:
 877  				// Clause boundary detection: in a subordinate clause that
 878  				// has a completed verb, a new determiner starts the main
 879  				// clause. "if it rains the cat runs" - "the" begins main.
 880  				if sawVerb && (clauseRel == ClauseIf || clauseRel == ClauseBecause) {
 881  					clauseSet := buildSetFromSlices(
 882  						slots, roles, slotMorphs,
 883  						slotMarks, slotOblRoles, slotHeads, slotModKinds,
 884  					)
 885  					nextParent := int16(len(clauses))
 886  					clauses = append(clauses, Clause{
 887  						Set: clauseSet, Relation: clauseRel,
 888  						Parent: nextParent, HostIdx: -1,
 889  					})
 890  					slots = nil
 891  					roles = nil
 892  					slotMorphs = nil
 893  					slotMarks = nil
 894  					slotOblRoles = nil
 895  					slotHeads = nil
 896  					slotModKinds = nil
 897  					sawVerb = false
 898  					contentCount = 0
 899  					resetPending()
 900  					clauseRel = ClauseRoot
 901  					clauseParent = -1
 902  				}
 903  				if mk == MkThe {
 904  					pendingDef = true
 905  				} // "a" leaves pendingDef=false (indefinite)
 906  			case enMarkNegation:
 907  				pendingNeg = true
 908  			case enMarkPronoun:
 909  				// Pronouns are content nouns; emit as slot.
 910  				contentCount++
 911  				role := pendingRole
 912  				if !sawVerb {
 913  					role = HistSubject
 914  				} else {
 915  					role = pendingRole
 916  				}
 917  				pat[len(pat)-1] = SlotNoun
 918  				pronAtom := tok
 919  				plem := LemmatizeEN(toLowerEN(tok))
 920  				if plem.Lemma != "" {
 921  					pronAtom = plem.Lemma
 922  				}
 923  				slots = append(slots, pronAtom)
 924  				roles = append(roles, role)
 925  				m := uint16(0)
 926  				if pendingDef {
 927  					m |= MetaDefDef
 928  				}
 929  				slotMorphs = append(slotMorphs, m)
 930  				slotMarks = append(slotMarks, pendingMark)
 931  				slotOblRoles = append(slotOblRoles, ORNone)
 932  				slotHeads = append(slotHeads, -1)
 933  				slotModKinds = append(slotModKinds, MKNone)
 934  				newPronIdx := int16(len(slots) - 1)
 935  				// Ditransitive recipient (two-noun pattern, mirrors the noun-
 936  				// emit branch). Pronouns are common recipients (give him X).
 937  				if pendingRecipCand {
 938  					if pendingRecipIdx < 0 {
 939  						if pendingMark == 0 && pendingOblRole == ORNone {
 940  							pendingRecipIdx = newPronIdx
 941  						} else {
 942  							pendingRecipCand = false
 943  						}
 944  					} else {
 945  						slotOblRoles[pendingRecipIdx] = ORRecip
 946  						roles[pendingRecipIdx] = HistModifier
 947  						pendingRecipCand = false
 948  						pendingRecipIdx = -1
 949  					}
 950  				}
 951  				pendingMark = 0
 952  				pendingDef = false
 953  			case enMarkPossDet:
 954  				// Possessive determiner: emit as POSS modifier of the next noun.
 955  				// The next slot emission resolves the Head pointer.
 956  				pat[len(pat)-1] = SlotNoun
 957  				slots = append(slots, possDetSurface(mk))
 958  				roles = append(roles, HistOperator) // flattened role for the modifier
 959  				slotMorphs = append(slotMorphs, 0)
 960  				slotMarks = append(slotMarks, mk)
 961  				slotOblRoles = append(slotOblRoles, ORNone)
 962  				slotHeads = append(slotHeads, -1)
 963  				slotModKinds = append(slotModKinds, MKNone)
 964  				pendingHead = int16(len(slots) - 1)
 965  				pendingModKind = MKPoss
 966  			case enMarkVerbAux:
 967  				// Verb-auxiliaries (be/is/are/was/were/am/do/etc) are not
 968  				// content verbs. For copular constructions ("he is a student"),
 969  				// the next noun becomes the copular predicate, attached via
 970  				// MKCop to the most recent subject. No verb slot is emitted.
 971  				//
 972  				// If we've already seen a real content verb (e.g. "is eating"),
 973  				// the aux just carries tense/aspect onto that verb.
 974  				if sawVerb && len(slotMorphs) > 0 {
 975  					lem := LemmatizeEN(low)
 976  					slotMorphs[len(slotMorphs)-1] |= lem.Morph
 977  				} else {
 978  					// No real verb. Stage copula info for the next noun.
 979  					subj := findSubjectIdx(roles, slotHeads)
 980  					if subj >= 0 {
 981  						pendingCop = true
 982  						pendingCopHead = subj
 983  						lem := LemmatizeEN(low)
 984  						pendingCopMorph = lem.Morph
 985  						pendingCopAuxIsBe = lem.Lemma == "be"
 986  						if pendingNeg {
 987  							pendingCopMorph |= MetaPolarNeg
 988  							pendingNeg = false
 989  						}
 990  					}
 991  					pendingRole = HistComplement
 992  				}
 993  			case enMarkPreposition:
 994  				pendingRole = MarkerToRole(mk)
 995  				pendingMark = mk
 996  				pendingOblRole = MarkerToOblRole(mk)
 997  			case enMarkConjunction:
 998  				// "and"/"or" is either NP-coord (between nouns at same position)
 999  				// or CLAUSE-coord (between two complete predications).
1000  				// Disambiguator: if a predication has been completed
1001  				// (sawVerb || any MKCop), this is clause-coord.
1002  				if hasPredication(roles, slotModKinds) {
1003  					// CLAUSE-COORD: finalize the current clause, reset
1004  					// per-slot state, mark next clause as ClauseAnd peer.
1005  					clauseSet := buildSetFromSlices(
1006  						slots, roles, slotMorphs,
1007  						slotMarks, slotOblRoles, slotHeads, slotModKinds,
1008  					)
1009  					clauseRelLocal := clauseRel
1010  					clauseParentLocal := clauseParent
1011  					if clauseRelLocal == ClauseRoot && len(clauses) > 0 {
1012  						// Shouldn't happen but guard
1013  						clauseRelLocal = ClauseAnd
1014  					}
1015  					nextParent := int16(len(clauses))
1016  					clauses = append(clauses, Clause{
1017  						Set: clauseSet, Relation: clauseRelLocal,
1018  						Parent: clauseParentLocal, HostIdx: -1,
1019  					})
1020  					// Reset per-slot state.
1021  					slots = nil
1022  					roles = nil
1023  					slotMorphs = nil
1024  					slotMarks = nil
1025  					slotOblRoles = nil
1026  					slotHeads = nil
1027  					slotModKinds = nil
1028  					sawVerb = false
1029  					contentCount = 0
1030  					resetPending()
1031  					// Next clause inherits AND relation, parent points at the
1032  					// previous root clause (or its index).
1033  					if mk == MkBut {
1034  						clauseRel = ClauseBut
1035  					} else {
1036  						clauseRel = ClauseAnd
1037  					}
1038  					clauseParent = nextParent - 1
1039  					continue
1040  				}
1041  				// NP-COORD: existing behavior.
1042  				if len(slots) > 0 {
1043  					prev := int16(len(slots) - 1)
1044  					if int32(prev) < len(slotModKinds) && slotModKinds[prev] == MKCoord {
1045  						prev = slotHeads[prev]
1046  					}
1047  					pendingCoordHead = prev
1048  				} else {
1049  					pendingRole = HistSubject
1050  					sawVerb = false
1051  					contentCount = 0
1052  				}
1053  			case enMarkRelative:
1054  				// "that/which/who" after a noun: capture the noun's index so
1055  				// the next verb is emitted as a MKRel modifier of that noun.
1056  				// Intransitive REL only - if a subject follows ("that I saw"),
1057  				// the verb attaches to that subject and pendingRelHead does
1058  				// not fire. Transitive REL needs the sub-clause path; not yet.
1059  				if len(slots) > 0 && roles[len(roles)-1] == HistSubject {
1060  					pendingRelHead = int16(len(slots) - 1)
1061  				}
1062  			}
1063  			continue
1064  		}
1065  		if isENPunct(tok) {
1066  			continue
1067  		}
1068  
1069  		// "let" / "let's" as a volitional auxiliary: marks the subsequent verb
1070  		// as MetaMoodVol. "let" only fires at clause start (else it's the verb
1071  		// meaning "allow/permit"). "let's" is unambiguously volitional and
1072  		// fires anywhere; the renderer may emit "let's" after a subject like
1073  		// "we let's go" so we can't gate on contentCount==0.
1074  		if !sawVerb && low == "let's" {
1075  			pendingVol = true
1076  			continue
1077  		}
1078  		if !sawVerb && contentCount == 0 && low == "let" {
1079  			pendingVol = true
1080  			continue
1081  		}
1082  
1083  		// Subordinating conjunctions at clause start: "if" / "because" mark
1084  		// the upcoming clause as subordinate (ClauseIf / ClauseBecause).
1085  		// Don't emit a slot for the conjunction; consume into clauseRel.
1086  		if !sawVerb && contentCount == 0 && len(slots) == 0 {
1087  			switch low {
1088  			case "if":
1089  				clauseRel = ClauseIf
1090  				continue
1091  			case "because":
1092  				clauseRel = ClauseBecause
1093  				continue
1094  			}
1095  		}
1096  
1097  		// Adverb detection: -ly suffix or hardcoded list. Adverbs modify
1098  		// the verb in the clause. Emit as a slot with ModKind=MKAdv. Head
1099  		// resolves at verb-emission time (forward) or via the last-emitted
1100  		// verb slot (backward).
1101  		//
1102  		// When pendingCop is set, predicate-adjective check takes precedence
1103  		// (fast is both an adverb and a predicate-adj; "is fast" wants the
1104  		// adj reading, "runs fast" wants the adverb reading).
1105  		isAdv := looksLikeAdverb(low)
1106  		if pendingCop && looksLikePredicateAdj(low) {
1107  			isAdv = false
1108  		}
1109  		if isAdv {
1110  			contentCount++
1111  			pat = append(pat, SlotNoun)
1112  			slots = append(slots, low)
1113  			roles = append(roles, HistModifier)
1114  			slotMorphs = append(slotMorphs, 0)
1115  			slotMarks = append(slotMarks, 0)
1116  			slotOblRoles = append(slotOblRoles, ORNone)
1117  			advIdx := int16(len(slots) - 1)
1118  			// Find the verb to bind to. Backward: most-recent verb slot.
1119  			// Forward: stage pendingAdvHead, resolved at next verb emission.
1120  			boundHead := int16(-1)
1121  			for i := int32(advIdx) - 1; i >= 0; i-- {
1122  				if roles[i] == HistVerb {
1123  					boundHead = int16(i)
1124  					break
1125  				}
1126  			}
1127  			if boundHead >= 0 {
1128  				slotHeads = append(slotHeads, boundHead)
1129  				slotModKinds = append(slotModKinds, MKAdv)
1130  			} else {
1131  				// No prior verb; wait for the next verb emission.
1132  				slotHeads = append(slotHeads, -1)
1133  				slotModKinds = append(slotModKinds, MKAdv)
1134  			}
1135  			continue
1136  		}
1137  
1138  		// Content word.
1139  		contentCount++
1140  		isVerb := looksLikeVerb(low)
1141  		// Second-verb-no-comma clause boundary: if we've already seen a
1142  		// verb in this clause and the current word is also a verb (with no
1143  		// preposition or conjunction between), the current clause ends and
1144  		// a new clause begins. Common after subordinators: "if it rains
1145  		// stay home" - "rains" verb-of-condition, "stay" verb-of-main.
1146  		// Suppress when pendingCausative is staged - the upcoming verb is
1147  		// the embedded action of an analytic causative, not a new clause.
1148  		if sawVerb && isVerb && pendingMark == 0 && pendingOblRole == ORNone &&
1149  			!pendingCausative {
1150  			clauseSet := buildSetFromSlices(
1151  				slots, roles, slotMorphs,
1152  				slotMarks, slotOblRoles, slotHeads, slotModKinds,
1153  			)
1154  			nextParent := int16(len(clauses))
1155  			// Condition/cause clauses come BEFORE the main clause in surface
1156  			// order; Parent points forward to the next root clause (its index
1157  			// will be len(clauses), the index this new clause is about to take).
1158  			parent := clauseParent
1159  			if clauseRel == ClauseIf || clauseRel == ClauseBecause {
1160  				parent = nextParent
1161  			}
1162  			clauses = append(clauses, Clause{
1163  				Set: clauseSet, Relation: clauseRel,
1164  				Parent: parent, HostIdx: -1,
1165  			})
1166  			slots = nil
1167  			roles = nil
1168  			slotMorphs = nil
1169  			slotMarks = nil
1170  			slotOblRoles = nil
1171  			slotHeads = nil
1172  			slotModKinds = nil
1173  			sawVerb = false
1174  			contentCount = 1 // we're about to emit this verb
1175  			resetPending()
1176  			// Main clause after subordinator: ClauseRoot
1177  			clauseRel = ClauseRoot
1178  			clauseParent = -1
1179  		}
1180  		// ATTR detection: pattern is [current content][next content][verb or end].
1181  		// If current and next are both content non-verbs, and the position
1182  		// after next is a verb (or end-of-clause), then current modifies next.
1183  		// Examples: "red car runs" - red is ATTR of car (runs is the verb).
1184  		isAttr := false
1185  		if !isVerb && tokIdx+1 < len(tokens) {
1186  			next := toLowerEN(tokens[tokIdx+1])
1187  			if _, nextIsMk := enWordToMarker()[next]; !nextIsMk && !isENPunct(tokens[tokIdx+1]) && !looksLikeVerb(next) {
1188  				// Two-token check: position after next must be verb or end.
1189  				if tokIdx+2 >= len(tokens) {
1190  					isAttr = true
1191  				} else {
1192  					afterNext := toLowerEN(tokens[tokIdx+2])
1193  					if _, anIsMk := enWordToMarker()[afterNext]; anIsMk {
1194  						// Marker (preposition/etc) after the noun phrase - NP ends here.
1195  						isAttr = true
1196  					} else if looksLikeVerb(afterNext) {
1197  						isAttr = true
1198  					}
1199  				}
1200  			}
1201  		}
1202  		// Sister check: if previous slot was ATTR-tagged at extract time, or a
1203  		// pendingHead is awaiting this slot as its head, or a copular predicate
1204  		// is pending, the current word is the predicate / noun, not a verb.
1205  		prevWasAttrOrPending := pendingHead >= 0 || pendingCop || pendingCoordHead >= 0
1206  		if len(slotModKinds) > 0 && slotModKinds[len(slotModKinds)-1] == MKAttr {
1207  			prevWasAttrOrPending = true
1208  		}
1209  		if !isVerb && !sawVerb && contentCount == 2 && pendingRole == HistObject {
1210  			// pendingRole was set to Object by a preposition; not a verb position
1211  		} else if !isVerb && !sawVerb && contentCount == 2 && !isAttr && !prevWasAttrOrPending {
1212  			isVerb = true
1213  		}
1214  		// Predicative adjective: when a copula is staged ("is/are/was/were")
1215  		// and the current word looks like a predicate adjective (deverbal -ing,
1216  		// -ful, -ous, -ic, -able, -ible) or its lemma does (bigger→big), emit
1217  		// as HistComplement with MKAdj pointing at the subject. The copula's
1218  		// morph (tense, 3sg) merges with the lemma's morph (e.g. MetaCompare
1219  		// for comparatives) into the adjective's morph; no separate verb slot.
1220  		if pendingCop {
1221  			// Surface-form check first (interesting, big, etc.) - atom stays
1222  			// as the surface. Only when surface miss AND lemma-form matches
1223  			// (comparatives: bigger→big) do we use the lemma+lem.Morph.
1224  			useLemma := false
1225  			var lem LemmaResult
1226  			if !looksLikePredicateAdj(low) {
1227  				lem = LemmatizeEN(low)
1228  				if looksLikePredicateAdj(lem.Lemma) {
1229  					useLemma = true
1230  				}
1231  			}
1232  			if looksLikePredicateAdj(low) || useLemma {
1233  				pat = append(pat, SlotNoun)
1234  				atom := low
1235  				m := pendingCopMorph
1236  				if useLemma {
1237  					atom = lem.Lemma
1238  					m |= lem.Morph
1239  				}
1240  				slots = append(slots, atom)
1241  				roles = append(roles, HistComplement)
1242  				if pendingNeg {
1243  					m |= MetaPolarNeg
1244  					pendingNeg = false
1245  				}
1246  				slotMorphs = append(slotMorphs, m)
1247  				slotMarks = append(slotMarks, 0)
1248  				slotOblRoles = append(slotOblRoles, ORNone)
1249  				slotHeads = append(slotHeads, pendingCopHead)
1250  				slotModKinds = append(slotModKinds, MKAdj)
1251  				pendingCop = false
1252  				pendingCopHead = -1
1253  				pendingCopMorph = 0
1254  				pendingCopAuxIsBe = false
1255  				sawVerb = true
1256  				continue
1257  			}
1258  		}
1259  		// REL-intransitive: if a relative pronoun staged pendingRelHead and
1260  		// the current word is a verb, emit it as a MKRel modifier of the
1261  		// host noun. Do NOT set sawVerb so the next real verb still becomes
1262  		// the clause's main predicate.
1263  		if pendingRelHead >= 0 && isVerb {
1264  			pat = append(pat, SlotVerb)
1265  			lem := LemmatizeEN(low)
1266  			slots = append(slots, lem.Lemma)
1267  			roles = append(roles, HistModifier)
1268  			m := lem.Morph
1269  			if m&MetaNumPlural != 0 {
1270  				m = (m &^ MetaNumPlural) | Meta3Sg
1271  			}
1272  			if pendingNeg {
1273  				m |= MetaPolarNeg
1274  				pendingNeg = false
1275  			}
1276  			slotMorphs = append(slotMorphs, m)
1277  			slotMarks = append(slotMarks, 0)
1278  			slotOblRoles = append(slotOblRoles, ORNone)
1279  			slotHeads = append(slotHeads, pendingRelHead)
1280  			slotModKinds = append(slotModKinds, MKRel)
1281  			pendingRelHead = -1
1282  			continue
1283  		}
1284  		// Causative: emit the embedded verb with MetaCausative. The aux
1285  		// (make/let) was suppressed at its position; its tense morph rides
1286  		// on the embedded verb.
1287  		if pendingCausative && isVerb {
1288  			pat = append(pat, SlotVerb)
1289  			lem := LemmatizeEN(low)
1290  			slots = append(slots, lem.Lemma)
1291  			roles = append(roles, HistVerb)
1292  			m := lem.Morph | MetaCausative | pendingCausativeMorph
1293  			if m&MetaNumPlural != 0 {
1294  				m = (m &^ MetaNumPlural) | Meta3Sg
1295  			}
1296  			if pendingNeg {
1297  				m |= MetaPolarNeg
1298  				pendingNeg = false
1299  			}
1300  			slotMorphs = append(slotMorphs, m)
1301  			slotMarks = append(slotMarks, 0)
1302  			slotOblRoles = append(slotOblRoles, ORNone)
1303  			slotHeads = append(slotHeads, -1)
1304  			slotModKinds = append(slotModKinds, MKNone)
1305  			pendingCausative = false
1306  			pendingCausativeMorph = 0
1307  			continue
1308  		}
1309  		if !sawVerb && isVerb {
1310  			lem := LemmatizeEN(low)
1311  			// Causative aux detection: "make"/"let" + (NP) + bare-V. Suppress
1312  			// the aux emission, stage MetaCausative for the next verb.
1313  			if (lem.Lemma == "make" || lem.Lemma == "let") &&
1314  				causativeBareVFollows(tokens, tokIdx) {
1315  				pendingCausative = true
1316  				pendingCausativeMorph = lem.Morph
1317  				sawVerb = true
1318  				pendingRole = HistObject
1319  				continue
1320  			}
1321  			pat = append(pat, SlotVerb)
1322  			slots = append(slots, lem.Lemma)
1323  			roles = append(roles, HistVerb)
1324  			m := lem.Morph
1325  			if m&MetaNumPlural != 0 {
1326  				m = (m &^ MetaNumPlural) | Meta3Sg
1327  			}
1328  			if pendingNeg {
1329  				m |= MetaPolarNeg
1330  				pendingNeg = false
1331  			}
1332  			if pendingVol {
1333  				m |= MetaMoodVol
1334  				pendingVol = false
1335  			}
1336  			// If a verb-aux had staged copula info but the next content turns
1337  			// out to be a real verb, the aux is an auxiliary helper, not a
1338  			// copula. The grammar of the verb form determines which:
1339  			//   "is/was + V-ing"  -> progressive aspect on V
1340  			//   "is/was + V-ed/en" (past participle, not progressive) -> passive
1341  			// LemmatizeEN sets MetaAspectProg for -ing forms, MetaTensePast
1342  			// for -ed/irregulars. Aspect bit distinguishes prog from passive.
1343  			if pendingCop {
1344  				m |= pendingCopMorph
1345  				// Passive voice requires a form of "be" as auxiliary plus
1346  				// past-participle form on the embedded verb. "did + V" is
1347  				// do-support (do-support + bare V = emphatic/negative/
1348  				// question), not passive. Only be-aux + past tense (with
1349  				// no progressive) yields passive.
1350  				if pendingCopAuxIsBe &&
1351  					m&MetaAspectProg == 0 && m&MetaTensePast != 0 {
1352  					m |= MetaPassive
1353  					// The aux's tense ("is"=non-past, "was"=past) overrides
1354  					// the participle's "past" reading - "was bitten" is past
1355  					// passive, "is bitten" is non-past passive.
1356  					if pendingCopMorph&MetaTensePast == 0 {
1357  						m &^= MetaTensePast
1358  					}
1359  				}
1360  				pendingCop = false
1361  				pendingCopHead = -1
1362  				pendingCopMorph = 0
1363  				pendingCopAuxIsBe = false
1364  			}
1365  			slotMorphs = append(slotMorphs, m)
1366  			slotMarks = append(slotMarks, 0)
1367  			slotOblRoles = append(slotOblRoles, ORNone)
1368  			slotHeads = append(slotHeads, -1)
1369  			slotModKinds = append(slotModKinds, MKNone)
1370  			verbIdx := int16(len(slots) - 1)
1371  			// Resolve any pre-verb adverbs that were waiting for a verb head.
1372  			for i := 0; i < int32(verbIdx); i++ {
1373  				if slotModKinds[i] == MKAdv && slotHeads[i] < 0 {
1374  					slotHeads[i] = verbIdx
1375  				}
1376  			}
1377  			// Ditransitive: if this verb takes a bare-NP recipient before the
1378  			// patient ("give X Y"), flag the next-noun-with-no-preposition as
1379  			// a recipient candidate. Commits to ORRecip only if a second
1380  			// object follows ("give him a book"); single-object uses keep
1381  			// the noun as plain patient ("read a book").
1382  			if isEnDitransitive(lem.Lemma) {
1383  				pendingRecipCand = true
1384  				pendingRecipIdx = -1
1385  			}
1386  			sawVerb = true
1387  			pendingRole = HistObject
1388  		} else {
1389  			pat = append(pat, SlotNoun)
1390  			lem := LemmatizeEN(low)
1391  			slots = append(slots, lem.Lemma)
1392  			role := pendingRole
1393  			if !sawVerb && !pendingCop {
1394  				role = HistSubject
1395  			}
1396  			roles = append(roles, role)
1397  			m := lem.Morph
1398  			if pendingDef {
1399  				m |= MetaDefDef
1400  				pendingDef = false
1401  			}
1402  			slotMorphs = append(slotMorphs, m)
1403  			slotMarks = append(slotMarks, pendingMark)
1404  			slotOblRoles = append(slotOblRoles, pendingOblRole)
1405  			slotHeads = append(slotHeads, -1)
1406  			slotModKinds = append(slotModKinds, MKNone)
1407  			newIdx := int16(len(slots) - 1)
1408  			// Ditransitive recipient (two-noun pattern): the first bare-NP
1409  			// after a ditransitive verb is staged as a candidate; the second
1410  			// noun's arrival promotes the first to ORRecip + HistModifier.
1411  			// A preposition on the first noun cancels (existing prep path
1412  			// handles "give book to him"). Single-object uses leave the
1413  			// candidate uncommitted so it stays a plain object.
1414  			if pendingRecipCand {
1415  				if pendingRecipIdx < 0 {
1416  					// This is the first noun. Stage as candidate unless a
1417  					// preposition fired.
1418  					if pendingMark == 0 && pendingOblRole == ORNone {
1419  						pendingRecipIdx = newIdx
1420  					} else {
1421  						pendingRecipCand = false
1422  					}
1423  				} else {
1424  					// Second noun arrives - promote the candidate.
1425  					slotOblRoles[pendingRecipIdx] = ORRecip
1426  					roles[pendingRecipIdx] = HistModifier
1427  					pendingRecipCand = false
1428  					pendingRecipIdx = -1
1429  				}
1430  			}
1431  			// Coordination resolution: if "and"/"or" set pendingCoordHead,
1432  			// this noun is a peer conjunct of that slot. Inherit its role.
1433  			if pendingCoordHead >= 0 && pendingCoordHead < newIdx {
1434  				slotHeads[newIdx] = pendingCoordHead
1435  				slotModKinds[newIdx] = MKCoord
1436  				roles[newIdx] = roles[pendingCoordHead]
1437  				pendingCoordHead = -1
1438  			}
1439  			// Copula resolution: if a verb-aux staged copula state, this noun
1440  			// is the predicate. Bind Head=subject, ModKind=MKCop, merge morph.
1441  			if pendingCop {
1442  				slotHeads[newIdx] = pendingCopHead
1443  				slotModKinds[newIdx] = MKCop
1444  				slotMorphs[newIdx] |= pendingCopMorph
1445  				pendingCop = false
1446  				pendingCopHead = -1
1447  				pendingCopMorph = 0
1448  				pendingCopAuxIsBe = false
1449  			}
1450  			// Resolve a pending POSS/ATTR modifier from a preceding determiner
1451  			// or ATTR-detected adjective.
1452  			if pendingHead >= 0 && pendingHead < newIdx {
1453  				slotHeads[pendingHead] = newIdx
1454  				slotModKinds[pendingHead] = pendingModKind
1455  				// Transfer pending def/morph from modifier to head (the
1456  				// determiner applied to the noun phrase, whose head is this).
1457  				if pendingModKind == MKAttr {
1458  					// Move MetaDefDef and other phrase-level morph from
1459  					// modifier to head if present.
1460  					if slotMorphs[pendingHead]&MetaDefDef != 0 {
1461  						slotMorphs[newIdx] |= MetaDefDef
1462  						slotMorphs[pendingHead] &^= MetaDefDef
1463  					}
1464  					// The head takes the clause role; modifier loses it.
1465  					if roles[pendingHead] == HistSubject {
1466  						roles[newIdx] = HistSubject
1467  					}
1468  				}
1469  				pendingHead = -1
1470  				pendingModKind = MKNone
1471  			}
1472  			// ATTR detection (forward-looking): if the lookahead identified
1473  			// the current slot as an ATTR modifier of the next noun, set up
1474  			// pendingHead so the next slot emission resolves it.
1475  			if isAttr {
1476  				pendingHead = newIdx
1477  				pendingModKind = MKAttr
1478  			}
1479  			pendingMark = 0
1480  			pendingOblRole = ORNone
1481  			if sawVerb && pendingRole == HistObject {
1482  				pendingRole = HistComplement
1483  			}
1484  		}
1485  	}
1486  
1487  	// Finalize the last clause.
1488  	finalSet := buildSetFromSlices(
1489  		slots, roles, slotMorphs,
1490  		slotMarks, slotOblRoles, slotHeads, slotModKinds,
1491  	)
1492  	clauses = append(clauses, Clause{
1493  		Set: finalSet, Relation: clauseRel,
1494  		Parent: clauseParent, HostIdx: -1,
1495  	})
1496  
1497  	// Flatten Slots/Roles across all clauses so the atom-link layer sees
1498  	// every word. With punct-aware tokenization, clauses are finalized
1499  	// mid-input and the per-clause `slots` array gets reset; only the final
1500  	// clause would otherwise be visible in ExtractResult.Slots.
1501  	flatSlots := []string{:0:len(slots)}
1502  	flatRoles := []int32{:0:len(slots)}
1503  	for _, c := range clauses {
1504  		for _, e := range c.Set {
1505  			flatSlots = append(flatSlots, e.Atom)
1506  			flatRoles = append(flatRoles, e.Role)
1507  		}
1508  	}
1509  
1510  	return ExtractResult{
1511  		Pattern: pat, Slots: flatSlots, Roles: flatRoles,
1512  		DeepPat: buildDeepPat(flatRoles), Set: clauses[0].Set,
1513  		Discourse: clauses,
1514  	}
1515  }
1516  
1517  // ExtractCode takes code tokens and produces pattern + slots.
1518  // Structural keywords become markers. Identifiers/literals become slots.
1519  func ExtractCode(tokens []string) ExtractResult {
1520  	var pat []byte
1521  	var slots []string
1522  	var roles []int32
1523  
1524  	for _, tok := range tokens {
1525  		mk := codeTokenToMarker(tok)
1526  		if mk != 0 {
1527  			pat = append(pat, mk)
1528  		} else {
1529  			pat = append(pat, SlotNoun)
1530  			slots = append(slots, tok)
1531  			roles = append(roles, HistComplement)
1532  		}
1533  	}
1534  	return ExtractResult{Pattern: pat, Slots: slots, Roles: roles, DeepPat: buildDeepPat(roles)}
1535  }
1536  
1537  func codeTokenToMarker(tok string) uint8 {
1538  	switch tok {
1539  	case "if":
1540  		return MkIf
1541  	case "else":
1542  		return MkElse
1543  	case "for", "range":
1544  		return MkFor_C
1545  	case "return":
1546  		return MkReturn
1547  	case "{":
1548  		return MkLBrace
1549  	case "}":
1550  		return MkRBrace
1551  	case "(":
1552  		return MkLParen
1553  	case ")":
1554  		return MkRParen
1555  	case "=", ":=":
1556  		return MkAssign
1557  	case ".":
1558  		return MkDot
1559  	case ",":
1560  		return MkComma
1561  	case ":":
1562  		return MkColon
1563  	case "->", "<-":
1564  		return MkArrow
1565  	case "case":
1566  		return MkCase
1567  	case "select":
1568  		return MkSelect
1569  	case "spawn":
1570  		return MkSpawn
1571  	case "chan":
1572  		return MkChan
1573  	}
1574  	return 0
1575  }
1576  
1577  // EN marker functional classes.
1578  const (
1579  	enMarkUnknown     = 0
1580  	enMarkDeterminer  = 1 // the, a, an
1581  	enMarkPossDet     = 2 // my, your, his, her, its, our, their (POSS modifiers)
1582  	enMarkVerbAux     = 3 // is, are, was, do, have...
1583  	enMarkPreposition = 4 // in, on, at, with, by...
1584  	enMarkNegation    = 5 // not, n't
1585  	enMarkConjunction = 6 // and, but, or
1586  	enMarkRelative    = 7 // that, which, who
1587  	enMarkPronoun     = 8 // i, you, he, she, it, we, they (subject pronouns)
1588  )
1589  
1590  func enMarkerClass(mk uint8) int32 {
1591  	switch mk {
1592  	case MkThe, MkA:
1593  		return enMarkDeterminer
1594  	case MkMy, MkYour, MkHis, MkHerP, MkIts, MkOurP, MkTheirP:
1595  		return enMarkPossDet
1596  	case MkIs, MkAre, MkWas, MkDo:
1597  		return enMarkVerbAux
1598  	case MkIn, MkOn, MkAt, MkWith, MkBy, MkFor, MkTo_EN, MkOf, MkFrom, MkAs, MkThan:
1599  		return enMarkPreposition
1600  	case MkNot:
1601  		return enMarkNegation
1602  	case MkAnd, MkBut:
1603  		return enMarkConjunction
1604  	case MkThat:
1605  		return enMarkRelative
1606  	case MkI, MkYou, MkPron3, MkIt_EN, MkWe_EN, MkThey_:
1607  		return enMarkPronoun
1608  	}
1609  	return enMarkUnknown
1610  }
1611  
1612  // looksLikeAdverb returns true if w is likely an adverb: -ly suffix or in
1613  // a hardcoded list of common irregular adverbs that aren't morphologically
1614  // derivable.
1615  func looksLikeAdverb(w string) bool {
1616  	if len(w) > 3 && hasSuffix(w, "ly") {
1617  		return true
1618  	}
1619  	switch w {
1620  	case "fast", "well", "hard", "here", "there", "now", "then",
1621  		"today", "yesterday", "tomorrow", "always", "often", "never",
1622  		"sometimes", "usually", "rarely", "still", "already", "yet",
1623  		"soon", "later", "early", "late", "ever", "again":
1624  		return true
1625  	}
1626  	return false
1627  }
1628  
1629  // causativeBareVFollows returns true if tokens after tokIdx contain a bare
1630  // infinitive verb (no intervening "to") within the next NP-shaped lookahead
1631  // window. Used to detect "make/let + NP + bare-V" causative pattern.
1632  // NP shape: at most 3 tokens (det + adj + noun, or possessive + noun, or
1633  // single pronoun) before the bare verb.
1634  func causativeBareVFollows(tokens []string, tokIdx int32) bool {
1635  	for k := 1; k <= 4 && tokIdx+k < len(tokens); k++ {
1636  		t := toLowerEN(tokens[tokIdx+k])
1637  		if t == "to" {
1638  			return false // "to V" infinitival, not bare-V causative
1639  		}
1640  		if mk, isMk := enWordToMarker()[t]; isMk {
1641  			switch enMarkerClass(mk) {
1642  			case enMarkPreposition, enMarkConjunction:
1643  				return false
1644  			}
1645  			continue // determiner, pronoun, possessive - part of the NP
1646  		}
1647  		if looksLikeVerb(t) {
1648  			return true
1649  		}
1650  	}
1651  	return false
1652  }
1653  
1654  // isEnDitransitive returns true for verbs that take a bare-NP recipient
1655  // before the patient object: "give X Y" = "give Y to X". When such a verb
1656  // is emitted and the next noun has no preceding preposition, that noun is
1657  // the recipient (HistModifier + ORRecip), not a second direct object.
1658  // Closed set; verbs that always take prepositional dative (e.g., "explain
1659  // X to Y") are excluded.
1660  func isEnDitransitive(lemma string) bool {
1661  	switch lemma {
1662  	case "give", "send", "tell", "show", "offer", "hand", "pass",
1663  		"teach", "write", "read", "sell", "buy", "bring",
1664  		"mail", "lend", "owe", "pay", "throw", "hand", "ask":
1665  		return true
1666  	}
1667  	return false
1668  }
1669  
1670  // looksLikePredicateAdj returns true for words that, when following a copula
1671  // (is/are/was/were), are predicative adjectives rather than verbs or nouns.
1672  // Used to disambiguate "is interesting" (predicate-adj) from "is V-ing"
1673  // (progressive verb) or "is X" (copular noun).
1674  //
1675  // Detection: three layers
1676  //   1. Common short adjective whitelist (big, small, fast, hungry, ...)
1677  //   2. Deverbal -ing predicate adjectives (interesting, exciting, ...)
1678  //   3. Adjective-shape suffixes (-ful, -ous, -ic, -able, -ible)
1679  //
1680  // Conservative; false negatives fall through to MKCop (noun-complement) which
1681  // preserves semantics but loses the JA-side MKAdj-parity for round-trip.
1682  func looksLikePredicateAdj(w string) bool {
1683  	if len(w) < 2 {
1684  		return false
1685  	}
1686  	// Common short adjective whitelist - parity with JA i-adj predicates.
1687  	switch w {
1688  	case "big", "small", "tall", "short", "long", "wide", "narrow",
1689  		"thick", "thin", "deep", "shallow", "high", "low",
1690  		"hot", "cold", "warm", "cool", "wet", "dry",
1691  		"fast", "slow", "quick", "old", "new", "young",
1692  		"good", "bad", "nice", "fine", "great", "poor",
1693  		"happy", "sad", "angry", "tired", "hungry", "thirsty",
1694  		"sleepy", "busy", "lazy", "easy", "hard", "soft",
1695  		"loud", "quiet", "clean", "dirty", "empty", "full",
1696  		"rich", "weak", "strong", "smart", "kind", "mean",
1697  		"red", "blue", "green", "yellow", "white", "black",
1698  		"pink", "brown", "gray", "grey", "purple", "orange",
1699  		"heavy", "light", "free", "cheap", "expensive",
1700  		"safe", "sick", "well", "ill", "ready", "right", "wrong",
1701  		"true", "false", "real", "fake", "open", "closed",
1702  		"bright", "dark", "sweet", "sour", "salty", "bitter",
1703  		"round", "square", "flat", "sharp", "dull",
1704  		"strange", "weird", "normal", "common", "rare",
1705  		"important", "famous", "popular", "different", "similar",
1706  		"alive", "dead", "alone", "together":
1707  		return true
1708  	}
1709  	if len(w) < 4 {
1710  		return false
1711  	}
1712  	// Deverbal -ing predicate adjectives.
1713  	switch w {
1714  	case "interesting", "exciting", "boring", "tiring", "amazing",
1715  		"frightening", "surprising", "confusing", "disappointing",
1716  		"satisfying", "encouraging", "pleasing", "fascinating",
1717  		"depressing", "embarrassing", "shocking", "thrilling",
1718  		"charming", "annoying", "relaxing", "stunning",
1719  		"missing", "willing", "outstanding", "promising":
1720  		return true
1721  	}
1722  	// Adjective-shape suffixes.
1723  	if hasSuffix(w, "ful") || hasSuffix(w, "ous") || hasSuffix(w, "ic") ||
1724  		hasSuffix(w, "able") || hasSuffix(w, "ible") {
1725  		return true
1726  	}
1727  	return false
1728  }
1729  
1730  // possDetSurface returns the surface form of an EN possessive determiner
1731  // for round-trip rendering.
1732  func possDetSurface(mk uint8) string {
1733  	switch mk {
1734  	case MkMy:
1735  		return "my"
1736  	case MkYour:
1737  		return "your"
1738  	case MkHis:
1739  		return "his"
1740  	case MkHerP:
1741  		return "her"
1742  	case MkIts:
1743  		return "its"
1744  	case MkOurP:
1745  		return "our"
1746  	case MkTheirP:
1747  		return "their"
1748  	}
1749  	return ""
1750  }
1751  
1752  func toLowerEN(s string) string {
1753  	b := []byte(s)
1754  	for i, c := range b {
1755  		if c >= 'A' && c <= 'Z' {
1756  			b[i] = c + 32
1757  		}
1758  	}
1759  	return string(b)
1760  }
1761  
1762  func isENPunct(s string) bool {
1763  	if len(s) != 1 {
1764  		return false
1765  	}
1766  	c := s[0]
1767  	return c == '.' || c == ',' || c == '!' || c == '?' || c == ';' || c == ':' || c == '"' || c == '\''
1768  }
1769  
1770  // looksLikeVerb is a heuristic for EN verb detection.
1771  // Uses common verb endings and a high-frequency set.
1772  // Min-length guards on suffix detection prevent short-word false positives
1773  // (red/fed/led/bed all end in -ed; sing/king/ring all end in -ing).
1774  func looksLikeVerb(w string) bool {
1775  	if len(w) < 2 {
1776  		return false
1777  	}
1778  	if (hasSuffix(w, "ing") && len(w) > 4) ||
1779  		(hasSuffix(w, "ed") && len(w) > 3) ||
1780  		hasSuffix(w, "ize") || hasSuffix(w, "ise") || hasSuffix(w, "ate") {
1781  		return true
1782  	}
1783  	if hasSuffix(w, "fy") || (hasSuffix(w, "en") && len(w) > 3) {
1784  		return true
1785  	}
1786  	switch w {
1787  	case
1788  		"go", "went", "gone", "goes",
1789  		"eat", "eats", "drink", "drinks", "read", "reads",
1790  		"write", "writes", "walk", "walks", "talk", "talks",
1791  		"sleep", "sleeps", "wake", "wakes", "sit", "sits",
1792  		"stand", "stands", "lie", "lies", "live", "lives",
1793  		"die", "dies", "chew", "chews", "fly", "flies",
1794  		"swim", "swims", "jump", "jumps", "throw", "throws",
1795  		"catch", "catches", "kick", "kicks", "hit", "hits",
1796  		"push", "pushes", "pull", "pulls", "grab", "grabs",
1797  		"bite", "bites", "chase", "chases",
1798  		"get", "got", "gotten", "gets",
1799  		"make", "made", "makes",
1800  		"take", "took", "taken", "takes",
1801  		"come", "came", "comes",
1802  		"see", "saw", "seen", "sees",
1803  		"know", "knew", "known", "knows",
1804  		"give", "gave", "given", "gives",
1805  		"say", "said", "says",
1806  		"tell", "told", "tells",
1807  		"think", "thought", "thinks",
1808  		"find", "found", "finds",
1809  		"leave", "left", "leaves",
1810  		"call", "calls",
1811  		"ask", "asks",
1812  		"seem", "seems",
1813  		"feel", "felt", "feels",
1814  		"become", "became", "becomes",
1815  		"keep", "kept", "keeps",
1816  		"begin", "began", "begun", "begins",
1817  		"show", "shows",
1818  		"hear", "heard", "hears",
1819  		"play", "plays",
1820  		"move", "moves",
1821  		"live", "lives",
1822  		"believe", "believes",
1823  		"hold", "held", "holds",
1824  		"bring", "brought", "brings",
1825  		"happen", "happens",
1826  		"write", "wrote", "writes",
1827  		"provide", "provides",
1828  		"sit", "sat", "sits",
1829  		"stand", "stood", "stands",
1830  		"lose", "lost", "loses",
1831  		"pay", "paid", "pays",
1832  		"meet", "met", "meets",
1833  		"include", "includes",
1834  		"continue", "continues",
1835  		"learn", "learns",
1836  		"change", "changes",
1837  		"lead", "led", "leads",
1838  		"understand", "understood",
1839  		"watch", "watches",
1840  		"follow", "follows",
1841  		"stop", "stops",
1842  		"create", "creates",
1843  		"speak", "spoke", "speaks",
1844  		"read", "reads",
1845  		"allow", "allows",
1846  		"add", "adds",
1847  		"spend", "spent", "spends",
1848  		"grow", "grew", "grows",
1849  		"open", "opens",
1850  		"walk", "walks",
1851  		"win", "won", "wins",
1852  		"teach", "taught",
1853  		"offer", "offers",
1854  		"remember", "remembers",
1855  		"love", "loves",
1856  		"consider", "considers",
1857  		"appear", "appears",
1858  		"buy", "bought", "buys",
1859  		"wait", "waits",
1860  		"serve", "serves",
1861  		"die", "died", "dies",
1862  		"send", "sent", "sends",
1863  		"expect", "expects",
1864  		"build", "built", "builds",
1865  		"stay", "stays",
1866  		"fall", "fell", "falls",
1867  		"cut", "cuts",
1868  		"reach", "reaches",
1869  		"kill", "kills",
1870  		"remain", "remains",
1871  		"suggest", "suggests",
1872  		"raise", "raises",
1873  		"pass", "passes",
1874  		"sell", "sold", "sells",
1875  		"require", "requires",
1876  		"report", "reports",
1877  		"decide", "decides",
1878  		"pull", "pulls",
1879  		"develop", "develops",
1880  		"use", "uses", "put", "puts", "set", "sets", "run", "runs",
1881  		"let", "lets", "try", "tries", "need", "needs", "want", "wants",
1882  		"start", "starts",
1883  		"help", "helps",
1884  		"turn", "turns",
1885  		"work", "works",
1886  		"like", "likes",
1887  		"look", "looks",
1888  		"mean", "means", "meant",
1889  		"can", "could", "will", "would", "shall", "should", "may", "might", "must":
1890  		return true
1891  	}
1892  	return false
1893  }
1894  
1895  func hasSuffix(s, suffix string) bool {
1896  	if len(s) < len(suffix) {
1897  		return false
1898  	}
1899  	return s[len(s)-len(suffix):] == suffix
1900  }
1901  
1902  // buildDeepPat creates a canonical (sorted, normalized) role sequence from roles.
1903  func buildDeepPat(roles []int32) []uint8 {
1904  	if len(roles) == 0 {
1905  		return nil
1906  	}
1907  	dp := []uint8{:len(roles):len(roles)}
1908  	for i, r := range roles {
1909  		nr := r
1910  		if nr == HistTopic {
1911  			nr = HistSubject
1912  		}
1913  		dp[i] = uint8(nr)
1914  	}
1915  	// Insertion sort (patterns are short, 3-8 elements).
1916  	for i := 1; i < len(dp); i++ {
1917  		key := dp[i]
1918  		j := i - 1
1919  		for j >= 0 && dp[j] > key {
1920  			dp[j+1] = dp[j]
1921  			j--
1922  		}
1923  		dp[j+1] = key
1924  	}
1925  	return dp
1926  }
1927