package iskra // UntranslatedMarker is the placeholder emitted by cross-language // translation when no atom-link mapping exists for the source atom. // Renderers emit it verbatim and skip inflection. The marker is ASCII- // safe (no language-specific bytes) and visible in output for diagnosis. const UntranslatedMarker = "[missing]" // SetEntry represents one element of the sentence-as-set abstraction. // // Lossless canonical form: Role + Atom + Morph + Class + OblRole + Head + ModKind. // // Three-layer role schema: // // Role - core grammatical role (Subject/Object/Verb/Topic/etc) from // the histogram. RRG macroroles. // OblRole - thematic/oblique role (Goal/Loc/Instr/etc) for adjunct slots. // Language-independent. // Head/ModKind - structural relation to another entry in the Set. Entry is // a top-level argument when Head=-1; otherwise it modifies the // entry at index Head with relation ModKind (POSS, ATTR, etc). // // Head is an index into the current Set. Valid only when the slot order is // the canonical extraction order for the language (modifier-before-head for // POSS and ATTR in both JA and EN). If translation ever reorders entries, or // if relative-clause modifiers introduce post-head ordering, switch to stable // IDs. type SetEntry struct { Role int32 // macrorole (Subject/Object/Verb/Topic/...) Atom string // region center (lemma/stem) Morph uint16 // tense|aspect|polarity|formality|number|def|mood|3sg|passive|causative|... Class uint8 // verb class (for verbs): 1=ichidan, 2-10=godan variants Mark uint8 // original particle/preposition (within-language preservation) OblRole uint8 // thematic role: Goal/Loc/Instr/etc. ORNone for core args. Head int16 // index of head entry; -1 if top-level argument ModKind uint8 // modification kind: MKNone (top), MKPoss, MKAttr, MKRel, MKApp } // Modification kinds (for Head/ModKind nesting). // // Layered semantics: // - POSS/ATTR: structural modifier of another argument (Head points at the // modified entry). The modifier doesn't have a clause-level role on its own. // - COP: copular predicate of the subject. The complement asserts an identity // or attribution about the entry at Head. No verb slot exists in the clause. // The complement's Morph carries tense/aspect/polite (だ vs です vs だった). // - REL/APP: reserved. const ( MKNone uint8 = 0 MKPoss uint8 = 1 // possessive (私の魚 / my fish) MKAttr uint8 = 2 // attributive (赤い車 / red car) - adjective modifies noun MKCop uint8 = 3 // copular predicate (学生だ / is a student); Head = subject MKAdv uint8 = 4 // adverbial (速く走る / runs fast); Head = verb MKCoord uint8 = 5 // coordination peer (猫と犬 / cats and dogs); Head = first conjunct MKRel uint8 = 6 // relative clause; modifier verb's Head=host noun MKApp uint8 = 7 // RESERVED: apposition; not yet implemented MKAdj uint8 = 8 // predicative adjective (面白い / is interesting); Head=subject ) // Oblique role values. Language-independent thematic relations. // Use these for adjunct slots; ORNone means the slot's role is purely macro // (Subject/Object/Verb). const ( ORNone uint8 = 0 ORGoal uint8 = 1 // to, へ, に-motion ORLoc uint8 = 2 // in/on/at, で-location, に-stative ORSource uint8 = 3 // from, から ORLimit uint8 = 4 // until, まで ORInstr uint8 = 5 // with-instrument, で-instrumental ORComit uint8 = 6 // with-companion, と ORBenef uint8 = 7 // for, のために ORAgent uint8 = 8 // by, によって (passive agent) ORRecip uint8 = 9 // to-recipient, に-dative ORPart uint8 = 10 // of, の-partitive/genitive ORCompare uint8 = 11 // than, より (standard of comparison) ) // ExtractResult is the output of pattern extraction from a token sequence. // // Set vs Discourse: Set holds the root clause's role-set; Discourse holds all // clauses (root + subordinate/coord). For single-clause sentences, Discourse // has exactly one element whose Set == ExtractResult.Set. Multi-clause // sentences (clause coordination, conditional, relative clause) populate // Discourse with multiple Clause entries. type ExtractResult struct { Pattern []byte // encoded pattern (markers + slots) Slots []string // content words filling each slot (surface forms) Roles []int32 // hist index for each slot (assigned by following marker) DeepPat []uint8 // canonical role sequence (sorted, normalized) Set []SetEntry // root-clause role-set (== Discourse[0].Set when populated) Discourse []Clause // all clauses; len 1 for single-clause inputs } // Clause is one complete predication within a Discourse. // // Single-clause sentences produce one Clause with Relation=ClauseRoot, // Parent=-1, HostIdx=-1. Multi-clause sentences add more Clauses with // Relation/Parent/HostIdx specifying how each subordinate or peer clause // relates to its anchor. type Clause struct { Set []SetEntry // role-set of this clause (modifier nesting etc. live inside) Relation ClauseRelation // how this clause relates to its Parent Parent int16 // index of parent clause in Discourse.Clauses; -1 for root HostIdx int16 // for REL: index of modified entry in parent's Set; -1 otherwise } // ClauseRelation enumerates inter-clause relations in a Discourse. // // Asymmetric relations (IF, BECAUSE, REL) point from the subordinate clause // to its parent. Peer relations (AND, OR, BUT) point from the second clause // to the first; commutativity is implicit at the semantic level. type ClauseRelation uint8 const ( ClauseRoot ClauseRelation = 0 // root clause; no parent ClauseAnd ClauseRelation = 1 // X and Y - peer ClauseOr ClauseRelation = 2 // X or Y - peer ClauseBut ClauseRelation = 3 // X but Y - peer with contrast ClauseIf ClauseRelation = 4 // if X (then parent) - condition ClauseBecause ClauseRelation = 5 // because X (then parent) - cause ClauseRel ClauseRelation = 6 // relative clause modifying parent.Set[HostIdx] ) // ExtractJA takes JA tokens (already split on particles) and produces // the structural pattern + content slots. func ExtractJA(tokens []string) ExtractResult { var pat []byte var slots []string var roles []int32 var slotMarkers []uint8 var slotMorphs []uint16 var slotOblRoles []uint8 var slotHeads []int16 var slotModKinds []uint8 pendingRole := HistVerb pendingHead := int16(-1) pendingModKind := uint8(MKNone) pendingCoordHeadJA := int16(-1) // Multi-clause accumulator for JA. Comma 、 signals clause boundary. var clausesJA []Clause clauseRelJA := ClauseRoot clauseParentJA := int16(-1) // Skip-tokens index for the の-relational-noun-に locative compound: // when the pattern is detected at の, we consume the next two tokens // (the relational noun + に) and apply ORLoc to the preceding base noun. skipUntilJA := -1 for i, tok := range tokens { if i <= skipUntilJA { continue } // もし at clause start signals a conditional clause (ClauseIf). // Consume it and mark the current clause's relation. if len(slots) == 0 && tok == "\xe3\x82\x82\xe3\x81\x97" { clauseRelJA = ClauseIf continue } // JA comma 、 (E3 80 81) signals a clause boundary. Finalize the // current clause, reset per-slot state, mark next clause as ClauseAnd. if tok == "\xe3\x80\x81" { if len(slots) > 0 { lastIdx := len(slots) - 1 // Apply same predicate-shape detection as end-of-input does: // copula strip (学生だ → student MKCop) then predicate-i-adj // (面白い → MKAdj) then fall back to last-slot=HistVerb. appliedPred := false if slotModKinds[lastIdx] != MKCop { if stripped, ok, copMorph := stripJACopula(slots[lastIdx]); ok && len(stripped) > 0 { slots[lastIdx] = stripped slotMorphs[lastIdx] |= copMorph slotModKinds[lastIdx] = MKCop slotHeads[lastIdx] = findSubjectIdx(roles, slotHeads) roles[lastIdx] = HistComplement appliedPred = true } else if endsInIKana(slots[lastIdx]) && len(slots[lastIdx]) > 3 && !endsInNaiSuffix(slots[lastIdx]) && !endsInTaiSuffix(slots[lastIdx]) && slotHeads[lastIdx] < 0 && slotModKinds[lastIdx] == MKNone { slotModKinds[lastIdx] = MKAdj slotHeads[lastIdx] = findSubjectIdx(roles, slotHeads) roles[lastIdx] = HistComplement appliedPred = true } } if !appliedPred && len(roles) > 0 && slotModKinds[lastIdx] != MKCop { roles[lastIdx] = HistVerb } clauseSet := buildSetFromSlices( slots, roles, slotMorphs, slotMarkers, slotOblRoles, slotHeads, slotModKinds, ) nextParent := int16(len(clausesJA)) clausesJA = append(clausesJA, Clause{ Set: clauseSet, Relation: clauseRelJA, Parent: clauseParentJA, HostIdx: -1, }) slots = nil roles = nil slotMarkers = nil slotMorphs = nil slotOblRoles = nil slotHeads = nil slotModKinds = nil pendingRole = HistVerb pendingHead = -1 pendingModKind = MKNone pendingCoordHeadJA = -1 clauseRelJA = ClauseAnd clauseParentJA = nextParent - 1 } continue } mk, isMk := jaParticleToMarker()[tok] if isMk { pat = append(pat, mk) // Synthetic morph markers attach to PRECEDING slot's morph. switch mk { case MkDef: if len(slotMorphs) > 0 { slotMorphs[len(slotMorphs)-1] |= MetaDefDef } continue case MkPlural: if len(slotMorphs) > 0 { slotMorphs[len(slotMorphs)-1] |= MetaNumPlural } continue case Mk3Sg: if len(slotMorphs) > 0 { slotMorphs[len(slotMorphs)-1] |= Meta3Sg } continue case MkCopula: // Copula marker - reserved. continue case MkNo: // Locative-compound disambiguation: の followed by a relational // noun (中/上/下/前/後/横/隣/間/内/外) and then に collapses to // an ORLoc oblique on the base noun. 箱の中に = "in the box"; // the 中 (inside) is implicit in ORLoc, and the prior 箱 takes // the locative role. Skip the next two tokens (relNoun + に). if i+2 < len(tokens) && len(slots) > 0 && isJARelationalNoun(tokens[i+1]) && tokens[i+2] == "\xe3\x81\xab" { lastIdx := len(slots) - 1 if lastIdx < len(slotOblRoles) { slotOblRoles[lastIdx] = jaRelationalNounToOblRole(tokens[i+1]) } if lastIdx < len(roles) { roles[lastIdx] = HistScope } if lastIdx < len(slotMarkers) { slotMarkers[lastIdx] = MkNi } skipUntilJA = i + 2 continue } // の: preceding slot is a POSS modifier of the next slot. if len(slots) > 0 { pendingHead = int16(len(slots) - 1) pendingModKind = MKPoss } if len(roles) > 0 { roles[len(roles)-1] = MarkerToRole(mk) if len(slotMarkers) == len(slots) { slotMarkers[len(slotMarkers)-1] = mk } } continue case MkDe: // で is ambiguous: locative/instrumental particle (家で本を読む // = "read book at home") or te-form of copula で joining two // copular clauses (学生で彼は先生だ = "[I'm a student] and // [he is a teacher]"). Disambiguator: if で is followed by // [noun][は/が], it's te-copula clause-coord. if i+2 < len(tokens) && len(slots) > 0 { next2 := tokens[i+2] if next2 == "\xe3\x81\xaf" || next2 == "\xe3\x81\x8c" { // Te-form copula: mark the preceding noun as MKCop // predicate of the current clause's subject, then // finalize the clause and start a new one. lastIdx := len(slots) - 1 slotModKinds[lastIdx] = MKCop slotHeads[lastIdx] = findSubjectIdx(roles, slotHeads) roles[lastIdx] = HistComplement clauseSet := buildSetFromSlices( slots, roles, slotMorphs, slotMarkers, slotOblRoles, slotHeads, slotModKinds, ) nextParent := int16(len(clausesJA)) clausesJA = append(clausesJA, Clause{ Set: clauseSet, Relation: clauseRelJA, Parent: clauseParentJA, HostIdx: -1, }) slots = nil roles = nil slotMarkers = nil slotMorphs = nil slotOblRoles = nil slotHeads = nil slotModKinds = nil pendingRole = HistVerb pendingHead = -1 pendingModKind = MKNone pendingCoordHeadJA = -1 clauseRelJA = ClauseAnd clauseParentJA = nextParent - 1 continue } } // Fall through to default for instrumental/locative で. case MkTo: // と is ambiguous: comitative (友達と) or coordination // (猫と犬). Coord heuristic: preceding slot has no Mark yet. // For chained coord (X と Y と Z), all peers point at the // FIRST conjunct, not the previous one - walk up the chain. if len(slots) > 0 && len(slotMarkers) == len(slots) && (slotMarkers[len(slotMarkers)-1] == 0 || slotMarkers[len(slotMarkers)-1] == MkTo) { prev := int16(len(slots) - 1) if slotModKinds[prev] == MKCoord { prev = slotHeads[prev] } pendingCoordHeadJA = prev if slotMarkers[len(slotMarkers)-1] == 0 { slotMarkers[len(slotMarkers)-1] = mk } continue } // Fall through to default marker handling for comitative. } if len(roles) > 0 { newRole := MarkerToRole(mk) roles[len(roles)-1] = newRole // Propagate the role backward through any coord chain so // the head conjunct gets the same role as the particle-marked // conjunct (猫と犬が = both subjects, marked via が on 犬). j := len(roles) - 1 for j > 0 && slotHeads[j] >= 0 && slotModKinds[j] == MKCoord { j = int32(slotHeads[j]) roles[j] = newRole } if len(slotMarkers) == len(slots) { // Keep the existing と Mark on the coord head; only update // non-coord-marker slots' Mark. if slotMarkers[len(slotMarkers)-1] != MkTo { slotMarkers[len(slotMarkers)-1] = mk } } if len(slotOblRoles) == len(slots) { if or := MarkerToOblRole(mk); or != ORNone { slotOblRoles[len(slotOblRoles)-1] = or } } } if i < len(tokens)-1 { pendingRole = HistVerb } } else { pat = append(pat, SlotNoun) slots = append(slots, tok) roles = append(roles, pendingRole) slotMarkers = append(slotMarkers, 0) slotMorphs = append(slotMorphs, 0) slotOblRoles = append(slotOblRoles, ORNone) slotHeads = append(slotHeads, -1) slotModKinds = append(slotModKinds, MKNone) newIdx := int16(len(slots) - 1) // Temporal-noun adverbial: 昨日/今日/明日/etc. - sentence-initial // adjuncts that surface as bare nouns but semantically modify the // clause's verb. Mark as MKAdv with head=-1; resolved at the // final-pass below (binds to the verb slot once it's identified). if isJATemporalNoun(tok) { roles[newIdx] = HistModifier slotModKinds[newIdx] = MKAdv slotHeads[newIdx] = -1 } // ば-ending token signals conditional clause: mark this clause // as ClauseIf. The lemmatizer strips ば from the verb separately. if len(tok) >= 3 { tb := []byte(tok) if tb[len(tb)-3] == 0xe3 && tb[len(tb)-2] == 0x81 && tb[len(tb)-1] == 0xb0 { clauseRelJA = ClauseIf } } // Coordination resolution: と connected this slot to the previous. if pendingCoordHeadJA >= 0 && pendingCoordHeadJA < newIdx { slotHeads[newIdx] = pendingCoordHeadJA slotModKinds[newIdx] = MKCoord roles[newIdx] = roles[pendingCoordHeadJA] pendingCoordHeadJA = -1 } // Resolve pending POSS modifier from a preceding の. if pendingHead >= 0 && pendingHead < newIdx { slotHeads[pendingHead] = newIdx slotModKinds[pendingHead] = pendingModKind pendingHead = -1 pendingModKind = MKNone } // ATTR detection: i-adjective immediately preceding this noun // (no particle between them - we'd have continued out via the // marker branch otherwise). Heuristic: previous slot's atom ends // in い with no intervening particle, and previous slot didn't // already get a modifier role from a particle. Known false // positives: な-adjectives ending in い (きれい), nouns ending // in い (兄). Logged limitation; not silently corrupted because // the comparison metric will catch any resulting drift. if newIdx >= 1 { prev := newIdx - 1 if slotHeads[prev] < 0 && slotMarkers[prev] == 0 { prevAtom := slots[prev] if endsInIKana(prevAtom) && !endsInNaiSuffix(prevAtom) && !endsInTaiSuffix(prevAtom) { slotHeads[prev] = newIdx slotModKinds[prev] = MKAttr } else if endsInKuKana(prevAtom) { slotHeads[prev] = newIdx slotModKinds[prev] = MKAdv } else if isJABareKanjiAdj(prevAtom) { slotHeads[prev] = newIdx slotModKinds[prev] = MKAttr } else if endsInTaKana(prevAtom) { // た-form REL: 食べた猫 (the cat that ate). A past-tense // verb immediately preceding a noun (no particle) is a // relative-clause predicate modifying the noun. slotHeads[prev] = newIdx slotModKinds[prev] = MKRel roles[prev] = HistModifier } } } pendingRole = HistVerb } } // Copula detection: if the last slot is a noun ending in だ/です/だった/でした, // it's a copular predicate, not a verb. Strip the copula suffix, mark the // slot with MKCop, point Head at the subject, and DO NOT apply the // last-slot-verb-override. copulaApplied := false if len(slots) > 0 { lastIdx := len(slots) - 1 if stripped, ok, copMorph := stripJACopula(slots[lastIdx]); ok { // Verify the stripped result isn't a verb-like stem. // (A verb past form like 食べた must keep た as past suffix, not // be treated as copula. The disambiguator: if stripping leaves // only hiragana that looks like a verb stem, skip copula.) // For now, accept any non-empty stripped result on the last slot. if len(stripped) > 0 { slots[lastIdx] = stripped slotMorphs[lastIdx] |= copMorph slotModKinds[lastIdx] = MKCop slotHeads[lastIdx] = findSubjectIdx(roles, slotHeads) roles[lastIdx] = HistComplement copulaApplied = true } } } // Locative-existence copula: if the final verb lemmatizes to いる/ある // and the clause has an ORLoc-marked slot, the いる is the existence // verb (be located). Promote the locative slot to MKCop (parallel to EN // "is in X" representation: [X SCOPE ORLoc MKCop h=subj]) and drop the // existence verb - its semantics is absorbed by the copular link. if !copulaApplied && len(slots) > 1 { lastIdx := len(slots) - 1 lastAtom := slots[lastIdx] // いる stem after lemmatization is い (ichidan); ある stem is あ. lem := LemmatizeJA(lastAtom, true) if (lem.Lemma == "\xe3\x81\x84" || lem.Lemma == "\xe3\x81\x82") && lem.Class == VClassIchidan { for i := 0; i < lastIdx; i++ { if i < len(slotOblRoles) && slotOblRoles[i] == ORLoc { slotModKinds[i] = MKCop slotHeads[i] = findSubjectIdx(roles, slotHeads) // Inherit both the verb's lemma morph AND any synthetic // morph markers attached to the verb slot (Meta3Sg via ◯). slotMorphs[i] |= lem.Morph | slotMorphs[lastIdx] // Drop the existence-verb slot. slots = slots[:lastIdx] roles = roles[:lastIdx] slotMarkers = slotMarkers[:lastIdx] slotMorphs = slotMorphs[:lastIdx] slotOblRoles = slotOblRoles[:lastIdx] slotHeads = slotHeads[:lastIdx] slotModKinds = slotModKinds[:lastIdx] copulaApplied = true break } } } } // Predicative i-adjective detection: when the last slot ends in い and the // copula stripper didn't fire, treat it as an adjectival predicate // (面白い / "is interesting"). Set ModKind=MKAdj, Head=subject, role= // Complement. The default last-slot=HistVerb override below is skipped. // False positives: な-adjectives ending in い (きれい), nouns ending in い // (兄). Accepted limitation, same profile as ATTR detection. predAdjApplied := false if !copulaApplied && len(slots) > 0 { lastIdx := len(slots) - 1 atom := slots[lastIdx] if slotHeads[lastIdx] < 0 && slotModKinds[lastIdx] == MKNone && endsInIKana(atom) && len(atom) > 3 && !endsInNaiSuffix(atom) && !endsInTaiSuffix(atom) { slotModKinds[lastIdx] = MKAdj slotHeads[lastIdx] = findSubjectIdx(roles, slotHeads) roles[lastIdx] = HistComplement // Comparative: if any earlier slot is ORCompare-marked (より), // the predicate adjective carries MetaCompare. for i := 0; i < lastIdx; i++ { if i < len(slotOblRoles) && slotOblRoles[i] == ORCompare { slotMorphs[lastIdx] |= MetaCompare break } } predAdjApplied = true } } if !copulaApplied && !predAdjApplied && len(roles) > 0 { roles[len(roles)-1] = HistVerb } // Passive/causative agent reinterpretation: に defaults to ORLoc (locative) // or ORGoal (motion goal) depending on verb semantics. When the final verb // carries MetaPassive, a に-marked slot is the agent ("by X") - flip its // OblRole to ORAgent and role to HistModifier so EN renders it as "by X". // MetaCausative reinterprets に-marked slot as the causer-agent similarly. // Pre-compute the verb morph by lemmatizing here; buildSetFromSlices below // will re-do the same lemmatization, but the cost is one extra strip on the // last slot - cheap compared to scanning every slot for に at render time. if !copulaApplied && !predAdjApplied && len(slots) > 0 { lastIdx := len(slots) - 1 verbLem := LemmatizeJA(slots[lastIdx], true) verbMorph := verbLem.Morph if lastIdx < len(slotMorphs) { verbMorph |= slotMorphs[lastIdx] } if verbMorph&(MetaPassive|MetaCausative) != 0 { for i := 0; i < len(slots); i++ { if i < len(slotMarkers) && slotMarkers[i] == MkNi { if i < len(slotOblRoles) { slotOblRoles[i] = ORAgent } if i < len(roles) { roles[i] = HistModifier } } } } else if isJADitransitive(verbLem.Lemma) { // Ditransitive: に-marked slot is the recipient (彼に本をあげる). // Flip from default ORGoal/ORLoc to ORRecip; role HistScope to // HistModifier for cross-language parity with EN ditransitive // extraction. for i := 0; i < len(slots); i++ { if i < len(slotMarkers) && slotMarkers[i] == MkNi { if i < len(slotOblRoles) { slotOblRoles[i] = ORRecip } if i < len(roles) { roles[i] = HistModifier } } } } } // Temporal-adverbial binding: any slot marked MKAdv with head=-1 from // the temporal-noun detection above gets bound to the clause's verb // (last slot if !copulaApplied && !predAdjApplied, otherwise no binding // since copular/adjectival clauses have no verb to modify). if !copulaApplied && !predAdjApplied && len(slots) > 0 { verbIdx := int16(len(slots) - 1) for i := 0; i < int32(verbIdx); i++ { if i < len(slotModKinds) && slotModKinds[i] == MKAdv && i < len(slotHeads) && slotHeads[i] < 0 { slotHeads[i] = verbIdx } } } // Modifier role propagation: ATTR modifiers copy their head noun's role // (EN gives "red" the same role as "car"); ADV modifiers get HistModifier. for i := 0; i < len(slots); i++ { if i < len(slotModKinds) && i < len(slotHeads) && slotHeads[i] >= 0 { h := int32(slotHeads[i]) switch slotModKinds[i] { case MKAttr: if h < len(roles) { roles[i] = roles[h] } case MKAdv: roles[i] = HistModifier } } } var set []SetEntry for i, word := range slots { role := HistComplement if i < len(roles) { role = roles[i] } // Modifier entries (Head>=0) skip verb lemmatization regardless of role- // override at sentence-final position. A POSS/ATTR modifier on the // last slot is still a modifier, not the clause's verb. Exception: // MKRel modifiers ARE verbs (relative-clause predicates) and must be // lemmatized to recover tense morph + verb class. isVerb := role == HistVerb head := int16(-1) modKind := uint8(MKNone) if i < len(slotHeads) { head = slotHeads[i] } if i < len(slotModKinds) { modKind = slotModKinds[i] } if modKind != MKNone && modKind != MKRel { isVerb = false } if modKind == MKRel { isVerb = true } lem := LemmatizeJA(word, isVerb) mark := uint8(0) extraMorph := uint16(0) obl := uint8(ORNone) if i < len(slotMarkers) { mark = slotMarkers[i] } if i < len(slotMorphs) { extraMorph = slotMorphs[i] } if i < len(slotOblRoles) { obl = slotOblRoles[i] } set = append(set, SetEntry{ Role: role, Atom: lem.Lemma, Morph: lem.Morph | extraMorph, Class: lem.Class, Mark: mark, OblRole: obl, Head: head, ModKind: modKind, }) } // Finalize the last (or only) clause. clausesJA = append(clausesJA, Clause{ Set: set, Relation: clauseRelJA, Parent: clauseParentJA, HostIdx: -1, }) // Flatten Slots/Roles across all clauses (see ExtractEN for rationale). flatSlotsJA := []string{:0:len(slots)} flatRolesJA := []int32{:0:len(slots)} for _, c := range clausesJA { for _, e := range c.Set { flatSlotsJA = append(flatSlotsJA, e.Atom) flatRolesJA = append(flatRolesJA, e.Role) } } return ExtractResult{ Pattern: pat, Slots: flatSlotsJA, Roles: flatRolesJA, DeepPat: buildDeepPat(flatRolesJA), Set: clausesJA[0].Set, Discourse: clausesJA, } } // findSubjectIdx returns the index of the entry that should serve as the // subject of a copular predicate: the first entry whose role is Topic or // Subject and that is not itself a modifier (Head=-1). // Returns -1 if no candidate found. func findSubjectIdx(roles []int32, heads []int16) int16 { for i, r := range roles { if i >= len(heads) { break } if heads[i] >= 0 { continue // modifier, skip } if r == HistTopic || r == HistSubject { return int16(i) } } return -1 } // buildSetFromSlices converts parallel per-slot slices into a []SetEntry. // Used by ExtractEN both at clause boundaries and end-of-input. func buildSetFromSlices( slots []string, roles []int32, slotMorphs []uint16, slotMarks, slotOblRoles []uint8, slotHeads []int16, slotModKinds []uint8, ) []SetEntry { // Role propagation before set construction: ATTR copies head's role, // ADV gets HistModifier. Mutates roles[] in place (caller's slice). for i := 0; i < len(slots); i++ { if i < len(slotModKinds) && i < len(slotHeads) && slotHeads[i] >= 0 { h := int32(slotHeads[i]) switch slotModKinds[i] { case MKAttr: if h < len(roles) { roles[i] = roles[h] } case MKAdv: roles[i] = HistModifier } } } var set []SetEntry for i, lemma := range slots { role := HistComplement if i < len(roles) { role = roles[i] } m := uint16(0) if i < len(slotMorphs) { m = slotMorphs[i] } mark := uint8(0) if i < len(slotMarks) { mark = slotMarks[i] } obl := uint8(ORNone) if i < len(slotOblRoles) { obl = slotOblRoles[i] } head := int16(-1) modKind := uint8(MKNone) if i < len(slotHeads) { head = slotHeads[i] } if i < len(slotModKinds) { modKind = slotModKinds[i] } set = append(set, SetEntry{ Role: role, Atom: lemma, Morph: m, Class: 0, Mark: mark, OblRole: obl, Head: head, ModKind: modKind, }) } return set } // hasPredication returns true when the current per-slot state contains a // completed predication (a real verb slot or a copular complement). // Used to disambiguate "and" between NP coordination and clause coordination. func hasPredication(roles []int32, slotModKinds []uint8) bool { for i, r := range roles { if r == HistVerb { return true } if i < len(slotModKinds) && slotModKinds[i] == MKCop { return true } } return false } // ExtractEN takes EN tokens and produces pattern + slots. // Handles determiners (the/a) as morph hints, prepositions as role markers, // verb auxiliaries as fillers of the verb slot, and pronouns as content nouns. func ExtractEN(tokens []string) ExtractResult { var pat []byte var slots []string var roles []int32 var slotMorphs []uint16 var slotMarks []uint8 var slotOblRoles []uint8 var slotHeads []int16 var slotModKinds []uint8 sawVerb := false contentCount := 0 pendingRole := HistSubject pendingDef := false pendingNeg := false pendingMark := uint8(0) pendingOblRole := uint8(ORNone) pendingHead := int16(-1) pendingModKind := uint8(MKNone) pendingCop := false pendingCopHead := int16(-1) pendingCopMorph := uint16(0) // Track whether the staged copula aux is a form of "be" (vs do/have/will). // Only be-aux + past-participle yields passive voice; do/will + bare-V // is do-support or modal, not passive. pendingCopAuxIsBe := false // Volitional state: "let X V" / "let's V" means the V slot is volitional. pendingVol := false // Coordination state: when "and"/"or" is seen between content words, the // next content word becomes a MKCoord peer of the most recent content. pendingCoordHead := int16(-1) // REL state: when "that/which/who" follows a noun, the next emitted verb // becomes a MKRel modifier of that noun (intransitive REL only - flat-Set // representation; transitive REL with its own subject needs sub-clauses). pendingRelHead := int16(-1) // Ditransitive state: when a verb in the enDitransitive set is emitted, // the FIRST bare-NP (no preceding preposition) is the candidate recipient. // It only commits to ORRecip when a SECOND object follows; otherwise the // single object is a plain patient (read a book, write a letter, etc.). pendingRecipCand := false pendingRecipIdx := int16(-1) // Causative state: when "make"/"let" appears as the first verb and a // bare-V follows after an NP (made me wait, let him go), the auxiliary // is not emitted. Instead MetaCausative is staged for the embedded verb // emission. The intervening NP is the causee, emitted as plain object. pendingCausative := false pendingCausativeMorph := uint16(0) // resetPending: single reset point for ALL pending-state variables above. // Every clause-boundary site (comma/semicolon split, "and"/"or"/"but" // clause-coord, subordinator boundary, second-verb-no-comma boundary) // calls this. Adding a new pending var requires exactly one new line // here - never twelve scattered reset blocks to keep in sync. resetPending := func() { pendingRole = HistSubject pendingDef = false pendingNeg = false pendingMark = 0 pendingOblRole = ORNone pendingHead = -1 pendingModKind = MKNone pendingCop = false pendingCopHead = -1 pendingCopMorph = 0 pendingCopAuxIsBe = false pendingVol = false pendingCoordHead = -1 pendingRelHead = -1 pendingRecipCand = false pendingRecipIdx = -1 pendingCausative = false pendingCausativeMorph = 0 } _ = resetPending // Multi-clause discourse accumulator. clauseRel/clauseParent track how // the CURRENT (being-built) clause relates to its parent in clauses[]. var clauses []Clause clauseRel := ClauseRoot clauseParent := int16(-1) for tokIdx, tok := range tokens { // Clause-boundary token from tokenizeEN punctuation classification // (synthetic 、 emitted for ,/;/:/./?/!/—/…/♫). Finalize the current // clause as a ClauseAnd peer, reset per-slot state, continue. if tok == "\xe3\x80\x81" { if len(slots) > 0 { clauseSet := buildSetFromSlices( slots, roles, slotMorphs, slotMarks, slotOblRoles, slotHeads, slotModKinds, ) clauseRelLocal := clauseRel if clauseRelLocal == ClauseRoot && len(clauses) > 0 { clauseRelLocal = ClauseAnd } nextParent := int16(len(clauses)) clauses = append(clauses, Clause{ Set: clauseSet, Relation: clauseRelLocal, Parent: clauseParent, HostIdx: -1, }) slots = nil roles = nil slotMorphs = nil slotMarks = nil slotOblRoles = nil slotHeads = nil slotModKinds = nil sawVerb = false contentCount = 0 resetPending() clauseRel = ClauseAnd clauseParent = nextParent - 1 } continue } low := toLowerEN(tok) mk, isMk := enWordToMarker()[low] if isMk { pat = append(pat, mk) switch enMarkerClass(mk) { case enMarkDeterminer: // Clause boundary detection: in a subordinate clause that // has a completed verb, a new determiner starts the main // clause. "if it rains the cat runs" - "the" begins main. if sawVerb && (clauseRel == ClauseIf || clauseRel == ClauseBecause) { clauseSet := buildSetFromSlices( slots, roles, slotMorphs, slotMarks, slotOblRoles, slotHeads, slotModKinds, ) nextParent := int16(len(clauses)) clauses = append(clauses, Clause{ Set: clauseSet, Relation: clauseRel, Parent: nextParent, HostIdx: -1, }) slots = nil roles = nil slotMorphs = nil slotMarks = nil slotOblRoles = nil slotHeads = nil slotModKinds = nil sawVerb = false contentCount = 0 resetPending() clauseRel = ClauseRoot clauseParent = -1 } if mk == MkThe { pendingDef = true } // "a" leaves pendingDef=false (indefinite) case enMarkNegation: pendingNeg = true case enMarkPronoun: // Pronouns are content nouns; emit as slot. contentCount++ role := pendingRole if !sawVerb { role = HistSubject } else { role = pendingRole } pat[len(pat)-1] = SlotNoun pronAtom := tok plem := LemmatizeEN(toLowerEN(tok)) if plem.Lemma != "" { pronAtom = plem.Lemma } slots = append(slots, pronAtom) roles = append(roles, role) m := uint16(0) if pendingDef { m |= MetaDefDef } slotMorphs = append(slotMorphs, m) slotMarks = append(slotMarks, pendingMark) slotOblRoles = append(slotOblRoles, ORNone) slotHeads = append(slotHeads, -1) slotModKinds = append(slotModKinds, MKNone) newPronIdx := int16(len(slots) - 1) // Ditransitive recipient (two-noun pattern, mirrors the noun- // emit branch). Pronouns are common recipients (give him X). if pendingRecipCand { if pendingRecipIdx < 0 { if pendingMark == 0 && pendingOblRole == ORNone { pendingRecipIdx = newPronIdx } else { pendingRecipCand = false } } else { slotOblRoles[pendingRecipIdx] = ORRecip roles[pendingRecipIdx] = HistModifier pendingRecipCand = false pendingRecipIdx = -1 } } pendingMark = 0 pendingDef = false case enMarkPossDet: // Possessive determiner: emit as POSS modifier of the next noun. // The next slot emission resolves the Head pointer. pat[len(pat)-1] = SlotNoun slots = append(slots, possDetSurface(mk)) roles = append(roles, HistOperator) // flattened role for the modifier slotMorphs = append(slotMorphs, 0) slotMarks = append(slotMarks, mk) slotOblRoles = append(slotOblRoles, ORNone) slotHeads = append(slotHeads, -1) slotModKinds = append(slotModKinds, MKNone) pendingHead = int16(len(slots) - 1) pendingModKind = MKPoss case enMarkVerbAux: // Verb-auxiliaries (be/is/are/was/were/am/do/etc) are not // content verbs. For copular constructions ("he is a student"), // the next noun becomes the copular predicate, attached via // MKCop to the most recent subject. No verb slot is emitted. // // If we've already seen a real content verb (e.g. "is eating"), // the aux just carries tense/aspect onto that verb. if sawVerb && len(slotMorphs) > 0 { lem := LemmatizeEN(low) slotMorphs[len(slotMorphs)-1] |= lem.Morph } else { // No real verb. Stage copula info for the next noun. subj := findSubjectIdx(roles, slotHeads) if subj >= 0 { pendingCop = true pendingCopHead = subj lem := LemmatizeEN(low) pendingCopMorph = lem.Morph pendingCopAuxIsBe = lem.Lemma == "be" if pendingNeg { pendingCopMorph |= MetaPolarNeg pendingNeg = false } } pendingRole = HistComplement } case enMarkPreposition: pendingRole = MarkerToRole(mk) pendingMark = mk pendingOblRole = MarkerToOblRole(mk) case enMarkConjunction: // "and"/"or" is either NP-coord (between nouns at same position) // or CLAUSE-coord (between two complete predications). // Disambiguator: if a predication has been completed // (sawVerb || any MKCop), this is clause-coord. if hasPredication(roles, slotModKinds) { // CLAUSE-COORD: finalize the current clause, reset // per-slot state, mark next clause as ClauseAnd peer. clauseSet := buildSetFromSlices( slots, roles, slotMorphs, slotMarks, slotOblRoles, slotHeads, slotModKinds, ) clauseRelLocal := clauseRel clauseParentLocal := clauseParent if clauseRelLocal == ClauseRoot && len(clauses) > 0 { // Shouldn't happen but guard clauseRelLocal = ClauseAnd } nextParent := int16(len(clauses)) clauses = append(clauses, Clause{ Set: clauseSet, Relation: clauseRelLocal, Parent: clauseParentLocal, HostIdx: -1, }) // Reset per-slot state. slots = nil roles = nil slotMorphs = nil slotMarks = nil slotOblRoles = nil slotHeads = nil slotModKinds = nil sawVerb = false contentCount = 0 resetPending() // Next clause inherits AND relation, parent points at the // previous root clause (or its index). if mk == MkBut { clauseRel = ClauseBut } else { clauseRel = ClauseAnd } clauseParent = nextParent - 1 continue } // NP-COORD: existing behavior. if len(slots) > 0 { prev := int16(len(slots) - 1) if int32(prev) < len(slotModKinds) && slotModKinds[prev] == MKCoord { prev = slotHeads[prev] } pendingCoordHead = prev } else { pendingRole = HistSubject sawVerb = false contentCount = 0 } case enMarkRelative: // "that/which/who" after a noun: capture the noun's index so // the next verb is emitted as a MKRel modifier of that noun. // Intransitive REL only - if a subject follows ("that I saw"), // the verb attaches to that subject and pendingRelHead does // not fire. Transitive REL needs the sub-clause path; not yet. if len(slots) > 0 && roles[len(roles)-1] == HistSubject { pendingRelHead = int16(len(slots) - 1) } } continue } if isENPunct(tok) { continue } // "let" / "let's" as a volitional auxiliary: marks the subsequent verb // as MetaMoodVol. "let" only fires at clause start (else it's the verb // meaning "allow/permit"). "let's" is unambiguously volitional and // fires anywhere; the renderer may emit "let's" after a subject like // "we let's go" so we can't gate on contentCount==0. if !sawVerb && low == "let's" { pendingVol = true continue } if !sawVerb && contentCount == 0 && low == "let" { pendingVol = true continue } // Subordinating conjunctions at clause start: "if" / "because" mark // the upcoming clause as subordinate (ClauseIf / ClauseBecause). // Don't emit a slot for the conjunction; consume into clauseRel. if !sawVerb && contentCount == 0 && len(slots) == 0 { switch low { case "if": clauseRel = ClauseIf continue case "because": clauseRel = ClauseBecause continue } } // Adverb detection: -ly suffix or hardcoded list. Adverbs modify // the verb in the clause. Emit as a slot with ModKind=MKAdv. Head // resolves at verb-emission time (forward) or via the last-emitted // verb slot (backward). // // When pendingCop is set, predicate-adjective check takes precedence // (fast is both an adverb and a predicate-adj; "is fast" wants the // adj reading, "runs fast" wants the adverb reading). isAdv := looksLikeAdverb(low) if pendingCop && looksLikePredicateAdj(low) { isAdv = false } if isAdv { contentCount++ pat = append(pat, SlotNoun) slots = append(slots, low) roles = append(roles, HistModifier) slotMorphs = append(slotMorphs, 0) slotMarks = append(slotMarks, 0) slotOblRoles = append(slotOblRoles, ORNone) advIdx := int16(len(slots) - 1) // Find the verb to bind to. Backward: most-recent verb slot. // Forward: stage pendingAdvHead, resolved at next verb emission. boundHead := int16(-1) for i := int32(advIdx) - 1; i >= 0; i-- { if roles[i] == HistVerb { boundHead = int16(i) break } } if boundHead >= 0 { slotHeads = append(slotHeads, boundHead) slotModKinds = append(slotModKinds, MKAdv) } else { // No prior verb; wait for the next verb emission. slotHeads = append(slotHeads, -1) slotModKinds = append(slotModKinds, MKAdv) } continue } // Content word. contentCount++ isVerb := looksLikeVerb(low) // Second-verb-no-comma clause boundary: if we've already seen a // verb in this clause and the current word is also a verb (with no // preposition or conjunction between), the current clause ends and // a new clause begins. Common after subordinators: "if it rains // stay home" - "rains" verb-of-condition, "stay" verb-of-main. // Suppress when pendingCausative is staged - the upcoming verb is // the embedded action of an analytic causative, not a new clause. if sawVerb && isVerb && pendingMark == 0 && pendingOblRole == ORNone && !pendingCausative { clauseSet := buildSetFromSlices( slots, roles, slotMorphs, slotMarks, slotOblRoles, slotHeads, slotModKinds, ) nextParent := int16(len(clauses)) // Condition/cause clauses come BEFORE the main clause in surface // order; Parent points forward to the next root clause (its index // will be len(clauses), the index this new clause is about to take). parent := clauseParent if clauseRel == ClauseIf || clauseRel == ClauseBecause { parent = nextParent } clauses = append(clauses, Clause{ Set: clauseSet, Relation: clauseRel, Parent: parent, HostIdx: -1, }) slots = nil roles = nil slotMorphs = nil slotMarks = nil slotOblRoles = nil slotHeads = nil slotModKinds = nil sawVerb = false contentCount = 1 // we're about to emit this verb resetPending() // Main clause after subordinator: ClauseRoot clauseRel = ClauseRoot clauseParent = -1 } // ATTR detection: pattern is [current content][next content][verb or end]. // If current and next are both content non-verbs, and the position // after next is a verb (or end-of-clause), then current modifies next. // Examples: "red car runs" - red is ATTR of car (runs is the verb). isAttr := false if !isVerb && tokIdx+1 < len(tokens) { next := toLowerEN(tokens[tokIdx+1]) if _, nextIsMk := enWordToMarker()[next]; !nextIsMk && !isENPunct(tokens[tokIdx+1]) && !looksLikeVerb(next) { // Two-token check: position after next must be verb or end. if tokIdx+2 >= len(tokens) { isAttr = true } else { afterNext := toLowerEN(tokens[tokIdx+2]) if _, anIsMk := enWordToMarker()[afterNext]; anIsMk { // Marker (preposition/etc) after the noun phrase - NP ends here. isAttr = true } else if looksLikeVerb(afterNext) { isAttr = true } } } } // Sister check: if previous slot was ATTR-tagged at extract time, or a // pendingHead is awaiting this slot as its head, or a copular predicate // is pending, the current word is the predicate / noun, not a verb. prevWasAttrOrPending := pendingHead >= 0 || pendingCop || pendingCoordHead >= 0 if len(slotModKinds) > 0 && slotModKinds[len(slotModKinds)-1] == MKAttr { prevWasAttrOrPending = true } if !isVerb && !sawVerb && contentCount == 2 && pendingRole == HistObject { // pendingRole was set to Object by a preposition; not a verb position } else if !isVerb && !sawVerb && contentCount == 2 && !isAttr && !prevWasAttrOrPending { isVerb = true } // Predicative adjective: when a copula is staged ("is/are/was/were") // and the current word looks like a predicate adjective (deverbal -ing, // -ful, -ous, -ic, -able, -ible) or its lemma does (bigger→big), emit // as HistComplement with MKAdj pointing at the subject. The copula's // morph (tense, 3sg) merges with the lemma's morph (e.g. MetaCompare // for comparatives) into the adjective's morph; no separate verb slot. if pendingCop { // Surface-form check first (interesting, big, etc.) - atom stays // as the surface. Only when surface miss AND lemma-form matches // (comparatives: bigger→big) do we use the lemma+lem.Morph. useLemma := false var lem LemmaResult if !looksLikePredicateAdj(low) { lem = LemmatizeEN(low) if looksLikePredicateAdj(lem.Lemma) { useLemma = true } } if looksLikePredicateAdj(low) || useLemma { pat = append(pat, SlotNoun) atom := low m := pendingCopMorph if useLemma { atom = lem.Lemma m |= lem.Morph } slots = append(slots, atom) roles = append(roles, HistComplement) if pendingNeg { m |= MetaPolarNeg pendingNeg = false } slotMorphs = append(slotMorphs, m) slotMarks = append(slotMarks, 0) slotOblRoles = append(slotOblRoles, ORNone) slotHeads = append(slotHeads, pendingCopHead) slotModKinds = append(slotModKinds, MKAdj) pendingCop = false pendingCopHead = -1 pendingCopMorph = 0 pendingCopAuxIsBe = false sawVerb = true continue } } // REL-intransitive: if a relative pronoun staged pendingRelHead and // the current word is a verb, emit it as a MKRel modifier of the // host noun. Do NOT set sawVerb so the next real verb still becomes // the clause's main predicate. if pendingRelHead >= 0 && isVerb { pat = append(pat, SlotVerb) lem := LemmatizeEN(low) slots = append(slots, lem.Lemma) roles = append(roles, HistModifier) m := lem.Morph if m&MetaNumPlural != 0 { m = (m &^ MetaNumPlural) | Meta3Sg } if pendingNeg { m |= MetaPolarNeg pendingNeg = false } slotMorphs = append(slotMorphs, m) slotMarks = append(slotMarks, 0) slotOblRoles = append(slotOblRoles, ORNone) slotHeads = append(slotHeads, pendingRelHead) slotModKinds = append(slotModKinds, MKRel) pendingRelHead = -1 continue } // Causative: emit the embedded verb with MetaCausative. The aux // (make/let) was suppressed at its position; its tense morph rides // on the embedded verb. if pendingCausative && isVerb { pat = append(pat, SlotVerb) lem := LemmatizeEN(low) slots = append(slots, lem.Lemma) roles = append(roles, HistVerb) m := lem.Morph | MetaCausative | pendingCausativeMorph if m&MetaNumPlural != 0 { m = (m &^ MetaNumPlural) | Meta3Sg } if pendingNeg { m |= MetaPolarNeg pendingNeg = false } slotMorphs = append(slotMorphs, m) slotMarks = append(slotMarks, 0) slotOblRoles = append(slotOblRoles, ORNone) slotHeads = append(slotHeads, -1) slotModKinds = append(slotModKinds, MKNone) pendingCausative = false pendingCausativeMorph = 0 continue } if !sawVerb && isVerb { lem := LemmatizeEN(low) // Causative aux detection: "make"/"let" + (NP) + bare-V. Suppress // the aux emission, stage MetaCausative for the next verb. if (lem.Lemma == "make" || lem.Lemma == "let") && causativeBareVFollows(tokens, tokIdx) { pendingCausative = true pendingCausativeMorph = lem.Morph sawVerb = true pendingRole = HistObject continue } pat = append(pat, SlotVerb) slots = append(slots, lem.Lemma) roles = append(roles, HistVerb) m := lem.Morph if m&MetaNumPlural != 0 { m = (m &^ MetaNumPlural) | Meta3Sg } if pendingNeg { m |= MetaPolarNeg pendingNeg = false } if pendingVol { m |= MetaMoodVol pendingVol = false } // If a verb-aux had staged copula info but the next content turns // out to be a real verb, the aux is an auxiliary helper, not a // copula. The grammar of the verb form determines which: // "is/was + V-ing" -> progressive aspect on V // "is/was + V-ed/en" (past participle, not progressive) -> passive // LemmatizeEN sets MetaAspectProg for -ing forms, MetaTensePast // for -ed/irregulars. Aspect bit distinguishes prog from passive. if pendingCop { m |= pendingCopMorph // Passive voice requires a form of "be" as auxiliary plus // past-participle form on the embedded verb. "did + V" is // do-support (do-support + bare V = emphatic/negative/ // question), not passive. Only be-aux + past tense (with // no progressive) yields passive. if pendingCopAuxIsBe && m&MetaAspectProg == 0 && m&MetaTensePast != 0 { m |= MetaPassive // The aux's tense ("is"=non-past, "was"=past) overrides // the participle's "past" reading - "was bitten" is past // passive, "is bitten" is non-past passive. if pendingCopMorph&MetaTensePast == 0 { m &^= MetaTensePast } } pendingCop = false pendingCopHead = -1 pendingCopMorph = 0 pendingCopAuxIsBe = false } slotMorphs = append(slotMorphs, m) slotMarks = append(slotMarks, 0) slotOblRoles = append(slotOblRoles, ORNone) slotHeads = append(slotHeads, -1) slotModKinds = append(slotModKinds, MKNone) verbIdx := int16(len(slots) - 1) // Resolve any pre-verb adverbs that were waiting for a verb head. for i := 0; i < int32(verbIdx); i++ { if slotModKinds[i] == MKAdv && slotHeads[i] < 0 { slotHeads[i] = verbIdx } } // Ditransitive: if this verb takes a bare-NP recipient before the // patient ("give X Y"), flag the next-noun-with-no-preposition as // a recipient candidate. Commits to ORRecip only if a second // object follows ("give him a book"); single-object uses keep // the noun as plain patient ("read a book"). if isEnDitransitive(lem.Lemma) { pendingRecipCand = true pendingRecipIdx = -1 } sawVerb = true pendingRole = HistObject } else { pat = append(pat, SlotNoun) lem := LemmatizeEN(low) slots = append(slots, lem.Lemma) role := pendingRole if !sawVerb && !pendingCop { role = HistSubject } roles = append(roles, role) m := lem.Morph if pendingDef { m |= MetaDefDef pendingDef = false } slotMorphs = append(slotMorphs, m) slotMarks = append(slotMarks, pendingMark) slotOblRoles = append(slotOblRoles, pendingOblRole) slotHeads = append(slotHeads, -1) slotModKinds = append(slotModKinds, MKNone) newIdx := int16(len(slots) - 1) // Ditransitive recipient (two-noun pattern): the first bare-NP // after a ditransitive verb is staged as a candidate; the second // noun's arrival promotes the first to ORRecip + HistModifier. // A preposition on the first noun cancels (existing prep path // handles "give book to him"). Single-object uses leave the // candidate uncommitted so it stays a plain object. if pendingRecipCand { if pendingRecipIdx < 0 { // This is the first noun. Stage as candidate unless a // preposition fired. if pendingMark == 0 && pendingOblRole == ORNone { pendingRecipIdx = newIdx } else { pendingRecipCand = false } } else { // Second noun arrives - promote the candidate. slotOblRoles[pendingRecipIdx] = ORRecip roles[pendingRecipIdx] = HistModifier pendingRecipCand = false pendingRecipIdx = -1 } } // Coordination resolution: if "and"/"or" set pendingCoordHead, // this noun is a peer conjunct of that slot. Inherit its role. if pendingCoordHead >= 0 && pendingCoordHead < newIdx { slotHeads[newIdx] = pendingCoordHead slotModKinds[newIdx] = MKCoord roles[newIdx] = roles[pendingCoordHead] pendingCoordHead = -1 } // Copula resolution: if a verb-aux staged copula state, this noun // is the predicate. Bind Head=subject, ModKind=MKCop, merge morph. if pendingCop { slotHeads[newIdx] = pendingCopHead slotModKinds[newIdx] = MKCop slotMorphs[newIdx] |= pendingCopMorph pendingCop = false pendingCopHead = -1 pendingCopMorph = 0 pendingCopAuxIsBe = false } // Resolve a pending POSS/ATTR modifier from a preceding determiner // or ATTR-detected adjective. if pendingHead >= 0 && pendingHead < newIdx { slotHeads[pendingHead] = newIdx slotModKinds[pendingHead] = pendingModKind // Transfer pending def/morph from modifier to head (the // determiner applied to the noun phrase, whose head is this). if pendingModKind == MKAttr { // Move MetaDefDef and other phrase-level morph from // modifier to head if present. if slotMorphs[pendingHead]&MetaDefDef != 0 { slotMorphs[newIdx] |= MetaDefDef slotMorphs[pendingHead] &^= MetaDefDef } // The head takes the clause role; modifier loses it. if roles[pendingHead] == HistSubject { roles[newIdx] = HistSubject } } pendingHead = -1 pendingModKind = MKNone } // ATTR detection (forward-looking): if the lookahead identified // the current slot as an ATTR modifier of the next noun, set up // pendingHead so the next slot emission resolves it. if isAttr { pendingHead = newIdx pendingModKind = MKAttr } pendingMark = 0 pendingOblRole = ORNone if sawVerb && pendingRole == HistObject { pendingRole = HistComplement } } } // Finalize the last clause. finalSet := buildSetFromSlices( slots, roles, slotMorphs, slotMarks, slotOblRoles, slotHeads, slotModKinds, ) clauses = append(clauses, Clause{ Set: finalSet, Relation: clauseRel, Parent: clauseParent, HostIdx: -1, }) // Flatten Slots/Roles across all clauses so the atom-link layer sees // every word. With punct-aware tokenization, clauses are finalized // mid-input and the per-clause `slots` array gets reset; only the final // clause would otherwise be visible in ExtractResult.Slots. flatSlots := []string{:0:len(slots)} flatRoles := []int32{:0:len(slots)} for _, c := range clauses { for _, e := range c.Set { flatSlots = append(flatSlots, e.Atom) flatRoles = append(flatRoles, e.Role) } } return ExtractResult{ Pattern: pat, Slots: flatSlots, Roles: flatRoles, DeepPat: buildDeepPat(flatRoles), Set: clauses[0].Set, Discourse: clauses, } } // ExtractCode takes code tokens and produces pattern + slots. // Structural keywords become markers. Identifiers/literals become slots. func ExtractCode(tokens []string) ExtractResult { var pat []byte var slots []string var roles []int32 for _, tok := range tokens { mk := codeTokenToMarker(tok) if mk != 0 { pat = append(pat, mk) } else { pat = append(pat, SlotNoun) slots = append(slots, tok) roles = append(roles, HistComplement) } } return ExtractResult{Pattern: pat, Slots: slots, Roles: roles, DeepPat: buildDeepPat(roles)} } func codeTokenToMarker(tok string) uint8 { switch tok { case "if": return MkIf case "else": return MkElse case "for", "range": return MkFor_C case "return": return MkReturn case "{": return MkLBrace case "}": return MkRBrace case "(": return MkLParen case ")": return MkRParen case "=", ":=": return MkAssign case ".": return MkDot case ",": return MkComma case ":": return MkColon case "->", "<-": return MkArrow case "case": return MkCase case "select": return MkSelect case "spawn": return MkSpawn case "chan": return MkChan } return 0 } // EN marker functional classes. const ( enMarkUnknown = 0 enMarkDeterminer = 1 // the, a, an enMarkPossDet = 2 // my, your, his, her, its, our, their (POSS modifiers) enMarkVerbAux = 3 // is, are, was, do, have... enMarkPreposition = 4 // in, on, at, with, by... enMarkNegation = 5 // not, n't enMarkConjunction = 6 // and, but, or enMarkRelative = 7 // that, which, who enMarkPronoun = 8 // i, you, he, she, it, we, they (subject pronouns) ) func enMarkerClass(mk uint8) int32 { switch mk { case MkThe, MkA: return enMarkDeterminer case MkMy, MkYour, MkHis, MkHerP, MkIts, MkOurP, MkTheirP: return enMarkPossDet case MkIs, MkAre, MkWas, MkDo: return enMarkVerbAux case MkIn, MkOn, MkAt, MkWith, MkBy, MkFor, MkTo_EN, MkOf, MkFrom, MkAs, MkThan: return enMarkPreposition case MkNot: return enMarkNegation case MkAnd, MkBut: return enMarkConjunction case MkThat: return enMarkRelative case MkI, MkYou, MkPron3, MkIt_EN, MkWe_EN, MkThey_: return enMarkPronoun } return enMarkUnknown } // looksLikeAdverb returns true if w is likely an adverb: -ly suffix or in // a hardcoded list of common irregular adverbs that aren't morphologically // derivable. func looksLikeAdverb(w string) bool { if len(w) > 3 && hasSuffix(w, "ly") { return true } switch w { case "fast", "well", "hard", "here", "there", "now", "then", "today", "yesterday", "tomorrow", "always", "often", "never", "sometimes", "usually", "rarely", "still", "already", "yet", "soon", "later", "early", "late", "ever", "again": return true } return false } // causativeBareVFollows returns true if tokens after tokIdx contain a bare // infinitive verb (no intervening "to") within the next NP-shaped lookahead // window. Used to detect "make/let + NP + bare-V" causative pattern. // NP shape: at most 3 tokens (det + adj + noun, or possessive + noun, or // single pronoun) before the bare verb. func causativeBareVFollows(tokens []string, tokIdx int32) bool { for k := 1; k <= 4 && tokIdx+k < len(tokens); k++ { t := toLowerEN(tokens[tokIdx+k]) if t == "to" { return false // "to V" infinitival, not bare-V causative } if mk, isMk := enWordToMarker()[t]; isMk { switch enMarkerClass(mk) { case enMarkPreposition, enMarkConjunction: return false } continue // determiner, pronoun, possessive - part of the NP } if looksLikeVerb(t) { return true } } return false } // isEnDitransitive returns true for verbs that take a bare-NP recipient // before the patient object: "give X Y" = "give Y to X". When such a verb // is emitted and the next noun has no preceding preposition, that noun is // the recipient (HistModifier + ORRecip), not a second direct object. // Closed set; verbs that always take prepositional dative (e.g., "explain // X to Y") are excluded. func isEnDitransitive(lemma string) bool { switch lemma { case "give", "send", "tell", "show", "offer", "hand", "pass", "teach", "write", "read", "sell", "buy", "bring", "mail", "lend", "owe", "pay", "throw", "hand", "ask": return true } return false } // looksLikePredicateAdj returns true for words that, when following a copula // (is/are/was/were), are predicative adjectives rather than verbs or nouns. // Used to disambiguate "is interesting" (predicate-adj) from "is V-ing" // (progressive verb) or "is X" (copular noun). // // Detection: three layers // 1. Common short adjective whitelist (big, small, fast, hungry, ...) // 2. Deverbal -ing predicate adjectives (interesting, exciting, ...) // 3. Adjective-shape suffixes (-ful, -ous, -ic, -able, -ible) // // Conservative; false negatives fall through to MKCop (noun-complement) which // preserves semantics but loses the JA-side MKAdj-parity for round-trip. func looksLikePredicateAdj(w string) bool { if len(w) < 2 { return false } // Common short adjective whitelist - parity with JA i-adj predicates. switch w { case "big", "small", "tall", "short", "long", "wide", "narrow", "thick", "thin", "deep", "shallow", "high", "low", "hot", "cold", "warm", "cool", "wet", "dry", "fast", "slow", "quick", "old", "new", "young", "good", "bad", "nice", "fine", "great", "poor", "happy", "sad", "angry", "tired", "hungry", "thirsty", "sleepy", "busy", "lazy", "easy", "hard", "soft", "loud", "quiet", "clean", "dirty", "empty", "full", "rich", "weak", "strong", "smart", "kind", "mean", "red", "blue", "green", "yellow", "white", "black", "pink", "brown", "gray", "grey", "purple", "orange", "heavy", "light", "free", "cheap", "expensive", "safe", "sick", "well", "ill", "ready", "right", "wrong", "true", "false", "real", "fake", "open", "closed", "bright", "dark", "sweet", "sour", "salty", "bitter", "round", "square", "flat", "sharp", "dull", "strange", "weird", "normal", "common", "rare", "important", "famous", "popular", "different", "similar", "alive", "dead", "alone", "together": return true } if len(w) < 4 { return false } // Deverbal -ing predicate adjectives. switch w { case "interesting", "exciting", "boring", "tiring", "amazing", "frightening", "surprising", "confusing", "disappointing", "satisfying", "encouraging", "pleasing", "fascinating", "depressing", "embarrassing", "shocking", "thrilling", "charming", "annoying", "relaxing", "stunning", "missing", "willing", "outstanding", "promising": return true } // Adjective-shape suffixes. if hasSuffix(w, "ful") || hasSuffix(w, "ous") || hasSuffix(w, "ic") || hasSuffix(w, "able") || hasSuffix(w, "ible") { return true } return false } // possDetSurface returns the surface form of an EN possessive determiner // for round-trip rendering. func possDetSurface(mk uint8) string { switch mk { case MkMy: return "my" case MkYour: return "your" case MkHis: return "his" case MkHerP: return "her" case MkIts: return "its" case MkOurP: return "our" case MkTheirP: return "their" } return "" } func toLowerEN(s string) string { b := []byte(s) for i, c := range b { if c >= 'A' && c <= 'Z' { b[i] = c + 32 } } return string(b) } func isENPunct(s string) bool { if len(s) != 1 { return false } c := s[0] return c == '.' || c == ',' || c == '!' || c == '?' || c == ';' || c == ':' || c == '"' || c == '\'' } // looksLikeVerb is a heuristic for EN verb detection. // Uses common verb endings and a high-frequency set. // Min-length guards on suffix detection prevent short-word false positives // (red/fed/led/bed all end in -ed; sing/king/ring all end in -ing). func looksLikeVerb(w string) bool { if len(w) < 2 { return false } if (hasSuffix(w, "ing") && len(w) > 4) || (hasSuffix(w, "ed") && len(w) > 3) || hasSuffix(w, "ize") || hasSuffix(w, "ise") || hasSuffix(w, "ate") { return true } if hasSuffix(w, "fy") || (hasSuffix(w, "en") && len(w) > 3) { return true } switch w { case "go", "went", "gone", "goes", "eat", "eats", "drink", "drinks", "read", "reads", "write", "writes", "walk", "walks", "talk", "talks", "sleep", "sleeps", "wake", "wakes", "sit", "sits", "stand", "stands", "lie", "lies", "live", "lives", "die", "dies", "chew", "chews", "fly", "flies", "swim", "swims", "jump", "jumps", "throw", "throws", "catch", "catches", "kick", "kicks", "hit", "hits", "push", "pushes", "pull", "pulls", "grab", "grabs", "bite", "bites", "chase", "chases", "get", "got", "gotten", "gets", "make", "made", "makes", "take", "took", "taken", "takes", "come", "came", "comes", "see", "saw", "seen", "sees", "know", "knew", "known", "knows", "give", "gave", "given", "gives", "say", "said", "says", "tell", "told", "tells", "think", "thought", "thinks", "find", "found", "finds", "leave", "left", "leaves", "call", "calls", "ask", "asks", "seem", "seems", "feel", "felt", "feels", "become", "became", "becomes", "keep", "kept", "keeps", "begin", "began", "begun", "begins", "show", "shows", "hear", "heard", "hears", "play", "plays", "move", "moves", "live", "lives", "believe", "believes", "hold", "held", "holds", "bring", "brought", "brings", "happen", "happens", "write", "wrote", "writes", "provide", "provides", "sit", "sat", "sits", "stand", "stood", "stands", "lose", "lost", "loses", "pay", "paid", "pays", "meet", "met", "meets", "include", "includes", "continue", "continues", "learn", "learns", "change", "changes", "lead", "led", "leads", "understand", "understood", "watch", "watches", "follow", "follows", "stop", "stops", "create", "creates", "speak", "spoke", "speaks", "read", "reads", "allow", "allows", "add", "adds", "spend", "spent", "spends", "grow", "grew", "grows", "open", "opens", "walk", "walks", "win", "won", "wins", "teach", "taught", "offer", "offers", "remember", "remembers", "love", "loves", "consider", "considers", "appear", "appears", "buy", "bought", "buys", "wait", "waits", "serve", "serves", "die", "died", "dies", "send", "sent", "sends", "expect", "expects", "build", "built", "builds", "stay", "stays", "fall", "fell", "falls", "cut", "cuts", "reach", "reaches", "kill", "kills", "remain", "remains", "suggest", "suggests", "raise", "raises", "pass", "passes", "sell", "sold", "sells", "require", "requires", "report", "reports", "decide", "decides", "pull", "pulls", "develop", "develops", "use", "uses", "put", "puts", "set", "sets", "run", "runs", "let", "lets", "try", "tries", "need", "needs", "want", "wants", "start", "starts", "help", "helps", "turn", "turns", "work", "works", "like", "likes", "look", "looks", "mean", "means", "meant", "can", "could", "will", "would", "shall", "should", "may", "might", "must": return true } return false } func hasSuffix(s, suffix string) bool { if len(s) < len(suffix) { return false } return s[len(s)-len(suffix):] == suffix } // buildDeepPat creates a canonical (sorted, normalized) role sequence from roles. func buildDeepPat(roles []int32) []uint8 { if len(roles) == 0 { return nil } dp := []uint8{:len(roles):len(roles)} for i, r := range roles { nr := r if nr == HistTopic { nr = HistSubject } dp[i] = uint8(nr) } // Insertion sort (patterns are short, 3-8 elements). for i := 1; i < len(dp); i++ { key := dp[i] j := i - 1 for j >= 0 && dp[j] > key { dp[j+1] = dp[j] j-- } dp[j+1] = key } return dp }