package iskra // Korean (KO) extractor. // // Korean is the JA-close cousin in the typological matrix: SOV, agglutinative, // particles, topic-comment, pro-drop. The extractor reuses the JA marker-ID // space because the structural roles are cognate - KO 은/는 plays the same // role as JA は (HistTopic), KO 이/가 is JA が (HistSubject), etc. // // Key surface differences from JA: // 1. KO uses spaces between words (easier tokenization than JA) // 2. KO particles are attached to the preceding word as one orthographic // token: 태초에 = 태초+에. The tokenizer splits these. // 3. KO has phonetically conditioned particle allomorphs: // 이/가 (SUBJ): 가 after vowel-final, 이 after consonant-final // 은/는 (TOPIC): 는 after vowel-final, 은 after consonant-final // 을/를 (OBJ): 를 after vowel-final, 을 after consonant-final // The marker table maps both allomorphs to the same JA-cognate marker. // // Initial version handles the core role-marking system. Predicate-shape // detection (copula, predicate-adj), relative clauses, conditional clauses, // and clause coordination will be added as the round-trip metric demands. // koParticleToMarker maps a KO particle surface form to the JA-cognate // marker ID. Both allomorphs of each particle point at the same marker // because they encode the same semantic role. func koParticleToMarker() map[string]uint8 { return map[string]uint8{ // SUBJ "\xec\x9d\xb4": MkGa, // 이 (consonant-final) "\xea\xb0\x80": MkGa, // 가 (vowel-final) // TOPIC "\xec\x9d\x80": MkWa, // 은 "\xeb\x8a\x94": MkWa, // 는 // OBJ "\xec\x9d\x84": MkWo, // 을 "\xeb\xa5\xbc": MkWo, // 를 // LOC/GOAL (≈ JA に) "\xec\x97\x90": MkNi, // 에 // LOC-action / FROM (≈ JA で / から - using で here, source captured by 부터) "\xec\x97\x90\xec\x84\x9c": MkDe, // 에서 // POSSESSIVE (≈ JA の) "\xec\x9d\x98": MkNo, // 의 // AND/WITH (≈ JA と) "\xec\x99\x80": MkTo, // 와 (vowel-final) "\xea\xb3\xbc": MkTo, // 과 (consonant-final) // INSTRUMENTAL/DIRECTION (≈ JA で) "\xeb\xa1\x9c": MkDe, // 로 "\xec\x9c\xbc\xeb\xa1\x9c": MkDe, // 으로 // SOURCE (≈ JA から) "\xeb\xb6\x80\xed\x84\xb0": MkKara, // 부터 // UNTIL (≈ JA まで) "\xea\xb9\x8c\xec\xa7\x80": MkMade, // 까지 // ALSO/INCLUSIVE (≈ JA も) "\xeb\x8f\x84": MkMo, // 도 } } // koParticleTails returns the particles that can appear as a SUFFIX of an // orthographic word. Listed longest-first so the tokenizer matches greedily // (에서 wins over 에, 으로 wins over 로, etc.). The byte sequences must // match koParticleToMarker keys exactly. func koParticleTails() []string { return []string{ "\xec\x97\x90\xec\x84\x9c", // 에서 (6 bytes) "\xec\x9c\xbc\xeb\xa1\x9c", // 으로 (6 bytes) "\xeb\xb6\x80\xed\x84\xb0", // 부터 (6 bytes) "\xea\xb9\x8c\xec\xa7\x80", // 까지 (6 bytes) "\xec\x9d\x98", // 의 (3 bytes) "\xec\x9d\xb4", // 이 "\xea\xb0\x80", // 가 "\xec\x9d\x80", // 은 "\xeb\x8a\x94", // 는 "\xec\x9d\x84", // 을 "\xeb\xa5\xbc", // 를 "\xec\x97\x90", // 에 "\xec\x99\x80", // 와 "\xea\xb3\xbc", // 과 "\xeb\xa1\x9c", // 로 "\xeb\x8f\x84", // 도 } } // ExtractKO takes pre-tokenized Korean tokens (space-split, then particle- // split via tokenizeKO) and produces the same ExtractResult shape as // ExtractJA: Pattern, Slots, Roles, Discourse. // // Structural recipe (same as ExtractJA in spirit): // - non-particle content token → emit slot; pendingRole defaults to Verb, // overridden retroactively by the next particle's MarkerToRole // - particle token → MarkerToRole assigns role to preceding slot; // MarkerToOblRole assigns oblique role // - last slot becomes HistVerb if no copula/adjective predication detected func ExtractKO(tokens []string) ExtractResult { var pat []byte var slots []string var roles []int32 var slotMarkers []uint8 var slotMorphs []uint16 var slotOblRoles []uint8 var slotHeads []int16 var slotModKinds []uint8 pendingRole := HistVerb pendingHead := int16(-1) pendingModKind := uint8(MKNone) var clausesKO []Clause clauseRelKO := ClauseRoot clauseParentKO := int16(-1) _ = pendingHead _ = pendingModKind for _, tok := range tokens { // Clause-boundary token (synthetic 、 from tokenizer punctuation). if tok == "\xe3\x80\x81" { if len(slots) > 0 { if len(roles) > 0 && slotModKinds[len(roles)-1] != MKCop { roles[len(roles)-1] = HistVerb } clauseSet := buildSetFromSlices( slots, roles, slotMorphs, slotMarkers, slotOblRoles, slotHeads, slotModKinds, ) nextParent := int16(len(clausesKO)) clausesKO = append(clausesKO, Clause{ Set: clauseSet, Relation: clauseRelKO, Parent: clauseParentKO, HostIdx: -1, }) slots = nil roles = nil slotMarkers = nil slotMorphs = nil slotOblRoles = nil slotHeads = nil slotModKinds = nil pendingRole = HistVerb clauseRelKO = ClauseAnd clauseParentKO = nextParent - 1 } continue } mk, isMk := koParticleToMarker()[tok] if isMk { pat = append(pat, mk) if len(roles) > 0 { newRole := MarkerToRole(mk) roles[len(roles)-1] = newRole if len(slotMarkers) == len(slots) { slotMarkers[len(slotMarkers)-1] = mk } if len(slotOblRoles) == len(slots) { if or := MarkerToOblRole(mk); or != ORNone { slotOblRoles[len(slotOblRoles)-1] = or } } } pendingRole = HistVerb continue } // Content token: emit as slot. pat = append(pat, SlotNoun) slots = append(slots, tok) roles = append(roles, pendingRole) slotMarkers = append(slotMarkers, 0) slotMorphs = append(slotMorphs, 0) slotOblRoles = append(slotOblRoles, ORNone) slotHeads = append(slotHeads, -1) slotModKinds = append(slotModKinds, MKNone) pendingRole = HistVerb } // Final-slot verb override (no predicate-shape detection yet). if len(roles) > 0 { roles[len(roles)-1] = HistVerb } set := buildSetFromSlices( slots, roles, slotMorphs, slotMarkers, slotOblRoles, slotHeads, slotModKinds, ) clausesKO = append(clausesKO, Clause{ Set: set, Relation: clauseRelKO, Parent: clauseParentKO, HostIdx: -1, }) // Flatten Slots/Roles across all clauses (same as ExtractJA). flatSlots := []string{:0:len(slots)} flatRoles := []int32{:0:len(slots)} for _, c := range clausesKO { for _, e := range c.Set { flatSlots = append(flatSlots, e.Atom) flatRoles = append(flatRoles, e.Role) } } return ExtractResult{ Pattern: pat, Slots: flatSlots, Roles: flatRoles, DeepPat: buildDeepPat(flatRoles), Set: clausesKO[0].Set, Discourse: clausesKO, } }