extract_ko.mx raw
1 package iskra
2
3 // Korean (KO) extractor.
4 //
5 // Korean is the JA-close cousin in the typological matrix: SOV, agglutinative,
6 // particles, topic-comment, pro-drop. The extractor reuses the JA marker-ID
7 // space because the structural roles are cognate - KO 은/는 plays the same
8 // role as JA は (HistTopic), KO 이/가 is JA が (HistSubject), etc.
9 //
10 // Key surface differences from JA:
11 // 1. KO uses spaces between words (easier tokenization than JA)
12 // 2. KO particles are attached to the preceding word as one orthographic
13 // token: 태초에 = 태초+에. The tokenizer splits these.
14 // 3. KO has phonetically conditioned particle allomorphs:
15 // 이/가 (SUBJ): 가 after vowel-final, 이 after consonant-final
16 // 은/는 (TOPIC): 는 after vowel-final, 은 after consonant-final
17 // 을/를 (OBJ): 를 after vowel-final, 을 after consonant-final
18 // The marker table maps both allomorphs to the same JA-cognate marker.
19 //
20 // Initial version handles the core role-marking system. Predicate-shape
21 // detection (copula, predicate-adj), relative clauses, conditional clauses,
22 // and clause coordination will be added as the round-trip metric demands.
23
24 // koParticleToMarker maps a KO particle surface form to the JA-cognate
25 // marker ID. Both allomorphs of each particle point at the same marker
26 // because they encode the same semantic role.
27 func koParticleToMarker() map[string]uint8 {
28 return map[string]uint8{
29 // SUBJ
30 "\xec\x9d\xb4": MkGa, // 이 (consonant-final)
31 "\xea\xb0\x80": MkGa, // 가 (vowel-final)
32 // TOPIC
33 "\xec\x9d\x80": MkWa, // 은
34 "\xeb\x8a\x94": MkWa, // 는
35 // OBJ
36 "\xec\x9d\x84": MkWo, // 을
37 "\xeb\xa5\xbc": MkWo, // 를
38 // LOC/GOAL (≈ JA に)
39 "\xec\x97\x90": MkNi, // 에
40 // LOC-action / FROM (≈ JA で / から - using で here, source captured by 부터)
41 "\xec\x97\x90\xec\x84\x9c": MkDe, // 에서
42 // POSSESSIVE (≈ JA の)
43 "\xec\x9d\x98": MkNo, // 의
44 // AND/WITH (≈ JA と)
45 "\xec\x99\x80": MkTo, // 와 (vowel-final)
46 "\xea\xb3\xbc": MkTo, // 과 (consonant-final)
47 // INSTRUMENTAL/DIRECTION (≈ JA で)
48 "\xeb\xa1\x9c": MkDe, // 로
49 "\xec\x9c\xbc\xeb\xa1\x9c": MkDe, // 으로
50 // SOURCE (≈ JA から)
51 "\xeb\xb6\x80\xed\x84\xb0": MkKara, // 부터
52 // UNTIL (≈ JA まで)
53 "\xea\xb9\x8c\xec\xa7\x80": MkMade, // 까지
54 // ALSO/INCLUSIVE (≈ JA も)
55 "\xeb\x8f\x84": MkMo, // 도
56 }
57 }
58
59 // koParticleTails returns the particles that can appear as a SUFFIX of an
60 // orthographic word. Listed longest-first so the tokenizer matches greedily
61 // (에서 wins over 에, 으로 wins over 로, etc.). The byte sequences must
62 // match koParticleToMarker keys exactly.
63 func koParticleTails() []string {
64 return []string{
65 "\xec\x97\x90\xec\x84\x9c", // 에서 (6 bytes)
66 "\xec\x9c\xbc\xeb\xa1\x9c", // 으로 (6 bytes)
67 "\xeb\xb6\x80\xed\x84\xb0", // 부터 (6 bytes)
68 "\xea\xb9\x8c\xec\xa7\x80", // 까지 (6 bytes)
69 "\xec\x9d\x98", // 의 (3 bytes)
70 "\xec\x9d\xb4", // 이
71 "\xea\xb0\x80", // 가
72 "\xec\x9d\x80", // 은
73 "\xeb\x8a\x94", // 는
74 "\xec\x9d\x84", // 을
75 "\xeb\xa5\xbc", // 를
76 "\xec\x97\x90", // 에
77 "\xec\x99\x80", // 와
78 "\xea\xb3\xbc", // 과
79 "\xeb\xa1\x9c", // 로
80 "\xeb\x8f\x84", // 도
81 }
82 }
83
84 // ExtractKO takes pre-tokenized Korean tokens (space-split, then particle-
85 // split via tokenizeKO) and produces the same ExtractResult shape as
86 // ExtractJA: Pattern, Slots, Roles, Discourse.
87 //
88 // Structural recipe (same as ExtractJA in spirit):
89 // - non-particle content token → emit slot; pendingRole defaults to Verb,
90 // overridden retroactively by the next particle's MarkerToRole
91 // - particle token → MarkerToRole assigns role to preceding slot;
92 // MarkerToOblRole assigns oblique role
93 // - last slot becomes HistVerb if no copula/adjective predication detected
94 func ExtractKO(tokens []string) ExtractResult {
95 var pat []byte
96 var slots []string
97 var roles []int32
98 var slotMarkers []uint8
99 var slotMorphs []uint16
100 var slotOblRoles []uint8
101 var slotHeads []int16
102 var slotModKinds []uint8
103 pendingRole := HistVerb
104 pendingHead := int16(-1)
105 pendingModKind := uint8(MKNone)
106 var clausesKO []Clause
107 clauseRelKO := ClauseRoot
108 clauseParentKO := int16(-1)
109 _ = pendingHead
110 _ = pendingModKind
111
112 for _, tok := range tokens {
113 // Clause-boundary token (synthetic 、 from tokenizer punctuation).
114 if tok == "\xe3\x80\x81" {
115 if len(slots) > 0 {
116 if len(roles) > 0 && slotModKinds[len(roles)-1] != MKCop {
117 roles[len(roles)-1] = HistVerb
118 }
119 clauseSet := buildSetFromSlices(
120 slots, roles, slotMorphs,
121 slotMarkers, slotOblRoles, slotHeads, slotModKinds,
122 )
123 nextParent := int16(len(clausesKO))
124 clausesKO = append(clausesKO, Clause{
125 Set: clauseSet, Relation: clauseRelKO,
126 Parent: clauseParentKO, HostIdx: -1,
127 })
128 slots = nil
129 roles = nil
130 slotMarkers = nil
131 slotMorphs = nil
132 slotOblRoles = nil
133 slotHeads = nil
134 slotModKinds = nil
135 pendingRole = HistVerb
136 clauseRelKO = ClauseAnd
137 clauseParentKO = nextParent - 1
138 }
139 continue
140 }
141
142 mk, isMk := koParticleToMarker()[tok]
143 if isMk {
144 pat = append(pat, mk)
145 if len(roles) > 0 {
146 newRole := MarkerToRole(mk)
147 roles[len(roles)-1] = newRole
148 if len(slotMarkers) == len(slots) {
149 slotMarkers[len(slotMarkers)-1] = mk
150 }
151 if len(slotOblRoles) == len(slots) {
152 if or := MarkerToOblRole(mk); or != ORNone {
153 slotOblRoles[len(slotOblRoles)-1] = or
154 }
155 }
156 }
157 pendingRole = HistVerb
158 continue
159 }
160
161 // Content token: emit as slot.
162 pat = append(pat, SlotNoun)
163 slots = append(slots, tok)
164 roles = append(roles, pendingRole)
165 slotMarkers = append(slotMarkers, 0)
166 slotMorphs = append(slotMorphs, 0)
167 slotOblRoles = append(slotOblRoles, ORNone)
168 slotHeads = append(slotHeads, -1)
169 slotModKinds = append(slotModKinds, MKNone)
170 pendingRole = HistVerb
171 }
172
173 // Final-slot verb override (no predicate-shape detection yet).
174 if len(roles) > 0 {
175 roles[len(roles)-1] = HistVerb
176 }
177
178 set := buildSetFromSlices(
179 slots, roles, slotMorphs,
180 slotMarkers, slotOblRoles, slotHeads, slotModKinds,
181 )
182 clausesKO = append(clausesKO, Clause{
183 Set: set, Relation: clauseRelKO,
184 Parent: clauseParentKO, HostIdx: -1,
185 })
186
187 // Flatten Slots/Roles across all clauses (same as ExtractJA).
188 flatSlots := []string{:0:len(slots)}
189 flatRoles := []int32{:0:len(slots)}
190 for _, c := range clausesKO {
191 for _, e := range c.Set {
192 flatSlots = append(flatSlots, e.Atom)
193 flatRoles = append(flatRoles, e.Role)
194 }
195 }
196
197 return ExtractResult{
198 Pattern: pat, Slots: flatSlots, Roles: flatRoles,
199 DeepPat: buildDeepPat(flatRoles), Set: clausesKO[0].Set,
200 Discourse: clausesKO,
201 }
202 }
203