extract_ko.mx raw

   1  package iskra
   2  
   3  // Korean (KO) extractor.
   4  //
   5  // Korean is the JA-close cousin in the typological matrix: SOV, agglutinative,
   6  // particles, topic-comment, pro-drop. The extractor reuses the JA marker-ID
   7  // space because the structural roles are cognate - KO 은/는 plays the same
   8  // role as JA は (HistTopic), KO 이/가 is JA が (HistSubject), etc.
   9  //
  10  // Key surface differences from JA:
  11  //   1. KO uses spaces between words (easier tokenization than JA)
  12  //   2. KO particles are attached to the preceding word as one orthographic
  13  //      token: 태초에 = 태초+에. The tokenizer splits these.
  14  //   3. KO has phonetically conditioned particle allomorphs:
  15  //      이/가 (SUBJ): 가 after vowel-final, 이 after consonant-final
  16  //      은/는 (TOPIC): 는 after vowel-final, 은 after consonant-final
  17  //      을/를 (OBJ):  를 after vowel-final, 을 after consonant-final
  18  //      The marker table maps both allomorphs to the same JA-cognate marker.
  19  //
  20  // Initial version handles the core role-marking system. Predicate-shape
  21  // detection (copula, predicate-adj), relative clauses, conditional clauses,
  22  // and clause coordination will be added as the round-trip metric demands.
  23  
  24  // koParticleToMarker maps a KO particle surface form to the JA-cognate
  25  // marker ID. Both allomorphs of each particle point at the same marker
  26  // because they encode the same semantic role.
  27  func koParticleToMarker() map[string]uint8 {
  28  	return map[string]uint8{
  29  	// SUBJ
  30  	"\xec\x9d\xb4": MkGa, // 이 (consonant-final)
  31  	"\xea\xb0\x80": MkGa, // 가 (vowel-final)
  32  	// TOPIC
  33  	"\xec\x9d\x80": MkWa, // 은
  34  	"\xeb\x8a\x94": MkWa, // 는
  35  	// OBJ
  36  	"\xec\x9d\x84": MkWo, // 을
  37  	"\xeb\xa5\xbc": MkWo, // 를
  38  	// LOC/GOAL (≈ JA に)
  39  	"\xec\x97\x90": MkNi, // 에
  40  	// LOC-action / FROM (≈ JA で / から - using で here, source captured by 부터)
  41  	"\xec\x97\x90\xec\x84\x9c": MkDe, // 에서
  42  	// POSSESSIVE (≈ JA の)
  43  	"\xec\x9d\x98": MkNo, // 의
  44  	// AND/WITH (≈ JA と)
  45  	"\xec\x99\x80": MkTo, // 와 (vowel-final)
  46  	"\xea\xb3\xbc": MkTo, // 과 (consonant-final)
  47  	// INSTRUMENTAL/DIRECTION (≈ JA で)
  48  	"\xeb\xa1\x9c":         MkDe, // 로
  49  	"\xec\x9c\xbc\xeb\xa1\x9c": MkDe, // 으로
  50  	// SOURCE (≈ JA から)
  51  	"\xeb\xb6\x80\xed\x84\xb0": MkKara, // 부터
  52  	// UNTIL (≈ JA まで)
  53  	"\xea\xb9\x8c\xec\xa7\x80": MkMade, // 까지
  54  	// ALSO/INCLUSIVE (≈ JA も)
  55  	"\xeb\x8f\x84": MkMo, // 도
  56  	}
  57  }
  58  
  59  // koParticleTails returns the particles that can appear as a SUFFIX of an
  60  // orthographic word. Listed longest-first so the tokenizer matches greedily
  61  // (에서 wins over 에, 으로 wins over 로, etc.). The byte sequences must
  62  // match koParticleToMarker keys exactly.
  63  func koParticleTails() []string {
  64  	return []string{
  65  	"\xec\x97\x90\xec\x84\x9c",         // 에서 (6 bytes)
  66  	"\xec\x9c\xbc\xeb\xa1\x9c",         // 으로 (6 bytes)
  67  	"\xeb\xb6\x80\xed\x84\xb0",         // 부터 (6 bytes)
  68  	"\xea\xb9\x8c\xec\xa7\x80",         // 까지 (6 bytes)
  69  	"\xec\x9d\x98",                     // 의 (3 bytes)
  70  	"\xec\x9d\xb4",                     // 이
  71  	"\xea\xb0\x80",                     // 가
  72  	"\xec\x9d\x80",                     // 은
  73  	"\xeb\x8a\x94",                     // 는
  74  	"\xec\x9d\x84",                     // 을
  75  	"\xeb\xa5\xbc",                     // 를
  76  	"\xec\x97\x90",                     // 에
  77  	"\xec\x99\x80",                     // 와
  78  	"\xea\xb3\xbc",                     // 과
  79  	"\xeb\xa1\x9c",                     // 로
  80  	"\xeb\x8f\x84",                     // 도
  81  	}
  82  }
  83  
  84  // ExtractKO takes pre-tokenized Korean tokens (space-split, then particle-
  85  // split via tokenizeKO) and produces the same ExtractResult shape as
  86  // ExtractJA: Pattern, Slots, Roles, Discourse.
  87  //
  88  // Structural recipe (same as ExtractJA in spirit):
  89  //   - non-particle content token → emit slot; pendingRole defaults to Verb,
  90  //     overridden retroactively by the next particle's MarkerToRole
  91  //   - particle token → MarkerToRole assigns role to preceding slot;
  92  //     MarkerToOblRole assigns oblique role
  93  //   - last slot becomes HistVerb if no copula/adjective predication detected
  94  func ExtractKO(tokens []string) ExtractResult {
  95  	var pat []byte
  96  	var slots []string
  97  	var roles []int32
  98  	var slotMarkers []uint8
  99  	var slotMorphs []uint16
 100  	var slotOblRoles []uint8
 101  	var slotHeads []int16
 102  	var slotModKinds []uint8
 103  	pendingRole := HistVerb
 104  	pendingHead := int16(-1)
 105  	pendingModKind := uint8(MKNone)
 106  	var clausesKO []Clause
 107  	clauseRelKO := ClauseRoot
 108  	clauseParentKO := int16(-1)
 109  	_ = pendingHead
 110  	_ = pendingModKind
 111  
 112  	for _, tok := range tokens {
 113  		// Clause-boundary token (synthetic 、 from tokenizer punctuation).
 114  		if tok == "\xe3\x80\x81" {
 115  			if len(slots) > 0 {
 116  				if len(roles) > 0 && slotModKinds[len(roles)-1] != MKCop {
 117  					roles[len(roles)-1] = HistVerb
 118  				}
 119  				clauseSet := buildSetFromSlices(
 120  					slots, roles, slotMorphs,
 121  					slotMarkers, slotOblRoles, slotHeads, slotModKinds,
 122  				)
 123  				nextParent := int16(len(clausesKO))
 124  				clausesKO = append(clausesKO, Clause{
 125  					Set: clauseSet, Relation: clauseRelKO,
 126  					Parent: clauseParentKO, HostIdx: -1,
 127  				})
 128  				slots = nil
 129  				roles = nil
 130  				slotMarkers = nil
 131  				slotMorphs = nil
 132  				slotOblRoles = nil
 133  				slotHeads = nil
 134  				slotModKinds = nil
 135  				pendingRole = HistVerb
 136  				clauseRelKO = ClauseAnd
 137  				clauseParentKO = nextParent - 1
 138  			}
 139  			continue
 140  		}
 141  
 142  		mk, isMk := koParticleToMarker()[tok]
 143  		if isMk {
 144  			pat = append(pat, mk)
 145  			if len(roles) > 0 {
 146  				newRole := MarkerToRole(mk)
 147  				roles[len(roles)-1] = newRole
 148  				if len(slotMarkers) == len(slots) {
 149  					slotMarkers[len(slotMarkers)-1] = mk
 150  				}
 151  				if len(slotOblRoles) == len(slots) {
 152  					if or := MarkerToOblRole(mk); or != ORNone {
 153  						slotOblRoles[len(slotOblRoles)-1] = or
 154  					}
 155  				}
 156  			}
 157  			pendingRole = HistVerb
 158  			continue
 159  		}
 160  
 161  		// Content token: emit as slot.
 162  		pat = append(pat, SlotNoun)
 163  		slots = append(slots, tok)
 164  		roles = append(roles, pendingRole)
 165  		slotMarkers = append(slotMarkers, 0)
 166  		slotMorphs = append(slotMorphs, 0)
 167  		slotOblRoles = append(slotOblRoles, ORNone)
 168  		slotHeads = append(slotHeads, -1)
 169  		slotModKinds = append(slotModKinds, MKNone)
 170  		pendingRole = HistVerb
 171  	}
 172  
 173  	// Final-slot verb override (no predicate-shape detection yet).
 174  	if len(roles) > 0 {
 175  		roles[len(roles)-1] = HistVerb
 176  	}
 177  
 178  	set := buildSetFromSlices(
 179  		slots, roles, slotMorphs,
 180  		slotMarkers, slotOblRoles, slotHeads, slotModKinds,
 181  	)
 182  	clausesKO = append(clausesKO, Clause{
 183  		Set: set, Relation: clauseRelKO,
 184  		Parent: clauseParentKO, HostIdx: -1,
 185  	})
 186  
 187  	// Flatten Slots/Roles across all clauses (same as ExtractJA).
 188  	flatSlots := []string{:0:len(slots)}
 189  	flatRoles := []int32{:0:len(slots)}
 190  	for _, c := range clausesKO {
 191  		for _, e := range c.Set {
 192  			flatSlots = append(flatSlots, e.Atom)
 193  			flatRoles = append(flatRoles, e.Role)
 194  		}
 195  	}
 196  
 197  	return ExtractResult{
 198  		Pattern: pat, Slots: flatSlots, Roles: flatRoles,
 199  		DeepPat: buildDeepPat(flatRoles), Set: clausesKO[0].Set,
 200  		Discourse: clausesKO,
 201  	}
 202  }
 203