package iskra

// Korean (KO) extractor.
//
// Korean is the JA-close cousin in the typological matrix: SOV, agglutinative,
// particles, topic-comment, pro-drop. The extractor reuses the JA marker-ID
// space because the structural roles are cognate - KO 은/는 plays the same
// role as JA は (HistTopic), KO 이/가 is JA が (HistSubject), etc.
//
// Key surface differences from JA:
//   1. KO uses spaces between words (easier tokenization than JA)
//   2. KO particles are attached to the preceding word as one orthographic
//      token: 태초에 = 태초+에. The tokenizer splits these.
//   3. KO has phonetically conditioned particle allomorphs:
//      이/가 (SUBJ): 가 after vowel-final, 이 after consonant-final
//      은/는 (TOPIC): 는 after vowel-final, 은 after consonant-final
//      을/를 (OBJ):  를 after vowel-final, 을 after consonant-final
//      The marker table maps both allomorphs to the same JA-cognate marker.
//
// Initial version handles the core role-marking system. Predicate-shape
// detection (copula, predicate-adj), relative clauses, conditional clauses,
// and clause coordination will be added as the round-trip metric demands.

// koParticleToMarker maps a KO particle surface form to the JA-cognate
// marker ID. Both allomorphs of each particle point at the same marker
// because they encode the same semantic role.
func koParticleToMarker() map[string]uint8 {
	return map[string]uint8{
	// SUBJ
	"\xec\x9d\xb4": MkGa, // 이 (consonant-final)
	"\xea\xb0\x80": MkGa, // 가 (vowel-final)
	// TOPIC
	"\xec\x9d\x80": MkWa, // 은
	"\xeb\x8a\x94": MkWa, // 는
	// OBJ
	"\xec\x9d\x84": MkWo, // 을
	"\xeb\xa5\xbc": MkWo, // 를
	// LOC/GOAL (≈ JA に)
	"\xec\x97\x90": MkNi, // 에
	// LOC-action / FROM (≈ JA で / から - using で here, source captured by 부터)
	"\xec\x97\x90\xec\x84\x9c": MkDe, // 에서
	// POSSESSIVE (≈ JA の)
	"\xec\x9d\x98": MkNo, // 의
	// AND/WITH (≈ JA と)
	"\xec\x99\x80": MkTo, // 와 (vowel-final)
	"\xea\xb3\xbc": MkTo, // 과 (consonant-final)
	// INSTRUMENTAL/DIRECTION (≈ JA で)
	"\xeb\xa1\x9c":         MkDe, // 로
	"\xec\x9c\xbc\xeb\xa1\x9c": MkDe, // 으로
	// SOURCE (≈ JA から)
	"\xeb\xb6\x80\xed\x84\xb0": MkKara, // 부터
	// UNTIL (≈ JA まで)
	"\xea\xb9\x8c\xec\xa7\x80": MkMade, // 까지
	// ALSO/INCLUSIVE (≈ JA も)
	"\xeb\x8f\x84": MkMo, // 도
	}
}

// koParticleTails returns the particles that can appear as a SUFFIX of an
// orthographic word. Listed longest-first so the tokenizer matches greedily
// (에서 wins over 에, 으로 wins over 로, etc.). The byte sequences must
// match koParticleToMarker keys exactly.
func koParticleTails() []string {
	return []string{
	"\xec\x97\x90\xec\x84\x9c",         // 에서 (6 bytes)
	"\xec\x9c\xbc\xeb\xa1\x9c",         // 으로 (6 bytes)
	"\xeb\xb6\x80\xed\x84\xb0",         // 부터 (6 bytes)
	"\xea\xb9\x8c\xec\xa7\x80",         // 까지 (6 bytes)
	"\xec\x9d\x98",                     // 의 (3 bytes)
	"\xec\x9d\xb4",                     // 이
	"\xea\xb0\x80",                     // 가
	"\xec\x9d\x80",                     // 은
	"\xeb\x8a\x94",                     // 는
	"\xec\x9d\x84",                     // 을
	"\xeb\xa5\xbc",                     // 를
	"\xec\x97\x90",                     // 에
	"\xec\x99\x80",                     // 와
	"\xea\xb3\xbc",                     // 과
	"\xeb\xa1\x9c",                     // 로
	"\xeb\x8f\x84",                     // 도
	}
}

// ExtractKO takes pre-tokenized Korean tokens (space-split, then particle-
// split via tokenizeKO) and produces the same ExtractResult shape as
// ExtractJA: Pattern, Slots, Roles, Discourse.
//
// Structural recipe (same as ExtractJA in spirit):
//   - non-particle content token → emit slot; pendingRole defaults to Verb,
//     overridden retroactively by the next particle's MarkerToRole
//   - particle token → MarkerToRole assigns role to preceding slot;
//     MarkerToOblRole assigns oblique role
//   - last slot becomes HistVerb if no copula/adjective predication detected
func ExtractKO(tokens []string) ExtractResult {
	var pat []byte
	var slots []string
	var roles []int32
	var slotMarkers []uint8
	var slotMorphs []uint16
	var slotOblRoles []uint8
	var slotHeads []int16
	var slotModKinds []uint8
	pendingRole := HistVerb
	pendingHead := int16(-1)
	pendingModKind := uint8(MKNone)
	var clausesKO []Clause
	clauseRelKO := ClauseRoot
	clauseParentKO := int16(-1)
	_ = pendingHead
	_ = pendingModKind

	for _, tok := range tokens {
		// Clause-boundary token (synthetic 、 from tokenizer punctuation).
		if tok == "\xe3\x80\x81" {
			if len(slots) > 0 {
				if len(roles) > 0 && slotModKinds[len(roles)-1] != MKCop {
					roles[len(roles)-1] = HistVerb
				}
				clauseSet := buildSetFromSlices(
					slots, roles, slotMorphs,
					slotMarkers, slotOblRoles, slotHeads, slotModKinds,
				)
				nextParent := int16(len(clausesKO))
				clausesKO = append(clausesKO, Clause{
					Set: clauseSet, Relation: clauseRelKO,
					Parent: clauseParentKO, HostIdx: -1,
				})
				slots = nil
				roles = nil
				slotMarkers = nil
				slotMorphs = nil
				slotOblRoles = nil
				slotHeads = nil
				slotModKinds = nil
				pendingRole = HistVerb
				clauseRelKO = ClauseAnd
				clauseParentKO = nextParent - 1
			}
			continue
		}

		mk, isMk := koParticleToMarker()[tok]
		if isMk {
			pat = append(pat, mk)
			if len(roles) > 0 {
				newRole := MarkerToRole(mk)
				roles[len(roles)-1] = newRole
				if len(slotMarkers) == len(slots) {
					slotMarkers[len(slotMarkers)-1] = mk
				}
				if len(slotOblRoles) == len(slots) {
					if or := MarkerToOblRole(mk); or != ORNone {
						slotOblRoles[len(slotOblRoles)-1] = or
					}
				}
			}
			pendingRole = HistVerb
			continue
		}

		// Content token: emit as slot.
		pat = append(pat, SlotNoun)
		slots = append(slots, tok)
		roles = append(roles, pendingRole)
		slotMarkers = append(slotMarkers, 0)
		slotMorphs = append(slotMorphs, 0)
		slotOblRoles = append(slotOblRoles, ORNone)
		slotHeads = append(slotHeads, -1)
		slotModKinds = append(slotModKinds, MKNone)
		pendingRole = HistVerb
	}

	// Final-slot verb override (no predicate-shape detection yet).
	if len(roles) > 0 {
		roles[len(roles)-1] = HistVerb
	}

	set := buildSetFromSlices(
		slots, roles, slotMorphs,
		slotMarkers, slotOblRoles, slotHeads, slotModKinds,
	)
	clausesKO = append(clausesKO, Clause{
		Set: set, Relation: clauseRelKO,
		Parent: clauseParentKO, HostIdx: -1,
	})

	// Flatten Slots/Roles across all clauses (same as ExtractJA).
	flatSlots := []string{:0:len(slots)}
	flatRoles := []int32{:0:len(slots)}
	for _, c := range clausesKO {
		for _, e := range c.Set {
			flatSlots = append(flatSlots, e.Atom)
			flatRoles = append(flatRoles, e.Role)
		}
	}

	return ExtractResult{
		Pattern: pat, Slots: flatSlots, Roles: flatRoles,
		DeepPat: buildDeepPat(flatRoles), Set: clausesKO[0].Set,
		Discourse: clausesKO,
	}
}