package iskra // ClusterType classifies a phrase cluster by syntactic role. type ClusterType uint8 const ( ClusterNPSubj ClusterType = 0 ClusterNPObj ClusterType = 1 ClusterVP ClusterType = 2 ClusterPP ClusterType = 3 ClusterMod ClusterType = 4 ) // Cluster is a phrase unit produced by ParseClusters. type Cluster struct { Kind ClusterType Tokens []string // content tokens after boundary particles consumed Flags uint64 // accumulated semantic flags from tokens Role uint8 // particle role code (RoleNP*, RolePP*) Nested []*Cluster // relative clause VP, if present Trans string // target-language string, filled by TranslateCluster Copular bool // VP cluster with no finite verb = copular sentence } // ParticleDetector returns true if tok is a phrase-boundary particle in this domain. type ParticleDetector func(tok string) bool // RoleLookup maps (particle, npFlags) to a role code using the domain's disambiguation rules. type RoleLookup func(particle string, npFlags uint64) uint8 // VerbDetector returns true if tok is a finite verb in this domain. type VerbDetector func(tok string) bool // FlagAccumulator extracts and ORs semantic flags from a token slice. type FlagAccumulator func(tokens []string) uint64 // ContentFilter returns only content words from a token slice (strips function words). type ContentFilter func(tokens []string) []string // ClusterHooks groups the language-specific callbacks for ParseClusters. type ClusterHooks struct { IsParticle ParticleDetector LookupRole RoleLookup HasVerb VerbDetector AccumFlags FlagAccumulator FilterContent ContentFilter DefaultRole func(particle string) uint8 // fallback when LookupRole returns RoleNone } // KindFromRole maps a particle role code to a ClusterType. func KindFromRole(role uint8) ClusterType { switch role { case RoleNPSubjTopic, RoleNPSubjGram: return ClusterNPSubj case RoleNPObjDirect: return ClusterNPObj case RolePPLocative, RolePPTemporal, RoleNPDative, RolePPSource, RolePPLimit, RolePPComitative, RolePPLocStatic, RolePPInstrumental: return ClusterPP } return ClusterMod } // splitAtFirstVerb splits tokens at the first verb token: vpTokens ends at the verb // (inclusive), headTokens is everything after. func splitAtFirstVerb(tokens []string, isVerb VerbDetector) (vpTokens, headTokens []string) { for i, tok := range tokens { if isVerb(tok) { return tokens[:i+1], tokens[i+1:] } } return tokens, nil } // hasAnyVerb returns true if any token satisfies isVerb. func hasAnyVerb(tokens []string, isVerb VerbDetector) bool { for _, tok := range tokens { if isVerb(tok) { return true } } return false } // ParseClusters segments a pre-tokenized sequence into phrase clusters. // Particle-bounded mode (JA, KO): each particle closes the preceding accumulation. // hooks provides all language-specific detection and classification functions. // Must be called on already-tokenized input — compound forms must be pre-collapsed. func ParseClusters(tokens []string, hooks ClusterHooks) []*Cluster { var clusters []*Cluster var acc []string for _, tok := range tokens { if !hooks.IsParticle(tok) { acc = append(acc, tok) continue } if len(acc) == 0 { continue } var flags uint64 if hooks.AccumFlags != nil { flags = hooks.AccumFlags(acc) } role := hooks.LookupRole(tok, flags) if role == RoleNone && hooks.DefaultRole != nil { role = hooks.DefaultRole(tok) } c := &Cluster{ Kind: KindFromRole(role), Flags: flags, Role: role, } if hasAnyVerb(acc, hooks.HasVerb) { vp, head := splitAtFirstVerb(acc, hooks.HasVerb) c.Tokens = head c.Nested = ParseClusters(vp, hooks) } else { c.Tokens = acc } clusters = append(clusters, c) acc = nil } if len(acc) > 0 { var flags uint64 if hooks.AccumFlags != nil { flags = hooks.AccumFlags(acc) } clusters = append(clusters, &Cluster{ Kind: ClusterVP, Tokens: acc, Flags: flags, Copular: !hasAnyVerb(acc, hooks.HasVerb), }) } return clusters } // ReorderClusters rearranges clusters from source word order to destination word order. // PP and Mod clusters are appended after the SVO/SOV core in source order. func ReorderClusters(clusters []*Cluster, srcOrder, dstOrder uint8) []*Cluster { var subj, obj, vp *Cluster var pps, mods []*Cluster for _, c := range clusters { switch c.Kind { case ClusterNPSubj: subj = c case ClusterNPObj: obj = c case ClusterVP: vp = c case ClusterPP: pps = append(pps, c) default: mods = append(mods, c) } } var result []*Cluster switch dstOrder { case OrderSVO: if subj != nil { result = append(result, subj) } if vp != nil { result = append(result, vp) } if obj != nil { result = append(result, obj) } case OrderSOV: if subj != nil { result = append(result, subj) } if obj != nil { result = append(result, obj) } if vp != nil { result = append(result, vp) } case OrderVSO: if vp != nil { result = append(result, vp) } if subj != nil { result = append(result, subj) } if obj != nil { result = append(result, obj) } default: if subj != nil { result = append(result, subj) } if vp != nil { result = append(result, vp) } if obj != nil { result = append(result, obj) } } result = append(result, pps...) result = append(result, mods...) return result } // InsertMarkers assembles translated clusters into a target-language string. // Inserts prepositions (prepositional target), postpositions (postpositional target), // and copula ("is") for overt-copula languages. func InsertMarkers(clusters []*Cluster, dstDesc LangDesc, dstDomain uint8) string { copular := false for _, c := range clusters { if c.Kind == ClusterVP && c.Copular { copular = true break } } var out []byte addSpace := false emitEN := func(s string) { if s == "" { return } if addSpace { out = append(out, ' ') } out = append(out, []byte(s)...) addSpace = true } for _, c := range clusters { if c.Trans == "" { continue } if dstDesc.Markers == MarkerPostpositional { out = append(out, []byte(c.Trans)...) marker := LookupTargetMarker(dstDomain, c.Role) if marker != "" { out = append(out, []byte(marker)...) } } else { if c.Kind == ClusterPP { marker := LookupTargetMarker(dstDomain, c.Role) if marker != "" { emitEN(marker) } } else if c.Kind == ClusterVP && copular && !dstDesc.ZeroCopula { emitEN("is") } emitEN(c.Trans) } } return string(out) }