cluster.mx raw
1 package iskra
2
3 // ClusterType classifies a phrase cluster by syntactic role.
4 type ClusterType uint8
5
6 const (
7 ClusterNPSubj ClusterType = 0
8 ClusterNPObj ClusterType = 1
9 ClusterVP ClusterType = 2
10 ClusterPP ClusterType = 3
11 ClusterMod ClusterType = 4
12 )
13
14 // Cluster is a phrase unit produced by ParseClusters.
15 type Cluster struct {
16 Kind ClusterType
17 Tokens []string // content tokens after boundary particles consumed
18 Flags uint64 // accumulated semantic flags from tokens
19 Role uint8 // particle role code (RoleNP*, RolePP*)
20 Nested []*Cluster // relative clause VP, if present
21 Trans string // target-language string, filled by TranslateCluster
22 Copular bool // VP cluster with no finite verb = copular sentence
23 }
24
25 // ParticleDetector returns true if tok is a phrase-boundary particle in this domain.
26 type ParticleDetector func(tok string) bool
27
28 // RoleLookup maps (particle, npFlags) to a role code using the domain's disambiguation rules.
29 type RoleLookup func(particle string, npFlags uint64) uint8
30
31 // VerbDetector returns true if tok is a finite verb in this domain.
32 type VerbDetector func(tok string) bool
33
34 // FlagAccumulator extracts and ORs semantic flags from a token slice.
35 type FlagAccumulator func(tokens []string) uint64
36
37 // ContentFilter returns only content words from a token slice (strips function words).
38 type ContentFilter func(tokens []string) []string
39
40 // ClusterHooks groups the language-specific callbacks for ParseClusters.
41 type ClusterHooks struct {
42 IsParticle ParticleDetector
43 LookupRole RoleLookup
44 HasVerb VerbDetector
45 AccumFlags FlagAccumulator
46 FilterContent ContentFilter
47 DefaultRole func(particle string) uint8 // fallback when LookupRole returns RoleNone
48 }
49
50 // KindFromRole maps a particle role code to a ClusterType.
51 func KindFromRole(role uint8) ClusterType {
52 switch role {
53 case RoleNPSubjTopic, RoleNPSubjGram:
54 return ClusterNPSubj
55 case RoleNPObjDirect:
56 return ClusterNPObj
57 case RolePPLocative, RolePPTemporal, RoleNPDative,
58 RolePPSource, RolePPLimit, RolePPComitative,
59 RolePPLocStatic, RolePPInstrumental:
60 return ClusterPP
61 }
62 return ClusterMod
63 }
64
65 // splitAtFirstVerb splits tokens at the first verb token: vpTokens ends at the verb
66 // (inclusive), headTokens is everything after.
67 func splitAtFirstVerb(tokens []string, isVerb VerbDetector) (vpTokens, headTokens []string) {
68 for i, tok := range tokens {
69 if isVerb(tok) {
70 return tokens[:i+1], tokens[i+1:]
71 }
72 }
73 return tokens, nil
74 }
75
76 // hasAnyVerb returns true if any token satisfies isVerb.
77 func hasAnyVerb(tokens []string, isVerb VerbDetector) bool {
78 for _, tok := range tokens {
79 if isVerb(tok) {
80 return true
81 }
82 }
83 return false
84 }
85
86 // ParseClusters segments a pre-tokenized sequence into phrase clusters.
87 // Particle-bounded mode (JA, KO): each particle closes the preceding accumulation.
88 // hooks provides all language-specific detection and classification functions.
89 // Must be called on already-tokenized input — compound forms must be pre-collapsed.
90 func ParseClusters(tokens []string, hooks ClusterHooks) []*Cluster {
91 var clusters []*Cluster
92 var acc []string
93
94 for _, tok := range tokens {
95 if !hooks.IsParticle(tok) {
96 acc = append(acc, tok)
97 continue
98 }
99 if len(acc) == 0 {
100 continue
101 }
102 var flags uint64
103 if hooks.AccumFlags != nil {
104 flags = hooks.AccumFlags(acc)
105 }
106 role := hooks.LookupRole(tok, flags)
107 if role == RoleNone && hooks.DefaultRole != nil {
108 role = hooks.DefaultRole(tok)
109 }
110 c := &Cluster{
111 Kind: KindFromRole(role),
112 Flags: flags,
113 Role: role,
114 }
115 if hasAnyVerb(acc, hooks.HasVerb) {
116 vp, head := splitAtFirstVerb(acc, hooks.HasVerb)
117 c.Tokens = head
118 c.Nested = ParseClusters(vp, hooks)
119 } else {
120 c.Tokens = acc
121 }
122 clusters = append(clusters, c)
123 acc = nil
124 }
125
126 if len(acc) > 0 {
127 var flags uint64
128 if hooks.AccumFlags != nil {
129 flags = hooks.AccumFlags(acc)
130 }
131 clusters = append(clusters, &Cluster{
132 Kind: ClusterVP,
133 Tokens: acc,
134 Flags: flags,
135 Copular: !hasAnyVerb(acc, hooks.HasVerb),
136 })
137 }
138 return clusters
139 }
140
141 // ReorderClusters rearranges clusters from source word order to destination word order.
142 // PP and Mod clusters are appended after the SVO/SOV core in source order.
143 func ReorderClusters(clusters []*Cluster, srcOrder, dstOrder uint8) []*Cluster {
144 var subj, obj, vp *Cluster
145 var pps, mods []*Cluster
146
147 for _, c := range clusters {
148 switch c.Kind {
149 case ClusterNPSubj:
150 subj = c
151 case ClusterNPObj:
152 obj = c
153 case ClusterVP:
154 vp = c
155 case ClusterPP:
156 pps = append(pps, c)
157 default:
158 mods = append(mods, c)
159 }
160 }
161
162 var result []*Cluster
163 switch dstOrder {
164 case OrderSVO:
165 if subj != nil { result = append(result, subj) }
166 if vp != nil { result = append(result, vp) }
167 if obj != nil { result = append(result, obj) }
168 case OrderSOV:
169 if subj != nil { result = append(result, subj) }
170 if obj != nil { result = append(result, obj) }
171 if vp != nil { result = append(result, vp) }
172 case OrderVSO:
173 if vp != nil { result = append(result, vp) }
174 if subj != nil { result = append(result, subj) }
175 if obj != nil { result = append(result, obj) }
176 default:
177 if subj != nil { result = append(result, subj) }
178 if vp != nil { result = append(result, vp) }
179 if obj != nil { result = append(result, obj) }
180 }
181 result = append(result, pps...)
182 result = append(result, mods...)
183 return result
184 }
185
186 // InsertMarkers assembles translated clusters into a target-language string.
187 // Inserts prepositions (prepositional target), postpositions (postpositional target),
188 // and copula ("is") for overt-copula languages.
189 func InsertMarkers(clusters []*Cluster, dstDesc LangDesc, dstDomain uint8) string {
190 copular := false
191 for _, c := range clusters {
192 if c.Kind == ClusterVP && c.Copular {
193 copular = true
194 break
195 }
196 }
197
198 var out []byte
199 addSpace := false
200
201 emitEN := func(s string) {
202 if s == "" {
203 return
204 }
205 if addSpace {
206 out = append(out, ' ')
207 }
208 out = append(out, []byte(s)...)
209 addSpace = true
210 }
211
212 for _, c := range clusters {
213 if c.Trans == "" {
214 continue
215 }
216 if dstDesc.Markers == MarkerPostpositional {
217 out = append(out, []byte(c.Trans)...)
218 marker := LookupTargetMarker(dstDomain, c.Role)
219 if marker != "" {
220 out = append(out, []byte(marker)...)
221 }
222 } else {
223 if c.Kind == ClusterPP {
224 marker := LookupTargetMarker(dstDomain, c.Role)
225 if marker != "" {
226 emitEN(marker)
227 }
228 } else if c.Kind == ClusterVP && copular && !dstDesc.ZeroCopula {
229 emitEN("is")
230 }
231 emitEN(c.Trans)
232 }
233 }
234 return string(out)
235 }
236