cluster.mx raw
1 package transdb
2
3 import (
4 "git.mleku.dev/iskra"
5 "git.smesh.lol/iskradb/lattice"
6 )
7
8 // Re-export cluster types from iskra.
9 type ClusterType = iskra.ClusterType
10 type Cluster = iskra.Cluster
11
12 const (
13 ClusterNPSubj = iskra.ClusterNPSubj
14 ClusterNPObj = iskra.ClusterNPObj
15 ClusterVP = iskra.ClusterVP
16 ClusterPP = iskra.ClusterPP
17 ClusterMod = iskra.ClusterMod
18 )
19
20 // jaParticleSet: JA boundary-marking particles for ParseClusters.
21 var jaParticleSet = map[string]bool{
22 "は": true, "が": true, "を": true,
23 "に": true, "で": true, "と": true,
24 "から": true, "まで": true, "へ": true,
25 "より": true, "も": true,
26 }
27
28 // jaDefaultRole: fallback roles when Bcooccur is not yet populated.
29 var jaDefaultRole = map[string]uint8{
30 "は": RoleNPSubjTopic,
31 "が": RoleNPSubjGram,
32 "を": RoleNPObjDirect,
33 "から": RolePPSource,
34 "まで": RolePPLimit,
35 "と": RolePPComitative,
36 "に": RoleNPDative,
37 "で": RolePPInstrumental,
38 "へ": RolePPLocative,
39 "より": RolePPSource,
40 "も": RoleNPSubjTopic,
41 }
42
43 // jaClusterHooks constructs the ClusterHooks for JA parsing.
44 func jaClusterHooks(tree *lattice.Tree) iskra.ClusterHooks {
45 return iskra.ClusterHooks{
46 IsParticle: func(tok string) bool { return jaParticleSet[tok] },
47 LookupRole: func(tok string, npFlags uint64) uint8 {
48 return LookupParticleRole(tree, LangJA, tok, npFlags)
49 },
50 HasVerb: func(tok string) bool { return isJAVerb(tree, tok) },
51 AccumFlags: func(tokens []string) uint64 {
52 var flags uint64
53 for _, tok := range tokens {
54 for _, b := range ActiveBranches {
55 if ri := tree.LookupRecIdx(b, MakeKey(LangJA, 0, tok)); ri != lattice.NullRec {
56 if rec := tree.GetRecord(ri); rec != nil {
57 flags |= GetSemanticFromDataFile(rec)
58 }
59 break
60 }
61 }
62 }
63 return flags
64 },
65 FilterContent: func(tokens []string) []string {
66 var out []string
67 for _, tok := range tokens {
68 if !jaFunctionWord[tok] {
69 out = append(out, tok)
70 }
71 }
72 return out
73 },
74 DefaultRole: func(tok string) uint8 { return jaDefaultRole[tok] },
75 }
76 }
77
78 // ParseClusters segments a pre-tokenized JA sequence into phrase clusters.
79 func ParseClusters(tokens []string, tree *lattice.Tree, lang uint8) []*Cluster {
80 hooks := jaClusterHooks(tree)
81 return iskra.ParseClusters(tokens, hooks)
82 }
83
84 // clusterHeadLookup finds the target form and morph state for a source token.
85 func clusterHeadLookup(tree *lattice.Tree, pool []byte, tok string, srcLang uint8, coord uint64) (form string, state uint8) {
86 order := defaultBranchOrder
87 if srcLang == LangJA {
88 order = branchOrderJA(coord)
89 }
90 for _, c := range RelaxCoord(coord) {
91 key := MakeKey(srcLang, c, tok)
92 for _, b := range order {
93 ri := tree.LookupRecIdx(lattice.Branch(b), key)
94 if ri == lattice.NullRec {
95 continue
96 }
97 rec := tree.GetRecord(ri)
98 if rec == nil {
99 continue
100 }
101 if rec.Link[0] == lattice.NullRec {
102 break
103 }
104 dst := tree.GetRecord(rec.Link[0])
105 if dst == nil {
106 break
107 }
108 if f := FormFromInline(dst, pool); f != "" {
109 return f, GetMorphState(rec)
110 }
111 break
112 }
113 }
114 if srcLang == LangJA {
115 ms := inferMorphState(tok)
116 for _, stem := range verbStems(tok) {
117 sCoord := PackCoord(0, 0, 0, uint64(ms), 0, 0, 0)
118 if f, _ := clusterHeadLookup(tree, pool, stem, srcLang, sCoord); f != "" {
119 return f, ms
120 }
121 }
122 }
123 return "", 0
124 }
125
126 // TranslateCluster translates a cluster's tokens and fills c.Trans.
127 func TranslateCluster(c *Cluster, tree *lattice.Tree, pool []byte, srcLang, dstLang uint8) {
128 for _, nc := range c.Nested {
129 TranslateCluster(nc, tree, pool, srcLang, dstLang)
130 }
131
132 hooks := jaClusterHooks(tree)
133 content := hooks.FilterContent(c.Tokens)
134
135 var morphState uint8
136 if c.Kind == ClusterVP && !c.Copular && len(content) > 0 {
137 head := content[len(content)-1]
138 if srcLang == LangJA {
139 morphState = inferMorphState(head)
140 }
141 }
142 coord := PackCoord(c.Flags, 0, 0, uint64(morphState), 0, 0, 0)
143
144 headIdx := len(content) - 1
145 if srcLang == LangEN || headIdx < 0 {
146 headIdx = 0
147 }
148
149 headTrans := ""
150 headState := uint8(0)
151 if len(content) > 0 {
152 headTrans, headState = clusterHeadLookup(tree, pool, content[headIdx], srcLang, coord)
153 if headTrans == "" {
154 headTrans = content[headIdx]
155 } else if dstLang == LangEN {
156 headTrans = applyMorphEN(headTrans, headState)
157 }
158 }
159
160 modCoord := PackCoord(c.Flags, 0, 0, 0, 0, 0, 0)
161 var modTrans []string
162 for i, tok := range content {
163 if i == headIdx {
164 continue
165 }
166 t, _ := clusterHeadLookup(tree, pool, tok, srcLang, modCoord)
167 if t == "" {
168 t = tok
169 } else if dstLang == LangEN {
170 t = stripTo(t)
171 }
172 modTrans = append(modTrans, t)
173 }
174
175 rcTrans := ""
176 if len(c.Nested) > 0 {
177 var parts []string
178 for _, nc := range c.Nested {
179 if nc.Trans != "" {
180 parts = append(parts, nc.Trans)
181 }
182 }
183 rcTrans = joinWords(parts, dstLang)
184 }
185
186 var parts []string
187 if dstLang == LangEN {
188 if rcTrans != "" {
189 parts = append(parts, rcTrans)
190 }
191 parts = append(parts, modTrans...)
192 if headTrans != "" {
193 parts = append(parts, headTrans)
194 }
195 } else {
196 parts = append(parts, modTrans...)
197 if rcTrans != "" {
198 parts = append(parts, rcTrans)
199 }
200 if headTrans != "" {
201 parts = append(parts, headTrans)
202 }
203 }
204
205 c.Trans = joinWords(parts, dstLang)
206 }
207
208 // joinWords joins parts with spaces (EN) or no separator (JA).
209 func joinWords(parts []string, lang uint8) string {
210 var out []byte
211 first := true
212 for _, p := range parts {
213 if p == "" {
214 continue
215 }
216 if lang == LangEN && !first {
217 out = append(out, ' ')
218 }
219 out = append(out, []byte(p)...)
220 first = false
221 }
222 return string(out)
223 }
224
225 // ReorderClusters rearranges clusters from src to dst word order.
226 func ReorderClusters(clusters []*Cluster, srcOrder, dstOrder uint8) []*Cluster {
227 return iskra.ReorderClusters(clusters, srcOrder, dstOrder)
228 }
229
230 // InsertMarkers assembles clusters into a target-language string.
231 func InsertMarkers(clusters []*Cluster, dstDesc LangDesc, dstLang uint8) string {
232 return iskra.InsertMarkers(clusters, dstDesc, dstLang)
233 }
234