package transdb import ( "git.mleku.dev/iskra" "git.smesh.lol/iskradb/lattice" ) // Re-export cluster types from iskra. type ClusterType = iskra.ClusterType type Cluster = iskra.Cluster const ( ClusterNPSubj = iskra.ClusterNPSubj ClusterNPObj = iskra.ClusterNPObj ClusterVP = iskra.ClusterVP ClusterPP = iskra.ClusterPP ClusterMod = iskra.ClusterMod ) // jaParticleSet: JA boundary-marking particles for ParseClusters. var jaParticleSet = map[string]bool{ "は": true, "が": true, "を": true, "に": true, "で": true, "と": true, "から": true, "まで": true, "へ": true, "より": true, "も": true, } // jaDefaultRole: fallback roles when Bcooccur is not yet populated. var jaDefaultRole = map[string]uint8{ "は": RoleNPSubjTopic, "が": RoleNPSubjGram, "を": RoleNPObjDirect, "から": RolePPSource, "まで": RolePPLimit, "と": RolePPComitative, "に": RoleNPDative, "で": RolePPInstrumental, "へ": RolePPLocative, "より": RolePPSource, "も": RoleNPSubjTopic, } // jaClusterHooks constructs the ClusterHooks for JA parsing. func jaClusterHooks(tree *lattice.Tree) iskra.ClusterHooks { return iskra.ClusterHooks{ IsParticle: func(tok string) bool { return jaParticleSet[tok] }, LookupRole: func(tok string, npFlags uint64) uint8 { return LookupParticleRole(tree, LangJA, tok, npFlags) }, HasVerb: func(tok string) bool { return isJAVerb(tree, tok) }, AccumFlags: func(tokens []string) uint64 { var flags uint64 for _, tok := range tokens { for _, b := range ActiveBranches { if ri := tree.LookupRecIdx(b, MakeKey(LangJA, 0, tok)); ri != lattice.NullRec { if rec := tree.GetRecord(ri); rec != nil { flags |= GetSemanticFromDataFile(rec) } break } } } return flags }, FilterContent: func(tokens []string) []string { var out []string for _, tok := range tokens { if !jaFunctionWord[tok] { out = append(out, tok) } } return out }, DefaultRole: func(tok string) uint8 { return jaDefaultRole[tok] }, } } // ParseClusters segments a pre-tokenized JA sequence into phrase clusters. func ParseClusters(tokens []string, tree *lattice.Tree, lang uint8) []*Cluster { hooks := jaClusterHooks(tree) return iskra.ParseClusters(tokens, hooks) } // clusterHeadLookup finds the target form and morph state for a source token. func clusterHeadLookup(tree *lattice.Tree, pool []byte, tok string, srcLang uint8, coord uint64) (form string, state uint8) { order := defaultBranchOrder if srcLang == LangJA { order = branchOrderJA(coord) } for _, c := range RelaxCoord(coord) { key := MakeKey(srcLang, c, tok) for _, b := range order { ri := tree.LookupRecIdx(lattice.Branch(b), key) if ri == lattice.NullRec { continue } rec := tree.GetRecord(ri) if rec == nil { continue } if rec.Link[0] == lattice.NullRec { break } dst := tree.GetRecord(rec.Link[0]) if dst == nil { break } if f := FormFromInline(dst, pool); f != "" { return f, GetMorphState(rec) } break } } if srcLang == LangJA { ms := inferMorphState(tok) for _, stem := range verbStems(tok) { sCoord := PackCoord(0, 0, 0, uint64(ms), 0, 0, 0) if f, _ := clusterHeadLookup(tree, pool, stem, srcLang, sCoord); f != "" { return f, ms } } } return "", 0 } // TranslateCluster translates a cluster's tokens and fills c.Trans. func TranslateCluster(c *Cluster, tree *lattice.Tree, pool []byte, srcLang, dstLang uint8) { for _, nc := range c.Nested { TranslateCluster(nc, tree, pool, srcLang, dstLang) } hooks := jaClusterHooks(tree) content := hooks.FilterContent(c.Tokens) var morphState uint8 if c.Kind == ClusterVP && !c.Copular && len(content) > 0 { head := content[len(content)-1] if srcLang == LangJA { morphState = inferMorphState(head) } } coord := PackCoord(c.Flags, 0, 0, uint64(morphState), 0, 0, 0) headIdx := len(content) - 1 if srcLang == LangEN || headIdx < 0 { headIdx = 0 } headTrans := "" headState := uint8(0) if len(content) > 0 { headTrans, headState = clusterHeadLookup(tree, pool, content[headIdx], srcLang, coord) if headTrans == "" { headTrans = content[headIdx] } else if dstLang == LangEN { headTrans = applyMorphEN(headTrans, headState) } } modCoord := PackCoord(c.Flags, 0, 0, 0, 0, 0, 0) var modTrans []string for i, tok := range content { if i == headIdx { continue } t, _ := clusterHeadLookup(tree, pool, tok, srcLang, modCoord) if t == "" { t = tok } else if dstLang == LangEN { t = stripTo(t) } modTrans = append(modTrans, t) } rcTrans := "" if len(c.Nested) > 0 { var parts []string for _, nc := range c.Nested { if nc.Trans != "" { parts = append(parts, nc.Trans) } } rcTrans = joinWords(parts, dstLang) } var parts []string if dstLang == LangEN { if rcTrans != "" { parts = append(parts, rcTrans) } parts = append(parts, modTrans...) if headTrans != "" { parts = append(parts, headTrans) } } else { parts = append(parts, modTrans...) if rcTrans != "" { parts = append(parts, rcTrans) } if headTrans != "" { parts = append(parts, headTrans) } } c.Trans = joinWords(parts, dstLang) } // joinWords joins parts with spaces (EN) or no separator (JA). func joinWords(parts []string, lang uint8) string { var out []byte first := true for _, p := range parts { if p == "" { continue } if lang == LangEN && !first { out = append(out, ' ') } out = append(out, []byte(p)...) first = false } return string(out) } // ReorderClusters rearranges clusters from src to dst word order. func ReorderClusters(clusters []*Cluster, srcOrder, dstOrder uint8) []*Cluster { return iskra.ReorderClusters(clusters, srcOrder, dstOrder) } // InsertMarkers assembles clusters into a target-language string. func InsertMarkers(clusters []*Cluster, dstDesc LangDesc, dstLang uint8) string { return iskra.InsertMarkers(clusters, dstDesc, dstLang) }