wordlist.mx raw
1 package transdb
2
3 import (
4 "bufio"
5 "fmt"
6 "io"
7 "os"
8 "slices"
9
10 "git.smesh.lol/iskradb/lattice"
11 )
12
13 // WordEntry holds a surface form and its corpus frequency.
14 type WordEntry struct {
15 Form string
16 Freq uint32 // DataLen = accumulated co-occurrence evidence
17 }
18
19 // ExtractWordlist collects all surface forms for lang from the lattice,
20 // ranked by corpus frequency descending, then form length descending.
21 // Only coord=0 base forms are collected (not morph-coord conjugations).
22 func ExtractWordlist(tree *lattice.Tree, pool []byte, lang uint8) []WordEntry {
23 var entries []WordEntry
24 seen := map[string]bool{}
25
26 for recIdx := range tree.RecKey {
27 rec := tree.GetRecord(recIdx)
28 if rec == nil {
29 continue
30 }
31 // Skip morph-coord entries (state != 0) — they're derived forms.
32 if GetMorphState(rec) != 0 {
33 continue
34 }
35 form := FormFromInline(rec, pool)
36 if form == "" || seen[form] {
37 continue
38 }
39 // Language filter by script content.
40 if Detect(form) != lang {
41 continue
42 }
43 seen[form] = true
44 entries = append(entries, WordEntry{form, rec.DataLen})
45 }
46
47 // Sort: frequency descending, then length descending (prefer longer matches).
48 slices.SortFunc(entries, func(a, b WordEntry) int {
49 if a.Freq != b.Freq {
50 if a.Freq > b.Freq {
51 return -1
52 }
53 return 1
54 }
55 if len(a.Form) != len(b.Form) {
56 if len(a.Form) > len(b.Form) {
57 return -1
58 }
59 return 1
60 }
61 return 0
62 })
63
64 return entries
65 }
66
67 // SaveWordlist writes a wordlist to path, one form per line, tab-separated with frequency.
68 // Format: "form\tfreq\n"
69 func SaveWordlist(entries []WordEntry, path string) error {
70 tmp := path | ".tmp"
71 f, err := os.Create(tmp)
72 if err != nil {
73 return err
74 }
75 w := bufio.NewWriter(f)
76 for _, e := range entries {
77 if _, err := fmt.Fprintf(w, "%s\t%d\n", e.Form, e.Freq); err != nil {
78 f.Close()
79 os.Remove(tmp)
80 return err
81 }
82 }
83 if err := w.Flush(); err != nil {
84 f.Close()
85 os.Remove(tmp)
86 return err
87 }
88 if err := f.Close(); err != nil {
89 os.Remove(tmp)
90 return err
91 }
92 return os.Rename(tmp, path)
93 }
94
95 // LoadWordlist reads a wordlist file and returns a map[form]freq for fast lookup.
96 // Returns nil on error.
97 func LoadWordlist(path string) (map[string]uint32, error) {
98 f, err := os.Open(path)
99 if err != nil {
100 return nil, err
101 }
102 defer f.Close()
103 return ReadWordlist(f)
104 }
105
106 // ReadWordlist parses a wordlist from an io.Reader.
107 func ReadWordlist(r io.Reader) (map[string]uint32, error) {
108 m := map[string]uint32{}
109 sc := bufio.NewScanner(r)
110 for sc.Scan() {
111 line := sc.Text()
112 // Find tab separator.
113 tab := -1
114 for i := 0; i < len(line); i++ {
115 if line[i] == '\t' {
116 tab = i
117 break
118 }
119 }
120 if tab < 0 {
121 // No frequency — just the form.
122 form := string(append([]byte(nil), []byte(line)...))
123 if form != "" {
124 m[form] = 0
125 }
126 continue
127 }
128 form := string(append([]byte(nil), []byte(line[:tab])...))
129 freq := uint32(0)
130 for _, c := range []byte(line[tab+1:]) {
131 if c >= '0' && c <= '9' {
132 freq = freq*10 + uint32(c-'0')
133 }
134 }
135 if form != "" {
136 m[form] = freq
137 }
138 }
139 return m, sc.Err()
140 }
141