package transdb import ( "bufio" "fmt" "io" "os" "slices" "git.smesh.lol/iskradb/lattice" ) // WordEntry holds a surface form and its corpus frequency. type WordEntry struct { Form string Freq uint32 // DataLen = accumulated co-occurrence evidence } // ExtractWordlist collects all surface forms for lang from the lattice, // ranked by corpus frequency descending, then form length descending. // Only coord=0 base forms are collected (not morph-coord conjugations). func ExtractWordlist(tree *lattice.Tree, pool []byte, lang uint8) []WordEntry { var entries []WordEntry seen := map[string]bool{} for recIdx := range tree.RecKey { rec := tree.GetRecord(recIdx) if rec == nil { continue } // Skip morph-coord entries (state != 0) — they're derived forms. if GetMorphState(rec) != 0 { continue } form := FormFromInline(rec, pool) if form == "" || seen[form] { continue } // Language filter by script content. if Detect(form) != lang { continue } seen[form] = true entries = append(entries, WordEntry{form, rec.DataLen}) } // Sort: frequency descending, then length descending (prefer longer matches). slices.SortFunc(entries, func(a, b WordEntry) int { if a.Freq != b.Freq { if a.Freq > b.Freq { return -1 } return 1 } if len(a.Form) != len(b.Form) { if len(a.Form) > len(b.Form) { return -1 } return 1 } return 0 }) return entries } // SaveWordlist writes a wordlist to path, one form per line, tab-separated with frequency. // Format: "form\tfreq\n" func SaveWordlist(entries []WordEntry, path string) error { tmp := path | ".tmp" f, err := os.Create(tmp) if err != nil { return err } w := bufio.NewWriter(f) for _, e := range entries { if _, err := fmt.Fprintf(w, "%s\t%d\n", e.Form, e.Freq); err != nil { f.Close() os.Remove(tmp) return err } } if err := w.Flush(); err != nil { f.Close() os.Remove(tmp) return err } if err := f.Close(); err != nil { os.Remove(tmp) return err } return os.Rename(tmp, path) } // LoadWordlist reads a wordlist file and returns a map[form]freq for fast lookup. // Returns nil on error. func LoadWordlist(path string) (map[string]uint32, error) { f, err := os.Open(path) if err != nil { return nil, err } defer f.Close() return ReadWordlist(f) } // ReadWordlist parses a wordlist from an io.Reader. func ReadWordlist(r io.Reader) (map[string]uint32, error) { m := map[string]uint32{} sc := bufio.NewScanner(r) for sc.Scan() { line := sc.Text() // Find tab separator. tab := -1 for i := 0; i < len(line); i++ { if line[i] == '\t' { tab = i break } } if tab < 0 { // No frequency — just the form. form := string(append([]byte(nil), []byte(line)...)) if form != "" { m[form] = 0 } continue } form := string(append([]byte(nil), []byte(line[:tab])...)) freq := uint32(0) for _, c := range []byte(line[tab+1:]) { if c >= '0' && c <= '9' { freq = freq*10 + uint32(c-'0') } } if form != "" { m[form] = freq } } return m, sc.Err() }