wordlist.mx raw

   1  package transdb
   2  
   3  import (
   4  	"bufio"
   5  	"fmt"
   6  	"io"
   7  	"os"
   8  	"slices"
   9  
  10  	"git.smesh.lol/iskradb/lattice"
  11  )
  12  
  13  // WordEntry holds a surface form and its corpus frequency.
  14  type WordEntry struct {
  15  	Form string
  16  	Freq uint32 // DataLen = accumulated co-occurrence evidence
  17  }
  18  
  19  // ExtractWordlist collects all surface forms for lang from the lattice,
  20  // ranked by corpus frequency descending, then form length descending.
  21  // Only coord=0 base forms are collected (not morph-coord conjugations).
  22  func ExtractWordlist(tree *lattice.Tree, pool []byte, lang uint8) []WordEntry {
  23  	var entries []WordEntry
  24  	seen := map[string]bool{}
  25  
  26  	for recIdx := range tree.RecKey {
  27  		rec := tree.GetRecord(recIdx)
  28  		if rec == nil {
  29  			continue
  30  		}
  31  		// Skip morph-coord entries (state != 0) — they're derived forms.
  32  		if GetMorphState(rec) != 0 {
  33  			continue
  34  		}
  35  		form := FormFromInline(rec, pool)
  36  		if form == "" || seen[form] {
  37  			continue
  38  		}
  39  		// Language filter by script content.
  40  		if Detect(form) != lang {
  41  			continue
  42  		}
  43  		seen[form] = true
  44  		entries = append(entries, WordEntry{form, rec.DataLen})
  45  	}
  46  
  47  	// Sort: frequency descending, then length descending (prefer longer matches).
  48  	slices.SortFunc(entries, func(a, b WordEntry) int {
  49  		if a.Freq != b.Freq {
  50  			if a.Freq > b.Freq {
  51  				return -1
  52  			}
  53  			return 1
  54  		}
  55  		if len(a.Form) != len(b.Form) {
  56  			if len(a.Form) > len(b.Form) {
  57  				return -1
  58  			}
  59  			return 1
  60  		}
  61  		return 0
  62  	})
  63  
  64  	return entries
  65  }
  66  
  67  // SaveWordlist writes a wordlist to path, one form per line, tab-separated with frequency.
  68  // Format: "form\tfreq\n"
  69  func SaveWordlist(entries []WordEntry, path string) error {
  70  	tmp := path | ".tmp"
  71  	f, err := os.Create(tmp)
  72  	if err != nil {
  73  		return err
  74  	}
  75  	w := bufio.NewWriter(f)
  76  	for _, e := range entries {
  77  		if _, err := fmt.Fprintf(w, "%s\t%d\n", e.Form, e.Freq); err != nil {
  78  			f.Close()
  79  			os.Remove(tmp)
  80  			return err
  81  		}
  82  	}
  83  	if err := w.Flush(); err != nil {
  84  		f.Close()
  85  		os.Remove(tmp)
  86  		return err
  87  	}
  88  	if err := f.Close(); err != nil {
  89  		os.Remove(tmp)
  90  		return err
  91  	}
  92  	return os.Rename(tmp, path)
  93  }
  94  
  95  // LoadWordlist reads a wordlist file and returns a map[form]freq for fast lookup.
  96  // Returns nil on error.
  97  func LoadWordlist(path string) (map[string]uint32, error) {
  98  	f, err := os.Open(path)
  99  	if err != nil {
 100  		return nil, err
 101  	}
 102  	defer f.Close()
 103  	return ReadWordlist(f)
 104  }
 105  
 106  // ReadWordlist parses a wordlist from an io.Reader.
 107  func ReadWordlist(r io.Reader) (map[string]uint32, error) {
 108  	m := map[string]uint32{}
 109  	sc := bufio.NewScanner(r)
 110  	for sc.Scan() {
 111  		line := sc.Text()
 112  		// Find tab separator.
 113  		tab := -1
 114  		for i := 0; i < len(line); i++ {
 115  			if line[i] == '\t' {
 116  				tab = i
 117  				break
 118  			}
 119  		}
 120  		if tab < 0 {
 121  			// No frequency — just the form.
 122  			form := string(append([]byte(nil), []byte(line)...))
 123  			if form != "" {
 124  				m[form] = 0
 125  			}
 126  			continue
 127  		}
 128  		form := string(append([]byte(nil), []byte(line[:tab])...))
 129  		freq := uint32(0)
 130  		for _, c := range []byte(line[tab+1:]) {
 131  			if c >= '0' && c <= '9' {
 132  				freq = freq*10 + uint32(c-'0')
 133  			}
 134  		}
 135  		if form != "" {
 136  			m[form] = freq
 137  		}
 138  	}
 139  	return m, sc.Err()
 140  }
 141