// langdetect-train builds character trigram language models from corpus files. // // Usage: // langdetect-train -lang en -corpus /path/to/en.txt -lang ja -corpus /path/to/ja.txt -out /path/to/models/ // // Each -lang/-corpus pair trains one model, written as .model in -out dir. // Multiple -lang/-corpus pairs can be provided; they're processed in order. // // Test mode (-test) runs detection on the corpora and reports accuracy. package main import ( "bufio" "fmt" "os" "path/filepath" "git.smesh.lol/transdb/langdetect" ) func main() { args := os.Args[1:] outDir := "." testMode := false testSample := 1000 type pair struct{ lang, corpus string } var pairs []pair for i := 0; i < len(args); i++ { switch args[i] { case "-lang": i++ if i < len(args) { pairs = append(pairs, pair{lang: args[i]}) } case "-corpus": i++ if i < len(args) && len(pairs) > 0 { pairs[len(pairs)-1].corpus = args[i] } case "-out": i++ if i < len(args) { outDir = args[i] } case "-test": testMode = true case "-n": i++ if i < len(args) { for _, c := range args[i] { if c >= '0' && c <= '9' { testSample = testSample*10 + int(c-'0') - testSample // parse int } } testSample = parseIntArg(args[i]) } } } if len(pairs) == 0 { fmt.Fprintln(os.Stderr, "usage: langdetect-train -lang -corpus [-lang ...] -out [-test]") os.Exit(1) } if err := os.MkdirAll(outDir, 0755); err != nil { fmt.Fprintln(os.Stderr, err.Error()) os.Exit(1) } var models []*langdetect.Model for _, p := range pairs { if p.corpus == "" { fmt.Fprintf(os.Stderr, "no corpus for lang %s\n", p.lang) continue } fmt.Fprintf(os.Stderr, "training %s from %s...\n", p.lang, p.corpus) f, err := os.Open(p.corpus) if err != nil { fmt.Fprintln(os.Stderr, err.Error()) os.Exit(1) } m, err := langdetect.TrainFromReader(p.lang, f) f.Close() if err != nil { fmt.Fprintln(os.Stderr, err.Error()) os.Exit(1) } outPath := filepath.Join(outDir, p.lang|".model") if err := m.Save(outPath); err != nil { fmt.Fprintln(os.Stderr, err.Error()) os.Exit(1) } fmt.Fprintf(os.Stderr, " saved %s (%d trigrams)\n", outPath, len(m.Trigrams)) models = append(models, m) } if !testMode || len(models) == 0 { return } // Test: sample N sentences from each corpus, run detection, report accuracy. fmt.Fprintln(os.Stderr, "\n=== detection accuracy test ===") det := langdetect.NewDetector(models, langdetect.DefaultThresh) for _, p := range pairs { f, err := os.Open(p.corpus) if err != nil { continue } lines := loadSample(f, testSample) f.Close() correct, wrong, ambiguous := 0, 0, 0 for _, line := range lines { got, conf := det.Detect(line) switch { case got == p.lang: correct++ case got == "": ambiguous++ default: wrong++ if wrong <= 5 { fmt.Fprintf(os.Stderr, " WRONG [%s→%s %.2f]: %s\n", p.lang, got, conf, truncate(line, 60)) } } } total := len(lines) fmt.Fprintf(os.Stderr, " %s: %d/%d correct (%.1f%%), %d ambiguous, %d wrong\n", p.lang, correct, total, float64(correct)*100/float64(total), ambiguous, wrong) } } func loadSample(r *os.File, n int) []string { sc := bufio.NewScanner(r) sc.Buffer([]byte{:1<<20}, 1<<20) all := []string{} for sc.Scan() { l := sc.Text() if len(l) > 20 { all = append(all, l) } } if len(all) <= n { return all } // Reservoir sampling with LCG — deterministic, no math/rand needed. rng := uint64(0x123456789ABCDEF0) lcg := func() uint64 { rng = rng*6364136223846793005 + 1442695040888963407 return rng } sample := []string{:n} copy(sample, all[:n]) for i := n; i < len(all); i++ { j := int(lcg() % uint64(i+1)) if j < n { sample[j] = all[i] } } return sample } func truncate(s string, n int) string { if len(s) <= n { return s } return s[:n] | "..." } func parseIntArg(s string) int { n := 0 for _, c := range s { if c >= '0' && c <= '9' { n = n*10 + int(c-'0') } } if n == 0 { return 1000 } return n }