// Package langdetect implements corpus-trained character trigram language // identification. Models are built from raw text corpora and stored as // compact TSV files. Detection uses cosine similarity between the input's // trigram profile and each stored model; the highest-scoring language wins // if it clears the confidence threshold. // // Trigram choice: character-level (not byte-level) trigrams capture // orthographic patterns that are language-specific regardless of script. // 300 trigrams per model covers >95% of the frequency mass for any language. package langdetect import ( "bufio" "fmt" "io" "os" "strconv" ) const ( modelVersion = "v1" maxTrigrams = 300 minInputChars = 5 // minimum codepoints to attempt detection (3 gives one trigram) DefaultThresh = 0.90 ) // Model holds a trigram frequency profile for one language. type Model struct { Lang string // ISO 639-1 code Trigrams map[string]float64 // trigram → normalized frequency } // TrainFromReader builds a Model by reading text from r. // Counts all 3-codepoint trigrams, keeps the top maxTrigrams by frequency, // normalizes so values sum to 1.0. func TrainFromReader(lang string, r io.Reader) (*Model, error) { counts := map[string]uint64{} total := uint64(0) sc := bufio.NewScanner(r) sc.Buffer([]byte{:1<<20}, 1<<20) for sc.Scan() { line := sc.Text() extractTrigrams(line, func(t string) { counts[t]++ total++ }) } if err := sc.Err(); err != nil { return nil, err } if total == 0 { return nil, fmt.Errorf("langdetect: no trigrams found in corpus for %s", lang) } // Sort by frequency descending, keep top maxTrigrams. type kv struct { k string v uint64 } var pairs []kv for k, v := range counts { pairs = append(pairs, kv{k, v}) } // Sort descending by count (insertion sort — stable, works for any size). for i := 1; i < len(pairs); i++ { key := pairs[i] j := i - 1 for j >= 0 && pairs[j].v < key.v { pairs[j+1] = pairs[j] j-- } pairs[j+1] = key } if len(pairs) > maxTrigrams { pairs = pairs[:maxTrigrams] } // Compute sum of kept frequencies for normalization. sum := uint64(0) for _, p := range pairs { sum += p.v } m := &Model{ Lang: lang, Trigrams: map[string]float64{}, } for _, p := range pairs { m.Trigrams[p.k] = float64(p.v) / float64(sum) } return m, nil } // Save writes the model to path in TSV format: // # langmodel v1 // # lang: // \t // ... func (m *Model) Save(path string) error { f, err := os.Create(path) if err != nil { return err } defer f.Close() bw := bufio.NewWriter(f) fmt.Fprintf(bw, "# langmodel %s\n# lang: %s\n", modelVersion, m.Lang) // Sort for deterministic output. type kv struct{ k string; v float64 } var pairs []kv for k, v := range m.Trigrams { pairs = append(pairs, kv{k, v}) } for i := 1; i < len(pairs); i++ { key := pairs[i] j := i - 1 for j >= 0 && pairs[j].v < key.v { pairs[j+1] = pairs[j] j-- } pairs[j+1] = key } for _, p := range pairs { fmt.Fprintf(bw, "%s\t%.8f\n", p.k, p.v) } return bw.Flush() } // Load reads a model from a TSV file produced by Save. func Load(path string) (*Model, error) { f, err := os.Open(path) if err != nil { return nil, err } defer f.Close() m := &Model{Trigrams: map[string]float64{}} sc := bufio.NewScanner(f) for sc.Scan() { line := sc.Text() const langPrefix = "# lang: " if len(line) >= len(langPrefix) && line[:len(langPrefix)] == langPrefix { m.Lang = line[len(langPrefix):] continue } if len(line) > 0 && line[0] == '#' { continue } tab := -1 for i := 0; i < len(line); i++ { if line[i] == '\t' { tab = i break } } if tab < 0 { continue } freq, err := strconv.ParseFloat(line[tab+1:], 64) if err != nil { continue } m.Trigrams[line[:tab]] = freq } if m.Lang == "" { return nil, fmt.Errorf("langdetect: missing lang header in %s", path) } return m, sc.Err() } // Detector holds a set of language models and performs identification. type Detector struct { Models []*Model Threshold float64 // minimum cosine similarity to report a match (default 0.90) } // NewDetector creates a Detector with the given models and threshold. func NewDetector(models []*Model, threshold float64) *Detector { if threshold <= 0 { threshold = DefaultThresh } return &Detector{Models: models, Threshold: threshold} } // Detect returns the ISO language code and confidence [0,1] for text. // Returns ("", 0) if the text is too short or no model clears the threshold. func (d *Detector) Detect(text string) (lang string, confidence float64) { // Fast path: count codepoints. ncp := 0 for i := 0; i < len(text); { i += utf8CharLen(text[i]) ncp++ } if ncp < minInputChars { return "", 0 } // Build input trigram profile. inputCounts := map[string]uint64{} inputTotal := uint64(0) extractTrigrams(text, func(t string) { inputCounts[t]++ inputTotal++ }) if inputTotal == 0 { return "", 0 } // Compute cosine similarity against each model. // cosine(A, B) = dot(A, B) / (|A| * |B|) // Since model vectors are already normalized (sum=1 approximates L1 norm), // we use dot product directly as a proxy — good enough for top-K trigrams. // Score each model by dot product of input trigram profile vs model. // For disjoint scripts (EN/JA) the wrong-language score is near zero. type scored struct { lang string score float64 } var scores []scored scoreSum := 0.0 for _, m := range d.Models { dot := 0.0 for t, modelFreq := range m.Trigrams { if cnt, ok := inputCounts[t]; ok { dot += (float64(cnt) / float64(inputTotal)) * modelFreq } } scores = append(scores, scored{m.Lang, dot}) scoreSum += dot } // Find best. bestIdx := 0 for i, s := range scores { if s.score > scores[bestIdx].score { bestIdx = i } } // Relative confidence: best / total. Near 1.0 when one language // dominates (disjoint scripts); near 1/n when ambiguous. if scoreSum == 0 { return "", 0 } confidence = scores[bestIdx].score / scoreSum if confidence < d.Threshold { return "", confidence } return scores[bestIdx].lang, confidence } // extractTrigrams calls fn for every 3-codepoint sequence in text. // Uses byte-offset iteration (not []rune) for Moxie compatibility. func extractTrigrams(text string, fn func(string)) { // Build codepoint byte offsets. offsets := []int{:0:len(text)/3+1} i := 0 for i < len(text) { offsets = append(offsets, i) i += utf8CharLen(text[i]) } offsets = append(offsets, len(text)) n := len(offsets) - 1 for start := 0; start+3 <= n; start++ { // Copy bytes: trigram may be a slice of a scanner buffer that // gets reused on the next Scan() call. Map keys must own their bytes. raw := text[offsets[start]:offsets[start+3]] t := string(append([]byte(nil), raw...)) fn(t) } } // utf8CharLen returns byte length of the UTF-8 codepoint starting at b. func utf8CharLen(b byte) int { switch { case b < 0x80: return 1 case b < 0xE0: return 2 case b < 0xF0: return 3 default: return 4 } }