package transdb // Detect returns LangJA if the text contains a significant fraction of // Japanese script codepoints (hiragana, katakana, or CJK unified ideographs). // Returns LangEN for Latin-dominant or ambiguous text. // Returns LangUnknown for empty or pure-punctuation input. // // Threshold: if Japanese-script codepoints exceed 5% of total non-whitespace // codepoints, the text is classified as Japanese. This is conservative enough // to avoid false positives from occasional kanji in mixed text. func Detect(text string) uint8 { var hiragana, katakana, cjk, latin, total int for i := 0; i < len(text); { r, size := decodeRune(text, i) i += size if isSpace(r) { continue } total++ switch { case r >= 0x3040 && r <= 0x309F: hiragana++ case r >= 0x30A0 && r <= 0x30FF: katakana++ case r >= 0x4E00 && r <= 0x9FFF: cjk++ case (r >= 0x0041 && r <= 0x005A) || (r >= 0x0061 && r <= 0x007A): latin++ } } if total == 0 { return LangUnknown } jaScore := hiragana + katakana + cjk // 5% threshold: even one hiragana in a short word is decisive. if jaScore*20 > total { return LangJA } if latin > 0 || total > 0 { return LangEN } return LangUnknown } // decodeRune decodes one UTF-8 codepoint from s starting at offset i. // Returns the rune and the number of bytes consumed. func decodeRune(s string, i int) (rune, int) { b := s[i] if b < 0x80 { return rune(b), 1 } if b < 0xE0 { if i+1 >= len(s) { return 0xFFFD, 1 } return rune(b&0x1F)<<6 | rune(s[i+1]&0x3F), 2 } if b < 0xF0 { if i+2 >= len(s) { return 0xFFFD, 1 } return rune(b&0x0F)<<12 | rune(s[i+1]&0x3F)<<6 | rune(s[i+2]&0x3F), 3 } if i+3 >= len(s) { return 0xFFFD, 1 } return rune(b&0x07)<<18 | rune(s[i+1]&0x3F)<<12 | rune(s[i+2]&0x3F)<<6 | rune(s[i+3]&0x3F), 4 } func isSpace(r rune) bool { return r == ' ' || r == '\t' || r == '\n' || r == '\r' || r == ' ' }