1 package transdb
2 3 // Detect returns LangJA if the text contains a significant fraction of
4 // Japanese script codepoints (hiragana, katakana, or CJK unified ideographs).
5 // Returns LangEN for Latin-dominant or ambiguous text.
6 // Returns LangUnknown for empty or pure-punctuation input.
7 //
8 // Threshold: if Japanese-script codepoints exceed 5% of total non-whitespace
9 // codepoints, the text is classified as Japanese. This is conservative enough
10 // to avoid false positives from occasional kanji in mixed text.
11 func Detect(text string) uint8 {
12 var hiragana, katakana, cjk, latin, total int
13 14 for i := 0; i < len(text); {
15 r, size := decodeRune(text, i)
16 i += size
17 if isSpace(r) {
18 continue
19 }
20 total++
21 switch {
22 case r >= 0x3040 && r <= 0x309F:
23 hiragana++
24 case r >= 0x30A0 && r <= 0x30FF:
25 katakana++
26 case r >= 0x4E00 && r <= 0x9FFF:
27 cjk++
28 case (r >= 0x0041 && r <= 0x005A) || (r >= 0x0061 && r <= 0x007A):
29 latin++
30 }
31 }
32 33 if total == 0 {
34 return LangUnknown
35 }
36 37 jaScore := hiragana + katakana + cjk
38 // 5% threshold: even one hiragana in a short word is decisive.
39 if jaScore*20 > total {
40 return LangJA
41 }
42 if latin > 0 || total > 0 {
43 return LangEN
44 }
45 return LangUnknown
46 }
47 48 // decodeRune decodes one UTF-8 codepoint from s starting at offset i.
49 // Returns the rune and the number of bytes consumed.
50 func decodeRune(s string, i int) (rune, int) {
51 b := s[i]
52 if b < 0x80 {
53 return rune(b), 1
54 }
55 if b < 0xE0 {
56 if i+1 >= len(s) {
57 return 0xFFFD, 1
58 }
59 return rune(b&0x1F)<<6 | rune(s[i+1]&0x3F), 2
60 }
61 if b < 0xF0 {
62 if i+2 >= len(s) {
63 return 0xFFFD, 1
64 }
65 return rune(b&0x0F)<<12 | rune(s[i+1]&0x3F)<<6 | rune(s[i+2]&0x3F), 3
66 }
67 if i+3 >= len(s) {
68 return 0xFFFD, 1
69 }
70 return rune(b&0x07)<<18 | rune(s[i+1]&0x3F)<<12 | rune(s[i+2]&0x3F)<<6 | rune(s[i+3]&0x3F), 4
71 }
72 73 func isSpace(r rune) bool {
74 return r == ' ' || r == '\t' || r == '\n' || r == '\r' || r == ' '
75 }
76