detect.mx raw

   1  package transdb
   2  
   3  // Detect returns LangJA if the text contains a significant fraction of
   4  // Japanese script codepoints (hiragana, katakana, or CJK unified ideographs).
   5  // Returns LangEN for Latin-dominant or ambiguous text.
   6  // Returns LangUnknown for empty or pure-punctuation input.
   7  //
   8  // Threshold: if Japanese-script codepoints exceed 5% of total non-whitespace
   9  // codepoints, the text is classified as Japanese. This is conservative enough
  10  // to avoid false positives from occasional kanji in mixed text.
  11  func Detect(text string) uint8 {
  12  	var hiragana, katakana, cjk, latin, total int
  13  
  14  	for i := 0; i < len(text); {
  15  		r, size := decodeRune(text, i)
  16  		i += size
  17  		if isSpace(r) {
  18  			continue
  19  		}
  20  		total++
  21  		switch {
  22  		case r >= 0x3040 && r <= 0x309F:
  23  			hiragana++
  24  		case r >= 0x30A0 && r <= 0x30FF:
  25  			katakana++
  26  		case r >= 0x4E00 && r <= 0x9FFF:
  27  			cjk++
  28  		case (r >= 0x0041 && r <= 0x005A) || (r >= 0x0061 && r <= 0x007A):
  29  			latin++
  30  		}
  31  	}
  32  
  33  	if total == 0 {
  34  		return LangUnknown
  35  	}
  36  
  37  	jaScore := hiragana + katakana + cjk
  38  	// 5% threshold: even one hiragana in a short word is decisive.
  39  	if jaScore*20 > total {
  40  		return LangJA
  41  	}
  42  	if latin > 0 || total > 0 {
  43  		return LangEN
  44  	}
  45  	return LangUnknown
  46  }
  47  
  48  // decodeRune decodes one UTF-8 codepoint from s starting at offset i.
  49  // Returns the rune and the number of bytes consumed.
  50  func decodeRune(s string, i int) (rune, int) {
  51  	b := s[i]
  52  	if b < 0x80 {
  53  		return rune(b), 1
  54  	}
  55  	if b < 0xE0 {
  56  		if i+1 >= len(s) {
  57  			return 0xFFFD, 1
  58  		}
  59  		return rune(b&0x1F)<<6 | rune(s[i+1]&0x3F), 2
  60  	}
  61  	if b < 0xF0 {
  62  		if i+2 >= len(s) {
  63  			return 0xFFFD, 1
  64  		}
  65  		return rune(b&0x0F)<<12 | rune(s[i+1]&0x3F)<<6 | rune(s[i+2]&0x3F), 3
  66  	}
  67  	if i+3 >= len(s) {
  68  		return 0xFFFD, 1
  69  	}
  70  	return rune(b&0x07)<<18 | rune(s[i+1]&0x3F)<<12 | rune(s[i+2]&0x3F)<<6 | rune(s[i+3]&0x3F), 4
  71  }
  72  
  73  func isSpace(r rune) bool {
  74  	return r == ' ' || r == '\t' || r == '\n' || r == '\r' || r == '　'
  75  }
  76