package transdb

// Detect returns LangJA if the text contains a significant fraction of
// Japanese script codepoints (hiragana, katakana, or CJK unified ideographs).
// Returns LangEN for Latin-dominant or ambiguous text.
// Returns LangUnknown for empty or pure-punctuation input.
//
// Threshold: if Japanese-script codepoints exceed 5% of total non-whitespace
// codepoints, the text is classified as Japanese. This is conservative enough
// to avoid false positives from occasional kanji in mixed text.
func Detect(text string) uint8 {
	var hiragana, katakana, cjk, latin, total int

	for i := 0; i < len(text); {
		r, size := decodeRune(text, i)
		i += size
		if isSpace(r) {
			continue
		}
		total++
		switch {
		case r >= 0x3040 && r <= 0x309F:
			hiragana++
		case r >= 0x30A0 && r <= 0x30FF:
			katakana++
		case r >= 0x4E00 && r <= 0x9FFF:
			cjk++
		case (r >= 0x0041 && r <= 0x005A) || (r >= 0x0061 && r <= 0x007A):
			latin++
		}
	}

	if total == 0 {
		return LangUnknown
	}

	jaScore := hiragana + katakana + cjk
	// 5% threshold: even one hiragana in a short word is decisive.
	if jaScore*20 > total {
		return LangJA
	}
	if latin > 0 || total > 0 {
		return LangEN
	}
	return LangUnknown
}

// decodeRune decodes one UTF-8 codepoint from s starting at offset i.
// Returns the rune and the number of bytes consumed.
func decodeRune(s string, i int) (rune, int) {
	b := s[i]
	if b < 0x80 {
		return rune(b), 1
	}
	if b < 0xE0 {
		if i+1 >= len(s) {
			return 0xFFFD, 1
		}
		return rune(b&0x1F)<<6 | rune(s[i+1]&0x3F), 2
	}
	if b < 0xF0 {
		if i+2 >= len(s) {
			return 0xFFFD, 1
		}
		return rune(b&0x0F)<<12 | rune(s[i+1]&0x3F)<<6 | rune(s[i+2]&0x3F), 3
	}
	if i+3 >= len(s) {
		return 0xFFFD, 1
	}
	return rune(b&0x07)<<18 | rune(s[i+1]&0x3F)<<12 | rune(s[i+2]&0x3F)<<6 | rune(s[i+3]&0x3F), 4
}

func isSpace(r rune) bool {
	return r == ' ' || r == '\t' || r == '\n' || r == '\r' || r == '　'
}