tokenize.mx raw

   1  package transdb
   2  
   3  import (
   4  	"bytes"
   5  
   6  	"git.smesh.lol/iskradb/lattice"
   7  )
   8  
   9  // TokenizeEN splits English text on whitespace and punctuation, lowercases
  10  // each token, and strips trailing possessive 's.
  11  func TokenizeEN(text string) []string {
  12  	var tokens []string
  13  	b := []byte(text)
  14  	start := -1
  15  	for i := 0; i <= len(b); i++ {
  16  		var c byte
  17  		if i < len(b) {
  18  			c = b[i]
  19  		}
  20  		isAlnum := (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9')
  21  		if isAlnum {
  22  			if start < 0 {
  23  				start = i
  24  			}
  25  		} else {
  26  			if start >= 0 {
  27  				tok := bytes.ToLower(b[start:i])
  28  				// Strip trailing 's or s'
  29  				if len(tok) > 2 && tok[len(tok)-2] == '\'' && tok[len(tok)-1] == 's' {
  30  					tok = tok[:len(tok)-2]
  31  				}
  32  				tokens = append(tokens, string(tok))
  33  				start = -1
  34  			}
  35  		}
  36  	}
  37  	return tokens
  38  }
  39  
  40  // TokenizeJA segments Japanese text using forward maximum-match against the
  41  // translation lattice. Branch search order adapts to syntactic context so
  42  // particles (Bmodifier) are preferred over noun readings in particle positions.
  43  func TokenizeJA(text string, tree *lattice.Tree, verbose bool) []string {
  44  	var tokens []string
  45  	var prevPOS uint8
  46  	for len(text) > 0 {
  47  		tok, consumed, branch := maxMatchJA(text, tree, prevPOS)
  48  		if tok != "" {
  49  			tokens = append(tokens, tok)
  50  			if branch < 3 {
  51  				prevPOS = branch + 1 // 1=noun, 2=verb, 3=modifier
  52  			} else {
  53  				prevPOS = 0
  54  			}
  55  		}
  56  		text = text[consumed:]
  57  		if verbose && tok != "" {
  58  			println("segment:", tok)
  59  		}
  60  	}
  61  	return tokens
  62  }
  63  
  64  // maxMatchJA tries to match the longest prefix of text against the JA lattice.
  65  // prevPOS carries the POS of the preceding content token (1=noun, 2=verb,
  66  // 3=modifier, 0=unknown) for context-guided branch ordering.
  67  // Returns (matched_form, bytes_consumed, branch). Branch=255 means no match.
  68  // Punctuation and CJK symbols pass through silently (empty form, 1 codepoint consumed).
  69  func maxMatchJA(text string, tree *lattice.Tree, prevPOS uint8) (string, int, uint8) {
  70  	_, r0size := decodeRune(text, 0)
  71  
  72  	// Fast path: punctuation and non-word characters pass through silently.
  73  	// CJK punctuation: U+3000-U+303F (。、・「」〜 etc.)
  74  	// Full-width: U+FF00-U+FFEF
  75  	// ASCII: < 0x80
  76  	if len(text) >= 3 && text[0] == 0xE3 {
  77  		b1 := text[1]
  78  		if b1 == 0x80 && text[2] <= 0xBF { // U+3000-U+303F
  79  			return "", r0size, 255
  80  		}
  81  		if b1 == 0x83 && text[2] == 0xBC { // ー U+30FC alone (handled by reading aliases in compounds)
  82  			// only skip if no longer match found below
  83  		}
  84  	}
  85  	if text[0] < 0x80 { // ASCII punctuation/spaces
  86  		return "", r0size, 255
  87  	}
  88  	// Full-width punctuation U+FF00-U+FFEF
  89  	if len(text) >= 3 && text[0] == 0xEF && text[1] == 0xBC {
  90  		return "", r0size, 255
  91  	}
  92  
  93  	// Build codepoint boundary offsets.
  94  	offsets := []int{:0:32}
  95  	i := 0
  96  	for i < len(text) {
  97  		offsets = append(offsets, i)
  98  		_, size := decodeRune(text, i)
  99  		i += size
 100  	}
 101  	offsets = append(offsets, len(text))
 102  
 103  	// Branch order: noun-preceding context → try Bmodifier first.
 104  	// prevPOS from POSForWord returns 1/2/3 matching CooccurNominal/Verbal/Function
 105  	coord := PackCoord(0, 0, CoordCooccur(prevPOS, 0), 0, 0, 0, 0)
 106  	order := branchOrderJA(coord)
 107  
 108  	// Try from longest to shortest prefix. At each length, check:
 109  	// 1. coord=0 (base form, most common)
 110  	// 2. morph coord (conjugated form — must check at the SAME length before
 111  	//    shortening, or else "開け"(noun) beats "開けた"(past of 開ける))
 112  	// 3. verbStem de-stem (for forms not yet in lattice)
 113  	for end := len(offsets) - 1; end >= 2; end-- {
 114  		prefix := text[:offsets[end]]
 115  
 116  		// coord=0 lookup.
 117  		key0 := MakeKey(LangJA, 0, prefix)
 118  		for _, b := range order {
 119  			if tree.LookupRecIdx(lattice.Branch(b), key0) != lattice.NullRec {
 120  				return prefix, offsets[end], b
 121  			}
 122  		}
 123  
 124  		// Morph coord lookup — only for ≥3 codepoints (conjugated forms).
 125  		if end >= 3 {
 126  			ms := inferMorphState(prefix)
 127  			if ms != 0 {
 128  				coordM := PackCoord(0, 0, 0, uint64(ms), 0, 0, 0)
 129  				keyM := MakeKey(LangJA, coordM, prefix)
 130  				for _, b := range ActiveBranches {
 131  					if tree.LookupRecIdx(b, keyM) != lattice.NullRec {
 132  						return prefix, offsets[end], 255
 133  					}
 134  				}
 135  			}
 136  			// verbStem de-stem for forms not yet at morph coord.
 137  			for _, stem := range verbStems(prefix) {
 138  				keyS := MakeKey(LangJA, 0, stem)
 139  				for _, b := range ActiveBranches {
 140  					if tree.LookupRecIdx(b, keyS) != lattice.NullRec {
 141  						return prefix, offsets[end], 255
 142  					}
 143  				}
 144  			}
 145  		}
 146  	}
 147  
 148  	// No match - consume first codepoint as an opaque token.
 149  	return text[:r0size], r0size, 255
 150  }
 151