enmorph.mx raw

   1  package ingest
   2  
   3  import (
   4  	"git.smesh.lol/iskradb/lattice"
   5  	"git.smesh.lol/transdb"
   6  )
   7  
   8  // enIrreg holds the principal parts of an English irregular verb.
   9  type enIrreg struct {
  10  	base, past, participle, progressive string
  11  }
  12  
  13  // enIrregs: the ~70 most common English irregular verbs.
  14  // base=infinitive stem, past=synthetic past, participle=past participle,
  15  // progressive=stem+"ing" (listed explicitly for irregular spelling).
  16  var enIrregs = []enIrreg{
  17  	{"eat", "ate", "eaten", "eating"},
  18  	{"sing", "sang", "sung", "singing"},
  19  	{"write", "wrote", "written", "writing"},
  20  	{"break", "broke", "broken", "breaking"},
  21  	{"go", "went", "gone", "going"},
  22  	{"come", "came", "come", "coming"},
  23  	{"see", "saw", "seen", "seeing"},
  24  	{"take", "took", "taken", "taking"},
  25  	{"give", "gave", "given", "giving"},
  26  	{"make", "made", "made", "making"},
  27  	{"know", "knew", "known", "knowing"},
  28  	{"think", "thought", "thought", "thinking"},
  29  	{"say", "said", "said", "saying"},
  30  	{"get", "got", "gotten", "getting"},
  31  	{"find", "found", "found", "finding"},
  32  	{"tell", "told", "told", "telling"},
  33  	{"buy", "bought", "bought", "buying"},
  34  	{"bring", "brought", "brought", "bringing"},
  35  	{"read", "read", "read", "reading"},
  36  	{"run", "ran", "run", "running"},
  37  	{"speak", "spoke", "spoken", "speaking"},
  38  	{"drink", "drank", "drunk", "drinking"},
  39  	{"swim", "swam", "swum", "swimming"},
  40  	{"begin", "began", "begun", "beginning"},
  41  	{"drive", "drove", "driven", "driving"},
  42  	{"fly", "flew", "flown", "flying"},
  43  	{"grow", "grew", "grown", "growing"},
  44  	{"throw", "threw", "thrown", "throwing"},
  45  	{"catch", "caught", "caught", "catching"},
  46  	{"teach", "taught", "taught", "teaching"},
  47  	{"hold", "held", "held", "holding"},
  48  	{"stand", "stood", "stood", "standing"},
  49  	{"understand", "understood", "understood", "understanding"},
  50  	{"lose", "lost", "lost", "losing"},
  51  	{"pay", "paid", "paid", "paying"},
  52  	{"meet", "met", "met", "meeting"},
  53  	{"sit", "sat", "sat", "sitting"},
  54  	{"lead", "led", "led", "leading"},
  55  	{"fall", "fell", "fallen", "falling"},
  56  	{"feel", "felt", "felt", "feeling"},
  57  	{"keep", "kept", "kept", "keeping"},
  58  	{"leave", "left", "left", "leaving"},
  59  	{"mean", "meant", "meant", "meaning"},
  60  	{"build", "built", "built", "building"},
  61  	{"send", "sent", "sent", "sending"},
  62  	{"spend", "spent", "spent", "spending"},
  63  	{"win", "won", "won", "winning"},
  64  	{"draw", "drew", "drawn", "drawing"},
  65  	{"choose", "chose", "chosen", "choosing"},
  66  	{"wear", "wore", "worn", "wearing"},
  67  	{"rise", "rose", "risen", "rising"},
  68  	{"hide", "hid", "hidden", "hiding"},
  69  	{"forget", "forgot", "forgotten", "forgetting"},
  70  	{"freeze", "froze", "frozen", "freezing"},
  71  	{"shake", "shook", "shaken", "shaking"},
  72  	{"steal", "stole", "stolen", "stealing"},
  73  	{"blow", "blew", "blown", "blowing"},
  74  	{"show", "showed", "shown", "showing"},
  75  	{"beat", "beat", "beaten", "beating"},
  76  	{"bite", "bit", "bitten", "biting"},
  77  	{"tear", "tore", "torn", "tearing"},
  78  	{"wake", "woke", "woken", "waking"},
  79  	{"ride", "rode", "ridden", "riding"},
  80  	{"ring", "rang", "rung", "ringing"},
  81  	{"do", "did", "done", "doing"},
  82  	{"have", "had", "had", "having"},
  83  	{"fight", "fought", "fought", "fighting"},
  84  	{"cut", "cut", "cut", "cutting"},
  85  	{"put", "put", "put", "putting"},
  86  	{"hit", "hit", "hit", "hitting"},
  87  	{"let", "let", "let", "letting"},
  88  	{"set", "set", "set", "setting"},
  89  	{"hurt", "hurt", "hurt", "hurting"},
  90  	{"shoot", "shot", "shot", "shooting"},
  91  	{"sell", "sold", "sold", "selling"},
  92  	{"tell", "told", "told", "telling"},
  93  	{"spell", "spelt", "spelt", "spelling"},
  94  	{"smell", "smelt", "smelt", "smelling"},
  95  	{"sleep", "slept", "slept", "sleeping"},
  96  	{"sweep", "swept", "swept", "sweeping"},
  97  	{"kneel", "knelt", "knelt", "kneeling"},
  98  	{"feel", "felt", "felt", "feeling"},
  99  	{"deal", "dealt", "dealt", "dealing"},
 100  	{"hear", "heard", "heard", "hearing"},
 101  	{"learn", "learnt", "learnt", "learning"},
 102  	{"burn", "burnt", "burnt", "burning"},
 103  	{"bend", "bent", "bent", "bending"},
 104  	{"lend", "lent", "lent", "lending"},
 105  	{"spend", "spent", "spent", "spending"},
 106  	{"hang", "hung", "hung", "hanging"},
 107  	{"strike", "struck", "struck", "striking"},
 108  	{"stick", "stuck", "stuck", "sticking"},
 109  	{"dig", "dug", "dug", "digging"},
 110  	{"spin", "spun", "spun", "spinning"},
 111  	{"swing", "swung", "swung", "swinging"},
 112  	{"cling", "clung", "clung", "clinging"},
 113  	{"sink", "sank", "sunk", "sinking"},
 114  	{"drink", "drank", "drunk", "drinking"},
 115  	{"spring", "sprang", "sprung", "springing"},
 116  	{"shrink", "shrank", "shrunk", "shrinking"},
 117  	{"bind", "bound", "bound", "binding"},
 118  	{"find", "found", "found", "finding"},
 119  	{"wind", "wound", "wound", "winding"},
 120  	{"grind", "ground", "ground", "grinding"},
 121  }
 122  
 123  // findIrreg returns the irregular entry for a base form, or (_, false).
 124  func findIrreg(base string) (enIrreg, bool) {
 125  	for _, e := range enIrregs {
 126  		if e.base == base {
 127  			return e, true
 128  		}
 129  	}
 130  	return enIrreg{}, false
 131  }
 132  
 133  func isVowel(c byte) bool {
 134  	return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u'
 135  }
 136  
 137  // regularPast returns the -ed form of a regular English verb.
 138  func regularPast(base string) string {
 139  	n := len(base)
 140  	if n == 0 {
 141  		return base
 142  	}
 143  	last := base[n-1]
 144  	if last == 'e' && n >= 2 && base[n-2] != 'e' && base[n-2] != 'i' {
 145  		return base[:n-1] | "ed"
 146  	}
 147  	if last == 'y' && n >= 2 && !isVowel(base[n-2]) {
 148  		return base[:n-1] | "ied"
 149  	}
 150  	// CVC doubling (single-syllable heuristic)
 151  	if n >= 3 && !isVowel(last) && last != 'w' && last != 'x' && last != 'y' &&
 152  		isVowel(base[n-2]) && !isVowel(base[n-3]) {
 153  		return base | string([]byte{last}) | "ed"
 154  	}
 155  	return base | "ed"
 156  }
 157  
 158  // regularProg returns the -ing form of a regular English verb.
 159  func regularProg(base string) string {
 160  	n := len(base)
 161  	if n == 0 {
 162  		return base
 163  	}
 164  	last := base[n-1]
 165  	if last == 'e' && n >= 2 && base[n-2] != 'e' && base[n-2] != 'i' {
 166  		return base[:n-1] | "ing"
 167  	}
 168  	// CVC doubling
 169  	if n >= 3 && !isVowel(last) && last != 'w' && last != 'x' && last != 'y' &&
 170  		isVowel(base[n-2]) && !isVowel(base[n-3]) {
 171  		return base | string([]byte{last}) | "ing"
 172  	}
 173  	return base | "ing"
 174  }
 175  
 176  // regular3sg returns the third-person singular present form.
 177  func regular3sg(base string) string {
 178  	n := len(base)
 179  	if n == 0 {
 180  		return base
 181  	}
 182  	last := base[n-1]
 183  	if last == 'y' && n >= 2 && !isVowel(base[n-2]) {
 184  		return base[:n-1] | "ies"
 185  	}
 186  	if last == 's' || last == 'x' || last == 'z' ||
 187  		(n >= 2 && base[n-2:] == "sh") || (n >= 2 && base[n-2:] == "ch") {
 188  		return base | "es"
 189  	}
 190  	return base | "s"
 191  }
 192  
 193  // stripInfinitive extracts the bare verb stem from a JMdict "to X" gloss.
 194  // Returns "" for complex glosses (multi-word, parenthetical, non-verb).
 195  func stripInfinitive(gloss string) string {
 196  	if len(gloss) <= 3 || gloss[:3] != "to " {
 197  		return ""
 198  	}
 199  	base := gloss[3:]
 200  	for i := 0; i < len(base); i++ {
 201  		c := base[i]
 202  		if (c < 'a' || c > 'z') && c != '-' {
 203  			return "" // multi-word or parenthetical gloss
 204  		}
 205  	}
 206  	if len(base) < 2 {
 207  		return ""
 208  	}
 209  	return base
 210  }
 211  
 212  // GenerateENForms inserts EN surface form lattice entries for a Bverb entry.
 213  // Mirrors GenerateConjugations for the EN side: synthetic past, progressive,
 214  // 3sg concordance, and participle forms all point to the same JA record.
 215  // jaRI is the primary JA translation record; branchByte is the packed Branch.
 216  func GenerateENForms(db *DB, enGloss string, branchByte uint8, jaRI uint32) {
 217  	if jaRI == lattice.NullRec {
 218  		return
 219  	}
 220  	base := stripInfinitive(enGloss)
 221  	if base == "" {
 222  		return
 223  	}
 224  
 225  	branch := lattice.Bverb
 226  
 227  	var past, participle, progressive, sg3 string
 228  	irreg, found := findIrreg(base)
 229  	if found {
 230  		past = irreg.past
 231  		participle = irreg.participle
 232  		progressive = irreg.progressive
 233  	} else {
 234  		past = regularPast(base)
 235  		participle = past // regular: participle = past
 236  		progressive = regularProg(base)
 237  	}
 238  	sg3 = regular3sg(base)
 239  
 240  	type formEntry struct {
 241  		word  string
 242  		state uint8
 243  	}
 244  	forms := []formEntry{
 245  		{past, transdb.MorphPastAffPlain},       // synthetic past: "sang"
 246  		{progressive, transdb.MorphPresProgPlain}, // progressive: "singing"
 247  		{sg3, transdb.MorphPresAffPlain},          // 3sg concordance: "sings"
 248  	}
 249  	// Participle if distinct from past (e.g. "sung" vs "sang")
 250  	if participle != "" && participle != past && participle != base {
 251  		forms = append(forms, formEntry{participle, transdb.MorphPastProgPlain})
 252  	}
 253  
 254  	for _, f := range forms {
 255  		if f.word == "" || f.word == base {
 256  			continue
 257  		}
 258  		key := transdb.MakeKey(transdb.LangEN, 0, f.word)
 259  		if db.Tree.LookupRecIdx(branch, key) != lattice.NullRec {
 260  			continue
 261  		}
 262  		var rec lattice.Record
 263  		transdb.SetFormOnRecord(&rec, f.word, &db.StringPool)
 264  		rec.Branch = branchByte
 265  		transdb.SetMorphState(&rec, f.state)
 266  		// InsertRec resets links — wire after.
 267  		newRI := db.Tree.InsertRec(branch, key, rec)
 268  		if r := db.Tree.GetRecord(newRI); r != nil {
 269  			r.Link[0] = jaRI
 270  		}
 271  	}
 272  }
 273