enmorph.mx raw
1 package ingest
2
3 import (
4 "git.smesh.lol/iskradb/lattice"
5 "git.smesh.lol/transdb"
6 )
7
8 // enIrreg holds the principal parts of an English irregular verb.
9 type enIrreg struct {
10 base, past, participle, progressive string
11 }
12
13 // enIrregs: the ~70 most common English irregular verbs.
14 // base=infinitive stem, past=synthetic past, participle=past participle,
15 // progressive=stem+"ing" (listed explicitly for irregular spelling).
16 var enIrregs = []enIrreg{
17 {"eat", "ate", "eaten", "eating"},
18 {"sing", "sang", "sung", "singing"},
19 {"write", "wrote", "written", "writing"},
20 {"break", "broke", "broken", "breaking"},
21 {"go", "went", "gone", "going"},
22 {"come", "came", "come", "coming"},
23 {"see", "saw", "seen", "seeing"},
24 {"take", "took", "taken", "taking"},
25 {"give", "gave", "given", "giving"},
26 {"make", "made", "made", "making"},
27 {"know", "knew", "known", "knowing"},
28 {"think", "thought", "thought", "thinking"},
29 {"say", "said", "said", "saying"},
30 {"get", "got", "gotten", "getting"},
31 {"find", "found", "found", "finding"},
32 {"tell", "told", "told", "telling"},
33 {"buy", "bought", "bought", "buying"},
34 {"bring", "brought", "brought", "bringing"},
35 {"read", "read", "read", "reading"},
36 {"run", "ran", "run", "running"},
37 {"speak", "spoke", "spoken", "speaking"},
38 {"drink", "drank", "drunk", "drinking"},
39 {"swim", "swam", "swum", "swimming"},
40 {"begin", "began", "begun", "beginning"},
41 {"drive", "drove", "driven", "driving"},
42 {"fly", "flew", "flown", "flying"},
43 {"grow", "grew", "grown", "growing"},
44 {"throw", "threw", "thrown", "throwing"},
45 {"catch", "caught", "caught", "catching"},
46 {"teach", "taught", "taught", "teaching"},
47 {"hold", "held", "held", "holding"},
48 {"stand", "stood", "stood", "standing"},
49 {"understand", "understood", "understood", "understanding"},
50 {"lose", "lost", "lost", "losing"},
51 {"pay", "paid", "paid", "paying"},
52 {"meet", "met", "met", "meeting"},
53 {"sit", "sat", "sat", "sitting"},
54 {"lead", "led", "led", "leading"},
55 {"fall", "fell", "fallen", "falling"},
56 {"feel", "felt", "felt", "feeling"},
57 {"keep", "kept", "kept", "keeping"},
58 {"leave", "left", "left", "leaving"},
59 {"mean", "meant", "meant", "meaning"},
60 {"build", "built", "built", "building"},
61 {"send", "sent", "sent", "sending"},
62 {"spend", "spent", "spent", "spending"},
63 {"win", "won", "won", "winning"},
64 {"draw", "drew", "drawn", "drawing"},
65 {"choose", "chose", "chosen", "choosing"},
66 {"wear", "wore", "worn", "wearing"},
67 {"rise", "rose", "risen", "rising"},
68 {"hide", "hid", "hidden", "hiding"},
69 {"forget", "forgot", "forgotten", "forgetting"},
70 {"freeze", "froze", "frozen", "freezing"},
71 {"shake", "shook", "shaken", "shaking"},
72 {"steal", "stole", "stolen", "stealing"},
73 {"blow", "blew", "blown", "blowing"},
74 {"show", "showed", "shown", "showing"},
75 {"beat", "beat", "beaten", "beating"},
76 {"bite", "bit", "bitten", "biting"},
77 {"tear", "tore", "torn", "tearing"},
78 {"wake", "woke", "woken", "waking"},
79 {"ride", "rode", "ridden", "riding"},
80 {"ring", "rang", "rung", "ringing"},
81 {"do", "did", "done", "doing"},
82 {"have", "had", "had", "having"},
83 {"fight", "fought", "fought", "fighting"},
84 {"cut", "cut", "cut", "cutting"},
85 {"put", "put", "put", "putting"},
86 {"hit", "hit", "hit", "hitting"},
87 {"let", "let", "let", "letting"},
88 {"set", "set", "set", "setting"},
89 {"hurt", "hurt", "hurt", "hurting"},
90 {"shoot", "shot", "shot", "shooting"},
91 {"sell", "sold", "sold", "selling"},
92 {"tell", "told", "told", "telling"},
93 {"spell", "spelt", "spelt", "spelling"},
94 {"smell", "smelt", "smelt", "smelling"},
95 {"sleep", "slept", "slept", "sleeping"},
96 {"sweep", "swept", "swept", "sweeping"},
97 {"kneel", "knelt", "knelt", "kneeling"},
98 {"feel", "felt", "felt", "feeling"},
99 {"deal", "dealt", "dealt", "dealing"},
100 {"hear", "heard", "heard", "hearing"},
101 {"learn", "learnt", "learnt", "learning"},
102 {"burn", "burnt", "burnt", "burning"},
103 {"bend", "bent", "bent", "bending"},
104 {"lend", "lent", "lent", "lending"},
105 {"spend", "spent", "spent", "spending"},
106 {"hang", "hung", "hung", "hanging"},
107 {"strike", "struck", "struck", "striking"},
108 {"stick", "stuck", "stuck", "sticking"},
109 {"dig", "dug", "dug", "digging"},
110 {"spin", "spun", "spun", "spinning"},
111 {"swing", "swung", "swung", "swinging"},
112 {"cling", "clung", "clung", "clinging"},
113 {"sink", "sank", "sunk", "sinking"},
114 {"drink", "drank", "drunk", "drinking"},
115 {"spring", "sprang", "sprung", "springing"},
116 {"shrink", "shrank", "shrunk", "shrinking"},
117 {"bind", "bound", "bound", "binding"},
118 {"find", "found", "found", "finding"},
119 {"wind", "wound", "wound", "winding"},
120 {"grind", "ground", "ground", "grinding"},
121 }
122
123 // findIrreg returns the irregular entry for a base form, or (_, false).
124 func findIrreg(base string) (enIrreg, bool) {
125 for _, e := range enIrregs {
126 if e.base == base {
127 return e, true
128 }
129 }
130 return enIrreg{}, false
131 }
132
133 func isVowel(c byte) bool {
134 return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u'
135 }
136
137 // regularPast returns the -ed form of a regular English verb.
138 func regularPast(base string) string {
139 n := len(base)
140 if n == 0 {
141 return base
142 }
143 last := base[n-1]
144 if last == 'e' && n >= 2 && base[n-2] != 'e' && base[n-2] != 'i' {
145 return base[:n-1] | "ed"
146 }
147 if last == 'y' && n >= 2 && !isVowel(base[n-2]) {
148 return base[:n-1] | "ied"
149 }
150 // CVC doubling (single-syllable heuristic)
151 if n >= 3 && !isVowel(last) && last != 'w' && last != 'x' && last != 'y' &&
152 isVowel(base[n-2]) && !isVowel(base[n-3]) {
153 return base | string([]byte{last}) | "ed"
154 }
155 return base | "ed"
156 }
157
158 // regularProg returns the -ing form of a regular English verb.
159 func regularProg(base string) string {
160 n := len(base)
161 if n == 0 {
162 return base
163 }
164 last := base[n-1]
165 if last == 'e' && n >= 2 && base[n-2] != 'e' && base[n-2] != 'i' {
166 return base[:n-1] | "ing"
167 }
168 // CVC doubling
169 if n >= 3 && !isVowel(last) && last != 'w' && last != 'x' && last != 'y' &&
170 isVowel(base[n-2]) && !isVowel(base[n-3]) {
171 return base | string([]byte{last}) | "ing"
172 }
173 return base | "ing"
174 }
175
176 // regular3sg returns the third-person singular present form.
177 func regular3sg(base string) string {
178 n := len(base)
179 if n == 0 {
180 return base
181 }
182 last := base[n-1]
183 if last == 'y' && n >= 2 && !isVowel(base[n-2]) {
184 return base[:n-1] | "ies"
185 }
186 if last == 's' || last == 'x' || last == 'z' ||
187 (n >= 2 && base[n-2:] == "sh") || (n >= 2 && base[n-2:] == "ch") {
188 return base | "es"
189 }
190 return base | "s"
191 }
192
193 // stripInfinitive extracts the bare verb stem from a JMdict "to X" gloss.
194 // Returns "" for complex glosses (multi-word, parenthetical, non-verb).
195 func stripInfinitive(gloss string) string {
196 if len(gloss) <= 3 || gloss[:3] != "to " {
197 return ""
198 }
199 base := gloss[3:]
200 for i := 0; i < len(base); i++ {
201 c := base[i]
202 if (c < 'a' || c > 'z') && c != '-' {
203 return "" // multi-word or parenthetical gloss
204 }
205 }
206 if len(base) < 2 {
207 return ""
208 }
209 return base
210 }
211
212 // GenerateENForms inserts EN surface form lattice entries for a Bverb entry.
213 // Mirrors GenerateConjugations for the EN side: synthetic past, progressive,
214 // 3sg concordance, and participle forms all point to the same JA record.
215 // jaRI is the primary JA translation record; branchByte is the packed Branch.
216 func GenerateENForms(db *DB, enGloss string, branchByte uint8, jaRI uint32) {
217 if jaRI == lattice.NullRec {
218 return
219 }
220 base := stripInfinitive(enGloss)
221 if base == "" {
222 return
223 }
224
225 branch := lattice.Bverb
226
227 var past, participle, progressive, sg3 string
228 irreg, found := findIrreg(base)
229 if found {
230 past = irreg.past
231 participle = irreg.participle
232 progressive = irreg.progressive
233 } else {
234 past = regularPast(base)
235 participle = past // regular: participle = past
236 progressive = regularProg(base)
237 }
238 sg3 = regular3sg(base)
239
240 type formEntry struct {
241 word string
242 state uint8
243 }
244 forms := []formEntry{
245 {past, transdb.MorphPastAffPlain}, // synthetic past: "sang"
246 {progressive, transdb.MorphPresProgPlain}, // progressive: "singing"
247 {sg3, transdb.MorphPresAffPlain}, // 3sg concordance: "sings"
248 }
249 // Participle if distinct from past (e.g. "sung" vs "sang")
250 if participle != "" && participle != past && participle != base {
251 forms = append(forms, formEntry{participle, transdb.MorphPastProgPlain})
252 }
253
254 for _, f := range forms {
255 if f.word == "" || f.word == base {
256 continue
257 }
258 key := transdb.MakeKey(transdb.LangEN, 0, f.word)
259 if db.Tree.LookupRecIdx(branch, key) != lattice.NullRec {
260 continue
261 }
262 var rec lattice.Record
263 transdb.SetFormOnRecord(&rec, f.word, &db.StringPool)
264 rec.Branch = branchByte
265 transdb.SetMorphState(&rec, f.state)
266 // InsertRec resets links — wire after.
267 newRI := db.Tree.InsertRec(branch, key, rec)
268 if r := db.Tree.GetRecord(newRI); r != nil {
269 r.Link[0] = jaRI
270 }
271 }
272 }
273