package ingest import ( "git.smesh.lol/iskradb/lattice" "git.smesh.lol/transdb" ) // enIrreg holds the principal parts of an English irregular verb. type enIrreg struct { base, past, participle, progressive string } // enIrregs: the ~70 most common English irregular verbs. // base=infinitive stem, past=synthetic past, participle=past participle, // progressive=stem+"ing" (listed explicitly for irregular spelling). var enIrregs = []enIrreg{ {"eat", "ate", "eaten", "eating"}, {"sing", "sang", "sung", "singing"}, {"write", "wrote", "written", "writing"}, {"break", "broke", "broken", "breaking"}, {"go", "went", "gone", "going"}, {"come", "came", "come", "coming"}, {"see", "saw", "seen", "seeing"}, {"take", "took", "taken", "taking"}, {"give", "gave", "given", "giving"}, {"make", "made", "made", "making"}, {"know", "knew", "known", "knowing"}, {"think", "thought", "thought", "thinking"}, {"say", "said", "said", "saying"}, {"get", "got", "gotten", "getting"}, {"find", "found", "found", "finding"}, {"tell", "told", "told", "telling"}, {"buy", "bought", "bought", "buying"}, {"bring", "brought", "brought", "bringing"}, {"read", "read", "read", "reading"}, {"run", "ran", "run", "running"}, {"speak", "spoke", "spoken", "speaking"}, {"drink", "drank", "drunk", "drinking"}, {"swim", "swam", "swum", "swimming"}, {"begin", "began", "begun", "beginning"}, {"drive", "drove", "driven", "driving"}, {"fly", "flew", "flown", "flying"}, {"grow", "grew", "grown", "growing"}, {"throw", "threw", "thrown", "throwing"}, {"catch", "caught", "caught", "catching"}, {"teach", "taught", "taught", "teaching"}, {"hold", "held", "held", "holding"}, {"stand", "stood", "stood", "standing"}, {"understand", "understood", "understood", "understanding"}, {"lose", "lost", "lost", "losing"}, {"pay", "paid", "paid", "paying"}, {"meet", "met", "met", "meeting"}, {"sit", "sat", "sat", "sitting"}, {"lead", "led", "led", "leading"}, {"fall", "fell", "fallen", "falling"}, {"feel", "felt", "felt", "feeling"}, {"keep", "kept", "kept", "keeping"}, {"leave", "left", "left", "leaving"}, {"mean", "meant", "meant", "meaning"}, {"build", "built", "built", "building"}, {"send", "sent", "sent", "sending"}, {"spend", "spent", "spent", "spending"}, {"win", "won", "won", "winning"}, {"draw", "drew", "drawn", "drawing"}, {"choose", "chose", "chosen", "choosing"}, {"wear", "wore", "worn", "wearing"}, {"rise", "rose", "risen", "rising"}, {"hide", "hid", "hidden", "hiding"}, {"forget", "forgot", "forgotten", "forgetting"}, {"freeze", "froze", "frozen", "freezing"}, {"shake", "shook", "shaken", "shaking"}, {"steal", "stole", "stolen", "stealing"}, {"blow", "blew", "blown", "blowing"}, {"show", "showed", "shown", "showing"}, {"beat", "beat", "beaten", "beating"}, {"bite", "bit", "bitten", "biting"}, {"tear", "tore", "torn", "tearing"}, {"wake", "woke", "woken", "waking"}, {"ride", "rode", "ridden", "riding"}, {"ring", "rang", "rung", "ringing"}, {"do", "did", "done", "doing"}, {"have", "had", "had", "having"}, {"fight", "fought", "fought", "fighting"}, {"cut", "cut", "cut", "cutting"}, {"put", "put", "put", "putting"}, {"hit", "hit", "hit", "hitting"}, {"let", "let", "let", "letting"}, {"set", "set", "set", "setting"}, {"hurt", "hurt", "hurt", "hurting"}, {"shoot", "shot", "shot", "shooting"}, {"sell", "sold", "sold", "selling"}, {"tell", "told", "told", "telling"}, {"spell", "spelt", "spelt", "spelling"}, {"smell", "smelt", "smelt", "smelling"}, {"sleep", "slept", "slept", "sleeping"}, {"sweep", "swept", "swept", "sweeping"}, {"kneel", "knelt", "knelt", "kneeling"}, {"feel", "felt", "felt", "feeling"}, {"deal", "dealt", "dealt", "dealing"}, {"hear", "heard", "heard", "hearing"}, {"learn", "learnt", "learnt", "learning"}, {"burn", "burnt", "burnt", "burning"}, {"bend", "bent", "bent", "bending"}, {"lend", "lent", "lent", "lending"}, {"spend", "spent", "spent", "spending"}, {"hang", "hung", "hung", "hanging"}, {"strike", "struck", "struck", "striking"}, {"stick", "stuck", "stuck", "sticking"}, {"dig", "dug", "dug", "digging"}, {"spin", "spun", "spun", "spinning"}, {"swing", "swung", "swung", "swinging"}, {"cling", "clung", "clung", "clinging"}, {"sink", "sank", "sunk", "sinking"}, {"drink", "drank", "drunk", "drinking"}, {"spring", "sprang", "sprung", "springing"}, {"shrink", "shrank", "shrunk", "shrinking"}, {"bind", "bound", "bound", "binding"}, {"find", "found", "found", "finding"}, {"wind", "wound", "wound", "winding"}, {"grind", "ground", "ground", "grinding"}, } // findIrreg returns the irregular entry for a base form, or (_, false). func findIrreg(base string) (enIrreg, bool) { for _, e := range enIrregs { if e.base == base { return e, true } } return enIrreg{}, false } func isVowel(c byte) bool { return c == 'a' || c == 'e' || c == 'i' || c == 'o' || c == 'u' } // regularPast returns the -ed form of a regular English verb. func regularPast(base string) string { n := len(base) if n == 0 { return base } last := base[n-1] if last == 'e' && n >= 2 && base[n-2] != 'e' && base[n-2] != 'i' { return base[:n-1] | "ed" } if last == 'y' && n >= 2 && !isVowel(base[n-2]) { return base[:n-1] | "ied" } // CVC doubling (single-syllable heuristic) if n >= 3 && !isVowel(last) && last != 'w' && last != 'x' && last != 'y' && isVowel(base[n-2]) && !isVowel(base[n-3]) { return base | string([]byte{last}) | "ed" } return base | "ed" } // regularProg returns the -ing form of a regular English verb. func regularProg(base string) string { n := len(base) if n == 0 { return base } last := base[n-1] if last == 'e' && n >= 2 && base[n-2] != 'e' && base[n-2] != 'i' { return base[:n-1] | "ing" } // CVC doubling if n >= 3 && !isVowel(last) && last != 'w' && last != 'x' && last != 'y' && isVowel(base[n-2]) && !isVowel(base[n-3]) { return base | string([]byte{last}) | "ing" } return base | "ing" } // regular3sg returns the third-person singular present form. func regular3sg(base string) string { n := len(base) if n == 0 { return base } last := base[n-1] if last == 'y' && n >= 2 && !isVowel(base[n-2]) { return base[:n-1] | "ies" } if last == 's' || last == 'x' || last == 'z' || (n >= 2 && base[n-2:] == "sh") || (n >= 2 && base[n-2:] == "ch") { return base | "es" } return base | "s" } // stripInfinitive extracts the bare verb stem from a JMdict "to X" gloss. // Returns "" for complex glosses (multi-word, parenthetical, non-verb). func stripInfinitive(gloss string) string { if len(gloss) <= 3 || gloss[:3] != "to " { return "" } base := gloss[3:] for i := 0; i < len(base); i++ { c := base[i] if (c < 'a' || c > 'z') && c != '-' { return "" // multi-word or parenthetical gloss } } if len(base) < 2 { return "" } return base } // GenerateENForms inserts EN surface form lattice entries for a Bverb entry. // Mirrors GenerateConjugations for the EN side: synthetic past, progressive, // 3sg concordance, and participle forms all point to the same JA record. // jaRI is the primary JA translation record; branchByte is the packed Branch. func GenerateENForms(db *DB, enGloss string, branchByte uint8, jaRI uint32) { if jaRI == lattice.NullRec { return } base := stripInfinitive(enGloss) if base == "" { return } branch := lattice.Bverb var past, participle, progressive, sg3 string irreg, found := findIrreg(base) if found { past = irreg.past participle = irreg.participle progressive = irreg.progressive } else { past = regularPast(base) participle = past // regular: participle = past progressive = regularProg(base) } sg3 = regular3sg(base) type formEntry struct { word string state uint8 } forms := []formEntry{ {past, transdb.MorphPastAffPlain}, // synthetic past: "sang" {progressive, transdb.MorphPresProgPlain}, // progressive: "singing" {sg3, transdb.MorphPresAffPlain}, // 3sg concordance: "sings" } // Participle if distinct from past (e.g. "sung" vs "sang") if participle != "" && participle != past && participle != base { forms = append(forms, formEntry{participle, transdb.MorphPastProgPlain}) } for _, f := range forms { if f.word == "" || f.word == base { continue } key := transdb.MakeKey(transdb.LangEN, 0, f.word) if db.Tree.LookupRecIdx(branch, key) != lattice.NullRec { continue } var rec lattice.Record transdb.SetFormOnRecord(&rec, f.word, &db.StringPool) rec.Branch = branchByte transdb.SetMorphState(&rec, f.state) // InsertRec resets links — wire after. newRI := db.Tree.InsertRec(branch, key, rec) if r := db.Tree.GetRecord(newRI); r != nil { r.Link[0] = jaRI } } }