render_en.mx raw
1 package iskra
2
3 // RenderENDiscourse renders a full multi-clause Discourse to EN text.
4 // Single-clause case renders identically to RenderEN(d[0].Set).
5 //
6 // Subordinating relations (ClauseIf, ClauseBecause) are emitted as prefixes
7 // on the subordinate clause itself ("if X, Y" - "if" attaches to X). Peer
8 // relations (ClauseAnd, ClauseOr, ClauseBut) emit between adjacent clauses.
9 func RenderENDiscourse(d []Clause) string {
10 if len(d) == 0 {
11 return ""
12 }
13 out := ""
14 for i, c := range d {
15 if i > 0 {
16 // Peer connective derived from THIS clause's Relation (the one
17 // joining it to the prior clause). For subordinators, the prefix
18 // is emitted below instead of here.
19 switch c.Relation {
20 case ClauseAnd:
21 out = out | " and "
22 case ClauseOr:
23 out = out | " or "
24 case ClauseBut:
25 out = out | " but "
26 case ClauseIf, ClauseBecause:
27 out = out | " "
28 default:
29 out = out | " "
30 }
31 }
32 // Subordinating prefix attached to this clause itself.
33 switch c.Relation {
34 case ClauseIf:
35 out = out | "if "
36 case ClauseBecause:
37 out = out | "because "
38 }
39 out = out | RenderEN(c.Set)
40 }
41 return out
42 }
43
44 // RenderEN converts a Set to EN text using SVO order with modifier-aware traversal.
45 //
46 // Two-pass walk:
47 // 1. Classify top-level entries (Head=-1) by Role into SVO groups.
48 // 2. For each emitted top-level entry, prepend its modifiers (entries whose
49 // Head points at this entry's index).
50 //
51 // EN modifier surface forms:
52 // POSS - possessive determiner directly before head ("my fish", "his book")
53 // ATTR - adjective directly before head ("red car", "small house")
54 func RenderEN(set []SetEntry) string {
55 mods := map[int32][]int32{}
56 var subj, verb, obj, scope, mod, comp, oper []int32
57 var copulas, adjs []int32
58 for i, e := range set {
59 if e.ModKind == MKCop {
60 copulas = append(copulas, i)
61 continue
62 }
63 if e.ModKind == MKAdj {
64 adjs = append(adjs, i)
65 continue
66 }
67 if e.Head >= 0 && int32(e.Head) < len(set) {
68 mods[int32(e.Head)] = append(mods[int32(e.Head)], i)
69 continue
70 }
71 switch e.Role {
72 case HistTopic, HistSubject:
73 subj = append(subj, i)
74 case HistVerb:
75 verb = append(verb, i)
76 case HistObject:
77 obj = append(obj, i)
78 case HistScope:
79 scope = append(scope, i)
80 case HistModifier:
81 mod = append(mod, i)
82 case HistComplement:
83 comp = append(comp, i)
84 case HistOperator:
85 oper = append(oper, i)
86 }
87 }
88
89 var parts []string
90
91 // Determine subject atom for copula agreement (am/are/is/was/were).
92 subjAtom := ""
93 subjPlural := false
94 if len(subj) > 0 {
95 s := set[subj[0]]
96 subjAtom = s.Atom
97 subjPlural = s.Morph&MetaNumPlural != 0
98 }
99
100 for _, i := range subj {
101 parts = appendENWithMods(parts, set, i, mods, "")
102 }
103
104 if len(copulas) > 0 {
105 c := set[copulas[0]]
106 parts = appendEN(parts, enCopulaForm(c.Morph))
107 prep := oblRoleToEnPrep(c.OblRole)
108 parts = appendENWithMods(parts, set, copulas[0], mods, prep)
109 for _, ci := range copulas[1:] {
110 cc := set[ci]
111 parts = appendEN(parts, enCopulaForm(cc.Morph))
112 parts = appendENWithMods(parts, set, ci, mods, oblRoleToEnPrep(cc.OblRole))
113 }
114 }
115
116 if len(adjs) > 0 {
117 a := set[adjs[0]]
118 parts = appendEN(parts, enCopulaForm(a.Morph))
119 parts = appendEN(parts, formatENAdj(a))
120 for _, ai := range adjs[1:] {
121 aa := set[ai]
122 parts = appendEN(parts, enCopulaForm(aa.Morph))
123 parts = appendEN(parts, formatENAdj(aa))
124 }
125 }
126
127 objsEmitted := false
128 if len(verb) > 0 {
129 // Emit adverbs that modify this verb BEFORE the verb form.
130 vIdx := verb[0]
131 for _, mIdx := range mods[vIdx] {
132 if set[mIdx].ModKind == MKAdv {
133 parts = appendEN(parts, set[mIdx].Atom)
134 }
135 }
136 v := set[vIdx]
137 switch {
138 case v.Morph&MetaMoodVol != 0:
139 parts = appendEN(parts, "let's")
140 parts = appendEN(parts, v.Atom)
141 case v.Morph&MetaCausative != 0:
142 past := v.Morph&MetaTensePast != 0
143 third := v.Morph&Meta3Sg != 0
144 switch {
145 case past:
146 parts = appendEN(parts, "made")
147 case third:
148 parts = appendEN(parts, "makes")
149 default:
150 parts = appendEN(parts, "make")
151 }
152 for _, i := range obj {
153 parts = appendENWithMods(parts, set, i, mods, "")
154 }
155 parts = appendEN(parts, v.Atom)
156 objsEmitted = true
157 case v.Morph&MetaPassive != 0:
158 parts = appendEN(parts, enSubjCopula(subjAtom, subjPlural, v.Morph))
159 parts = appendEN(parts, formatENVerbPP(v))
160 case v.Morph&MetaAspectProg != 0:
161 parts = appendEN(parts, enSubjCopula(subjAtom, subjPlural, v.Morph))
162 parts = appendEN(parts, formatENVerbProg(v.Atom))
163 default:
164 parts = appendEN(parts, formatENVerb(v))
165 }
166 for _, vi := range verb[1:] {
167 parts = appendEN(parts, formatENVerb(set[vi]))
168 }
169 }
170
171 if !objsEmitted {
172 for _, i := range obj {
173 parts = appendENWithMods(parts, set, i, mods, "")
174 }
175 }
176 for _, i := range scope {
177 prep := oblRoleToEnPrep(set[i].OblRole)
178 if prep == "" {
179 prep = "in"
180 }
181 parts = appendENWithMods(parts, set, i, mods, prep)
182 }
183 for _, i := range mod {
184 prep := oblRoleToEnPrep(set[i].OblRole)
185 if prep == "" {
186 prep = "with"
187 }
188 parts = appendENWithMods(parts, set, i, mods, prep)
189 }
190 for _, i := range comp {
191 parts = appendENWithMods(parts, set, i, mods, "")
192 }
193 for _, i := range oper {
194 prep := oblRoleToEnPrep(set[i].OblRole)
195 if prep == "" {
196 prep = "of"
197 }
198 parts = appendENWithMods(parts, set, i, mods, prep)
199 }
200
201 return joinSpace(parts)
202 }
203
204 // appendENWithMods emits an optional preposition, then this entry's pre-head
205 // modifiers (POSS, ATTR), then the head noun, then any MKCoord peers joined
206 // with "and", then any MKRel relative clauses prefixed with "that".
207 func appendENWithMods(parts []string, set []SetEntry, idx int32, mods map[int32][]int32, prep string) []string {
208 if prep != "" {
209 parts = appendEN(parts, prep)
210 }
211 // Pre-head modifiers: POSS, ATTR. Skip MKCoord/MKAdv/MKRel - those emit
212 // elsewhere relative to the head.
213 for _, mIdx := range mods[idx] {
214 m := set[mIdx]
215 if m.ModKind == MKCoord || m.ModKind == MKAdv || m.ModKind == MKRel {
216 continue
217 }
218 parts = appendEN(parts, m.Atom)
219 }
220 parts = appendEN(parts, formatENNoun(set[idx]))
221 // Post-head coordination peers: "and" + peer-atom for each MKCoord.
222 for _, mIdx := range mods[idx] {
223 if set[mIdx].ModKind != MKCoord {
224 continue
225 }
226 parts = appendEN(parts, "and")
227 parts = appendEN(parts, formatENNoun(set[mIdx]))
228 }
229 // Post-head relative clauses: "that" + verb-form for each MKRel modifier.
230 // Intransitive REL only - the verb is the sole predicate of the sub-clause.
231 for _, mIdx := range mods[idx] {
232 if set[mIdx].ModKind != MKRel {
233 continue
234 }
235 parts = appendEN(parts, "that")
236 parts = appendEN(parts, formatENVerb(set[mIdx]))
237 }
238 return parts
239 }
240
241 // enSubjCopula selects the copula form based on the subject atom and verb morph.
242 // "i" → am/was, "you"/"we"/"they"/plural → are/were, others → is/was.
243 func enSubjCopula(subjAtom string, subjPlural bool, morph uint16) string {
244 past := morph&MetaTensePast != 0
245 switch {
246 case subjAtom == "i":
247 if past {
248 return "was"
249 }
250 return "am"
251 case subjAtom == "you" || subjAtom == "we" || subjAtom == "they" || subjPlural:
252 if past {
253 return "were"
254 }
255 return "are"
256 default:
257 if past {
258 return "was"
259 }
260 return "is"
261 }
262 }
263
264 // enCopulaForm returns the appropriate "be" form for EN copula rendering.
265 // Selects between is/are/was/were based on tense and 3sg morph bits.
266 // Note: EN doesn't have a politeness distinction; MetaFormalityPol is JA-only.
267 func enCopulaForm(morph uint16) string {
268 past := morph&MetaTensePast != 0
269 thirdSg := morph&Meta3Sg != 0
270 switch {
271 case past && thirdSg:
272 return "was"
273 case past:
274 return "were"
275 case thirdSg:
276 return "is"
277 default:
278 return "are"
279 }
280 }
281
282 // oblRoleToEnPrep maps an oblique semantic role to the canonical EN preposition.
283 // This is the cross-language layer: OblRole is language-independent, prep is EN-specific.
284 func oblRoleToEnPrep(or uint8) string {
285 switch or {
286 case ORGoal:
287 return "to"
288 case ORLoc:
289 return "in"
290 case ORSource:
291 return "from"
292 case ORLimit:
293 return "until"
294 case ORInstr:
295 return "with"
296 case ORComit:
297 return "with"
298 case ORBenef:
299 return "for"
300 case ORAgent:
301 return "by"
302 case ORRecip:
303 return "to"
304 case ORPart:
305 return "of"
306 case ORCompare:
307 return "than"
308 }
309 return ""
310 }
311
312 func appendEN(parts []string, w string) []string {
313 if w == "" {
314 return parts
315 }
316 return append(parts, w)
317 }
318
319 func joinSpace(parts []string) string {
320 out := ""
321 for i, p := range parts {
322 if i > 0 {
323 out = out | " "
324 }
325 out = out | p
326 }
327 return out
328 }
329
330 // pronounCase normalizes EN pronoun surface form based on the slot's role.
331 // Subject role → nominative (i, he, she, we, they); object/oblique roles →
332 // accusative (me, him, her, us, them). Closed set; non-pronoun atoms pass
333 // through unchanged.
334 func pronounCase(atom string, role int32) string {
335 subjRole := role == HistSubject || role == HistTopic
336 switch atom {
337 case "i", "me":
338 if subjRole {
339 return "i"
340 }
341 return "me"
342 case "he", "him":
343 if subjRole {
344 return "he"
345 }
346 return "him"
347 case "she":
348 if subjRole {
349 return "she"
350 }
351 return "her"
352 case "we", "us":
353 if subjRole {
354 return "we"
355 }
356 return "us"
357 case "they", "them":
358 if subjRole {
359 return "they"
360 }
361 return "them"
362 }
363 return atom
364 }
365
366 // formatENNoun emits "the lemma" or "a lemma" with plural if applicable.
367 func formatENNoun(e SetEntry) string {
368 if e.Atom == "" {
369 return ""
370 }
371 // Untranslated marker: emit verbatim, no determiner or plural suffix.
372 if e.Atom == UntranslatedMarker {
373 return e.Atom
374 }
375 // Cross-language leakage guard: if the atom contains non-ASCII bytes,
376 // the EN-side lookup didn't resolve to an English atom. Don't apply
377 // determiner or plural suffix - that produces nonsense like "雨るs".
378 // Pass the atom through unchanged so the failure is visible upstream.
379 if !isASCIIOnly(e.Atom) {
380 return e.Atom
381 }
382 det := ""
383 if e.Morph&MetaDefDef != 0 {
384 det = "the "
385 }
386 noun := pronounCase(e.Atom, e.Role)
387 if e.Morph&MetaNumPlural != 0 {
388 noun = pluralizeEN(noun)
389 }
390 return det | noun
391 }
392
393 // formatENVerb emits the verb with tense/aspect/3sg suffix as appropriate.
394 func formatENVerb(e SetEntry) string {
395 if e.Atom == "" {
396 return ""
397 }
398 if e.Atom == UntranslatedMarker {
399 return e.Atom
400 }
401 // Cross-language leakage guard: non-ASCII atoms are unresolved JA
402 // fragments; don't apply EN inflection.
403 if !isASCIIOnly(e.Atom) {
404 return e.Atom
405 }
406 // Check irregular table first (reverse lookup).
407 if surface, ok := buildEnIrregularReverse()[verbKey(e.Atom, e.Morph)]; ok {
408 return surface
409 }
410 if e.Morph&MetaPolarNeg != 0 {
411 // Emit "not lemma" auxiliary form.
412 if e.Morph&MetaTensePast != 0 {
413 return "did not " | e.Atom
414 }
415 if e.Morph&Meta3Sg != 0 {
416 return "does not " | e.Atom
417 }
418 return "do not " | e.Atom
419 }
420 if e.Morph&MetaTensePast != 0 {
421 return enVerbPast(e.Atom)
422 }
423 if e.Morph&Meta3Sg != 0 {
424 return enVerb3Sg(e.Atom)
425 }
426 return e.Atom
427 }
428
429 // formatENVerbPP returns the past-participle form of a verb for passive voice.
430 // Looks up irregular table with key (lemma, MetaTensePast|MetaPassive) for
431 // PPs that differ from simple-past, then falls back to (lemma, MetaTensePast),
432 // then to regular -ed.
433 func formatENVerbPP(e SetEntry) string {
434 if e.Atom == "" {
435 return ""
436 }
437 if surface, ok := buildEnIrregularReverse()[verbKey(e.Atom, MetaTensePast|MetaPassive)]; ok {
438 return surface
439 }
440 if surface, ok := buildEnIrregularReverse()[verbKey(e.Atom, MetaTensePast)]; ok {
441 return surface
442 }
443 return enVerbPast(e.Atom)
444 }
445
446 func formatENVerbProg(atom string) string {
447 if atom == UntranslatedMarker {
448 return atom
449 }
450 if !isASCIIOnly(atom) {
451 return atom
452 }
453 if surface, ok := buildEnIrregularReverse()[verbKey(atom, MetaAspectProg)]; ok {
454 return surface
455 }
456 return enVerbProg(atom)
457 }
458
459 // formatENAdj returns the comparative surface form when MetaCompare is set
460 // (looked up in enIrregularReverse with key (lemma, MetaCompare)), otherwise
461 // returns the lemma atom unchanged.
462 func formatENAdj(e SetEntry) string {
463 if e.Atom == UntranslatedMarker {
464 return e.Atom
465 }
466 if !isASCIIOnly(e.Atom) {
467 return e.Atom
468 }
469 if e.Morph&MetaCompare != 0 {
470 if surface, ok := buildEnIrregularReverse()[verbKey(e.Atom, MetaCompare)]; ok {
471 return surface
472 }
473 // Uncomparable / intensifier words: never take -er suffix; prefix
474 // with "more" instead. Catches "verier", "morer", "stiller" etc.
475 if enUncomparable()[e.Atom] {
476 return "more " | e.Atom
477 }
478 return adjComparativeEN(e.Atom)
479 }
480 return e.Atom
481 }
482
483 // enUncomparable lists adjectives/adverbs that don't form -er comparatives.
484 // Most are intensifiers and quantity words that should take "more" instead.
485 func enUncomparable() map[string]bool {
486 return map[string]bool{
487 "very": true, "more": true, "most": true, "much": true, "many": true,
488 "quite": true, "rather": true, "just": true, "only": true,
489 "also": true, "even": true, "still": true, "again": true,
490 "too": true, "so": true, "no": true, "not": true,
491 "a": true, "an": true, "the": true, "this": true, "that": true,
492 "some": true, "any": true, "every": true, "all": true, "each": true,
493 "none": true,
494 }
495 }
496
497 // adjComparativeEN forms the comparative of a regular adjective stem.
498 func adjComparativeEN(stem string) string {
499 if hasSuffix(stem, "e") {
500 return stem | "r"
501 }
502 if hasSuffix(stem, "y") && len(stem) >= 2 && !isVowel(stem[len(stem)-2]) {
503 return stem[:len(stem)-1] | "ier"
504 }
505 // CVC doubling for one-syllable adjectives: big → bigger, hot → hotter.
506 if isCVCDoubling(stem) {
507 return stem | string([]byte{stem[len(stem)-1]}) | "er"
508 }
509 return stem | "er"
510 }
511
512 // isASCIIOnly returns true when every byte of s is in the ASCII range.
513 // Used by the EN renderer to detect cross-language atom leakage: if the
514 // looked-up "EN atom" contains non-ASCII bytes, the lattice lookup failed
515 // and we have a JA fragment that should pass through without EN inflection.
516 func isASCIIOnly(s string) bool {
517 for i := 0; i < len(s); i++ {
518 if s[i] >= 0x80 {
519 return false
520 }
521 }
522 return true
523 }
524
525 // isCVCDoubling returns true when stem ends in consonant-vowel-consonant
526 // (the final consonant is not w, x, y - those don't double). Used by the
527 // regular -ing/-ed/-er formers to decide whether to double the final
528 // consonant. Approximation: doesn't check syllable count, so multi-
529 // syllable words like "open" → "opener" would double wrongly; this rule
530 // is more conservative by only firing on stems of length <= 5.
531 func isCVCDoubling(stem string) bool {
532 n := len(stem)
533 if n < 3 || n > 5 {
534 return false
535 }
536 c1 := stem[n-3]
537 v := stem[n-2]
538 c2 := stem[n-1]
539 if !isVowel(v) {
540 return false
541 }
542 if isVowel(c1) {
543 return false
544 }
545 if isVowel(c2) {
546 return false
547 }
548 // Excluded final consonants: w, x, y (these don't double in standard English).
549 if c2 == 'w' || c2 == 'x' || c2 == 'y' {
550 return false
551 }
552 return true
553 }
554
555 func verbKey(lemma string, morph uint16) string {
556 return lemma | "|" | string([]byte{byte(morph), byte(morph >> 8)})
557 }
558
559 // Regular EN conjugation rules (deterministic).
560 func enVerbPast(stem string) string {
561 if hasSuffix(stem, "e") {
562 return stem | "d"
563 }
564 if hasSuffix(stem, "y") && len(stem) >= 2 && !isVowel(stem[len(stem)-2]) {
565 return stem[:len(stem)-1] | "ied"
566 }
567 if isCVCDoubling(stem) {
568 return stem | string([]byte{stem[len(stem)-1]}) | "ed"
569 }
570 return stem | "ed"
571 }
572
573 func enVerb3Sg(stem string) string {
574 if hasSuffix(stem, "s") || hasSuffix(stem, "x") || hasSuffix(stem, "z") ||
575 hasSuffix(stem, "sh") || hasSuffix(stem, "ch") {
576 return stem | "es"
577 }
578 if hasSuffix(stem, "y") && len(stem) >= 2 && !isVowel(stem[len(stem)-2]) {
579 return stem[:len(stem)-1] | "ies"
580 }
581 return stem | "s"
582 }
583
584 func enVerbProg(stem string) string {
585 // -ie verbs become -y + ing: lie → lying, die → dying, tie → tying.
586 if hasSuffix(stem, "ie") {
587 return stem[:len(stem)-2] | "ying"
588 }
589 if hasSuffix(stem, "e") && !hasSuffix(stem, "ee") {
590 return stem[:len(stem)-1] | "ing"
591 }
592 if isCVCDoubling(stem) {
593 return stem | string([]byte{stem[len(stem)-1]}) | "ing"
594 }
595 return stem | "ing"
596 }
597
598 func pluralizeEN(stem string) string {
599 if hasSuffix(stem, "s") || hasSuffix(stem, "x") || hasSuffix(stem, "z") ||
600 hasSuffix(stem, "sh") || hasSuffix(stem, "ch") {
601 return stem | "es"
602 }
603 if hasSuffix(stem, "y") && len(stem) >= 2 && !isVowel(stem[len(stem)-2]) {
604 return stem[:len(stem)-1] | "ies"
605 }
606 return stem | "s"
607 }
608
609 func isVowel(c byte) bool {
610 switch c {
611 case 'a', 'e', 'i', 'o', 'u':
612 return true
613 }
614 return false
615 }
616
617 // buildEnIrregularReverse builds the (lemma, morph) -> surface form map from enIrregular.
618 func buildEnIrregularReverse() map[string]string {
619 m := map[string]string{}
620 for surface, lr := range enIrregular() {
621 key := verbKey(lr.Lemma, lr.Morph)
622 // Prefer first-seen surface for each (lemma, morph) pair.
623 if _, exists := m[key]; !exists {
624 m[key] = surface
625 }
626 }
627 return m
628 }
629