render_en.mx raw

   1  package iskra
   2  
   3  // RenderENDiscourse renders a full multi-clause Discourse to EN text.
   4  // Single-clause case renders identically to RenderEN(d[0].Set).
   5  //
   6  // Subordinating relations (ClauseIf, ClauseBecause) are emitted as prefixes
   7  // on the subordinate clause itself ("if X, Y" - "if" attaches to X). Peer
   8  // relations (ClauseAnd, ClauseOr, ClauseBut) emit between adjacent clauses.
   9  func RenderENDiscourse(d []Clause) string {
  10  	if len(d) == 0 {
  11  		return ""
  12  	}
  13  	out := ""
  14  	for i, c := range d {
  15  		if i > 0 {
  16  			// Peer connective derived from THIS clause's Relation (the one
  17  			// joining it to the prior clause). For subordinators, the prefix
  18  			// is emitted below instead of here.
  19  			switch c.Relation {
  20  			case ClauseAnd:
  21  				out = out | " and "
  22  			case ClauseOr:
  23  				out = out | " or "
  24  			case ClauseBut:
  25  				out = out | " but "
  26  			case ClauseIf, ClauseBecause:
  27  				out = out | " "
  28  			default:
  29  				out = out | " "
  30  			}
  31  		}
  32  		// Subordinating prefix attached to this clause itself.
  33  		switch c.Relation {
  34  		case ClauseIf:
  35  			out = out | "if "
  36  		case ClauseBecause:
  37  			out = out | "because "
  38  		}
  39  		out = out | RenderEN(c.Set)
  40  	}
  41  	return out
  42  }
  43  
  44  // RenderEN converts a Set to EN text using SVO order with modifier-aware traversal.
  45  //
  46  // Two-pass walk:
  47  //   1. Classify top-level entries (Head=-1) by Role into SVO groups.
  48  //   2. For each emitted top-level entry, prepend its modifiers (entries whose
  49  //      Head points at this entry's index).
  50  //
  51  // EN modifier surface forms:
  52  //   POSS - possessive determiner directly before head ("my fish", "his book")
  53  //   ATTR - adjective directly before head ("red car", "small house")
  54  func RenderEN(set []SetEntry) string {
  55  	mods := map[int32][]int32{}
  56  	var subj, verb, obj, scope, mod, comp, oper []int32
  57  	var copulas, adjs []int32
  58  	for i, e := range set {
  59  		if e.ModKind == MKCop {
  60  			copulas = append(copulas, i)
  61  			continue
  62  		}
  63  		if e.ModKind == MKAdj {
  64  			adjs = append(adjs, i)
  65  			continue
  66  		}
  67  		if e.Head >= 0 && int32(e.Head) < len(set) {
  68  			mods[int32(e.Head)] = append(mods[int32(e.Head)], i)
  69  			continue
  70  		}
  71  		switch e.Role {
  72  		case HistTopic, HistSubject:
  73  			subj = append(subj, i)
  74  		case HistVerb:
  75  			verb = append(verb, i)
  76  		case HistObject:
  77  			obj = append(obj, i)
  78  		case HistScope:
  79  			scope = append(scope, i)
  80  		case HistModifier:
  81  			mod = append(mod, i)
  82  		case HistComplement:
  83  			comp = append(comp, i)
  84  		case HistOperator:
  85  			oper = append(oper, i)
  86  		}
  87  	}
  88  
  89  	var parts []string
  90  
  91  	// Determine subject atom for copula agreement (am/are/is/was/were).
  92  	subjAtom := ""
  93  	subjPlural := false
  94  	if len(subj) > 0 {
  95  		s := set[subj[0]]
  96  		subjAtom = s.Atom
  97  		subjPlural = s.Morph&MetaNumPlural != 0
  98  	}
  99  
 100  	for _, i := range subj {
 101  		parts = appendENWithMods(parts, set, i, mods, "")
 102  	}
 103  
 104  	if len(copulas) > 0 {
 105  		c := set[copulas[0]]
 106  		parts = appendEN(parts, enCopulaForm(c.Morph))
 107  		prep := oblRoleToEnPrep(c.OblRole)
 108  		parts = appendENWithMods(parts, set, copulas[0], mods, prep)
 109  		for _, ci := range copulas[1:] {
 110  			cc := set[ci]
 111  			parts = appendEN(parts, enCopulaForm(cc.Morph))
 112  			parts = appendENWithMods(parts, set, ci, mods, oblRoleToEnPrep(cc.OblRole))
 113  		}
 114  	}
 115  
 116  	if len(adjs) > 0 {
 117  		a := set[adjs[0]]
 118  		parts = appendEN(parts, enCopulaForm(a.Morph))
 119  		parts = appendEN(parts, formatENAdj(a))
 120  		for _, ai := range adjs[1:] {
 121  			aa := set[ai]
 122  			parts = appendEN(parts, enCopulaForm(aa.Morph))
 123  			parts = appendEN(parts, formatENAdj(aa))
 124  		}
 125  	}
 126  
 127  	objsEmitted := false
 128  	if len(verb) > 0 {
 129  		// Emit adverbs that modify this verb BEFORE the verb form.
 130  		vIdx := verb[0]
 131  		for _, mIdx := range mods[vIdx] {
 132  			if set[mIdx].ModKind == MKAdv {
 133  				parts = appendEN(parts, set[mIdx].Atom)
 134  			}
 135  		}
 136  		v := set[vIdx]
 137  		switch {
 138  		case v.Morph&MetaMoodVol != 0:
 139  			parts = appendEN(parts, "let's")
 140  			parts = appendEN(parts, v.Atom)
 141  		case v.Morph&MetaCausative != 0:
 142  			past := v.Morph&MetaTensePast != 0
 143  			third := v.Morph&Meta3Sg != 0
 144  			switch {
 145  			case past:
 146  				parts = appendEN(parts, "made")
 147  			case third:
 148  				parts = appendEN(parts, "makes")
 149  			default:
 150  				parts = appendEN(parts, "make")
 151  			}
 152  			for _, i := range obj {
 153  				parts = appendENWithMods(parts, set, i, mods, "")
 154  			}
 155  			parts = appendEN(parts, v.Atom)
 156  			objsEmitted = true
 157  		case v.Morph&MetaPassive != 0:
 158  			parts = appendEN(parts, enSubjCopula(subjAtom, subjPlural, v.Morph))
 159  			parts = appendEN(parts, formatENVerbPP(v))
 160  		case v.Morph&MetaAspectProg != 0:
 161  			parts = appendEN(parts, enSubjCopula(subjAtom, subjPlural, v.Morph))
 162  			parts = appendEN(parts, formatENVerbProg(v.Atom))
 163  		default:
 164  			parts = appendEN(parts, formatENVerb(v))
 165  		}
 166  		for _, vi := range verb[1:] {
 167  			parts = appendEN(parts, formatENVerb(set[vi]))
 168  		}
 169  	}
 170  
 171  	if !objsEmitted {
 172  		for _, i := range obj {
 173  			parts = appendENWithMods(parts, set, i, mods, "")
 174  		}
 175  	}
 176  	for _, i := range scope {
 177  		prep := oblRoleToEnPrep(set[i].OblRole)
 178  		if prep == "" {
 179  			prep = "in"
 180  		}
 181  		parts = appendENWithMods(parts, set, i, mods, prep)
 182  	}
 183  	for _, i := range mod {
 184  		prep := oblRoleToEnPrep(set[i].OblRole)
 185  		if prep == "" {
 186  			prep = "with"
 187  		}
 188  		parts = appendENWithMods(parts, set, i, mods, prep)
 189  	}
 190  	for _, i := range comp {
 191  		parts = appendENWithMods(parts, set, i, mods, "")
 192  	}
 193  	for _, i := range oper {
 194  		prep := oblRoleToEnPrep(set[i].OblRole)
 195  		if prep == "" {
 196  			prep = "of"
 197  		}
 198  		parts = appendENWithMods(parts, set, i, mods, prep)
 199  	}
 200  
 201  	return joinSpace(parts)
 202  }
 203  
 204  // appendENWithMods emits an optional preposition, then this entry's pre-head
 205  // modifiers (POSS, ATTR), then the head noun, then any MKCoord peers joined
 206  // with "and", then any MKRel relative clauses prefixed with "that".
 207  func appendENWithMods(parts []string, set []SetEntry, idx int32, mods map[int32][]int32, prep string) []string {
 208  	if prep != "" {
 209  		parts = appendEN(parts, prep)
 210  	}
 211  	// Pre-head modifiers: POSS, ATTR. Skip MKCoord/MKAdv/MKRel - those emit
 212  	// elsewhere relative to the head.
 213  	for _, mIdx := range mods[idx] {
 214  		m := set[mIdx]
 215  		if m.ModKind == MKCoord || m.ModKind == MKAdv || m.ModKind == MKRel {
 216  			continue
 217  		}
 218  		parts = appendEN(parts, m.Atom)
 219  	}
 220  	parts = appendEN(parts, formatENNoun(set[idx]))
 221  	// Post-head coordination peers: "and" + peer-atom for each MKCoord.
 222  	for _, mIdx := range mods[idx] {
 223  		if set[mIdx].ModKind != MKCoord {
 224  			continue
 225  		}
 226  		parts = appendEN(parts, "and")
 227  		parts = appendEN(parts, formatENNoun(set[mIdx]))
 228  	}
 229  	// Post-head relative clauses: "that" + verb-form for each MKRel modifier.
 230  	// Intransitive REL only - the verb is the sole predicate of the sub-clause.
 231  	for _, mIdx := range mods[idx] {
 232  		if set[mIdx].ModKind != MKRel {
 233  			continue
 234  		}
 235  		parts = appendEN(parts, "that")
 236  		parts = appendEN(parts, formatENVerb(set[mIdx]))
 237  	}
 238  	return parts
 239  }
 240  
 241  // enSubjCopula selects the copula form based on the subject atom and verb morph.
 242  // "i" → am/was, "you"/"we"/"they"/plural → are/were, others → is/was.
 243  func enSubjCopula(subjAtom string, subjPlural bool, morph uint16) string {
 244  	past := morph&MetaTensePast != 0
 245  	switch {
 246  	case subjAtom == "i":
 247  		if past {
 248  			return "was"
 249  		}
 250  		return "am"
 251  	case subjAtom == "you" || subjAtom == "we" || subjAtom == "they" || subjPlural:
 252  		if past {
 253  			return "were"
 254  		}
 255  		return "are"
 256  	default:
 257  		if past {
 258  			return "was"
 259  		}
 260  		return "is"
 261  	}
 262  }
 263  
 264  // enCopulaForm returns the appropriate "be" form for EN copula rendering.
 265  // Selects between is/are/was/were based on tense and 3sg morph bits.
 266  // Note: EN doesn't have a politeness distinction; MetaFormalityPol is JA-only.
 267  func enCopulaForm(morph uint16) string {
 268  	past := morph&MetaTensePast != 0
 269  	thirdSg := morph&Meta3Sg != 0
 270  	switch {
 271  	case past && thirdSg:
 272  		return "was"
 273  	case past:
 274  		return "were"
 275  	case thirdSg:
 276  		return "is"
 277  	default:
 278  		return "are"
 279  	}
 280  }
 281  
 282  // oblRoleToEnPrep maps an oblique semantic role to the canonical EN preposition.
 283  // This is the cross-language layer: OblRole is language-independent, prep is EN-specific.
 284  func oblRoleToEnPrep(or uint8) string {
 285  	switch or {
 286  	case ORGoal:
 287  		return "to"
 288  	case ORLoc:
 289  		return "in"
 290  	case ORSource:
 291  		return "from"
 292  	case ORLimit:
 293  		return "until"
 294  	case ORInstr:
 295  		return "with"
 296  	case ORComit:
 297  		return "with"
 298  	case ORBenef:
 299  		return "for"
 300  	case ORAgent:
 301  		return "by"
 302  	case ORRecip:
 303  		return "to"
 304  	case ORPart:
 305  		return "of"
 306  	case ORCompare:
 307  		return "than"
 308  	}
 309  	return ""
 310  }
 311  
 312  func appendEN(parts []string, w string) []string {
 313  	if w == "" {
 314  		return parts
 315  	}
 316  	return append(parts, w)
 317  }
 318  
 319  func joinSpace(parts []string) string {
 320  	out := ""
 321  	for i, p := range parts {
 322  		if i > 0 {
 323  			out = out | " "
 324  		}
 325  		out = out | p
 326  	}
 327  	return out
 328  }
 329  
 330  // pronounCase normalizes EN pronoun surface form based on the slot's role.
 331  // Subject role → nominative (i, he, she, we, they); object/oblique roles →
 332  // accusative (me, him, her, us, them). Closed set; non-pronoun atoms pass
 333  // through unchanged.
 334  func pronounCase(atom string, role int32) string {
 335  	subjRole := role == HistSubject || role == HistTopic
 336  	switch atom {
 337  	case "i", "me":
 338  		if subjRole {
 339  			return "i"
 340  		}
 341  		return "me"
 342  	case "he", "him":
 343  		if subjRole {
 344  			return "he"
 345  		}
 346  		return "him"
 347  	case "she":
 348  		if subjRole {
 349  			return "she"
 350  		}
 351  		return "her"
 352  	case "we", "us":
 353  		if subjRole {
 354  			return "we"
 355  		}
 356  		return "us"
 357  	case "they", "them":
 358  		if subjRole {
 359  			return "they"
 360  		}
 361  		return "them"
 362  	}
 363  	return atom
 364  }
 365  
 366  // formatENNoun emits "the lemma" or "a lemma" with plural if applicable.
 367  func formatENNoun(e SetEntry) string {
 368  	if e.Atom == "" {
 369  		return ""
 370  	}
 371  	// Untranslated marker: emit verbatim, no determiner or plural suffix.
 372  	if e.Atom == UntranslatedMarker {
 373  		return e.Atom
 374  	}
 375  	// Cross-language leakage guard: if the atom contains non-ASCII bytes,
 376  	// the EN-side lookup didn't resolve to an English atom. Don't apply
 377  	// determiner or plural suffix - that produces nonsense like "雨るs".
 378  	// Pass the atom through unchanged so the failure is visible upstream.
 379  	if !isASCIIOnly(e.Atom) {
 380  		return e.Atom
 381  	}
 382  	det := ""
 383  	if e.Morph&MetaDefDef != 0 {
 384  		det = "the "
 385  	}
 386  	noun := pronounCase(e.Atom, e.Role)
 387  	if e.Morph&MetaNumPlural != 0 {
 388  		noun = pluralizeEN(noun)
 389  	}
 390  	return det | noun
 391  }
 392  
 393  // formatENVerb emits the verb with tense/aspect/3sg suffix as appropriate.
 394  func formatENVerb(e SetEntry) string {
 395  	if e.Atom == "" {
 396  		return ""
 397  	}
 398  	if e.Atom == UntranslatedMarker {
 399  		return e.Atom
 400  	}
 401  	// Cross-language leakage guard: non-ASCII atoms are unresolved JA
 402  	// fragments; don't apply EN inflection.
 403  	if !isASCIIOnly(e.Atom) {
 404  		return e.Atom
 405  	}
 406  	// Check irregular table first (reverse lookup).
 407  	if surface, ok := buildEnIrregularReverse()[verbKey(e.Atom, e.Morph)]; ok {
 408  		return surface
 409  	}
 410  	if e.Morph&MetaPolarNeg != 0 {
 411  		// Emit "not lemma" auxiliary form.
 412  		if e.Morph&MetaTensePast != 0 {
 413  			return "did not " | e.Atom
 414  		}
 415  		if e.Morph&Meta3Sg != 0 {
 416  			return "does not " | e.Atom
 417  		}
 418  		return "do not " | e.Atom
 419  	}
 420  	if e.Morph&MetaTensePast != 0 {
 421  		return enVerbPast(e.Atom)
 422  	}
 423  	if e.Morph&Meta3Sg != 0 {
 424  		return enVerb3Sg(e.Atom)
 425  	}
 426  	return e.Atom
 427  }
 428  
 429  // formatENVerbPP returns the past-participle form of a verb for passive voice.
 430  // Looks up irregular table with key (lemma, MetaTensePast|MetaPassive) for
 431  // PPs that differ from simple-past, then falls back to (lemma, MetaTensePast),
 432  // then to regular -ed.
 433  func formatENVerbPP(e SetEntry) string {
 434  	if e.Atom == "" {
 435  		return ""
 436  	}
 437  	if surface, ok := buildEnIrregularReverse()[verbKey(e.Atom, MetaTensePast|MetaPassive)]; ok {
 438  		return surface
 439  	}
 440  	if surface, ok := buildEnIrregularReverse()[verbKey(e.Atom, MetaTensePast)]; ok {
 441  		return surface
 442  	}
 443  	return enVerbPast(e.Atom)
 444  }
 445  
 446  func formatENVerbProg(atom string) string {
 447  	if atom == UntranslatedMarker {
 448  		return atom
 449  	}
 450  	if !isASCIIOnly(atom) {
 451  		return atom
 452  	}
 453  	if surface, ok := buildEnIrregularReverse()[verbKey(atom, MetaAspectProg)]; ok {
 454  		return surface
 455  	}
 456  	return enVerbProg(atom)
 457  }
 458  
 459  // formatENAdj returns the comparative surface form when MetaCompare is set
 460  // (looked up in enIrregularReverse with key (lemma, MetaCompare)), otherwise
 461  // returns the lemma atom unchanged.
 462  func formatENAdj(e SetEntry) string {
 463  	if e.Atom == UntranslatedMarker {
 464  		return e.Atom
 465  	}
 466  	if !isASCIIOnly(e.Atom) {
 467  		return e.Atom
 468  	}
 469  	if e.Morph&MetaCompare != 0 {
 470  		if surface, ok := buildEnIrregularReverse()[verbKey(e.Atom, MetaCompare)]; ok {
 471  			return surface
 472  		}
 473  		// Uncomparable / intensifier words: never take -er suffix; prefix
 474  		// with "more" instead. Catches "verier", "morer", "stiller" etc.
 475  		if enUncomparable()[e.Atom] {
 476  			return "more " | e.Atom
 477  		}
 478  		return adjComparativeEN(e.Atom)
 479  	}
 480  	return e.Atom
 481  }
 482  
 483  // enUncomparable lists adjectives/adverbs that don't form -er comparatives.
 484  // Most are intensifiers and quantity words that should take "more" instead.
 485  func enUncomparable() map[string]bool {
 486  	return map[string]bool{
 487  		"very": true, "more": true, "most": true, "much": true, "many": true,
 488  		"quite": true, "rather": true, "just": true, "only": true,
 489  		"also": true, "even": true, "still": true, "again": true,
 490  		"too": true, "so": true, "no": true, "not": true,
 491  		"a": true, "an": true, "the": true, "this": true, "that": true,
 492  		"some": true, "any": true, "every": true, "all": true, "each": true,
 493  		"none": true,
 494  	}
 495  }
 496  
 497  // adjComparativeEN forms the comparative of a regular adjective stem.
 498  func adjComparativeEN(stem string) string {
 499  	if hasSuffix(stem, "e") {
 500  		return stem | "r"
 501  	}
 502  	if hasSuffix(stem, "y") && len(stem) >= 2 && !isVowel(stem[len(stem)-2]) {
 503  		return stem[:len(stem)-1] | "ier"
 504  	}
 505  	// CVC doubling for one-syllable adjectives: big → bigger, hot → hotter.
 506  	if isCVCDoubling(stem) {
 507  		return stem | string([]byte{stem[len(stem)-1]}) | "er"
 508  	}
 509  	return stem | "er"
 510  }
 511  
 512  // isASCIIOnly returns true when every byte of s is in the ASCII range.
 513  // Used by the EN renderer to detect cross-language atom leakage: if the
 514  // looked-up "EN atom" contains non-ASCII bytes, the lattice lookup failed
 515  // and we have a JA fragment that should pass through without EN inflection.
 516  func isASCIIOnly(s string) bool {
 517  	for i := 0; i < len(s); i++ {
 518  		if s[i] >= 0x80 {
 519  			return false
 520  		}
 521  	}
 522  	return true
 523  }
 524  
 525  // isCVCDoubling returns true when stem ends in consonant-vowel-consonant
 526  // (the final consonant is not w, x, y - those don't double). Used by the
 527  // regular -ing/-ed/-er formers to decide whether to double the final
 528  // consonant. Approximation: doesn't check syllable count, so multi-
 529  // syllable words like "open" → "opener" would double wrongly; this rule
 530  // is more conservative by only firing on stems of length <= 5.
 531  func isCVCDoubling(stem string) bool {
 532  	n := len(stem)
 533  	if n < 3 || n > 5 {
 534  		return false
 535  	}
 536  	c1 := stem[n-3]
 537  	v := stem[n-2]
 538  	c2 := stem[n-1]
 539  	if !isVowel(v) {
 540  		return false
 541  	}
 542  	if isVowel(c1) {
 543  		return false
 544  	}
 545  	if isVowel(c2) {
 546  		return false
 547  	}
 548  	// Excluded final consonants: w, x, y (these don't double in standard English).
 549  	if c2 == 'w' || c2 == 'x' || c2 == 'y' {
 550  		return false
 551  	}
 552  	return true
 553  }
 554  
 555  func verbKey(lemma string, morph uint16) string {
 556  	return lemma | "|" | string([]byte{byte(morph), byte(morph >> 8)})
 557  }
 558  
 559  // Regular EN conjugation rules (deterministic).
 560  func enVerbPast(stem string) string {
 561  	if hasSuffix(stem, "e") {
 562  		return stem | "d"
 563  	}
 564  	if hasSuffix(stem, "y") && len(stem) >= 2 && !isVowel(stem[len(stem)-2]) {
 565  		return stem[:len(stem)-1] | "ied"
 566  	}
 567  	if isCVCDoubling(stem) {
 568  		return stem | string([]byte{stem[len(stem)-1]}) | "ed"
 569  	}
 570  	return stem | "ed"
 571  }
 572  
 573  func enVerb3Sg(stem string) string {
 574  	if hasSuffix(stem, "s") || hasSuffix(stem, "x") || hasSuffix(stem, "z") ||
 575  		hasSuffix(stem, "sh") || hasSuffix(stem, "ch") {
 576  		return stem | "es"
 577  	}
 578  	if hasSuffix(stem, "y") && len(stem) >= 2 && !isVowel(stem[len(stem)-2]) {
 579  		return stem[:len(stem)-1] | "ies"
 580  	}
 581  	return stem | "s"
 582  }
 583  
 584  func enVerbProg(stem string) string {
 585  	// -ie verbs become -y + ing: lie → lying, die → dying, tie → tying.
 586  	if hasSuffix(stem, "ie") {
 587  		return stem[:len(stem)-2] | "ying"
 588  	}
 589  	if hasSuffix(stem, "e") && !hasSuffix(stem, "ee") {
 590  		return stem[:len(stem)-1] | "ing"
 591  	}
 592  	if isCVCDoubling(stem) {
 593  		return stem | string([]byte{stem[len(stem)-1]}) | "ing"
 594  	}
 595  	return stem | "ing"
 596  }
 597  
 598  func pluralizeEN(stem string) string {
 599  	if hasSuffix(stem, "s") || hasSuffix(stem, "x") || hasSuffix(stem, "z") ||
 600  		hasSuffix(stem, "sh") || hasSuffix(stem, "ch") {
 601  		return stem | "es"
 602  	}
 603  	if hasSuffix(stem, "y") && len(stem) >= 2 && !isVowel(stem[len(stem)-2]) {
 604  		return stem[:len(stem)-1] | "ies"
 605  	}
 606  	return stem | "s"
 607  }
 608  
 609  func isVowel(c byte) bool {
 610  	switch c {
 611  	case 'a', 'e', 'i', 'o', 'u':
 612  		return true
 613  	}
 614  	return false
 615  }
 616  
 617  // buildEnIrregularReverse builds the (lemma, morph) -> surface form map from enIrregular.
 618  func buildEnIrregularReverse() map[string]string {
 619  	m := map[string]string{}
 620  	for surface, lr := range enIrregular() {
 621  		key := verbKey(lr.Lemma, lr.Morph)
 622  		// Prefer first-seen surface for each (lemma, morph) pair.
 623  		if _, exists := m[key]; !exists {
 624  			m[key] = surface
 625  		}
 626  	}
 627  	return m
 628  }
 629