label_verbs.py raw

   1  #!/usr/bin/env python3
   2  """
   3  Extract top JA verbs, translate, classify with semantic bitflags, output TSV.
   4  
   5  Semantic flags (bitfield, can combine):
   6    0x01 SemanticHuman    - human subjects/agents
   7    0x02 SemanticAnimate  - non-human animate subjects
   8    0x04 SemanticAbstract - abstract/conceptual subjects
   9    0x08 SemanticPlace    - place/location subjects
  10    0x10 SemanticArtifact - artifact/made-object subjects
  11    0x20 SemanticNatural  - natural inanimate subjects
  12    0x40 SemanticEvent    - event subjects
  13    0x80 SemanticCollect  - collective/group subjects
  14  """
  15  import sys, os, subprocess, re
  16  
  17  DB      = os.environ.get("DB", os.path.expanduser("~/tmp/transdb-data"))
  18  WLIST   = os.environ.get("WLIST", os.path.join(DB, "wordlist.ja"))
  19  TRANSDB = os.environ.get("TRANSDB", "/tmp/transdb")
  20  TOP_N   = int(os.environ.get("TOP_N", "1500"))  # scan more to get 500 verbs
  21  OUT     = os.environ.get("OUT", os.path.join(DB, "verb_semantic_labels.tsv"))
  22  
  23  # Semantic flag bits
  24  # Semantic flags — 2 bits per category (subject | object).
  25  # Use subject bits for nouns (what they are), use both for verbs (what they take).
  26  Hs = 0x0001  # human subject
  27  Ho = 0x0002  # human object
  28  As = 0x0004  # animate subject
  29  Ao = 0x0008  # animate object
  30  Abs = 0x0010 # abstract subject
  31  Abo = 0x0020 # abstract object
  32  Ps = 0x0040  # place subject (rare)
  33  Po = 0x0080  # place object
  34  Ars = 0x0100 # artifact subject
  35  Aro = 0x0200 # artifact object
  36  Ns = 0x0400  # natural subject
  37  No = 0x0800  # natural object
  38  Es = 0x1000  # event subject
  39  Eo = 0x2000  # event object
  40  Cs = 0x4000  # collective subject
  41  Co = 0x8000  # collective object
  42  
  43  # EN keyword → semantic flags encoding SUBJECT and OBJECT types for the verb.
  44  SUBJECT_RULES = [
  45      # Human-subject, human-object (social/mental verbs)
  46      (Hs|Ho,  ["teach", "help", "meet", "greet", "thank", "praise", "blame",
  47                 "forgive", "marry", "divorce", "hire", "fire", "elect", "lead",
  48                 "follow", "trust", "love", "hate", "envy", "respect", "fear"]),
  49      # Human-subject only (cognitive/volitional, no specific object type)
  50      (Hs,     ["think", "believe", "know", "feel", "want", "decide", "promise",
  51                 "dream", "remember", "forget", "understand", "realize", "wonder",
  52                 "speak", "say", "ask", "answer", "read", "write", "study",
  53                 "work", "vote", "hope", "wish", "intend", "succeed", "fail",
  54                 "laugh", "cry", "smile", "apologize", "create", "invent",
  55                 "discover", "learn", "obey", "disobey", "doubt", "regret"]),
  56      # Human-subject, animate-object (hunting, training)
  57      (Hs|Ao,  ["hunt", "tame", "raise", "rescue", "chase", "pet", "domesticate"]),
  58      # Human+animate subject (basic animate actions)
  59      (Hs|As,  ["eat", "drink", "sleep", "wake", "run", "walk", "jump", "swim",
  60                 "breathe", "live", "die", "grow", "attack", "escape", "hide",
  61                 "seek", "fight", "bite", "scratch", "gather", "scatter", "feed",
  62                 "nurse", "bear", "age", "play", "rest"]),
  63      # Human+animate subject, artifact|natural object
  64      (Hs|As|Aro|No, ["carry", "hold", "touch", "take", "give", "receive"]),
  65      # Animate-only subject (animal sounds/behaviors)
  66      (As,     ["bark", "meow", "chirp", "roar", "neigh", "moo", "crow", "hiss",
  67                 "purr", "howl", "twitter", "coo", "quack", "grunt", "squeak",
  68                 "buzz", "sting", "shed", "hibernate", "migrate", "nest", "hatch",
  69                 "graze", "peck", "crawl", "slither", "leap", "pounce", "molt"]),
  70      # Artifact|natural subject (things breaking/changing state)
  71      (Ars|Ns, ["break", "crack", "shatter", "bend", "melt", "burn", "rust",
  72                 "rot", "decay", "corrode", "tear", "fold", "explode", "collapse",
  73                 "sink", "float", "bounce", "roll", "spin", "slide", "freeze",
  74                 "thaw", "evaporate", "condense", "dissolve", "oxidize"]),
  75      # Natural subject (phenomena)
  76      (Ns,     ["flow", "blow", "rain", "snow", "shine", "thunder", "erupt",
  77                 "quake", "flood", "bloom", "wither", "sprout", "ripen",
  78                 "rise", "set", "dawn", "wave", "ripple", "glitter", "fade",
  79                 "darken", "sparkle", "gleam"]),
  80      # Event/abstract subject
  81      (Es|Abs, ["occur", "happen", "begin", "end", "continue", "stop", "start",
  82                 "change", "develop", "spread", "increase", "decrease", "expand",
  83                 "contract", "emerge", "disappear", "exist", "remain", "pass",
  84                 "last", "elapse", "improve", "worsen", "fall", "drop"]),
  85      # Abstract subject (meaning/relation)
  86      (Abs,    ["mean", "represent", "indicate", "suggest", "imply", "matter",
  87                 "concern", "relate", "differ", "resemble", "equal", "exceed",
  88                 "lack", "include", "exclude", "depend", "precede"]),
  89      # Artifact subject (machines operating)
  90      (Ars,    ["operate", "function", "compute", "print", "record", "ring", "tick"]),
  91      # Collective/human subject, various objects
  92      (Hs|Cs,  ["organize", "cooperate", "compete", "agree", "disagree",
  93                 "negotiate", "trade", "govern", "rule", "protest", "celebrate",
  94                 "assemble", "demonstrate", "mourn", "unite"]),
  95  ]
  96  
  97  # Dictionary endings that indicate JA dictionary-form verbs
  98  VERB_ENDINGS = ['る', 'く', 'ぐ', 'す', 'つ', 'ぬ', 'ぶ', 'む', 'う',
  99                  'する', 'くる', 'いる', 'ある', 'なる', 'れる', 'せる']
 100  
 101  def is_likely_verb(form):
 102      """Heuristic: does the form look like a JA dictionary-form verb?"""
 103      for e in VERB_ENDINGS:
 104          if form.endswith(e) and len(form) >= 2:
 105              return True
 106      return False
 107  
 108  def classify_verb(en_gloss):
 109      """Classify semantic flags from EN gloss."""
 110      text = en_gloss.lower()
 111      words = set(re.findall(r'[a-z]+', text))
 112      flags = 0
 113      for flag, keywords in SUBJECT_RULES:
 114          if any(kw in words or any(w.startswith(kw) for w in words) for kw in keywords):
 115              flags |= flag
 116      return flags
 117  
 118  def translate(ja_word):
 119      """Translate JA→EN via transdb CLI."""
 120      try:
 121          r = subprocess.run(
 122              [TRANSDB, "translate", "-src", "ja", "-dst", "en", "-db", DB, ja_word],
 123              capture_output=True, text=True, timeout=5)
 124          return r.stdout.strip()
 125      except Exception:
 126          return ""
 127  
 128  def main():
 129      entries = []
 130      with open(WLIST) as f:
 131          for line in f:
 132              line = line.rstrip('\n')
 133              tab = line.find('\t')
 134              if tab < 0:
 135                  continue
 136              form = line[:tab]
 137              freq = int(line[tab+1:]) if tab >= 0 else 0
 138              if is_likely_verb(form):
 139                  entries.append((form, freq))
 140              if len(entries) >= TOP_N:
 141                  break
 142  
 143      print(f"Found {len(entries)} verb-like forms, processing top 500...", file=sys.stderr)
 144  
 145      results = []
 146      for i, (form, freq) in enumerate(entries[:500]):
 147          en = translate(form)
 148          flags = classify_verb(en) if en else 0
 149          results.append((form, flags, en, freq))
 150          if (i+1) % 50 == 0:
 151              print(f"  {i+1}/500: {form} → {en!r} → flags=0x{flags:02x}", file=sys.stderr)
 152  
 153      with open(OUT, 'w') as f:
 154          f.write("# JA_verb\tsemantic_flags_hex\ten_translation\tfrequency\n")
 155          for form, flags, en, freq in results:
 156              f.write(f"{form}\t0x{flags:02x}\t{en}\t{freq}\n")
 157  
 158      print(f"\nWrote {len(results)} entries to {OUT}", file=sys.stderr)
 159      total = sum(1 for _, f, _, _ in results if f != 0)
 160      print(f"  {total}/{len(results)} labeled (non-zero flags)", file=sys.stderr)
 161  
 162      # Show distribution
 163      flag_names = {0x01:"human",0x02:"animate",0x04:"abstract",0x08:"place",
 164                    0x10:"artifact",0x20:"natural",0x40:"event",0x80:"collective"}
 165      for bit, name in flag_names.items():
 166          count = sum(1 for _, f, _, _ in results if f & bit)
 167          if count:
 168              print(f"  {name}: {count}", file=sys.stderr)
 169  
 170  if __name__ == "__main__":
 171      main()
 172