label.py raw

   1  #!/usr/bin/env python3
   2  """
   3  Extract top JA nouns, translate to EN, classify semantically, output TSV.
   4  Semantic codes:
   5    0 = unspecified
   6    1 = human (person, teacher, child)
   7    2 = animate non-human (animal, bird, fish)
   8    3 = abstract concept (freedom, idea, love)
   9    4 = place/location (city, country, room)
  10    5 = artifact (tool, vehicle, building)
  11    6 = natural inanimate (mountain, water, stone)
  12    7 = event (war, meeting, accident)
  13    8 = collective/organization (group, company, team)
  14  """
  15  import sys, os, subprocess, re
  16  
  17  DB      = os.environ.get("DB", os.path.expanduser("~/tmp/transdb-data"))
  18  WLIST   = os.environ.get("WLIST", os.path.join(DB, "wordlist.ja"))
  19  TRANSDB = os.environ.get("TRANSDB", "/tmp/transdb")
  20  TOP_N   = int(os.environ.get("TOP_N", "500"))
  21  OUT     = os.environ.get("OUT", os.path.join(DB, "semantic_labels.tsv"))
  22  
  23  # Simple EN keyword rules — covers most unambiguous cases.
  24  RULES = {
  25      1: [  # human
  26          "person", "people", "man", "woman", "child", "boy", "girl", "human",
  27          "teacher", "student", "doctor", "nurse", "soldier", "king", "queen",
  28          "father", "mother", "parent", "friend", "enemy", "citizen", "worker",
  29          "author", "writer", "singer", "actor", "player", "athlete", "artist",
  30          "politician", "official", "minister", "police", "guard", "servant",
  31          "husband", "wife", "brother", "sister", "son", "daughter", "baby",
  32          "adult", "youth", "elder", "ancestor", "descendant", "neighbor",
  33          "stranger", "guest", "host", "leader", "follower", "hero", "villain",
  34          "criminal", "victim", "survivor", "refugee", "immigrant",
  35      ],
  36      2: [  # animate non-human
  37          "animal", "creature", "beast", "bird", "fish", "insect", "bug",
  38          "cat", "dog", "horse", "cow", "pig", "sheep", "goat", "rabbit",
  39          "mouse", "rat", "bear", "wolf", "fox", "deer", "lion", "tiger",
  40          "elephant", "monkey", "snake", "frog", "turtle", "crab", "shrimp",
  41          "whale", "dolphin", "shark", "eagle", "crow", "sparrow", "dove",
  42          "butterfly", "bee", "ant", "spider", "worm", "dragon", "monster",
  43          "plant", "tree", "flower", "grass", "rice", "seed",
  44      ],
  45      3: [  # abstract
  46          "idea", "concept", "thought", "mind", "spirit", "soul", "feeling",
  47          "emotion", "love", "hate", "fear", "hope", "dream", "memory",
  48          "freedom", "peace", "justice", "truth", "beauty", "good", "evil",
  49          "right", "wrong", "law", "rule", "theory", "principle", "belief",
  50          "knowledge", "wisdom", "culture", "tradition", "custom", "habit",
  51          "language", "word", "meaning", "reason", "cause", "effect",
  52          "value", "worth", "purpose", "goal", "plan", "method",
  53      ],
  54      4: [  # place
  55          "place", "location", "area", "region", "country", "nation", "state",
  56          "city", "town", "village", "district", "street", "road", "path",
  57          "building", "house", "room", "hall", "floor", "office", "school",
  58          "hospital", "church", "temple", "shrine", "park", "garden",
  59          "forest", "mountain", "hill", "valley", "river", "lake", "sea",
  60          "ocean", "island", "desert", "field", "farm", "port", "station",
  61          "airport", "market", "shop", "store", "restaurant", "hotel",
  62          "universe", "world", "earth", "sky", "heaven", "hell",
  63      ],
  64      5: [  # artifact
  65          "tool", "device", "machine", "engine", "car", "vehicle", "ship",
  66          "plane", "train", "bicycle", "boat", "weapon", "sword", "gun",
  67          "knife", "book", "paper", "pen", "pencil", "letter", "document",
  68          "phone", "computer", "screen", "camera", "television", "radio",
  69          "clock", "watch", "key", "lock", "door", "window", "chair",
  70          "table", "bed", "clothes", "dress", "shoe", "hat", "bag",
  71          "food", "drink", "bread", "rice", "medicine", "drug",
  72          "money", "coin", "ticket", "card", "flag",
  73      ],
  74      6: [  # natural inanimate
  75          "stone", "rock", "sand", "soil", "mud", "dust", "ash",
  76          "water", "rain", "snow", "ice", "fire", "smoke", "air", "wind",
  77          "cloud", "star", "sun", "moon", "light", "dark", "shadow",
  78          "wave", "tide", "storm", "thunder", "lightning", "earthquake",
  79          "metal", "gold", "silver", "iron", "copper", "wood", "coal",
  80          "oil", "gas", "salt", "sugar", "blood", "bone",
  81      ],
  82      7: [  # event
  83          "event", "incident", "accident", "disaster", "war", "battle",
  84          "fight", "conflict", "attack", "defense", "victory", "defeat",
  85          "meeting", "conference", "ceremony", "festival", "wedding",
  86          "election", "vote", "trial", "judgment", "punishment",
  87          "birth", "death", "marriage", "divorce", "departure", "arrival",
  88          "change", "revolution", "reform", "movement", "crisis",
  89      ],
  90      8: [  # collective/organization
  91          "group", "team", "party", "club", "union", "organization",
  92          "company", "firm", "corporation", "government", "army", "navy",
  93          "police", "family", "tribe", "nation", "community", "society",
  94          "committee", "council", "parliament", "congress", "court",
  95          "school", "university", "church", "religion",
  96      ],
  97  }
  98  
  99  def classify_en(en_words):
 100      """Match EN translation words against category rules."""
 101      text = " ".join(en_words).lower()
 102      words = re.findall(r'[a-z]+', text)
 103      scores = {cat: 0 for cat in RULES}
 104      for cat, keywords in RULES.items():
 105          for kw in keywords:
 106              if kw in words or any(w.startswith(kw) for w in words):
 107                  scores[cat] += 1
 108      best_cat = max(scores, key=lambda c: scores[c])
 109      return best_cat if scores[best_cat] > 0 else 0
 110  
 111  def translate(ja_word):
 112      """Translate JA→EN via transdb CLI."""
 113      try:
 114          r = subprocess.run(
 115              [TRANSDB, "translate", "-src", "ja", "-dst", "en", "-db", DB, ja_word],
 116              capture_output=True, text=True, timeout=5)
 117          return r.stdout.strip()
 118      except Exception:
 119          return ""
 120  
 121  def main():
 122      # Load wordlist, filter to nouns (skip short hiragana-only strings — mostly particles).
 123      entries = []
 124      with open(WLIST) as f:
 125          for line in f:
 126              line = line.rstrip('\n')
 127              tab = line.find('\t')
 128              if tab < 0:
 129                  continue
 130              form, freq = line[:tab], int(line[tab+1:]) if tab >= 0 else 0
 131              # Skip pure hiragana (usually particles/function words) and very short forms.
 132              has_kanji = any('一' <= c <= '鿿' or '㐀' <= c <= '䶿' for c in form)
 133              has_kata  = any('゠' <= c <= 'ヿ' for c in form)
 134              if not (has_kanji or has_kata):
 135                  continue
 136              if len(form) < 2:
 137                  continue
 138              entries.append((form, freq))
 139              if len(entries) >= TOP_N:
 140                  break
 141  
 142      print(f"Processing {len(entries)} nouns...", file=sys.stderr)
 143  
 144      results = []
 145      for i, (form, freq) in enumerate(entries):
 146          en = translate(form)
 147          cat = classify_en(en.split()) if en else 0
 148          results.append((form, cat, en, freq))
 149          if (i+1) % 50 == 0:
 150              print(f"  {i+1}/{len(entries)}: {form} → {en} → cat={cat}", file=sys.stderr)
 151  
 152      with open(OUT, 'w') as f:
 153          f.write("# JA_form\tsemantic_code\ten_translation\tfrequency\n")
 154          for form, cat, en, freq in results:
 155              f.write(f"{form}\t{cat}\t{en}\t{freq}\n")
 156  
 157      cats = {}
 158      for _, cat, _, _ in results:
 159          cats[cat] = cats.get(cat, 0) + 1
 160      print(f"\nWrote {len(results)} entries to {OUT}", file=sys.stderr)
 161      names = {0:"unspecified",1:"human",2:"animate",3:"abstract",4:"place",
 162               5:"artifact",6:"natural",7:"event",8:"collective"}
 163      for c in sorted(cats):
 164          print(f"  {names.get(c,str(c))}: {cats[c]}", file=sys.stderr)
 165  
 166  if __name__ == "__main__":
 167      main()
 168