#!/usr/bin/env python3 """ Extract top JA nouns, translate to EN, classify semantically, output TSV. Semantic codes: 0 = unspecified 1 = human (person, teacher, child) 2 = animate non-human (animal, bird, fish) 3 = abstract concept (freedom, idea, love) 4 = place/location (city, country, room) 5 = artifact (tool, vehicle, building) 6 = natural inanimate (mountain, water, stone) 7 = event (war, meeting, accident) 8 = collective/organization (group, company, team) """ import sys, os, subprocess, re DB = os.environ.get("DB", os.path.expanduser("~/tmp/transdb-data")) WLIST = os.environ.get("WLIST", os.path.join(DB, "wordlist.ja")) TRANSDB = os.environ.get("TRANSDB", "/tmp/transdb") TOP_N = int(os.environ.get("TOP_N", "500")) OUT = os.environ.get("OUT", os.path.join(DB, "semantic_labels.tsv")) # Simple EN keyword rules — covers most unambiguous cases. RULES = { 1: [ # human "person", "people", "man", "woman", "child", "boy", "girl", "human", "teacher", "student", "doctor", "nurse", "soldier", "king", "queen", "father", "mother", "parent", "friend", "enemy", "citizen", "worker", "author", "writer", "singer", "actor", "player", "athlete", "artist", "politician", "official", "minister", "police", "guard", "servant", "husband", "wife", "brother", "sister", "son", "daughter", "baby", "adult", "youth", "elder", "ancestor", "descendant", "neighbor", "stranger", "guest", "host", "leader", "follower", "hero", "villain", "criminal", "victim", "survivor", "refugee", "immigrant", ], 2: [ # animate non-human "animal", "creature", "beast", "bird", "fish", "insect", "bug", "cat", "dog", "horse", "cow", "pig", "sheep", "goat", "rabbit", "mouse", "rat", "bear", "wolf", "fox", "deer", "lion", "tiger", "elephant", "monkey", "snake", "frog", "turtle", "crab", "shrimp", "whale", "dolphin", "shark", "eagle", "crow", "sparrow", "dove", "butterfly", "bee", "ant", "spider", "worm", "dragon", "monster", "plant", "tree", "flower", "grass", "rice", "seed", ], 3: [ # abstract "idea", "concept", "thought", "mind", "spirit", "soul", "feeling", "emotion", "love", "hate", "fear", "hope", "dream", "memory", "freedom", "peace", "justice", "truth", "beauty", "good", "evil", "right", "wrong", "law", "rule", "theory", "principle", "belief", "knowledge", "wisdom", "culture", "tradition", "custom", "habit", "language", "word", "meaning", "reason", "cause", "effect", "value", "worth", "purpose", "goal", "plan", "method", ], 4: [ # place "place", "location", "area", "region", "country", "nation", "state", "city", "town", "village", "district", "street", "road", "path", "building", "house", "room", "hall", "floor", "office", "school", "hospital", "church", "temple", "shrine", "park", "garden", "forest", "mountain", "hill", "valley", "river", "lake", "sea", "ocean", "island", "desert", "field", "farm", "port", "station", "airport", "market", "shop", "store", "restaurant", "hotel", "universe", "world", "earth", "sky", "heaven", "hell", ], 5: [ # artifact "tool", "device", "machine", "engine", "car", "vehicle", "ship", "plane", "train", "bicycle", "boat", "weapon", "sword", "gun", "knife", "book", "paper", "pen", "pencil", "letter", "document", "phone", "computer", "screen", "camera", "television", "radio", "clock", "watch", "key", "lock", "door", "window", "chair", "table", "bed", "clothes", "dress", "shoe", "hat", "bag", "food", "drink", "bread", "rice", "medicine", "drug", "money", "coin", "ticket", "card", "flag", ], 6: [ # natural inanimate "stone", "rock", "sand", "soil", "mud", "dust", "ash", "water", "rain", "snow", "ice", "fire", "smoke", "air", "wind", "cloud", "star", "sun", "moon", "light", "dark", "shadow", "wave", "tide", "storm", "thunder", "lightning", "earthquake", "metal", "gold", "silver", "iron", "copper", "wood", "coal", "oil", "gas", "salt", "sugar", "blood", "bone", ], 7: [ # event "event", "incident", "accident", "disaster", "war", "battle", "fight", "conflict", "attack", "defense", "victory", "defeat", "meeting", "conference", "ceremony", "festival", "wedding", "election", "vote", "trial", "judgment", "punishment", "birth", "death", "marriage", "divorce", "departure", "arrival", "change", "revolution", "reform", "movement", "crisis", ], 8: [ # collective/organization "group", "team", "party", "club", "union", "organization", "company", "firm", "corporation", "government", "army", "navy", "police", "family", "tribe", "nation", "community", "society", "committee", "council", "parliament", "congress", "court", "school", "university", "church", "religion", ], } def classify_en(en_words): """Match EN translation words against category rules.""" text = " ".join(en_words).lower() words = re.findall(r'[a-z]+', text) scores = {cat: 0 for cat in RULES} for cat, keywords in RULES.items(): for kw in keywords: if kw in words or any(w.startswith(kw) for w in words): scores[cat] += 1 best_cat = max(scores, key=lambda c: scores[c]) return best_cat if scores[best_cat] > 0 else 0 def translate(ja_word): """Translate JA→EN via transdb CLI.""" try: r = subprocess.run( [TRANSDB, "translate", "-src", "ja", "-dst", "en", "-db", DB, ja_word], capture_output=True, text=True, timeout=5) return r.stdout.strip() except Exception: return "" def main(): # Load wordlist, filter to nouns (skip short hiragana-only strings — mostly particles). entries = [] with open(WLIST) as f: for line in f: line = line.rstrip('\n') tab = line.find('\t') if tab < 0: continue form, freq = line[:tab], int(line[tab+1:]) if tab >= 0 else 0 # Skip pure hiragana (usually particles/function words) and very short forms. has_kanji = any('一' <= c <= '鿿' or '㐀' <= c <= '䶿' for c in form) has_kata = any('゠' <= c <= 'ヿ' for c in form) if not (has_kanji or has_kata): continue if len(form) < 2: continue entries.append((form, freq)) if len(entries) >= TOP_N: break print(f"Processing {len(entries)} nouns...", file=sys.stderr) results = [] for i, (form, freq) in enumerate(entries): en = translate(form) cat = classify_en(en.split()) if en else 0 results.append((form, cat, en, freq)) if (i+1) % 50 == 0: print(f" {i+1}/{len(entries)}: {form} → {en} → cat={cat}", file=sys.stderr) with open(OUT, 'w') as f: f.write("# JA_form\tsemantic_code\ten_translation\tfrequency\n") for form, cat, en, freq in results: f.write(f"{form}\t{cat}\t{en}\t{freq}\n") cats = {} for _, cat, _, _ in results: cats[cat] = cats.get(cat, 0) + 1 print(f"\nWrote {len(results)} entries to {OUT}", file=sys.stderr) names = {0:"unspecified",1:"human",2:"animate",3:"abstract",4:"place", 5:"artifact",6:"natural",7:"event",8:"collective"} for c in sorted(cats): print(f" {names.get(c,str(c))}: {cats[c]}", file=sys.stderr) if __name__ == "__main__": main()