#!/usr/bin/env python3 """ Extract top JA verbs, translate, classify with semantic bitflags, output TSV. Semantic flags (bitfield, can combine): 0x01 SemanticHuman - human subjects/agents 0x02 SemanticAnimate - non-human animate subjects 0x04 SemanticAbstract - abstract/conceptual subjects 0x08 SemanticPlace - place/location subjects 0x10 SemanticArtifact - artifact/made-object subjects 0x20 SemanticNatural - natural inanimate subjects 0x40 SemanticEvent - event subjects 0x80 SemanticCollect - collective/group subjects """ import sys, os, subprocess, re DB = os.environ.get("DB", os.path.expanduser("~/tmp/transdb-data")) WLIST = os.environ.get("WLIST", os.path.join(DB, "wordlist.ja")) TRANSDB = os.environ.get("TRANSDB", "/tmp/transdb") TOP_N = int(os.environ.get("TOP_N", "1500")) # scan more to get 500 verbs OUT = os.environ.get("OUT", os.path.join(DB, "verb_semantic_labels.tsv")) # Semantic flag bits # Semantic flags — 2 bits per category (subject | object). # Use subject bits for nouns (what they are), use both for verbs (what they take). Hs = 0x0001 # human subject Ho = 0x0002 # human object As = 0x0004 # animate subject Ao = 0x0008 # animate object Abs = 0x0010 # abstract subject Abo = 0x0020 # abstract object Ps = 0x0040 # place subject (rare) Po = 0x0080 # place object Ars = 0x0100 # artifact subject Aro = 0x0200 # artifact object Ns = 0x0400 # natural subject No = 0x0800 # natural object Es = 0x1000 # event subject Eo = 0x2000 # event object Cs = 0x4000 # collective subject Co = 0x8000 # collective object # EN keyword → semantic flags encoding SUBJECT and OBJECT types for the verb. SUBJECT_RULES = [ # Human-subject, human-object (social/mental verbs) (Hs|Ho, ["teach", "help", "meet", "greet", "thank", "praise", "blame", "forgive", "marry", "divorce", "hire", "fire", "elect", "lead", "follow", "trust", "love", "hate", "envy", "respect", "fear"]), # Human-subject only (cognitive/volitional, no specific object type) (Hs, ["think", "believe", "know", "feel", "want", "decide", "promise", "dream", "remember", "forget", "understand", "realize", "wonder", "speak", "say", "ask", "answer", "read", "write", "study", "work", "vote", "hope", "wish", "intend", "succeed", "fail", "laugh", "cry", "smile", "apologize", "create", "invent", "discover", "learn", "obey", "disobey", "doubt", "regret"]), # Human-subject, animate-object (hunting, training) (Hs|Ao, ["hunt", "tame", "raise", "rescue", "chase", "pet", "domesticate"]), # Human+animate subject (basic animate actions) (Hs|As, ["eat", "drink", "sleep", "wake", "run", "walk", "jump", "swim", "breathe", "live", "die", "grow", "attack", "escape", "hide", "seek", "fight", "bite", "scratch", "gather", "scatter", "feed", "nurse", "bear", "age", "play", "rest"]), # Human+animate subject, artifact|natural object (Hs|As|Aro|No, ["carry", "hold", "touch", "take", "give", "receive"]), # Animate-only subject (animal sounds/behaviors) (As, ["bark", "meow", "chirp", "roar", "neigh", "moo", "crow", "hiss", "purr", "howl", "twitter", "coo", "quack", "grunt", "squeak", "buzz", "sting", "shed", "hibernate", "migrate", "nest", "hatch", "graze", "peck", "crawl", "slither", "leap", "pounce", "molt"]), # Artifact|natural subject (things breaking/changing state) (Ars|Ns, ["break", "crack", "shatter", "bend", "melt", "burn", "rust", "rot", "decay", "corrode", "tear", "fold", "explode", "collapse", "sink", "float", "bounce", "roll", "spin", "slide", "freeze", "thaw", "evaporate", "condense", "dissolve", "oxidize"]), # Natural subject (phenomena) (Ns, ["flow", "blow", "rain", "snow", "shine", "thunder", "erupt", "quake", "flood", "bloom", "wither", "sprout", "ripen", "rise", "set", "dawn", "wave", "ripple", "glitter", "fade", "darken", "sparkle", "gleam"]), # Event/abstract subject (Es|Abs, ["occur", "happen", "begin", "end", "continue", "stop", "start", "change", "develop", "spread", "increase", "decrease", "expand", "contract", "emerge", "disappear", "exist", "remain", "pass", "last", "elapse", "improve", "worsen", "fall", "drop"]), # Abstract subject (meaning/relation) (Abs, ["mean", "represent", "indicate", "suggest", "imply", "matter", "concern", "relate", "differ", "resemble", "equal", "exceed", "lack", "include", "exclude", "depend", "precede"]), # Artifact subject (machines operating) (Ars, ["operate", "function", "compute", "print", "record", "ring", "tick"]), # Collective/human subject, various objects (Hs|Cs, ["organize", "cooperate", "compete", "agree", "disagree", "negotiate", "trade", "govern", "rule", "protest", "celebrate", "assemble", "demonstrate", "mourn", "unite"]), ] # Dictionary endings that indicate JA dictionary-form verbs VERB_ENDINGS = ['る', 'く', 'ぐ', 'す', 'つ', 'ぬ', 'ぶ', 'む', 'う', 'する', 'くる', 'いる', 'ある', 'なる', 'れる', 'せる'] def is_likely_verb(form): """Heuristic: does the form look like a JA dictionary-form verb?""" for e in VERB_ENDINGS: if form.endswith(e) and len(form) >= 2: return True return False def classify_verb(en_gloss): """Classify semantic flags from EN gloss.""" text = en_gloss.lower() words = set(re.findall(r'[a-z]+', text)) flags = 0 for flag, keywords in SUBJECT_RULES: if any(kw in words or any(w.startswith(kw) for w in words) for kw in keywords): flags |= flag return flags def translate(ja_word): """Translate JA→EN via transdb CLI.""" try: r = subprocess.run( [TRANSDB, "translate", "-src", "ja", "-dst", "en", "-db", DB, ja_word], capture_output=True, text=True, timeout=5) return r.stdout.strip() except Exception: return "" def main(): entries = [] with open(WLIST) as f: for line in f: line = line.rstrip('\n') tab = line.find('\t') if tab < 0: continue form = line[:tab] freq = int(line[tab+1:]) if tab >= 0 else 0 if is_likely_verb(form): entries.append((form, freq)) if len(entries) >= TOP_N: break print(f"Found {len(entries)} verb-like forms, processing top 500...", file=sys.stderr) results = [] for i, (form, freq) in enumerate(entries[:500]): en = translate(form) flags = classify_verb(en) if en else 0 results.append((form, flags, en, freq)) if (i+1) % 50 == 0: print(f" {i+1}/500: {form} → {en!r} → flags=0x{flags:02x}", file=sys.stderr) with open(OUT, 'w') as f: f.write("# JA_verb\tsemantic_flags_hex\ten_translation\tfrequency\n") for form, flags, en, freq in results: f.write(f"{form}\t0x{flags:02x}\t{en}\t{freq}\n") print(f"\nWrote {len(results)} entries to {OUT}", file=sys.stderr) total = sum(1 for _, f, _, _ in results if f != 0) print(f" {total}/{len(results)} labeled (non-zero flags)", file=sys.stderr) # Show distribution flag_names = {0x01:"human",0x02:"animate",0x04:"abstract",0x08:"place", 0x10:"artifact",0x20:"natural",0x40:"event",0x80:"collective"} for bit, name in flag_names.items(): count = sum(1 for _, f, _, _ in results if f & bit) if count: print(f" {name}: {count}", file=sys.stderr) if __name__ == "__main__": main()