label_verbs.py raw
1 #!/usr/bin/env python3
2 """
3 Extract top JA verbs, translate, classify with semantic bitflags, output TSV.
4
5 Semantic flags (bitfield, can combine):
6 0x01 SemanticHuman - human subjects/agents
7 0x02 SemanticAnimate - non-human animate subjects
8 0x04 SemanticAbstract - abstract/conceptual subjects
9 0x08 SemanticPlace - place/location subjects
10 0x10 SemanticArtifact - artifact/made-object subjects
11 0x20 SemanticNatural - natural inanimate subjects
12 0x40 SemanticEvent - event subjects
13 0x80 SemanticCollect - collective/group subjects
14 """
15 import sys, os, subprocess, re
16
17 DB = os.environ.get("DB", os.path.expanduser("~/tmp/transdb-data"))
18 WLIST = os.environ.get("WLIST", os.path.join(DB, "wordlist.ja"))
19 TRANSDB = os.environ.get("TRANSDB", "/tmp/transdb")
20 TOP_N = int(os.environ.get("TOP_N", "1500")) # scan more to get 500 verbs
21 OUT = os.environ.get("OUT", os.path.join(DB, "verb_semantic_labels.tsv"))
22
23 # Semantic flag bits
24 # Semantic flags — 2 bits per category (subject | object).
25 # Use subject bits for nouns (what they are), use both for verbs (what they take).
26 Hs = 0x0001 # human subject
27 Ho = 0x0002 # human object
28 As = 0x0004 # animate subject
29 Ao = 0x0008 # animate object
30 Abs = 0x0010 # abstract subject
31 Abo = 0x0020 # abstract object
32 Ps = 0x0040 # place subject (rare)
33 Po = 0x0080 # place object
34 Ars = 0x0100 # artifact subject
35 Aro = 0x0200 # artifact object
36 Ns = 0x0400 # natural subject
37 No = 0x0800 # natural object
38 Es = 0x1000 # event subject
39 Eo = 0x2000 # event object
40 Cs = 0x4000 # collective subject
41 Co = 0x8000 # collective object
42
43 # EN keyword → semantic flags encoding SUBJECT and OBJECT types for the verb.
44 SUBJECT_RULES = [
45 # Human-subject, human-object (social/mental verbs)
46 (Hs|Ho, ["teach", "help", "meet", "greet", "thank", "praise", "blame",
47 "forgive", "marry", "divorce", "hire", "fire", "elect", "lead",
48 "follow", "trust", "love", "hate", "envy", "respect", "fear"]),
49 # Human-subject only (cognitive/volitional, no specific object type)
50 (Hs, ["think", "believe", "know", "feel", "want", "decide", "promise",
51 "dream", "remember", "forget", "understand", "realize", "wonder",
52 "speak", "say", "ask", "answer", "read", "write", "study",
53 "work", "vote", "hope", "wish", "intend", "succeed", "fail",
54 "laugh", "cry", "smile", "apologize", "create", "invent",
55 "discover", "learn", "obey", "disobey", "doubt", "regret"]),
56 # Human-subject, animate-object (hunting, training)
57 (Hs|Ao, ["hunt", "tame", "raise", "rescue", "chase", "pet", "domesticate"]),
58 # Human+animate subject (basic animate actions)
59 (Hs|As, ["eat", "drink", "sleep", "wake", "run", "walk", "jump", "swim",
60 "breathe", "live", "die", "grow", "attack", "escape", "hide",
61 "seek", "fight", "bite", "scratch", "gather", "scatter", "feed",
62 "nurse", "bear", "age", "play", "rest"]),
63 # Human+animate subject, artifact|natural object
64 (Hs|As|Aro|No, ["carry", "hold", "touch", "take", "give", "receive"]),
65 # Animate-only subject (animal sounds/behaviors)
66 (As, ["bark", "meow", "chirp", "roar", "neigh", "moo", "crow", "hiss",
67 "purr", "howl", "twitter", "coo", "quack", "grunt", "squeak",
68 "buzz", "sting", "shed", "hibernate", "migrate", "nest", "hatch",
69 "graze", "peck", "crawl", "slither", "leap", "pounce", "molt"]),
70 # Artifact|natural subject (things breaking/changing state)
71 (Ars|Ns, ["break", "crack", "shatter", "bend", "melt", "burn", "rust",
72 "rot", "decay", "corrode", "tear", "fold", "explode", "collapse",
73 "sink", "float", "bounce", "roll", "spin", "slide", "freeze",
74 "thaw", "evaporate", "condense", "dissolve", "oxidize"]),
75 # Natural subject (phenomena)
76 (Ns, ["flow", "blow", "rain", "snow", "shine", "thunder", "erupt",
77 "quake", "flood", "bloom", "wither", "sprout", "ripen",
78 "rise", "set", "dawn", "wave", "ripple", "glitter", "fade",
79 "darken", "sparkle", "gleam"]),
80 # Event/abstract subject
81 (Es|Abs, ["occur", "happen", "begin", "end", "continue", "stop", "start",
82 "change", "develop", "spread", "increase", "decrease", "expand",
83 "contract", "emerge", "disappear", "exist", "remain", "pass",
84 "last", "elapse", "improve", "worsen", "fall", "drop"]),
85 # Abstract subject (meaning/relation)
86 (Abs, ["mean", "represent", "indicate", "suggest", "imply", "matter",
87 "concern", "relate", "differ", "resemble", "equal", "exceed",
88 "lack", "include", "exclude", "depend", "precede"]),
89 # Artifact subject (machines operating)
90 (Ars, ["operate", "function", "compute", "print", "record", "ring", "tick"]),
91 # Collective/human subject, various objects
92 (Hs|Cs, ["organize", "cooperate", "compete", "agree", "disagree",
93 "negotiate", "trade", "govern", "rule", "protest", "celebrate",
94 "assemble", "demonstrate", "mourn", "unite"]),
95 ]
96
97 # Dictionary endings that indicate JA dictionary-form verbs
98 VERB_ENDINGS = ['る', 'く', 'ぐ', 'す', 'つ', 'ぬ', 'ぶ', 'む', 'う',
99 'する', 'くる', 'いる', 'ある', 'なる', 'れる', 'せる']
100
101 def is_likely_verb(form):
102 """Heuristic: does the form look like a JA dictionary-form verb?"""
103 for e in VERB_ENDINGS:
104 if form.endswith(e) and len(form) >= 2:
105 return True
106 return False
107
108 def classify_verb(en_gloss):
109 """Classify semantic flags from EN gloss."""
110 text = en_gloss.lower()
111 words = set(re.findall(r'[a-z]+', text))
112 flags = 0
113 for flag, keywords in SUBJECT_RULES:
114 if any(kw in words or any(w.startswith(kw) for w in words) for kw in keywords):
115 flags |= flag
116 return flags
117
118 def translate(ja_word):
119 """Translate JA→EN via transdb CLI."""
120 try:
121 r = subprocess.run(
122 [TRANSDB, "translate", "-src", "ja", "-dst", "en", "-db", DB, ja_word],
123 capture_output=True, text=True, timeout=5)
124 return r.stdout.strip()
125 except Exception:
126 return ""
127
128 def main():
129 entries = []
130 with open(WLIST) as f:
131 for line in f:
132 line = line.rstrip('\n')
133 tab = line.find('\t')
134 if tab < 0:
135 continue
136 form = line[:tab]
137 freq = int(line[tab+1:]) if tab >= 0 else 0
138 if is_likely_verb(form):
139 entries.append((form, freq))
140 if len(entries) >= TOP_N:
141 break
142
143 print(f"Found {len(entries)} verb-like forms, processing top 500...", file=sys.stderr)
144
145 results = []
146 for i, (form, freq) in enumerate(entries[:500]):
147 en = translate(form)
148 flags = classify_verb(en) if en else 0
149 results.append((form, flags, en, freq))
150 if (i+1) % 50 == 0:
151 print(f" {i+1}/500: {form} → {en!r} → flags=0x{flags:02x}", file=sys.stderr)
152
153 with open(OUT, 'w') as f:
154 f.write("# JA_verb\tsemantic_flags_hex\ten_translation\tfrequency\n")
155 for form, flags, en, freq in results:
156 f.write(f"{form}\t0x{flags:02x}\t{en}\t{freq}\n")
157
158 print(f"\nWrote {len(results)} entries to {OUT}", file=sys.stderr)
159 total = sum(1 for _, f, _, _ in results if f != 0)
160 print(f" {total}/{len(results)} labeled (non-zero flags)", file=sys.stderr)
161
162 # Show distribution
163 flag_names = {0x01:"human",0x02:"animate",0x04:"abstract",0x08:"place",
164 0x10:"artifact",0x20:"natural",0x40:"event",0x80:"collective"}
165 for bit, name in flag_names.items():
166 count = sum(1 for _, f, _, _ in results if f & bit)
167 if count:
168 print(f" {name}: {count}", file=sys.stderr)
169
170 if __name__ == "__main__":
171 main()
172