label.py raw
1 #!/usr/bin/env python3
2 """
3 Extract top JA nouns, translate to EN, classify semantically, output TSV.
4 Semantic codes:
5 0 = unspecified
6 1 = human (person, teacher, child)
7 2 = animate non-human (animal, bird, fish)
8 3 = abstract concept (freedom, idea, love)
9 4 = place/location (city, country, room)
10 5 = artifact (tool, vehicle, building)
11 6 = natural inanimate (mountain, water, stone)
12 7 = event (war, meeting, accident)
13 8 = collective/organization (group, company, team)
14 """
15 import sys, os, subprocess, re
16
17 DB = os.environ.get("DB", os.path.expanduser("~/tmp/transdb-data"))
18 WLIST = os.environ.get("WLIST", os.path.join(DB, "wordlist.ja"))
19 TRANSDB = os.environ.get("TRANSDB", "/tmp/transdb")
20 TOP_N = int(os.environ.get("TOP_N", "500"))
21 OUT = os.environ.get("OUT", os.path.join(DB, "semantic_labels.tsv"))
22
23 # Simple EN keyword rules — covers most unambiguous cases.
24 RULES = {
25 1: [ # human
26 "person", "people", "man", "woman", "child", "boy", "girl", "human",
27 "teacher", "student", "doctor", "nurse", "soldier", "king", "queen",
28 "father", "mother", "parent", "friend", "enemy", "citizen", "worker",
29 "author", "writer", "singer", "actor", "player", "athlete", "artist",
30 "politician", "official", "minister", "police", "guard", "servant",
31 "husband", "wife", "brother", "sister", "son", "daughter", "baby",
32 "adult", "youth", "elder", "ancestor", "descendant", "neighbor",
33 "stranger", "guest", "host", "leader", "follower", "hero", "villain",
34 "criminal", "victim", "survivor", "refugee", "immigrant",
35 ],
36 2: [ # animate non-human
37 "animal", "creature", "beast", "bird", "fish", "insect", "bug",
38 "cat", "dog", "horse", "cow", "pig", "sheep", "goat", "rabbit",
39 "mouse", "rat", "bear", "wolf", "fox", "deer", "lion", "tiger",
40 "elephant", "monkey", "snake", "frog", "turtle", "crab", "shrimp",
41 "whale", "dolphin", "shark", "eagle", "crow", "sparrow", "dove",
42 "butterfly", "bee", "ant", "spider", "worm", "dragon", "monster",
43 "plant", "tree", "flower", "grass", "rice", "seed",
44 ],
45 3: [ # abstract
46 "idea", "concept", "thought", "mind", "spirit", "soul", "feeling",
47 "emotion", "love", "hate", "fear", "hope", "dream", "memory",
48 "freedom", "peace", "justice", "truth", "beauty", "good", "evil",
49 "right", "wrong", "law", "rule", "theory", "principle", "belief",
50 "knowledge", "wisdom", "culture", "tradition", "custom", "habit",
51 "language", "word", "meaning", "reason", "cause", "effect",
52 "value", "worth", "purpose", "goal", "plan", "method",
53 ],
54 4: [ # place
55 "place", "location", "area", "region", "country", "nation", "state",
56 "city", "town", "village", "district", "street", "road", "path",
57 "building", "house", "room", "hall", "floor", "office", "school",
58 "hospital", "church", "temple", "shrine", "park", "garden",
59 "forest", "mountain", "hill", "valley", "river", "lake", "sea",
60 "ocean", "island", "desert", "field", "farm", "port", "station",
61 "airport", "market", "shop", "store", "restaurant", "hotel",
62 "universe", "world", "earth", "sky", "heaven", "hell",
63 ],
64 5: [ # artifact
65 "tool", "device", "machine", "engine", "car", "vehicle", "ship",
66 "plane", "train", "bicycle", "boat", "weapon", "sword", "gun",
67 "knife", "book", "paper", "pen", "pencil", "letter", "document",
68 "phone", "computer", "screen", "camera", "television", "radio",
69 "clock", "watch", "key", "lock", "door", "window", "chair",
70 "table", "bed", "clothes", "dress", "shoe", "hat", "bag",
71 "food", "drink", "bread", "rice", "medicine", "drug",
72 "money", "coin", "ticket", "card", "flag",
73 ],
74 6: [ # natural inanimate
75 "stone", "rock", "sand", "soil", "mud", "dust", "ash",
76 "water", "rain", "snow", "ice", "fire", "smoke", "air", "wind",
77 "cloud", "star", "sun", "moon", "light", "dark", "shadow",
78 "wave", "tide", "storm", "thunder", "lightning", "earthquake",
79 "metal", "gold", "silver", "iron", "copper", "wood", "coal",
80 "oil", "gas", "salt", "sugar", "blood", "bone",
81 ],
82 7: [ # event
83 "event", "incident", "accident", "disaster", "war", "battle",
84 "fight", "conflict", "attack", "defense", "victory", "defeat",
85 "meeting", "conference", "ceremony", "festival", "wedding",
86 "election", "vote", "trial", "judgment", "punishment",
87 "birth", "death", "marriage", "divorce", "departure", "arrival",
88 "change", "revolution", "reform", "movement", "crisis",
89 ],
90 8: [ # collective/organization
91 "group", "team", "party", "club", "union", "organization",
92 "company", "firm", "corporation", "government", "army", "navy",
93 "police", "family", "tribe", "nation", "community", "society",
94 "committee", "council", "parliament", "congress", "court",
95 "school", "university", "church", "religion",
96 ],
97 }
98
99 def classify_en(en_words):
100 """Match EN translation words against category rules."""
101 text = " ".join(en_words).lower()
102 words = re.findall(r'[a-z]+', text)
103 scores = {cat: 0 for cat in RULES}
104 for cat, keywords in RULES.items():
105 for kw in keywords:
106 if kw in words or any(w.startswith(kw) for w in words):
107 scores[cat] += 1
108 best_cat = max(scores, key=lambda c: scores[c])
109 return best_cat if scores[best_cat] > 0 else 0
110
111 def translate(ja_word):
112 """Translate JA→EN via transdb CLI."""
113 try:
114 r = subprocess.run(
115 [TRANSDB, "translate", "-src", "ja", "-dst", "en", "-db", DB, ja_word],
116 capture_output=True, text=True, timeout=5)
117 return r.stdout.strip()
118 except Exception:
119 return ""
120
121 def main():
122 # Load wordlist, filter to nouns (skip short hiragana-only strings — mostly particles).
123 entries = []
124 with open(WLIST) as f:
125 for line in f:
126 line = line.rstrip('\n')
127 tab = line.find('\t')
128 if tab < 0:
129 continue
130 form, freq = line[:tab], int(line[tab+1:]) if tab >= 0 else 0
131 # Skip pure hiragana (usually particles/function words) and very short forms.
132 has_kanji = any('一' <= c <= '鿿' or '㐀' <= c <= '䶿' for c in form)
133 has_kata = any('゠' <= c <= 'ヿ' for c in form)
134 if not (has_kanji or has_kata):
135 continue
136 if len(form) < 2:
137 continue
138 entries.append((form, freq))
139 if len(entries) >= TOP_N:
140 break
141
142 print(f"Processing {len(entries)} nouns...", file=sys.stderr)
143
144 results = []
145 for i, (form, freq) in enumerate(entries):
146 en = translate(form)
147 cat = classify_en(en.split()) if en else 0
148 results.append((form, cat, en, freq))
149 if (i+1) % 50 == 0:
150 print(f" {i+1}/{len(entries)}: {form} → {en} → cat={cat}", file=sys.stderr)
151
152 with open(OUT, 'w') as f:
153 f.write("# JA_form\tsemantic_code\ten_translation\tfrequency\n")
154 for form, cat, en, freq in results:
155 f.write(f"{form}\t{cat}\t{en}\t{freq}\n")
156
157 cats = {}
158 for _, cat, _, _ in results:
159 cats[cat] = cats.get(cat, 0) + 1
160 print(f"\nWrote {len(results)} entries to {OUT}", file=sys.stderr)
161 names = {0:"unspecified",1:"human",2:"animate",3:"abstract",4:"place",
162 5:"artifact",6:"natural",7:"event",8:"collective"}
163 for c in sorted(cats):
164 print(f" {names.get(c,str(c))}: {cats[c]}", file=sys.stderr)
165
166 if __name__ == "__main__":
167 main()
168