#!/usr/bin/env python3 """ Quality sampling for round-trip test failures. Samples min(2%, 1000) pairs, pipes batches through the local `claude` CLI (no API credits — uses the Claude Code session). """ import sys, os, random, re, subprocess, time FAIL_TSV = os.environ.get("FAIL_TSV", f"{os.environ['HOME']}/tmp/roundtrip_fails.tsv") OUT_REPORT = os.environ.get("OUT_REPORT", f"{os.environ['HOME']}/tmp/quality_report.txt") BATCH_SIZE = 40 MAX_SAMPLE = 1000 SAMPLE_PCT = 0.02 BATCH_DELAY = 3 # seconds between batches to avoid rate limiting def load_fails(path): rows = [] with open(path) as f: for i, line in enumerate(f): if i == 0: continue parts = line.rstrip('\n').split('\t') if len(parts) < 4: continue rows.append(tuple(parts[:4])) return rows def sample(rows): n = min(MAX_SAMPLE, int(len(rows) * SAMPLE_PCT)) print(f"Total fails: {len(rows)}, sampling {n} ({SAMPLE_PCT*100:.0f}% or {MAX_SAMPLE} max)", file=sys.stderr) return random.sample(rows, n) def make_prompt(batch): lines = [] for i, (direction, source, primary, backs) in enumerate(batch, 1): if direction.startswith("JA"): q = f"Is '{primary}' a valid English translation of Japanese '{source}'?" else: q = f"Is '{primary}' a valid Japanese translation of English '{source}'?" lines.append(f"{i}. {q}") return ( "Rate each translation GOOD or BAD. GOOD = semantically valid even if not the only " "possible translation. BAD = wrong or misleading.\n" "One line per item: number, GOOD or BAD, brief reason.\n\n" + "\n".join(lines) ) def parse_response(text, n): results = [] for line in text.strip().split('\n'): m = re.match(r'(\d+)[.)]\s+(GOOD|BAD)', line.strip(), re.IGNORECASE) if m: results.append(m.group(2).upper()) while len(results) < n: results.append("UNKNOWN") return results[:n] def call_claude(prompt): result = subprocess.run( ["claude", "--print", "--model", "claude-haiku-4-5-20251001"], input=prompt, capture_output=True, text=True, timeout=120 ) if result.returncode != 0: print(f"claude error: {result.stderr[:200]}", file=sys.stderr) return "" return result.stdout def main(): random.seed(int(time.time())) rows = load_fails(FAIL_TSV) sampled = sample(rows) all_results = [] batches = (len(sampled) + BATCH_SIZE - 1) // BATCH_SIZE for b in range(batches): batch = sampled[b*BATCH_SIZE:(b+1)*BATCH_SIZE] prompt = make_prompt(batch) response = call_claude(prompt) ratings = parse_response(response, len(batch)) for (direction, source, primary, backs), rating in zip(batch, ratings): all_results.append((direction, source, primary, backs, rating)) good = sum(1 for r in all_results if r[4] == "GOOD") bad = sum(1 for r in all_results if r[4] == "BAD") total = len(all_results) print(f"Batch {b+1}/{batches}: {good}/{total} GOOD ({good*100//max(total,1)}%)", file=sys.stderr) if b < batches - 1: time.sleep(BATCH_DELAY) good = sum(1 for r in all_results if r[4] == "GOOD") bad = sum(1 for r in all_results if r[4] == "BAD") unknown = sum(1 for r in all_results if r[4] == "UNKNOWN") total = len(all_results) with open(OUT_REPORT, 'w') as f: f.write(f"Quality sample: {total} pairs assessed\n") f.write(f" GOOD: {good:5d} ({good*100//max(total,1)}%)\n") f.write(f" BAD: {bad:5d} ({bad*100//max(total,1)}%)\n") if unknown: f.write(f" UNKNOWN: {unknown:5d}\n") f.write("\n--- BAD translations (sample) ---\n") count = 0 for direction, source, primary, backs, rating in all_results: if rating == "BAD": f.write(f" [{direction}] {source} → {primary} (back: {backs})\n") count += 1 if count >= 50: f.write(" ... (truncated)\n") break print(f"\nQuality sample: {total} pairs assessed") print(f" GOOD: {good} ({good*100//max(total,1)}%)") print(f" BAD: {bad} ({bad*100//max(total,1)}%)") print(f"Full report: {OUT_REPORT}") if __name__ == "__main__": main()