#!/usr/bin/env python3
"""
Quality sampling for round-trip test failures.
Samples min(2%, 1000) pairs, pipes batches through the local `claude` CLI
(no API credits — uses the Claude Code session).
"""
import sys, os, random, re, subprocess, time

FAIL_TSV   = os.environ.get("FAIL_TSV",   f"{os.environ['HOME']}/tmp/roundtrip_fails.tsv")
OUT_REPORT = os.environ.get("OUT_REPORT", f"{os.environ['HOME']}/tmp/quality_report.txt")
BATCH_SIZE = 40
MAX_SAMPLE = 1000
SAMPLE_PCT = 0.02
BATCH_DELAY = 3  # seconds between batches to avoid rate limiting

def load_fails(path):
    rows = []
    with open(path) as f:
        for i, line in enumerate(f):
            if i == 0:
                continue
            parts = line.rstrip('\n').split('\t')
            if len(parts) < 4:
                continue
            rows.append(tuple(parts[:4]))
    return rows

def sample(rows):
    n = min(MAX_SAMPLE, int(len(rows) * SAMPLE_PCT))
    print(f"Total fails: {len(rows)}, sampling {n} ({SAMPLE_PCT*100:.0f}% or {MAX_SAMPLE} max)",
          file=sys.stderr)
    return random.sample(rows, n)

def make_prompt(batch):
    lines = []
    for i, (direction, source, primary, backs) in enumerate(batch, 1):
        if direction.startswith("JA"):
            q = f"Is '{primary}' a valid English translation of Japanese '{source}'?"
        else:
            q = f"Is '{primary}' a valid Japanese translation of English '{source}'?"
        lines.append(f"{i}. {q}")
    return (
        "Rate each translation GOOD or BAD. GOOD = semantically valid even if not the only "
        "possible translation. BAD = wrong or misleading.\n"
        "One line per item: number, GOOD or BAD, brief reason.\n\n"
        + "\n".join(lines)
    )

def parse_response(text, n):
    results = []
    for line in text.strip().split('\n'):
        m = re.match(r'(\d+)[.)]\s+(GOOD|BAD)', line.strip(), re.IGNORECASE)
        if m:
            results.append(m.group(2).upper())
    while len(results) < n:
        results.append("UNKNOWN")
    return results[:n]

def call_claude(prompt):
    result = subprocess.run(
        ["claude", "--print", "--model", "claude-haiku-4-5-20251001"],
        input=prompt, capture_output=True, text=True, timeout=120
    )
    if result.returncode != 0:
        print(f"claude error: {result.stderr[:200]}", file=sys.stderr)
        return ""
    return result.stdout

def main():
    random.seed(int(time.time()))
    rows = load_fails(FAIL_TSV)
    sampled = sample(rows)

    all_results = []
    batches = (len(sampled) + BATCH_SIZE - 1) // BATCH_SIZE

    for b in range(batches):
        batch = sampled[b*BATCH_SIZE:(b+1)*BATCH_SIZE]
        prompt = make_prompt(batch)
        response = call_claude(prompt)
        ratings = parse_response(response, len(batch))
        for (direction, source, primary, backs), rating in zip(batch, ratings):
            all_results.append((direction, source, primary, backs, rating))

        good  = sum(1 for r in all_results if r[4] == "GOOD")
        bad   = sum(1 for r in all_results if r[4] == "BAD")
        total = len(all_results)
        print(f"Batch {b+1}/{batches}: {good}/{total} GOOD ({good*100//max(total,1)}%)", file=sys.stderr)

        if b < batches - 1:
            time.sleep(BATCH_DELAY)

    good    = sum(1 for r in all_results if r[4] == "GOOD")
    bad     = sum(1 for r in all_results if r[4] == "BAD")
    unknown = sum(1 for r in all_results if r[4] == "UNKNOWN")
    total   = len(all_results)

    with open(OUT_REPORT, 'w') as f:
        f.write(f"Quality sample: {total} pairs assessed\n")
        f.write(f"  GOOD:    {good:5d} ({good*100//max(total,1)}%)\n")
        f.write(f"  BAD:     {bad:5d} ({bad*100//max(total,1)}%)\n")
        if unknown:
            f.write(f"  UNKNOWN: {unknown:5d}\n")
        f.write("\n--- BAD translations (sample) ---\n")
        count = 0
        for direction, source, primary, backs, rating in all_results:
            if rating == "BAD":
                f.write(f"  [{direction}] {source} → {primary}  (back: {backs})\n")
                count += 1
                if count >= 50:
                    f.write("  ... (truncated)\n")
                    break

    print(f"\nQuality sample: {total} pairs assessed")
    print(f"  GOOD:      {good} ({good*100//max(total,1)}%)")
    print(f"  BAD:       {bad} ({bad*100//max(total,1)}%)")
    print(f"Full report: {OUT_REPORT}")

if __name__ == "__main__":
    main()