main.mx raw

   1  // tatoeba-join reads the three Tatoeba per-language export files and writes
   2  // two parallel text files (one sentence per line) that transdb extend can consume.
   3  //
   4  // Usage:
   5  //   tatoeba-join -links eng-jpn_links.tsv -en eng_sentences.tsv -ja jpn_sentences.tsv \
   6  //                -out-en /tmp/tatoeba.en -out-ja /tmp/tatoeba.ja
   7  //
   8  // The links file has two columns: eng_id\tjpn_id
   9  // The sentence files have three columns: id\tlang\ttext
  10  package main
  11  
  12  import (
  13  	"bufio"
  14  	"bytes"
  15  	"fmt"
  16  	"os"
  17  )
  18  
  19  func main() {
  20  	args := os.Args[1:]
  21  	linksPath := ""
  22  	enPath := ""
  23  	jaPath := ""
  24  	outEn := "/tmp/tatoeba.en"
  25  	outJa := "/tmp/tatoeba.ja"
  26  
  27  	for i := 0; i < len(args); i++ {
  28  		switch args[i] {
  29  		case "-links":
  30  			i++
  31  			linksPath = args[i]
  32  		case "-en":
  33  			i++
  34  			enPath = args[i]
  35  		case "-ja":
  36  			i++
  37  			jaPath = args[i]
  38  		case "-out-en":
  39  			i++
  40  			outEn = args[i]
  41  		case "-out-ja":
  42  			i++
  43  			outJa = args[i]
  44  		}
  45  	}
  46  	if linksPath == "" || enPath == "" || jaPath == "" {
  47  		fmt.Fprintln(os.Stderr, "usage: tatoeba-join -links <links.tsv> -en <eng.tsv> -ja <jpn.tsv>")
  48  		os.Exit(1)
  49  	}
  50  
  51  	fmt.Fprintln(os.Stderr, "loading English sentences...")
  52  	enMap, err := loadSentences(enPath)
  53  	if err != nil {
  54  		fmt.Fprintln(os.Stderr, err.Error())
  55  		os.Exit(1)
  56  	}
  57  	fmt.Fprintf(os.Stderr, "  %d English sentences loaded\n", len(enMap))
  58  
  59  	fmt.Fprintln(os.Stderr, "loading Japanese sentences...")
  60  	jaMap, err := loadSentences(jaPath)
  61  	if err != nil {
  62  		fmt.Fprintln(os.Stderr, err.Error())
  63  		os.Exit(1)
  64  	}
  65  	fmt.Fprintf(os.Stderr, "  %d Japanese sentences loaded\n", len(jaMap))
  66  
  67  	wEn, err := os.Create(outEn)
  68  	if err != nil {
  69  		fmt.Fprintln(os.Stderr, err.Error())
  70  		os.Exit(1)
  71  	}
  72  	defer wEn.Close()
  73  
  74  	wJa, err := os.Create(outJa)
  75  	if err != nil {
  76  		fmt.Fprintln(os.Stderr, err.Error())
  77  		os.Exit(1)
  78  	}
  79  	defer wJa.Close()
  80  
  81  	bEn := bufio.NewWriter(wEn)
  82  	bJa := bufio.NewWriter(wJa)
  83  
  84  	fmt.Fprintln(os.Stderr, "joining...")
  85  	lf, err := os.Open(linksPath)
  86  	if err != nil {
  87  		fmt.Fprintln(os.Stderr, err.Error())
  88  		os.Exit(1)
  89  	}
  90  	defer lf.Close()
  91  
  92  	sc := bufio.NewScanner(lf)
  93  	written := 0
  94  	for sc.Scan() {
  95  		line := sc.Bytes()
  96  		tab := bytes.IndexByte(line, '\t')
  97  		if tab < 0 {
  98  			continue
  99  		}
 100  		// Copy: scanner buffer is reused each Scan call.
 101  		engID := string(append([]byte(nil), line[:tab]...))
 102  		jpnID := string(append([]byte(nil), line[tab+1:]...))
 103  
 104  		enText, enOK := enMap[engID]
 105  		jaText, jaOK := jaMap[jpnID]
 106  		if !enOK || !jaOK {
 107  			continue
 108  		}
 109  
 110  		bEn.WriteString(enText)
 111  		bEn.WriteByte('\n')
 112  		bJa.WriteString(jaText)
 113  		bJa.WriteByte('\n')
 114  		written++
 115  	}
 116  
 117  	bEn.Flush()
 118  	bJa.Flush()
 119  	fmt.Fprintf(os.Stderr, "wrote %d sentence pairs → %s, %s\n", written, outEn, outJa)
 120  }
 121  
 122  // loadSentences reads a Tatoeba sentence file (id\tlang\ttext) into a map.
 123  func loadSentences(path string) (map[string]string, error) {
 124  	f, err := os.Open(path)
 125  	if err != nil {
 126  		return nil, err
 127  	}
 128  	defer f.Close()
 129  
 130  	m := map[string]string{}
 131  	sc := bufio.NewScanner(f)
 132  	sc.Buffer([]byte{:0:1<<20}, 1<<20) // 1MB line buffer for long sentences
 133  	for sc.Scan() {
 134  		line := sc.Bytes()
 135  		// Format: id\tlang\ttext
 136  		t1 := bytes.IndexByte(line, '\t')
 137  		if t1 < 0 {
 138  			continue
 139  		}
 140  		t2 := bytes.IndexByte(line[t1+1:], '\t')
 141  		if t2 < 0 {
 142  			continue
 143  		}
 144  		// Explicit copy: string=[]byte in Moxie means string(line[:n]) is a
 145  		// no-op slice alias. bufio.Scanner reuses its buffer on the next Scan,
 146  		// corrupting all "strings" stored from previous calls. Must copy.
 147  		id := string(append([]byte(nil), line[:t1]...))
 148  		raw := line[t1+1+t2+1:]
 149  		text := string(append([]byte(nil), raw...))
 150  		if text != "" {
 151  			m[id] = text
 152  		}
 153  	}
 154  	return m, sc.Err()
 155  }
 156