main.mx raw
1 // tatoeba-join reads the three Tatoeba per-language export files and writes
2 // two parallel text files (one sentence per line) that transdb extend can consume.
3 //
4 // Usage:
5 // tatoeba-join -links eng-jpn_links.tsv -en eng_sentences.tsv -ja jpn_sentences.tsv \
6 // -out-en /tmp/tatoeba.en -out-ja /tmp/tatoeba.ja
7 //
8 // The links file has two columns: eng_id\tjpn_id
9 // The sentence files have three columns: id\tlang\ttext
10 package main
11
12 import (
13 "bufio"
14 "bytes"
15 "fmt"
16 "os"
17 )
18
19 func main() {
20 args := os.Args[1:]
21 linksPath := ""
22 enPath := ""
23 jaPath := ""
24 outEn := "/tmp/tatoeba.en"
25 outJa := "/tmp/tatoeba.ja"
26
27 for i := 0; i < len(args); i++ {
28 switch args[i] {
29 case "-links":
30 i++
31 linksPath = args[i]
32 case "-en":
33 i++
34 enPath = args[i]
35 case "-ja":
36 i++
37 jaPath = args[i]
38 case "-out-en":
39 i++
40 outEn = args[i]
41 case "-out-ja":
42 i++
43 outJa = args[i]
44 }
45 }
46 if linksPath == "" || enPath == "" || jaPath == "" {
47 fmt.Fprintln(os.Stderr, "usage: tatoeba-join -links <links.tsv> -en <eng.tsv> -ja <jpn.tsv>")
48 os.Exit(1)
49 }
50
51 fmt.Fprintln(os.Stderr, "loading English sentences...")
52 enMap, err := loadSentences(enPath)
53 if err != nil {
54 fmt.Fprintln(os.Stderr, err.Error())
55 os.Exit(1)
56 }
57 fmt.Fprintf(os.Stderr, " %d English sentences loaded\n", len(enMap))
58
59 fmt.Fprintln(os.Stderr, "loading Japanese sentences...")
60 jaMap, err := loadSentences(jaPath)
61 if err != nil {
62 fmt.Fprintln(os.Stderr, err.Error())
63 os.Exit(1)
64 }
65 fmt.Fprintf(os.Stderr, " %d Japanese sentences loaded\n", len(jaMap))
66
67 wEn, err := os.Create(outEn)
68 if err != nil {
69 fmt.Fprintln(os.Stderr, err.Error())
70 os.Exit(1)
71 }
72 defer wEn.Close()
73
74 wJa, err := os.Create(outJa)
75 if err != nil {
76 fmt.Fprintln(os.Stderr, err.Error())
77 os.Exit(1)
78 }
79 defer wJa.Close()
80
81 bEn := bufio.NewWriter(wEn)
82 bJa := bufio.NewWriter(wJa)
83
84 fmt.Fprintln(os.Stderr, "joining...")
85 lf, err := os.Open(linksPath)
86 if err != nil {
87 fmt.Fprintln(os.Stderr, err.Error())
88 os.Exit(1)
89 }
90 defer lf.Close()
91
92 sc := bufio.NewScanner(lf)
93 written := 0
94 for sc.Scan() {
95 line := sc.Bytes()
96 tab := bytes.IndexByte(line, '\t')
97 if tab < 0 {
98 continue
99 }
100 // Copy: scanner buffer is reused each Scan call.
101 engID := string(append([]byte(nil), line[:tab]...))
102 jpnID := string(append([]byte(nil), line[tab+1:]...))
103
104 enText, enOK := enMap[engID]
105 jaText, jaOK := jaMap[jpnID]
106 if !enOK || !jaOK {
107 continue
108 }
109
110 bEn.WriteString(enText)
111 bEn.WriteByte('\n')
112 bJa.WriteString(jaText)
113 bJa.WriteByte('\n')
114 written++
115 }
116
117 bEn.Flush()
118 bJa.Flush()
119 fmt.Fprintf(os.Stderr, "wrote %d sentence pairs → %s, %s\n", written, outEn, outJa)
120 }
121
122 // loadSentences reads a Tatoeba sentence file (id\tlang\ttext) into a map.
123 func loadSentences(path string) (map[string]string, error) {
124 f, err := os.Open(path)
125 if err != nil {
126 return nil, err
127 }
128 defer f.Close()
129
130 m := map[string]string{}
131 sc := bufio.NewScanner(f)
132 sc.Buffer([]byte{:0:1<<20}, 1<<20) // 1MB line buffer for long sentences
133 for sc.Scan() {
134 line := sc.Bytes()
135 // Format: id\tlang\ttext
136 t1 := bytes.IndexByte(line, '\t')
137 if t1 < 0 {
138 continue
139 }
140 t2 := bytes.IndexByte(line[t1+1:], '\t')
141 if t2 < 0 {
142 continue
143 }
144 // Explicit copy: string=[]byte in Moxie means string(line[:n]) is a
145 // no-op slice alias. bufio.Scanner reuses its buffer on the next Scan,
146 // corrupting all "strings" stored from previous calls. Must copy.
147 id := string(append([]byte(nil), line[:t1]...))
148 raw := line[t1+1+t2+1:]
149 text := string(append([]byte(nil), raw...))
150 if text != "" {
151 m[id] = text
152 }
153 }
154 return m, sc.Err()
155 }
156