// tatoeba-join reads the three Tatoeba per-language export files and writes // two parallel text files (one sentence per line) that transdb extend can consume. // // Usage: // tatoeba-join -links eng-jpn_links.tsv -en eng_sentences.tsv -ja jpn_sentences.tsv \ // -out-en /tmp/tatoeba.en -out-ja /tmp/tatoeba.ja // // The links file has two columns: eng_id\tjpn_id // The sentence files have three columns: id\tlang\ttext package main import ( "bufio" "bytes" "fmt" "os" ) func main() { args := os.Args[1:] linksPath := "" enPath := "" jaPath := "" outEn := "/tmp/tatoeba.en" outJa := "/tmp/tatoeba.ja" for i := 0; i < len(args); i++ { switch args[i] { case "-links": i++ linksPath = args[i] case "-en": i++ enPath = args[i] case "-ja": i++ jaPath = args[i] case "-out-en": i++ outEn = args[i] case "-out-ja": i++ outJa = args[i] } } if linksPath == "" || enPath == "" || jaPath == "" { fmt.Fprintln(os.Stderr, "usage: tatoeba-join -links -en -ja ") os.Exit(1) } fmt.Fprintln(os.Stderr, "loading English sentences...") enMap, err := loadSentences(enPath) if err != nil { fmt.Fprintln(os.Stderr, err.Error()) os.Exit(1) } fmt.Fprintf(os.Stderr, " %d English sentences loaded\n", len(enMap)) fmt.Fprintln(os.Stderr, "loading Japanese sentences...") jaMap, err := loadSentences(jaPath) if err != nil { fmt.Fprintln(os.Stderr, err.Error()) os.Exit(1) } fmt.Fprintf(os.Stderr, " %d Japanese sentences loaded\n", len(jaMap)) wEn, err := os.Create(outEn) if err != nil { fmt.Fprintln(os.Stderr, err.Error()) os.Exit(1) } defer wEn.Close() wJa, err := os.Create(outJa) if err != nil { fmt.Fprintln(os.Stderr, err.Error()) os.Exit(1) } defer wJa.Close() bEn := bufio.NewWriter(wEn) bJa := bufio.NewWriter(wJa) fmt.Fprintln(os.Stderr, "joining...") lf, err := os.Open(linksPath) if err != nil { fmt.Fprintln(os.Stderr, err.Error()) os.Exit(1) } defer lf.Close() sc := bufio.NewScanner(lf) written := 0 for sc.Scan() { line := sc.Bytes() tab := bytes.IndexByte(line, '\t') if tab < 0 { continue } // Copy: scanner buffer is reused each Scan call. engID := string(append([]byte(nil), line[:tab]...)) jpnID := string(append([]byte(nil), line[tab+1:]...)) enText, enOK := enMap[engID] jaText, jaOK := jaMap[jpnID] if !enOK || !jaOK { continue } bEn.WriteString(enText) bEn.WriteByte('\n') bJa.WriteString(jaText) bJa.WriteByte('\n') written++ } bEn.Flush() bJa.Flush() fmt.Fprintf(os.Stderr, "wrote %d sentence pairs → %s, %s\n", written, outEn, outJa) } // loadSentences reads a Tatoeba sentence file (id\tlang\ttext) into a map. func loadSentences(path string) (map[string]string, error) { f, err := os.Open(path) if err != nil { return nil, err } defer f.Close() m := map[string]string{} sc := bufio.NewScanner(f) sc.Buffer([]byte{:0:1<<20}, 1<<20) // 1MB line buffer for long sentences for sc.Scan() { line := sc.Bytes() // Format: id\tlang\ttext t1 := bytes.IndexByte(line, '\t') if t1 < 0 { continue } t2 := bytes.IndexByte(line[t1+1:], '\t') if t2 < 0 { continue } // Explicit copy: string=[]byte in Moxie means string(line[:n]) is a // no-op slice alias. bufio.Scanner reuses its buffer on the next Scan, // corrupting all "strings" stored from previous calls. Must copy. id := string(append([]byte(nil), line[:t1]...)) raw := line[t1+1+t2+1:] text := string(append([]byte(nil), raw...)) if text != "" { m[id] = text } } return m, sc.Err() }