File size: 23,814 Bytes
10b0761
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

if [ -z $WORKDIR_ROOT ] ;
then
        echo "please specify your working directory root in environment variable WORKDIR_ROOT. Exitting..."
        exit
fi



set -x -e

# TODO update the workdir and dest dir name
# put fasttext model
WORKDIR=$WORKDIR_ROOT
# put intermediate files
TMP_DIR=$WORKDIR_ROOT/tmp/tmp_wmt20_lowres_download
# output {train,valid,test} files to dest
DEST=$WORKDIR_ROOT/ML50/raw

UTILS=$PWD/utils

# per dataset locations
COMMONCRAWL_DIR=$TMP_DIR/commoncrawl
YANDEX_CORPUS=$WORKDIR_ROOT/wmt20/official/ru/yandex/1mcorpus.zip
# unzipped
CZENG_CORPUS=$WORKDIR_ROOT/wmt20/official/cs/czeng/czeng20-train
CCMT_DIR=$WORKDIR_ROOT/wmt20/official/zh/ccmt/parallel

download_and_select() {
  SUBFOLDER=$1
  URL=$2
  UNCOMPRESS_CMD=$3
  LANG=$4
  INPUT_FILEPATH=$5
  if [[ $# -gt 5 ]]; then
    LANG_COL=$6
    EN_COL=$7
  fi

  mkdir -p $SUBFOLDER
  cd $SUBFOLDER
  wget -nc --content-disposition $URL
  $UNCOMPRESS_CMD

  if [[ $# -gt 5 ]]; then
    cut -f$LANG_COL $INPUT_FILEPATH > $INPUT_FILEPATH.$LANG
    cut -f$EN_COL $INPUT_FILEPATH > $INPUT_FILEPATH.en
  fi
  cd ..

  ln -sf $SUBFOLDER/$INPUT_FILEPATH.$LANG $SUBFOLDER.$LANG
  ln -sf $SUBFOLDER/$INPUT_FILEPATH.en $SUBFOLDER.en
}

prepare_lid() {
  pip install fasttext

  # TODO specify global workdir
  MODEL=$WORKDIR/fasttext/lid.176.bin
  LID_MULTI=$UTILS/fasttext_multi_filter.py

  if [ ! -f "$MODEL" ]; then
    echo "downloading fasttext lid model..."
    mkdir -p $WORKDIR/fasttext
    wget -nc https://dl.fbaipublicfiles.com/fasttext/supervised-models/lid.176.bin -O $MODEL
  fi
}

prepare_moses() {
  pushd $UTILS
  echo 'Cloning Moses github repository (for tokenization scripts)...'
  git clone https://github.com/moses-smt/mosesdecoder.git  
  popd
}

lid_filter() {
  # TODO specify global workdir
  MODEL=$WORKDIR/fasttext/lid.176.bin
  LID_MULTI=$UTILS/fasttext_multi_filter.py

  prepare_lid

  SRC=$1
  SRC_FILE=$2
  SRC_OUTPUT=$3
  TGT=$4
  TGT_FILE=$5
  TGT_OUTPUT=$6
  python $LID_MULTI --model $MODEL --inputs $SRC_FILE $TGT_FILE --langs $SRC $TGT --outputs $SRC_OUTPUT $TGT_OUTPUT
}

prepare_ja_ted() {
  mkdir -p ted
  cd ted

  wget -nc https://wit3.fbk.eu/archive/2017-01-trnted//texts/en/ja/en-ja.tgz
  tar -zxvf en-ja.tgz
  cat en-ja/train.tags.en-ja.en | grep -v -P "^[ ]*\<" | sed 's/^[ \t]*//g' | sed 's/[ \t]*$//g' > en-ja/train.en-ja.en
  cat en-ja/train.tags.en-ja.ja | grep -v -P "^[ ]*\<" | sed 's/^[ \t]*//g' | sed 's/[ \t]*$//g' > en-ja/train.en-ja.ja

  cd ..
  ln -sf ted/en-ja/train.en-ja.ja ted.ja
  ln -sf ted/en-ja/train.en-ja.en ted.en
}

prepare_ja() {
  OUTPUT_DIR=$TMP_DIR/ja
  mkdir -p $OUTPUT_DIR
  cd $OUTPUT_DIR

  download_and_select paracrawl "http://www.kecl.ntt.co.jp/icl/lirg/jparacrawl/release/2.0/bitext/en-ja.tar.gz" "tar -zxvf en-ja.tar.gz" ja en-ja/en-ja.bicleaner05.txt 4 3 &
  download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.en-ja.tsv.gz" "gunzip -f news-commentary-v15.en-ja.tsv.gz" ja news-commentary-v15.en-ja.tsv 2 1 &
  download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ja-en.tsv.gz" "gunzip -f wikititles-v2.ja-en.tsv.gz" ja wikititles-v2.ja-en.tsv 1 2 &
  download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-ja.langid.tsv.gz" "gunzip -f WikiMatrix.v1.en-ja.langid.tsv.gz" ja WikiMatrix.v1.en-ja.langid.tsv 3 2 &
  download_and_select subtitle "https://nlp.stanford.edu/projects/jesc/data/split.tar.gz" "tar -zxvf split.tar.gz" ja split/train 2 1 &
  download_and_select kftt "http://www.phontron.com/kftt/download/kftt-data-1.0.tar.gz" "tar -zxvf kftt-data-1.0.tar.gz" ja kftt-data-1.0/data/orig/kyoto-train &

  prepare_ja_ted &

  # ted data needs to 

  wait

  # remove previous results
  rm -f all.??
  find ./ -maxdepth 1 -name "*.ja" | sort -V | xargs cat > all.ja
  find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
  lid_filter ja all.ja $DEST/train.ja_XX-en_XX.ja_XX en all.en $DEST/train.ja_XX-en_XX.en_XX
}

prepare_ta() {
  OUTPUT_DIR=$TMP_DIR/ta
  mkdir -p $OUTPUT_DIR
  cd $OUTPUT_DIR

  download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ta-en.tsv.gz" "gunzip -f wikititles-v2.ta-en.tsv.gz" ta wikititles-v2.ta-en.tsv 1 2 &
  download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-ta.langid.tsv.gz" "gunzip -f WikiMatrix.v1.en-ta.langid.tsv.gz" ta WikiMatrix.v1.en-ta.langid.tsv 3 2 &
  download_and_select pmindia "http://data.statmt.org/pmindia/v1/parallel/pmindia.v1.ta-en.tsv" "" ta pmindia.v1.ta-en.tsv 2 1 &
  download_and_select tanzil "https://object.pouta.csc.fi/OPUS-Tanzil/v1/moses/en-ta.txt.zip" "unzip en-ta.txt.zip" ta Tanzil.en-ta &
  download_and_select pib "http://preon.iiit.ac.in/~jerin/resources/datasets/pib-v0.tar" "tar -xvf pib-v0.tar" ta pib/en-ta/train &
  download_and_select mkb "http://preon.iiit.ac.in/~jerin/resources/datasets/mkb-v0.tar" "tar -xvf mkb-v0.tar" ta mkb/en-ta/mkb &
  download_and_select ufal "http://ufal.mff.cuni.cz/~ramasamy/parallel/data/v2/en-ta-parallel-v2.tar.gz" "tar -zxvf en-ta-parallel-v2.tar.gz" ta en-ta-parallel-v2/corpus.bcn.train &

  wait

  # need special handling for nlpc
  mkdir -p nlpc
  cd nlpc
  wget -nc https://raw.githubusercontent.com/nlpc-uom/English-Tamil-Parallel-Corpus/master/En-Ta%20Corpus/En-Ta%20English.txt
  wget -nc https://github.com/nlpc-uom/English-Tamil-Parallel-Corpus/raw/master/En-Ta%20Corpus/En-Ta%20Tamil.txt
  tail -n +4 "En-Ta English.txt" > en-ta.en
  tail -n +4 "En-Ta Tamil.txt" > en-ta.ta
  cd ..
  ln -sf nlpc/en-ta.en nlpc.en
  ln -sf nlpc/en-ta.ta nlpc.ta

  # remove previous results
  rm -f all.??
  find ./ -maxdepth 1 -name "*.ta" | sort -V | xargs cat > all.ta
  find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
  lid_filter ta all.ta $DEST/train.ta_IN-en_XX.ta_IN en all.en $DEST/train.ta_IN-en_XX.en_XX
}

prepare_iu() {
  OUTPUT_DIR=$TMP_DIR/iu
  mkdir -p $OUTPUT_DIR
  cd $OUTPUT_DIR
  
  download_and_select nh "https://nrc-digital-repository.canada.ca/eng/view/dataset/?id=c7e34fa7-7629-43c2-bd6d-19b32bf64f60" "tar -zxvf Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0.1.tgz" iu Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0/NunavutHansard > /dev/null &
  download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.iu-en.tsv.gz" "gunzip -f wikititles-v2.iu-en.tsv.gz" iu wikititles-v2.iu-en.tsv 1 2 &

  wait

  # remove previous results
  rm -f all.??
  find ./ -maxdepth 1 -name "*.iu" | sort -V | xargs cat | nh/Nunavut-Hansard-Inuktitut-English-Parallel-Corpus-3.0/scripts/normalize-iu-spelling.pl > all.iu
  find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
  paste all.iu all.en | awk -F $'\t' '$1!=""&&$2!=""' > all.iuen
  cut -f1 all.iuen > $DEST/train.iu_CA-en_XX.iu_CA
  cut -f2 all.iuen > $DEST/train.iu_CA-en_XX.en_XX
}

prepare_km() {
  OUTPUT_DIR=$TMP_DIR/km
  mkdir -p $OUTPUT_DIR
  cd $OUTPUT_DIR

  download_and_select paracrawl "http://data.statmt.org/wmt20/translation-task/ps-km/wmt20-sent.en-km.xz" "unxz wmt20-sent.en-km.zx" km wmt20-sent.en-km 2 1 &

  # km-parallel has multiple sets, concat all of them together
  mkdir -p opus
  cd opus
  wget -nc "http://data.statmt.org/wmt20/translation-task/ps-km/km-parallel.tgz"
  tar -zxvf km-parallel.tgz
  find ./km-parallel -maxdepth 1 -name "*.km" | sort -V | xargs cat > opus.km
  find ./km-parallel -maxdepth 1 -name "*.en" | sort -V | xargs cat > opus.en
  cd ..
  ln -sf opus/opus.km .
  ln -sf opus/opus.en .

  wait

  # remove previous results
  rm -f all.??
  find ./ -maxdepth 1 -name "*.km" | sort -V | xargs cat > all.km
  find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
  lid_filter km all.km $DEST/train.km_KH-en_XX.km_KH en all.en $DEST/train.km_KH-en_XX.en_XX
}

prepare_ps() {
  OUTPUT_DIR=$TMP_DIR/ps
  mkdir -p $OUTPUT_DIR
  cd $OUTPUT_DIR

  download_and_select paracrawl "http://data.statmt.org/wmt20/translation-task/ps-km/wmt20-sent.en-ps.xz" "unxz wmt20-sent.en-ps.xz" ps wmt20-sent.en-ps 2 1 &
  download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ps-en.tsv.gz" "gunzip -f wikititles-v2.ps-en.tsv.gz" ps wikititles-v2.ps-en.tsv 1 2 &
  # ps-parallel has multiple sets, concat all of them together
  mkdir -p opus
  cd opus
  wget -nc "http://data.statmt.org/wmt20/translation-task/ps-km/ps-parallel.tgz"
  tar -zxvf ps-parallel.tgz
  find ./ps-parallel -maxdepth 1 -name "*.ps" | sort -V | xargs cat > opus.ps
  find ./ps-parallel -maxdepth 1 -name "*.en" | sort -V | xargs cat > opus.en
  cd ..
  ln -sf opus/opus.ps opus.ps
  ln -sf opus/opus.en opus.en

  wait

  # remove previous results
  rm -f all.??
  find ./ -maxdepth 1 -name "*.ps" | sort -V | xargs cat > all.ps
  find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
  lid_filter ps all.ps $DEST/train.ps_AF-en_XX.ps_AF en all.en $DEST/train.ps_AF-en_XX.en_XX
}

download_commoncrawl() {
  mkdir -p $COMMONCRAWL_DIR
  cd $COMMONCRAWL_DIR

  wget -nc "http://www.statmt.org/wmt13/training-parallel-commoncrawl.tgz"
  tar -zxvf training-parallel-commoncrawl.tgz
}
link_commoncrawl() {
  LANG=$1
  ln -sf $COMMONCRAWL_DIR/commoncrawl.$LANG-en.en commoncrawl.en
  ln -sf $COMMONCRAWL_DIR/commoncrawl.$LANG-en.$LANG commoncrawl.$LANG
}

strip_xlf() {
  INPUT_FILE=$1
  SRC=$2
  TGT=$3
  grep '<source xml:lang=' $INPUT_FILE | sed 's/^<[^<>]*>//g' | sed 's/<[^<>]*>$//g' > $INPUT_FILE.$SRC
  grep '<target xml:lang=' $INPUT_FILE | sed 's/^<[^<>]*>//g' | sed 's/<[^<>]*>$//g' > $INPUT_FILE.$TGT
}

download_and_process_tilde() {
  URL=$1
  UNCOMPRESS_CMD=$2
  FILENAME=$3
  LANG=$4
  PROCESS_CMD=$5

  mkdir -p tilde
  cd tilde
  wget -nc $URL
  $UNCOMPRESS_CMD
  echo "executing cmd"
  echo $PROCESS_CMD
  $PROCESS_CMD
  cd ..
  ln -sf tilde/$FILENAME.$LANG tilde.$LANG
  ln -sf tilde/$FILENAME.en tilde.en
}

prepare_cs() {
  OUTPUT_DIR=$TMP_DIR/cs
  mkdir -p $OUTPUT_DIR
  cd $OUTPUT_DIR

  #download_and_select europarl "http://www.statmt.org/europarl/v10/training/europarl-v10.cs-en.tsv.gz" "gunzip europarl-v10.cs-en.tsv.gz" cs europarl-v10.cs-en.tsv 1 2 &
  #download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release5.1/en-cs.txt.gz" "gunzip en-cs.txt.gz" cs en-cs.txt 2 1 &
  #link_commoncrawl cs
  #download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.cs-en.tsv.gz" "gunzip news-commentary-v15.cs-en.tsv.gz" cs news-commentary-v15.cs-en.tsv 1 2 &
  #download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.cs-en.tsv.gz" "gunzip wikititles-v2.cs-en.tsv.gz" cs wikititles-v2.cs-en.tsv 1 2 &
  #download_and_process_tilde "http://data.statmt.org/wmt20/translation-task/rapid/RAPID_2019.cs-en.xlf.gz" "gunzip RAPID_2019.cs-en.xlf.gz" RAPID_2019.cs-en.xlf cs "strip_xlf RAPID_2019.cs-en.xlf cs en" &
  #download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.cs-en.langid.tsv.gz" "gunzip WikiMatrix.v1.cs-en.langid.tsv.gz" cs WikiMatrix.v1.cs-en.langid.tsv 2 3 &

  #wait

  # remove previous results
  #rm -f all.??
  #find ./ -maxdepth 1 -name "*.cs" | sort -V | xargs cat > all.cs
  #find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
  if [ -z $CZENG_CORPUS ] ;
  then
          echo "Please download CZENG_CORPUS manually and place them at $CZENG_CORPUS. Exitting..."
          exit
  fi  
  cat $CZENG_CORPUS | sed '/^$/d' | cut -f5 > all.cs
  cat $CZENG_CORPUS | sed '/^$/d' | cut -f6 > all.en

  lid_filter cs all.cs $DEST/train.cs_CZ-en_XX.cs_CZ en all.en $DEST/train.cs_CZ-en_XX.en_XX
}

prepare_de() {
  OUTPUT_DIR=$TMP_DIR/de
  mkdir -p $OUTPUT_DIR
  cd $OUTPUT_DIR

  download_and_select europarl "http://www.statmt.org/europarl/v10/training/europarl-v10.de-en.tsv.gz" "gunzip europarl-v10.de-en.tsv.gz" de europarl-v10.de-en.tsv 1 2 &
  download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release5.1/en-de.txt.gz"  "gunzip en-de.txt.gz" de en-de.txt 2 1 &
  link_commoncrawl de
  download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.de-en.tsv.gz" "gunzip news-commentary-v15.de-en.tsv.gz" de news-commentary-v15.de-en.tsv 1 2 &
  download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.de-en.tsv.gz" "gunzip wikititles-v2.de-en.tsv.gz" de wikititles-v2.de-en.tsv 1 2 &
  download_and_process_tilde "http://data.statmt.org/wmt20/translation-task/rapid/RAPID_2019.de-en.xlf.gz" "gunzip RAPID_2019.de-en.xlf.gz" RAPID_2019.de-en.xlf de "strip_xlf RAPID_2019.de-en.xlf de en" &
  download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.de-en.langid.tsv.gz" "gunzip WikiMatrix.v1.de-en.langid.tsv.gz" de WikiMatrix.v1.de-en.langid.tsv 2 3 &

  wait

  # remove previous results
  rm -f all.??
  find ./ -maxdepth 1 -name "*.de" | sort -V | xargs cat > all.de
  find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
  lid_filter de all.de $DEST/train.de_DE-en_XX.de_DE en all.en $DEST/train.de_DE-en_XX.en_XX
}

prepare_tmx() {
  TMX_FILE=$1
  git clone https://github.com/amake/TMX2Corpus $UTILS/tmx2corpus
  pip install tinysegmenter

  python $UTILS/tmx2corpus/tmx2corpus.py $TMX_FILE
}

prepare_pl() {
  OUTPUT_DIR=$TMP_DIR/pl
  mkdir -p $OUTPUT_DIR
  cd $OUTPUT_DIR

  # download_and_select europarl "http://www.statmt.org/europarl/v10/training/europarl-v10.pl-en.tsv.gz" "gunzip europarl-v10.pl-en.tsv.gz" pl europarl-v10.pl-en.tsv 1 2 &
  # download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release5.1/en-pl.txt.gz" "gunzip en-pl.txt.gz" pl en-pl.txt 2 1 &
  # download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.pl-en.tsv.gz" "gunzip wikititles-v2.pl-en.tsv.gz" pl wikititles-v2.pl-en.tsv 1 2 &
  download_and_select tilde "https://tilde-model.s3-eu-west-1.amazonaws.com/rapid2019.en-pl.tmx.zip" "gunzip rapid2019.en-pl.tmx.zip" bitext pl "prepare_tmx RAPID_2019.UNIQUE.en-pl.tmx" &
  # download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-pl.langid.tsv.gz" "gunzip WikiMatrix.v1.en-pl.langid.tsv.gz" pl WikiMatrix.v1.en-pl.langid.tsv 3 2 &

  wait

  # remove previous results
  rm -f all.??
  find ./ -maxdepth 1 -name "*.pl" | sort -V | xargs cat > all.pl
  find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
  lid_filter pl all.pl $DEST/train.pl_PL-en_XX.pl_PL en all.en $DEST/train.pl_PL-en_XX.en_XX
}

prepare_uncorpus() {
  $URLS=$1
  $FILES=$2

  mkdir -p uncorpus
  cd uncorpus

  for URL in $URLS; do
    wget -nc $URL
  done
  cat $FILES > uncorpus.tar.gz
  tar -zxvf uncorpus.tar.gz

  cd ..
  ln -sf uncorpus/en-$LANG/UNv1.0.en-$LANG.$LANG uncorpus.$LANG
  ln -sf uncorpus/en-$LANG/UNv1.0.en-$LANG.en uncorpus.en
}

prepare_yandex() {
  mkdir -p yandex
  cd yandex
  unzip $YANDEX_CORPUS ./
  cd ..
  ln -s yandex/corpus.en_ru.1m.en yandex.en
  ln -s yandex/corpus.en_ru.1m.ru yandex.ru
}

prepare_ru() {
  OUTPUT_DIR=$TMP_DIR/ru
  mkdir -p $OUTPUT_DIR
  cd $OUTPUT_DIR

  download_and_select paracrawl "https://s3.amazonaws.com/web-language-models/paracrawl/release1/paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz" "tar -zxvf paracrawl-release1.en-ru.zipporah0-dedup-clean.tgz" ru paracrawl-release1.en-ru.zipporah0-dedup-clean &
  link_commoncrawl ru
  download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.en-ru.tsv.gz" "gunzip news-commentary-v15.en-ru.tsv.gz" ru news-commentary-v15.en-ru.tsv 2 1 &
  prepare_yandex &
  download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.ru-en.tsv.gz" "gunzip wikititles-v2.ru-en.tsv.gz" ru wikititles-v2.ru-en.tsv 1 2 &
  prepare_uncorpus "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.00 https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.01 https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-ru.tar.gz.02" "UNv1.0.en-ru.tar.gz.00 UNv1.0.en-ru.tar.gz.01 UNv1.0.en-ru.tar.gz.02" &
  download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-ru.langid.tsv.gz" "gunzip WikiMatrix.v1.en-ru.langid.tsv.gz" ru WikiMatrix.v1.en-ru.langid.tsv 3 2 &

  wait

  # remove previous results
  rm -f all.??
  find ./ -maxdepth 1 -name "*.ru" | sort -V | xargs cat > all.ru
  find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
  lid_filter ru all.ru $DEST/train.ru_RU-en_XX.ru_RU en all.en $DEST/train.ru_RU-en_XX.en_XX
}

prepare_ccmt() {
  mkdir -p ccmt
  cd ccmt
  # assume ccmt data is already unzipped under CCMT_DIR folder
  cat $CCMT_DIR/datum2017/Book*_cn.txt | sed 's/ //g' > datum2017.detok.zh
  cat $CCMT_DIR/datum2017/Book*_en.txt > datum2017.detok.en
  cat $CCMT_DIR/casict2011/casict-A_ch.txt $CCMT_DIR/casict2011/casict-B_ch.txt $CCMT_DIR/casict2015/casict2015_ch.txt $CCMT_DIR/datum2015/datum_ch.txt $CCMT_DIR/neu2017/NEU_cn.txt datum2017.detok.zh > ccmt.zh
  cat $CCMT_DIR/casict2011/casict-A_en.txt $CCMT_DIR/casict2011/casict-B_en.txt $CCMT_DIR/casict2015/casict2015_en.txt $CCMT_DIR/datum2015/datum_en.txt $CCMT_DIR/neu2017/NEU_en.txt datum2017.detok.en > ccmt.en
  cd ..
  ln -sf ccmt/ccmt.zh ccmt.zh
  ln -sf ccmt/ccmt.en ccmt.en
}

prepare_zh() {
  OUTPUT_DIR=$TMP_DIR/zh
  mkdir -p $OUTPUT_DIR
  cd $OUTPUT_DIR

  download_and_select newscommentary "http://data.statmt.org/news-commentary/v15/training/news-commentary-v15.en-zh.tsv.gz" "gunzip news-commentary-v15.en-zh.tsv.gz" zh news-commentary-v15.en-zh.tsv 2 1 &
  download_and_select wikititles "http://data.statmt.org/wikititles/v2/wikititles-v2.zh-en.tsv.gz" "gunzip wikititles-v2.zh-en.tsv.gz" zh wikititles-v2.zh-en.tsv 1 2 &
  prepare_uncorpus "https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.00 https://stuncorpusprod.blob.core.windows.net/corpusfiles/UNv1.0.en-zh.tar.gz.01" "UNv1.0.en-zh.tar.gz.00 UNv1.0.en-zh.tar.gz.01" &
  prepare_ccmt &
  download_and_select wikimatrix "http://data.statmt.org/wmt20/translation-task/WikiMatrix/WikiMatrix.v1.en-zh.langid.tsv.gz" "gunzip WikiMatrix.v1.en-zh.langid.tsv.gz" zh WikiMatrix.v1.en-zh.langid.tsv 3 2 &

  wait

  # remove previous results
  rm -f all.??
  find ./ -maxdepth 1 -name "*.zh" | sort -V | xargs cat > all.zh
  find ./ -maxdepth 1 -name "*.en" | sort -V | xargs cat > all.en
  lid_filter zh all.zh $DEST/train.zh_CN-en_XX.zh_CN en all.en $DEST/train.zh_CN-en_XX.en_XX
}

prepare_tests() {
  OUTPUT_DIR=$TMP_DIR
  mkdir -p $OUTPUT_DIR
  cd $OUTPUT_DIR
  wget -nc http://data.statmt.org/wmt20/translation-task/dev.tgz
  tar -zxvf dev.tgz
  cd dev

  cat newsdev2020-jaen-src.ja.sgm | $UTILS/strip_sgm.sh > newsdev2020-jaen.ja
  cat newsdev2020-jaen-ref.en.sgm | $UTILS/strip_sgm.sh > newsdev2020-jaen.en
  split newsdev2020-jaen.ja -a 0 -n r/1/2 > $DEST/valid.ja_XX-en_XX.ja_XX
  split newsdev2020-jaen.en -a 0 -n r/1/2 > $DEST/valid.ja_XX-en_XX.en_XX
  split newsdev2020-jaen.ja -a 0 -n r/2/2 > $DEST/test.ja_XX-en_XX.ja_XX
  split newsdev2020-jaen.en -a 0 -n r/2/2 > $DEST/test.ja_XX-en_XX.en_XX

  cat newsdev2020-iuen-src.iu.sgm | strip_sgm.sh > newsdev2020-iuen.iu
  cat newsdev2020-iuen-ref.en.sgm | strip_sgm.sh > newsdev2020-iuen.en
  split newsdev2020-iuen.iu -a 0 -n r/1/2 > $DEST/valid.iu_CA-en_XX.iu_CA
  split newsdev2020-iuen.en -a 0 -n r/1/2 > $DEST/valid.iu_CA-en_XX.en_XX
  split newsdev2020-iuen.iu -a 0 -n r/2/2 > $DEST/test.iu_CA-en_XX.iu_CA
  split newsdev2020-iuen.en -a 0 -n r/2/2 > $DEST/test.iu_CA-en_XX.en_XX

  cat newsdev2020-taen-src.ta.sgm | strip_sgm.sh > newsdev2020-taen.ta
  cat newsdev2020-taen-ref.en.sgm | strip_sgm.sh > newsdev2020-taen.en
  split newsdev2020-taen.ta -a 0 -n r/1/2 > $DEST/valid.ta_IN-en_XX.ta_IN
  split newsdev2020-taen.en -a 0 -n r/1/2 > $DEST/valid.ta_IN-en_XX.en_XX
  split newsdev2020-taen.ta -a 0 -n r/2/2 > $DEST/test.ta_IN-en_XX.ta_IN
  split newsdev2020-taen.en -a 0 -n r/2/2 > $DEST/test.ta_IN-en_XX.en_XX

  cp wikipedia.dev.km-en.km $DEST/valid.km_KH-en_XX.km_KH
  cp wikipedia.dev.km-en.en $DEST/valid.km_KH-en_XX.en_XX
  cp wikipedia.devtest.km-en.km $DEST/test.km_KH-en_XX.km_KH
  cp wikipedia.devtest.km-en.en $DEST/test.km_KH-en_XX.en_XX

  cp wikipedia.dev.ps-en.ps $DEST/valid.ps_AF-en_XX.ps_AF
  cp wikipedia.dev.ps-en.en $DEST/valid.ps_AF-en_XX.en_XX
  cp wikipedia.devtest.ps-en.ps $DEST/test.ps_AF-en_XX.ps_AF
  cp wikipedia.devtest.ps-en.en $DEST/test.ps_AF-en_XX.en_XX

  cat newsdev2020-plen-src.pl.sgm | strip_sgm.sh > newsdev2020-plen.pl
  cat newsdev2020-plen-ref.en.sgm | strip_sgm.sh > newsdev2020-plen.en
  split newsdev2020-plen.pl -a 0 -n r/1/2 > $DEST/valid.pl_PL-en_XX.pl_PL
  split newsdev2020-plen.en -a 0 -n r/1/2 > $DEST/valid.pl_PL-en_XX.en_XX
  split newsdev2020-plen.pl -a 0 -n r/2/2 > $DEST/test.pl_PL-en_XX.pl_PL
  split newsdev2020-plen.en -a 0 -n r/2/2 > $DEST/test.pl_PL-en_XX.en_XX

  cat newstest2018-encs-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-cs_CZ.en_XX
  cat newstest2018-encs-ref.cs.sgm | strip_sgm.sh > $DEST/valid.en_XX-cs_CZ.cs_CZ
  cat newstest2019-encs-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-cs_CZ.en_XX
  cat newstest2019-encs-ref.cs.sgm | strip_sgm.sh > $DEST/test.en_XX-cs_CZ.cs_CZ

  cat newstest2018-deen-src.de.sgm | strip_sgm.sh > $DEST/valid.de_DE-en_XX.de_DE
  cat newstest2018-deen-ref.en.sgm | strip_sgm.sh > $DEST/valid.de_DE-en_XX.en_XX
  cat newstest2018-ende-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-de_DE.en_XX
  cat newstest2018-ende-ref.de.sgm | strip_sgm.sh > $DEST/valid.en_XX-de_DE.de_DE
  cat newstest2019-deen-src.de.sgm | strip_sgm.sh > $DEST/test.de_DE-en_XX.de_DE
  cat newstest2019-deen-ref.en.sgm | strip_sgm.sh > $DEST/test.de_DE-en_XX.en_XX
  cat newstest2019-ende-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-de_DE.en_XX
  cat newstest2019-ende-ref.de.sgm | strip_sgm.sh > $DEST/test.en_XX-de_DE.de_DE

  cat newstest2018-ruen-src.ru.sgm | strip_sgm.sh > $DEST/valid.ru_RU-en_XX.ru_RU
  cat newstest2018-ruen-ref.en.sgm | strip_sgm.sh > $DEST/valid.ru_RU-en_XX.en_XX
  cat newstest2018-enru-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-ru_RU.en_XX
  cat newstest2018-enru-ref.ru.sgm | strip_sgm.sh > $DEST/valid.en_XX-ru_RU.ru_RU
  cat newstest2019-ruen-src.ru.sgm | strip_sgm.sh > $DEST/test.ru_RU-en_XX.ru_RU
  cat newstest2019-ruen-ref.en.sgm | strip_sgm.sh > $DEST/test.ru_RU-en_XX.en_XX
  cat newstest2019-enru-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-ru_RU.en_XX
  cat newstest2019-enru-ref.ru.sgm | strip_sgm.sh > $DEST/test.en_XX-ru_RU.ru_RU

  cat newstest2018-zhen-src.zh.sgm | strip_sgm.sh > $DEST/valid.zh_CN-en_XX.zh_CN
  cat newstest2018-zhen-ref.en.sgm | strip_sgm.sh > $DEST/valid.zh_CN-en_XX.en_XX
  cat newstest2018-enzh-src.en.sgm | strip_sgm.sh > $DEST/valid.en_XX-zh_CN.en_XX
  cat newstest2018-enzh-ref.zh.sgm | strip_sgm.sh > $DEST/valid.en_XX-zh_CN.zh_CN
  cat newstest2019-zhen-src.zh.sgm | strip_sgm.sh > $DEST/test.zh_CN-en_XX.zh_CN
  cat newstest2019-zhen-ref.en.sgm | strip_sgm.sh > $DEST/test.zh_CN-en_XX.en_XX
  cat newstest2019-enzh-src.en.sgm | strip_sgm.sh > $DEST/test.en_XX-zh_CN.en_XX
  cat newstest2019-enzh-ref.zh.sgm | strip_sgm.sh > $DEST/test.en_XX-zh_CN.zh_CN
}

mkdir -p $DEST

prepare_lid
prepare_moses
download_commoncrawl

prepare_ja &
prepare_ta &
prepare_km &
prepare_ps &
prepare_iu &
prepare_cs &
prepare_de &
prepare_pl &
prepare_ru &
prepare_zh &

# prepare valid/test set
prepare_tests &

# wait

# TODO remove intermediate files
# rm -rf $TMP_DIR