nb-nordic-lid / code /prepare_data.sh
versae's picture
First full version of the models
486585a
#wget http://downloads.tatoeba.org/exports/sentences.tar.bz2
#bunzip2 sentences.tar.bz2
#tar xvf sentences.tar
#awk -F"\t" '{print"__label__"$2" "$3}' < sentences.csv | shuf > all.txt
#head -3 all.txt
#head -n 10000 all.txt > validation_tatoeba.txt
#tail -n +10001 all.txt > train_tatoeba.txt
python create_fasttext_data.py
python create_tatoeba_data.py
cat train*.txt | shuf > train_all.txt
cat validation*.txt | shuf > validation_all.txt
cat test*.txt | shuf > test_all.txt
python <<EOF
from pathlib import Path
import pandas as pd
pd.DataFrame.from_records([line[9:].split(" ", 1) for line in Path("train_all.txt").read_text().split("\n") if line], columns=["lang", "text"]).to_csv("train_all.csv", index=False)
pd.DataFrame.from_records([line[9:].split(" ", 1) for line in Path("validation_all.txt").read_text().split("\n") if line], columns=["lang", "text"]).to_csv("validation_all.csv", index=False)
pd.DataFrame.from_records([line[9:].split(" ", 1) for line in Path("test_all.txt").read_text().split("\n") if line], columns=["lang", "text"]).to_csv("test_all.csv", index=False)
EOF