Spaces:
Runtime error
Runtime error
File size: 1,740 Bytes
ee21b96 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 |
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
SPM_ENCODE=flores/scripts/spm_encode.py
DATA=data_tmp
SPM_MODEL=criss_checkpoints/sentence.bpe.model
DICT=criss_checkpoints/dict.txt
download_data() {
CORPORA=$1
URL=$2
if [ -f $CORPORA ]; then
echo "$CORPORA already exists, skipping download"
else
echo "Downloading $URL"
wget $URL -O $CORPORA --no-check-certificate || rm -f $CORPORA
if [ -f $CORPORA ]; then
echo "$URL successfully downloaded."
else
echo "$URL not successfully downloaded."
rm -f $CORPORA
fi
fi
}
if [[ -f flores ]]; then
echo "flores already cloned"
else
git clone https://github.com/facebookresearch/flores
fi
mkdir -p $DATA
download_data $DATA/wikipedia_en_ne_si_test_sets.tgz "https://github.com/facebookresearch/flores/raw/master/data/wikipedia_en_ne_si_test_sets.tgz"
pushd $DATA
pwd
tar -vxf wikipedia_en_ne_si_test_sets.tgz
popd
for lang in ne_NP si_LK; do
datadir=$DATA/${lang}-en_XX-flores
rm -rf $datadir
mkdir -p $datadir
TEST_PREFIX=$DATA/wikipedia_en_ne_si_test_sets/wikipedia.test
python $SPM_ENCODE \
--model ${SPM_MODEL} \
--output_format=piece \
--inputs ${TEST_PREFIX}.${lang:0:2}-en.${lang:0:2} ${TEST_PREFIX}.${lang:0:2}-en.en \
--outputs $datadir/test.bpe.${lang}-en_XX.${lang} $datadir/test.bpe.${lang}-en_XX.en_XX
# binarize data
fairseq-preprocess \
--source-lang ${lang} --target-lang en_XX \
--testpref $datadir/test.bpe.${lang}-en_XX \
--destdir $datadir \
--srcdict ${DICT} \
--joined-dictionary \
--workers 4
done
|