JustinLin610
update
10b0761
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
#
source_lang=kk_KZ
target_lang=en_XX
MODEL=criss_checkpoints/criss.3rd.pt
SPM=criss_checkpoints/sentence.bpe.model
SPLIT=test
LANG_DICT=criss_checkpoints/lang_dict.txt
SPM_ENCODE=flores/scripts/spm_encode.py
SAVE_ENCODER=save_encoder.py
ENCODER_SAVE_ROOT=sentence_embeddings/$MODEL
DICT=criss_checkpoints/dict.txt
THRESHOLD=1.02
MIN_COUNT=500
DATA_DIR=data_tmp
SAVE_DIR=mining/${source_lang}_${target_lang}_mined
ENCODER_SAVE_DIR=${ENCODER_SAVE_ROOT}/${source_lang}-${target_lang}
INPUT_DIR=$DATA_DIR/${source_lang}-${target_lang}-tatoeba
mkdir -p $ENCODER_SAVE_DIR/${target_lang}
mkdir -p $ENCODER_SAVE_DIR/${source_lang}
mkdir -p $SAVE_DIR
## Save encoder outputs
# Save encoder outputs for source sentences
python $SAVE_ENCODER \
${INPUT_DIR} \
--path ${MODEL} \
--task translation_multi_simple_epoch \
--lang-pairs ${source_lang}-${target_lang} \
--lang-dict ${LANG_DICT} \
--gen-subset ${SPLIT} \
--bpe 'sentencepiece' \
-s ${source_lang} -t ${target_lang} \
--sentencepiece-model ${SPM} \
--remove-bpe 'sentencepiece' \
--beam 1 \
--lang-tok-style mbart \
--encoder-save-dir ${ENCODER_SAVE_DIR}/${source_lang}
## Save encoder outputs for target sentences
python $SAVE_ENCODER \
${INPUT_DIR} \
--path ${MODEL} \
--lang-pairs ${source_lang}-${target_lang} \
--lang-dict ${LANG_DICT} \
--task translation_multi_simple_epoch \
--gen-subset ${SPLIT} \
--bpe 'sentencepiece' \
-t ${source_lang} -s ${target_lang} \
--sentencepiece-model ${SPM} \
--remove-bpe 'sentencepiece' \
--beam 1 \
--lang-tok-style mbart \
--encoder-save-dir ${ENCODER_SAVE_DIR}/${target_lang}
## Mining
python mining/mine.py \
--src-lang ${source_lang} \
--tgt-lang ${target_lang} \
--dim 1024 \
--mem 10 \
--neighborhood 4 \
--src-dir ${ENCODER_SAVE_DIR}/${source_lang} \
--tgt-dir ${ENCODER_SAVE_DIR}/${target_lang} \
--output $SAVE_DIR \
--threshold ${THRESHOLD} \
--min-count ${MIN_COUNT} \
--valid-size 100 \
--dict-path ${DICT} \
--spm-path ${SPM} \
## Process and binarize mined data
python $SPM_ENCODE \
--model ${SPM} \
--output_format=piece \
--inputs mining/${source_lang}_${target_lang}_mined/train.${source_lang} mining/${source_lang}_${target_lang}_mined/train.${target_lang} \
--outputs mining/${source_lang}_${target_lang}_mined/train.bpe.${source_lang} mining/${source_lang}_${target_lang}_mined/train.bpe.${target_lang}
python $SPM_ENCODE \
--model ${SPM} \
--output_format=piece \
--inputs mining/${source_lang}_${target_lang}_mined/valid.${source_lang} mining/${source_lang}_${target_lang}_mined/valid.${target_lang} \
--outputs mining/${source_lang}_${target_lang}_mined/valid.bpe.${source_lang} mining/${source_lang}_${target_lang}_mined/valid.bpe.${target_lang}
fairseq-preprocess \
--source-lang ${source_lang} \
--target-lang ${target_lang} \
--trainpref mining/${source_lang}_${target_lang}_mined/train.bpe \
--validpref mining/${source_lang}_${target_lang}_mined/valid.bpe \
--destdir mining/${source_lang}_${target_lang}_mined \
--srcdict ${DICT} \
--joined-dictionary \
--workers 8