OFA-OCR / fairseq /examples /roberta /preprocess_GLUE_tasks.sh
JustinLin610's picture
first commit
ee21b96
raw
history blame
5.74 kB
#!/bin/bash
# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
# raw glue data as downloaded by glue download script (https://gist.github.com/W4ngatang/60c2bdb54d156a41194446737ce03e2e)
if [[ $# -ne 2 ]]; then
echo "Run as following:"
echo "./examples/roberta/preprocess_GLUE_tasks.sh <glud_data_folder> <task_name>"
exit 1
fi
GLUE_DATA_FOLDER=$1
# download bpe encoder.json, vocabulary and fairseq dictionary
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/encoder.json'
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/vocab.bpe'
wget -N 'https://dl.fbaipublicfiles.com/fairseq/gpt2_bpe/dict.txt'
TASKS=$2 # QQP
if [ "$TASKS" = "ALL" ]
then
TASKS="QQP MNLI QNLI MRPC RTE STS-B SST-2 CoLA"
fi
for TASK in $TASKS
do
echo "Preprocessing $TASK"
TASK_DATA_FOLDER="$GLUE_DATA_FOLDER/$TASK"
echo "Raw data as downloaded from glue website: $TASK_DATA_FOLDER"
SPLITS="train dev test"
INPUT_COUNT=2
if [ "$TASK" = "QQP" ]
then
INPUT_COLUMNS=( 4 5 )
TEST_INPUT_COLUMNS=( 2 3 )
LABEL_COLUMN=6
elif [ "$TASK" = "MNLI" ]
then
SPLITS="train dev_matched dev_mismatched test_matched test_mismatched"
INPUT_COLUMNS=( 9 10 )
TEST_INPUT_COLUMNS=( 9 10 )
DEV_LABEL_COLUMN=16
LABEL_COLUMN=12
elif [ "$TASK" = "QNLI" ]
then
INPUT_COLUMNS=( 2 3 )
TEST_INPUT_COLUMNS=( 2 3 )
LABEL_COLUMN=4
elif [ "$TASK" = "MRPC" ]
then
INPUT_COLUMNS=( 4 5 )
TEST_INPUT_COLUMNS=( 4 5 )
LABEL_COLUMN=1
elif [ "$TASK" = "RTE" ]
then
INPUT_COLUMNS=( 2 3 )
TEST_INPUT_COLUMNS=( 2 3 )
LABEL_COLUMN=4
elif [ "$TASK" = "STS-B" ]
then
INPUT_COLUMNS=( 8 9 )
TEST_INPUT_COLUMNS=( 8 9 )
LABEL_COLUMN=10
# Following are single sentence tasks.
elif [ "$TASK" = "SST-2" ]
then
INPUT_COLUMNS=( 1 )
TEST_INPUT_COLUMNS=( 2 )
LABEL_COLUMN=2
INPUT_COUNT=1
elif [ "$TASK" = "CoLA" ]
then
INPUT_COLUMNS=( 4 )
TEST_INPUT_COLUMNS=( 2 )
LABEL_COLUMN=2
INPUT_COUNT=1
fi
# Strip out header and filter lines that don't have expected number of fields.
rm -rf "$TASK_DATA_FOLDER/processed"
mkdir -p "$TASK_DATA_FOLDER/processed"
for SPLIT in $SPLITS
do
# CoLA train and dev doesn't have header.
if [[ ( "$TASK" = "CoLA") && ( "$SPLIT" != "test" ) ]]
then
cp "$TASK_DATA_FOLDER/$SPLIT.tsv" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp";
else
tail -n +2 "$TASK_DATA_FOLDER/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp";
fi
# Remove unformatted lines from train and dev files for QQP dataset.
if [[ ( "$TASK" = "QQP") && ( "$SPLIT" != "test" ) ]]
then
awk -F '\t' -v NUM_FIELDS=6 'NF==NUM_FIELDS{print}{}' "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp" > "$TASK_DATA_FOLDER/processed/$SPLIT.tsv";
else
cp "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv";
fi
rm "$TASK_DATA_FOLDER/processed/$SPLIT.tsv.temp";
done
# Split into input0, input1 and label
for SPLIT in $SPLITS
do
for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1)))
do
if [[ "$SPLIT" != test* ]]
then
COLUMN_NUMBER=${INPUT_COLUMNS[$INPUT_TYPE]}
else
COLUMN_NUMBER=${TEST_INPUT_COLUMNS[$INPUT_TYPE]}
fi
cut -f"$COLUMN_NUMBER" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.raw.input$INPUT_TYPE";
done
if [[ "$SPLIT" != test* ]]
then
if [ "$TASK" = "MNLI" ] && [ "$SPLIT" != "train" ]
then
cut -f"$DEV_LABEL_COLUMN" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.label";
else
cut -f"$LABEL_COLUMN" "$TASK_DATA_FOLDER/processed/$SPLIT.tsv" > "$TASK_DATA_FOLDER/processed/$SPLIT.label";
fi
fi
# BPE encode.
for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1)))
do
LANG="input$INPUT_TYPE"
echo "BPE encoding $SPLIT/$LANG"
python -m examples.roberta.multiprocessing_bpe_encoder \
--encoder-json encoder.json \
--vocab-bpe vocab.bpe \
--inputs "$TASK_DATA_FOLDER/processed/$SPLIT.raw.$LANG" \
--outputs "$TASK_DATA_FOLDER/processed/$SPLIT.$LANG" \
--workers 60 \
--keep-empty;
done
done
# Remove output directory.
rm -rf "$TASK-bin"
DEVPREF="$TASK_DATA_FOLDER/processed/dev.LANG"
TESTPREF="$TASK_DATA_FOLDER/processed/test.LANG"
if [ "$TASK" = "MNLI" ]
then
DEVPREF="$TASK_DATA_FOLDER/processed/dev_matched.LANG,$TASK_DATA_FOLDER/processed/dev_mismatched.LANG"
TESTPREF="$TASK_DATA_FOLDER/processed/test_matched.LANG,$TASK_DATA_FOLDER/processed/test_mismatched.LANG"
fi
# Run fairseq preprocessing:
for INPUT_TYPE in $(seq 0 $((INPUT_COUNT-1)))
do
LANG="input$INPUT_TYPE"
fairseq-preprocess \
--only-source \
--trainpref "$TASK_DATA_FOLDER/processed/train.$LANG" \
--validpref "${DEVPREF//LANG/$LANG}" \
--testpref "${TESTPREF//LANG/$LANG}" \
--destdir "$TASK-bin/$LANG" \
--workers 60 \
--srcdict dict.txt;
done
if [[ "$TASK" != "STS-B" ]]
then
fairseq-preprocess \
--only-source \
--trainpref "$TASK_DATA_FOLDER/processed/train.label" \
--validpref "${DEVPREF//LANG/label}" \
--destdir "$TASK-bin/label" \
--workers 60;
else
# For STS-B output range is converted to be between: [0.0, 1.0]
mkdir -p "$TASK-bin/label"
awk '{print $1 / 5.0 }' "$TASK_DATA_FOLDER/processed/train.label" > "$TASK-bin/label/train.label"
awk '{print $1 / 5.0 }' "$TASK_DATA_FOLDER/processed/dev.label" > "$TASK-bin/label/valid.label"
fi
done