File size: 785 Bytes
31bf2aa c36ebf7 31bf2aa |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 |
#!/bin/bash
export LC_ALL=C.UTF-8
export LANG=C.UTF-8
export OUTPUT_DIR=/home/saied/code/gpt2-medium-persian
export DATASET_NAME=oscar
export DATASET_CONFIG_NAME=unshuffled_deduplicated_fa
export VOCAB_SIZE=50000
export MIN_FREQUENCY=2
export SPECIAL_TOKENS='<s>','<pad>','</s>','<unk>','<mask>','<|endoftext|>','<|startoftext|>','<sep>','<cls>','<nl>','<tab>','<zwnj>','[U1]','[U2]','[U3]','[U4]','[U5]','[U6]','[U7]','[U8]','[U9]','[U10]','[U11]','[U12]','[U13]','[U14]','[U15]','[U16]','[U17]','[U18]','[U19]','[U20]'
python src/train_tokenizer.py \
--output_dir="$OUTPUT_DIR" \
--dataset_name="$DATASET_NAME" \
--dataset_config_name="$DATASET_CONFIG_NAME" \
--vocab_size=$VOCAB_SIZE \
--min_frequency=$MIN_FREQUENCY \
--special_tokens="$SPECIAL_TOKENS" |