Spaces:
Runtime error
Runtime error
import tempfile | |
from argparse import ArgumentParser | |
import sentencepiece as spm | |
from transformers.models.bert.tokenization_bert import BertTokenizer | |
# Load the text tokenizer | |
tokenizer = BertTokenizer.from_pretrained("bert-base-chinese") | |
BOS_TOKEN = tokenizer.cls_token | |
EOS_TOKEN = tokenizer.sep_token | |
UNK_TOKEN = tokenizer.unk_token | |
PAD_ID = tokenizer.pad_token_id | |
BOS_ID = tokenizer.cls_token_id | |
EOS_ID = tokenizer.sep_token_id | |
UNK_ID = tokenizer.unk_token_id | |
# Build the condition (attribute) tokenizer | |
if __name__ == "__main__": | |
parser = ArgumentParser() | |
# fmt: off | |
parser.add_argument("--input", nargs="+", required=True) | |
parser.add_argument("--vocab-file", type=str, required=True) | |
parser.add_argument("--vocab-size", type=int, default=31) | |
parser.add_argument("--algo", type=str, default="bpe", choices=["bpe", "word"]) | |
# fmt: on | |
args = parser.parse_args() | |
print("Building token vocabulary") | |
with tempfile.NamedTemporaryFile("w") as f: | |
# concatenate input files | |
for input_fname in args.input: | |
with open(input_fname) as input_f: | |
f.write(input_f.read() + "\n") | |
# run sentence piece with bpe | |
spm.SentencePieceTrainer.Train( | |
f"--add_dummy_prefix=false --pad_id=0 --bos_id=1 --eos_id=2 --unk_id=3 " | |
f"--vocab_size={args.vocab_size} " | |
f"--model_prefix={args.vocab_file} --model_type={args.algo} " | |
f"--input={f.name}" | |
) | |
~ |