metadata
language:
- ko
tags:
- roberta
license:
- mit
ํ๋ จ ์ฝ๋
from datasets import load_dataset
from tokenizers import ByteLevelBPETokenizer
tokenizer = ByteLevelBPETokenizer(unicode_normalizer="nfkc", trim_offsets=True)
ds = load_dataset("Bingsu/my-korean-training-corpus", use_auth_token=True)
# ๊ณต๊ฐ๋ ๋ฐ์ดํฐ๋ฅผ ์ฌ์ฉํ ๊ฒฝ์ฐ
# ds = load_dataset("cc100", lang="ko") # 50GB
# ์ด ๋ฐ์ดํฐ๋ 35GB์ด๊ณ , ๋ฐ์ดํฐ๊ฐ ๋๋ฌด ๋ง์ผ๋ฉด ์ปดํจํฐ๊ฐ ํฐ์ ธ์ ์ผ๋ถ๋ง ์ฌ์ฉํ์ต๋๋ค.
ds_sample = ds["train"].train_test_split(0.35, seed=20220819)["test"]
def gen_text(batch_size: int = 5000):
for i in range(0, len(ds_sample), batch_size):
yield ds_sample[i : i + batch_size]["text"]
tokenizer.train_from_iterator(
gen_text(),
vocab_size=50265,
min_frequency=2,
special_tokens=[
"<s>",
"<pad>",
"</s>",
"<unk>",
"<mask>",
],
)
tokenizer.save("my_tokenizer.json")
์ฝ 7์๊ฐ ์๋ชจ (i5-12600 non-k)
์ฌ์ฉ๋ฒ
1.
tokenizer = AutoTokenizer.from_pretrained("Bingsu/BBPE_tokenizer_test")
# tokenizer๋ RobertaTokenizerFast ํด๋์ค๊ฐ ๋ฉ๋๋ค.
2.
tokenizer.json
ํ์ผ์ ๋จผ์ ๋ค์ด๋ฐ์ต๋๋ค.
from transformers import BartTokenizerFast, BertTokenizerFast
bart_tokenizer = BartTokenizerFast(tokenizer_file="tokenizer.json")
bert_tokenizer = BertTokenizerFast(tokenizer_file="tokenizer.json")
roberta์ ๊ฐ์ด BBPE๋ฅผ ์ฌ์ฉํ bart๋ ๋ฌผ๋ก ์ด๊ณ bert์๋ ๋ถ๋ฌ์ฌ ์ ์์ต๋๋ค. ๋ค๋ง ์ด๋ ๊ฒ ๋ถ๋ฌ์์ ๊ฒฝ์ฐ, model_max_len์ด ์ง์ ์ด ๋์ด์์ง ์์ผ๋ ์ง์ ํด์ผ ํฉ๋๋ค.