|
import os |
|
import sys |
|
from datasets import load_dataset, load_from_disk, concatenate_datasets, Dataset |
|
from transformers import PreTrainedTokenizerFast |
|
import transformers |
|
from transformers import ( |
|
AutoConfig, |
|
AutoModelForCausalLM, |
|
Trainer, |
|
TrainingArguments, |
|
default_data_collator, |
|
) |
|
from transformers.trainer_utils import get_last_checkpoint |
|
from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel |
|
|
|
from transformers import GPT2Model |
|
from transformers import GPT2TokenizerFast |
|
import transformers |
|
import torch |
|
import numpy as np |
|
import argparse |
|
|
|
tokenizer = AutoTokenizer.from_pretrained("/tokenizer/loc") |
|
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token}) |
|
out_dir = "/out_dir/ylilauta" |
|
max_length = 1024 |
|
|
|
|
|
|
|
|
|
path = r"/data/ylilauta-corpus/data/100-percent/train.txt" |
|
text = [] |
|
labels = [] |
|
with open(path,"r",encoding="utf-8") as f: |
|
for line in f: |
|
parts = line.split(" ", maxsplit=1) |
|
labels.append(parts[0]) |
|
text.append(parts[1]) |
|
|
|
data_dict = {"text":text,"labels":labels} |
|
dataset = Dataset.from_dict(data_dict) |
|
label_names = dataset.unique('labels') |
|
n_labels = len(label_names) |
|
|
|
def to_one_hot(examples): |
|
import numpy as np |
|
|
|
label = np.zeros(n_labels) |
|
label[label_names.index(examples["labels"])] = 1 |
|
|
|
return {"text":examples["text"],"labels":label.tolist()} |
|
|
|
def tokenize_function(examples): |
|
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_length) |
|
|
|
tokenized = dataset.map(to_one_hot).map(tokenize_function).train_test_split(test_size=0.1) |
|
|
|
tokenized.save_to_disk(out_dir) |