gpt-fi / data /fine-tuning /create_ylilauta.py
Vaino Hatanpaa
add training and evaluation scripts
ceedef8
raw
history blame
1.86 kB
import os
import sys
from datasets import load_dataset, load_from_disk, concatenate_datasets, Dataset
from transformers import PreTrainedTokenizerFast
import transformers
from transformers import (
AutoConfig,
AutoModelForCausalLM,
Trainer,
TrainingArguments,
default_data_collator,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel
from transformers import GPT2Model
from transformers import GPT2TokenizerFast
import transformers
import torch
import numpy as np
import argparse
tokenizer = AutoTokenizer.from_pretrained("/tokenizer/loc")
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
out_dir = "/out_dir/ylilauta"
max_length = 1024
#checkpoint_loc = r"H:\Data_temp\checkpoints\good_large\checkpoint-67400"
#output_dir = r"H:\Data_temp\checkpoints\tests\yle"
path = r"/data/ylilauta-corpus/data/100-percent/train.txt" #get from https://github.com/spyysalo/ylilauta-corpus
text = []
labels = []
with open(path,"r",encoding="utf-8") as f:
for line in f:
parts = line.split(" ", maxsplit=1)
labels.append(parts[0])
text.append(parts[1])
data_dict = {"text":text,"labels":labels}
dataset = Dataset.from_dict(data_dict)
label_names = dataset.unique('labels')
n_labels = len(label_names)
def to_one_hot(examples):
import numpy as np
label = np.zeros(n_labels)
label[label_names.index(examples["labels"])] = 1
return {"text":examples["text"],"labels":label.tolist()}
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_length)
tokenized = dataset.map(to_one_hot).map(tokenize_function).train_test_split(test_size=0.1)
tokenized.save_to_disk(out_dir)