gpt-fi / data /fine-tuning /create_xed.py
Vaino Hatanpaa
add training and evaluation scripts
ceedef8
import os
import sys
from datasets import load_dataset, concatenate_datasets
from transformers import PreTrainedTokenizerFast
import transformers
from transformers import (
AutoConfig,
AutoModelForCausalLM,
Trainer,
TrainingArguments,
default_data_collator,
GPT2Tokenizer
)
from transformers.trainer_utils import get_last_checkpoint
from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel
from transformers import GPT2Model
from transformers import GPT2TokenizerFast
import transformers
import torch
import numpy as np
import argparse
tokenizer = AutoTokenizer.from_pretrained("/checkpoint/loc")
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
out_dir = "/out_dir/xed"
max_length = 1024
fi_annotated_raw = load_dataset("xed_en_fi","fi_annotated")
fi_neutral_raw = load_dataset("xed_en_fi","fi_neutral")
def to_arr(examples):
labels = []
for item in examples["labels"]:
labels.append([item])
return {"sentence":examples["sentence"],"labels":labels}
fi_neutral_mapped = fi_neutral_raw["train"].map(to_arr, batched=True)
fi_neutral_mapped_cast = fi_neutral_mapped.cast(fi_annotated_raw["train"].features)
concat_raw_set = concatenate_datasets([fi_neutral_mapped_cast, fi_annotated_raw["train"]])#combine neutral and other labels into single dataset
def tokenize_function(examples):
return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=max_length)
def to_arr_2(examples):
labels = []
for item in examples["labels"]:
label = np.zeros(9)
label[item] = 1
labels.append(label.tolist())
return {"sentence":examples["sentence"],"labels":labels}
tokenized_datasets = concat_raw_set.map(tokenize_function, batched=True).map(to_arr_2, batched=True).shuffle(seed=42).train_test_split(test_size=0.1)
tokenized_datasets.save_to_disk(out_dir)