gpt-fi / data /fine-tuning /create_yle.py
Vaino Hatanpaa
add training and evaluation scripts
ceedef8
import os
import json
import sys
from datasets import load_dataset, load_from_disk, concatenate_datasets, Dataset
from transformers import PreTrainedTokenizerFast
import transformers
from transformers import (
AutoConfig,
AutoModelForCausalLM,
Trainer,
TrainingArguments,
default_data_collator,
)
from transformers.trainer_utils import get_last_checkpoint
from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel
from transformers import GPT2Model
from transformers import GPT2TokenizerFast
import transformers
import torch
import numpy as np
root = r'G:\Data\yle\data'#download from kielipankki and extract
texts = []
subjects = []
first_subjects = []
first_ids = []
subject_ids = []
for path, subdirs, files in os.walk(root):
#Data is split into multiple files
for name in files:
print(os.path.join(path, name))
with open(os.path.join(path, name), encoding="utf8") as f:
data = json.load(f)
#Each file contains json with multiple articles
for i in range(len(data["data"])):
try:
txt = ""
s = [] #Subjects
s_ids = []#Id for the subjects
#From the content loop trough the content and get only heading as text as we do not want to add metadata to a text dataset
for c in data["data"][i]["content"]:
if c["type"] in ("heading","text"):
txt += c["text"]
txt += "\n"
first = ""
#An article contains n subjects. Loop trough those and also save which one was first. We want that as a distinct column in the dataset for performance.
if "subjects" in data["data"][i]:#To know if we have a first subject, check first if we even have subjects in json.
first = data["data"][i]["subjects"][0]["title"]["fi"]
first_id = data["data"][i]["subjects"][0]["id"]
for subject in data["data"][i]["subjects"]:
s.append(subject["title"]["fi"])
s_ids.append(subject["id"])
first_subjects.append(first)
first_ids.append(first_id)
texts.append(txt)
subjects.append(s)
subject_ids.append(s_ids)
except:
#Some texts contain formatting errors, just skip those as they are a neglible portion of all the articles.
pass
dataset = Dataset.from_dict({"text":texts, "subjects":subjects, "first_subject":first_subjects, "first_ids":first_ids, "subject_ids":subject_ids})
tokenizer_loc = "/tokenizer_loc"
tokenizer = AutoTokenizer.from_pretrained(tokenizer_loc)
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
def find_major_subject(example):
good_subjects = ["urheilu","Kotimaan uutiset","Ulkomaat","jääkiekko","talous","politiikka","poliisi","Liikenne ja kuljetus","kulttuuri","puolueet","onnettomuudet","musiikki","Koulutus ja kasvatus","Venäjä","tieliikenne","luonto","autot","terveys","Helsinki","Pohjoismaat","kunnat","Eurooppa","rikokset","vaalit","Yhdysvallat","lainvalvonta"]
import numpy as np #Some scopes were broken on Windows so import again here to get batched processing to work...
example["main_subject"] = None
label = np.zeros(len(good_subjects))#sparse label matrix, to be made into one-hot later
for subject in example["subjects"]:
if subject in good_subjects:
example["main_subject"] = subject
label[good_subjects.index(subject)] = 1
#example["labels"] = label
break
return {"labels":label}
filtered = dataset.map(find_major_subject, num_proc=12).filter(lambda example: example['main_subject'] != None)
def tokenize_function(examples):
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=800)
tokenized_and_filtered_dataset = filtered.map(tokenize_function, batched=True)
tokenized_and_filtered_dataset.save_to_disk("/output/dir")