|
|
|
|
|
import os
|
|
import json
|
|
import sys
|
|
from datasets import load_dataset, load_from_disk, concatenate_datasets, Dataset
|
|
from transformers import PreTrainedTokenizerFast
|
|
import transformers
|
|
from transformers import (
|
|
AutoConfig,
|
|
AutoModelForCausalLM,
|
|
Trainer,
|
|
TrainingArguments,
|
|
default_data_collator,
|
|
)
|
|
from transformers.trainer_utils import get_last_checkpoint
|
|
from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel
|
|
|
|
from transformers import GPT2Model
|
|
from transformers import GPT2TokenizerFast
|
|
import transformers
|
|
import torch
|
|
import numpy as np
|
|
|
|
root = r'G:\Data\yle\data'
|
|
|
|
texts = []
|
|
subjects = []
|
|
first_subjects = []
|
|
first_ids = []
|
|
subject_ids = []
|
|
|
|
for path, subdirs, files in os.walk(root):
|
|
|
|
for name in files:
|
|
print(os.path.join(path, name))
|
|
with open(os.path.join(path, name), encoding="utf8") as f:
|
|
data = json.load(f)
|
|
|
|
|
|
for i in range(len(data["data"])):
|
|
try:
|
|
txt = ""
|
|
s = []
|
|
s_ids = []
|
|
|
|
for c in data["data"][i]["content"]:
|
|
if c["type"] in ("heading","text"):
|
|
txt += c["text"]
|
|
txt += "\n"
|
|
first = ""
|
|
|
|
if "subjects" in data["data"][i]:
|
|
first = data["data"][i]["subjects"][0]["title"]["fi"]
|
|
first_id = data["data"][i]["subjects"][0]["id"]
|
|
for subject in data["data"][i]["subjects"]:
|
|
s.append(subject["title"]["fi"])
|
|
s_ids.append(subject["id"])
|
|
first_subjects.append(first)
|
|
first_ids.append(first_id)
|
|
texts.append(txt)
|
|
subjects.append(s)
|
|
subject_ids.append(s_ids)
|
|
except:
|
|
|
|
pass
|
|
|
|
|
|
dataset = Dataset.from_dict({"text":texts, "subjects":subjects, "first_subject":first_subjects, "first_ids":first_ids, "subject_ids":subject_ids})
|
|
|
|
tokenizer_loc = "/tokenizer_loc"
|
|
|
|
tokenizer = AutoTokenizer.from_pretrained(tokenizer_loc)
|
|
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
|
|
|
|
def find_major_subject(example):
|
|
good_subjects = ["urheilu","Kotimaan uutiset","Ulkomaat","jääkiekko","talous","politiikka","poliisi","Liikenne ja kuljetus","kulttuuri","puolueet","onnettomuudet","musiikki","Koulutus ja kasvatus","Venäjä","tieliikenne","luonto","autot","terveys","Helsinki","Pohjoismaat","kunnat","Eurooppa","rikokset","vaalit","Yhdysvallat","lainvalvonta"]
|
|
import numpy as np
|
|
example["main_subject"] = None
|
|
label = np.zeros(len(good_subjects))
|
|
for subject in example["subjects"]:
|
|
if subject in good_subjects:
|
|
example["main_subject"] = subject
|
|
label[good_subjects.index(subject)] = 1
|
|
|
|
break
|
|
return {"labels":label}
|
|
|
|
filtered = dataset.map(find_major_subject, num_proc=12).filter(lambda example: example['main_subject'] != None)
|
|
|
|
def tokenize_function(examples):
|
|
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=800)
|
|
tokenized_and_filtered_dataset = filtered.map(tokenize_function, batched=True)
|
|
|
|
tokenized_and_filtered_dataset.save_to_disk("/output/dir") |