Vaino Hatanpaa
commited on
Commit
·
ceedef8
1
Parent(s):
debeca1
add training and evaluation scripts
Browse files- data/fine-tuning/create_online_reviews.py +28 -0
- data/fine-tuning/create_xed.py +57 -0
- data/fine-tuning/create_yle.py +95 -0
- data/fine-tuning/create_ylilauta.py +58 -0
- data/fine-tuning/online_reviews_loading.py +51 -0
- data/tokenize.py +50 -0
- data/train_tokenizer.py +25 -0
- evaluate_and_analyze/evaluate.py +132 -0
- evaluate_and_analyze/few_shot.ipynb +0 -0
- evaluate_and_analyze/generation.ipynb +0 -0
- finetune.py +150 -0
- train.py +97 -0
data/fine-tuning/create_online_reviews.py
ADDED
@@ -0,0 +1,28 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
from transformers import AutoTokenizer
|
3 |
+
import datasets
|
4 |
+
import os
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
def main():
|
9 |
+
datasets.set_caching_enabled(False)
|
10 |
+
tokenizer = AutoTokenizer.from_pretrained(r"/tokenizer/loc")
|
11 |
+
|
12 |
+
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
|
13 |
+
|
14 |
+
data_loc = "path/to/review/jsons"
|
15 |
+
data_files = [fil.path for fil in os.scandir(data_loc)]
|
16 |
+
dataset = load_dataset('online_reviews_loading.py', data_files=data_files)
|
17 |
+
|
18 |
+
def tokenize_function(examples):
|
19 |
+
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=512)
|
20 |
+
|
21 |
+
def process_rating(example):
|
22 |
+
example["labels"] = [float(item) for item in example["rating"]]
|
23 |
+
return example
|
24 |
+
dataset = dataset["train"].map(tokenize_function, batched=True).map(process_rating,batched=True,remove_columns=['rating']).shuffle(seed=42).train_test_split(test_size=0.1)
|
25 |
+
|
26 |
+
|
27 |
+
if __name__ == "__main__":
|
28 |
+
main()
|
data/fine-tuning/create_xed.py
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
from datasets import load_dataset, concatenate_datasets
|
6 |
+
from transformers import PreTrainedTokenizerFast
|
7 |
+
import transformers
|
8 |
+
from transformers import (
|
9 |
+
AutoConfig,
|
10 |
+
AutoModelForCausalLM,
|
11 |
+
Trainer,
|
12 |
+
TrainingArguments,
|
13 |
+
default_data_collator,
|
14 |
+
GPT2Tokenizer
|
15 |
+
)
|
16 |
+
from transformers.trainer_utils import get_last_checkpoint
|
17 |
+
from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel
|
18 |
+
|
19 |
+
from transformers import GPT2Model
|
20 |
+
from transformers import GPT2TokenizerFast
|
21 |
+
import transformers
|
22 |
+
import torch
|
23 |
+
import numpy as np
|
24 |
+
import argparse
|
25 |
+
|
26 |
+
tokenizer = AutoTokenizer.from_pretrained("/checkpoint/loc")
|
27 |
+
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
|
28 |
+
out_dir = "/out_dir/xed"
|
29 |
+
max_length = 1024
|
30 |
+
|
31 |
+
|
32 |
+
fi_annotated_raw = load_dataset("xed_en_fi","fi_annotated")
|
33 |
+
fi_neutral_raw = load_dataset("xed_en_fi","fi_neutral")
|
34 |
+
|
35 |
+
def to_arr(examples):
|
36 |
+
labels = []
|
37 |
+
for item in examples["labels"]:
|
38 |
+
labels.append([item])
|
39 |
+
return {"sentence":examples["sentence"],"labels":labels}
|
40 |
+
fi_neutral_mapped = fi_neutral_raw["train"].map(to_arr, batched=True)
|
41 |
+
|
42 |
+
fi_neutral_mapped_cast = fi_neutral_mapped.cast(fi_annotated_raw["train"].features)
|
43 |
+
concat_raw_set = concatenate_datasets([fi_neutral_mapped_cast, fi_annotated_raw["train"]])#combine neutral and other labels into single dataset
|
44 |
+
|
45 |
+
def tokenize_function(examples):
|
46 |
+
return tokenizer(examples["sentence"], padding="max_length", truncation=True, max_length=max_length)
|
47 |
+
|
48 |
+
def to_arr_2(examples):
|
49 |
+
labels = []
|
50 |
+
for item in examples["labels"]:
|
51 |
+
label = np.zeros(9)
|
52 |
+
label[item] = 1
|
53 |
+
labels.append(label.tolist())
|
54 |
+
return {"sentence":examples["sentence"],"labels":labels}
|
55 |
+
|
56 |
+
tokenized_datasets = concat_raw_set.map(tokenize_function, batched=True).map(to_arr_2, batched=True).shuffle(seed=42).train_test_split(test_size=0.1)
|
57 |
+
tokenized_datasets.save_to_disk(out_dir)
|
data/fine-tuning/create_yle.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
import sys
|
6 |
+
from datasets import load_dataset, load_from_disk, concatenate_datasets, Dataset
|
7 |
+
from transformers import PreTrainedTokenizerFast
|
8 |
+
import transformers
|
9 |
+
from transformers import (
|
10 |
+
AutoConfig,
|
11 |
+
AutoModelForCausalLM,
|
12 |
+
Trainer,
|
13 |
+
TrainingArguments,
|
14 |
+
default_data_collator,
|
15 |
+
)
|
16 |
+
from transformers.trainer_utils import get_last_checkpoint
|
17 |
+
from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel
|
18 |
+
|
19 |
+
from transformers import GPT2Model
|
20 |
+
from transformers import GPT2TokenizerFast
|
21 |
+
import transformers
|
22 |
+
import torch
|
23 |
+
import numpy as np
|
24 |
+
|
25 |
+
root = r'G:\Data\yle\data'#download from kielipankki and extract
|
26 |
+
|
27 |
+
texts = []
|
28 |
+
subjects = []
|
29 |
+
first_subjects = []
|
30 |
+
first_ids = []
|
31 |
+
subject_ids = []
|
32 |
+
|
33 |
+
for path, subdirs, files in os.walk(root):
|
34 |
+
#Data is split into multiple files
|
35 |
+
for name in files:
|
36 |
+
print(os.path.join(path, name))
|
37 |
+
with open(os.path.join(path, name), encoding="utf8") as f:
|
38 |
+
data = json.load(f)
|
39 |
+
|
40 |
+
#Each file contains json with multiple articles
|
41 |
+
for i in range(len(data["data"])):
|
42 |
+
try:
|
43 |
+
txt = ""
|
44 |
+
s = [] #Subjects
|
45 |
+
s_ids = []#Id for the subjects
|
46 |
+
#From the content loop trough the content and get only heading as text as we do not want to add metadata to a text dataset
|
47 |
+
for c in data["data"][i]["content"]:
|
48 |
+
if c["type"] in ("heading","text"):
|
49 |
+
txt += c["text"]
|
50 |
+
txt += "\n"
|
51 |
+
first = ""
|
52 |
+
#An article contains n subjects. Loop trough those and also save which one was first. We want that as a distinct column in the dataset for performance.
|
53 |
+
if "subjects" in data["data"][i]:#To know if we have a first subject, check first if we even have subjects in json.
|
54 |
+
first = data["data"][i]["subjects"][0]["title"]["fi"]
|
55 |
+
first_id = data["data"][i]["subjects"][0]["id"]
|
56 |
+
for subject in data["data"][i]["subjects"]:
|
57 |
+
s.append(subject["title"]["fi"])
|
58 |
+
s_ids.append(subject["id"])
|
59 |
+
first_subjects.append(first)
|
60 |
+
first_ids.append(first_id)
|
61 |
+
texts.append(txt)
|
62 |
+
subjects.append(s)
|
63 |
+
subject_ids.append(s_ids)
|
64 |
+
except:
|
65 |
+
#Some texts contain formatting errors, just skip those as they are a neglible portion of all the articles.
|
66 |
+
pass
|
67 |
+
|
68 |
+
|
69 |
+
dataset = Dataset.from_dict({"text":texts, "subjects":subjects, "first_subject":first_subjects, "first_ids":first_ids, "subject_ids":subject_ids})
|
70 |
+
|
71 |
+
tokenizer_loc = "/tokenizer_loc"
|
72 |
+
|
73 |
+
tokenizer = AutoTokenizer.from_pretrained(tokenizer_loc)
|
74 |
+
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
|
75 |
+
|
76 |
+
def find_major_subject(example):
|
77 |
+
good_subjects = ["urheilu","Kotimaan uutiset","Ulkomaat","jääkiekko","talous","politiikka","poliisi","Liikenne ja kuljetus","kulttuuri","puolueet","onnettomuudet","musiikki","Koulutus ja kasvatus","Venäjä","tieliikenne","luonto","autot","terveys","Helsinki","Pohjoismaat","kunnat","Eurooppa","rikokset","vaalit","Yhdysvallat","lainvalvonta"]
|
78 |
+
import numpy as np #Some scopes were broken on Windows so import again here to get batched processing to work...
|
79 |
+
example["main_subject"] = None
|
80 |
+
label = np.zeros(len(good_subjects))#sparse label matrix, to be made into one-hot later
|
81 |
+
for subject in example["subjects"]:
|
82 |
+
if subject in good_subjects:
|
83 |
+
example["main_subject"] = subject
|
84 |
+
label[good_subjects.index(subject)] = 1
|
85 |
+
#example["labels"] = label
|
86 |
+
break
|
87 |
+
return {"labels":label}
|
88 |
+
|
89 |
+
filtered = dataset.map(find_major_subject, num_proc=12).filter(lambda example: example['main_subject'] != None)
|
90 |
+
|
91 |
+
def tokenize_function(examples):
|
92 |
+
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=800)
|
93 |
+
tokenized_and_filtered_dataset = filtered.map(tokenize_function, batched=True)
|
94 |
+
|
95 |
+
tokenized_and_filtered_dataset.save_to_disk("/output/dir")
|
data/fine-tuning/create_ylilauta.py
ADDED
@@ -0,0 +1,58 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
from datasets import load_dataset, load_from_disk, concatenate_datasets, Dataset
|
4 |
+
from transformers import PreTrainedTokenizerFast
|
5 |
+
import transformers
|
6 |
+
from transformers import (
|
7 |
+
AutoConfig,
|
8 |
+
AutoModelForCausalLM,
|
9 |
+
Trainer,
|
10 |
+
TrainingArguments,
|
11 |
+
default_data_collator,
|
12 |
+
)
|
13 |
+
from transformers.trainer_utils import get_last_checkpoint
|
14 |
+
from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel
|
15 |
+
|
16 |
+
from transformers import GPT2Model
|
17 |
+
from transformers import GPT2TokenizerFast
|
18 |
+
import transformers
|
19 |
+
import torch
|
20 |
+
import numpy as np
|
21 |
+
import argparse
|
22 |
+
|
23 |
+
tokenizer = AutoTokenizer.from_pretrained("/tokenizer/loc")
|
24 |
+
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
|
25 |
+
out_dir = "/out_dir/ylilauta"
|
26 |
+
max_length = 1024
|
27 |
+
|
28 |
+
#checkpoint_loc = r"H:\Data_temp\checkpoints\good_large\checkpoint-67400"
|
29 |
+
#output_dir = r"H:\Data_temp\checkpoints\tests\yle"
|
30 |
+
|
31 |
+
path = r"/data/ylilauta-corpus/data/100-percent/train.txt" #get from https://github.com/spyysalo/ylilauta-corpus
|
32 |
+
text = []
|
33 |
+
labels = []
|
34 |
+
with open(path,"r",encoding="utf-8") as f:
|
35 |
+
for line in f:
|
36 |
+
parts = line.split(" ", maxsplit=1)
|
37 |
+
labels.append(parts[0])
|
38 |
+
text.append(parts[1])
|
39 |
+
|
40 |
+
data_dict = {"text":text,"labels":labels}
|
41 |
+
dataset = Dataset.from_dict(data_dict)
|
42 |
+
label_names = dataset.unique('labels')
|
43 |
+
n_labels = len(label_names)
|
44 |
+
|
45 |
+
def to_one_hot(examples):
|
46 |
+
import numpy as np
|
47 |
+
|
48 |
+
label = np.zeros(n_labels)
|
49 |
+
label[label_names.index(examples["labels"])] = 1
|
50 |
+
|
51 |
+
return {"text":examples["text"],"labels":label.tolist()}
|
52 |
+
|
53 |
+
def tokenize_function(examples):
|
54 |
+
return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_length)
|
55 |
+
|
56 |
+
tokenized = dataset.map(to_one_hot).map(tokenize_function).train_test_split(test_size=0.1)
|
57 |
+
|
58 |
+
tokenized.save_to_disk(out_dir)
|
data/fine-tuning/online_reviews_loading.py
ADDED
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import load_dataset
|
2 |
+
import datasets
|
3 |
+
import json
|
4 |
+
import numpy as np
|
5 |
+
import os
|
6 |
+
#Dataset loading script that is missing quite a lot of details but works
|
7 |
+
class NewDataset(datasets.GeneratorBasedBuilder):
|
8 |
+
def _info(self):
|
9 |
+
return datasets.DatasetInfo(
|
10 |
+
description="beep boop",
|
11 |
+
features=datasets.Features(
|
12 |
+
{
|
13 |
+
"description": datasets.Value("string"),
|
14 |
+
"text": datasets.Value("string"),
|
15 |
+
"rating": datasets.Value("int32")
|
16 |
+
}
|
17 |
+
),
|
18 |
+
# No default supervised_keys (as we have to pass both question
|
19 |
+
# and context as input).
|
20 |
+
supervised_keys=None,
|
21 |
+
homepage="no",
|
22 |
+
citation="no",
|
23 |
+
)
|
24 |
+
|
25 |
+
def _split_generators(self, dl_manager):
|
26 |
+
files = self.config.data_files
|
27 |
+
return [datasets.SplitGenerator(name=datasets.Split.TRAIN, gen_kwargs={"files": files["train"]})]
|
28 |
+
def _generate_examples(
|
29 |
+
self, files # method parameters are unpacked from `gen_kwargs` as given in `_split_generators`
|
30 |
+
):
|
31 |
+
""" Yields examples as (key, example) tuples. """
|
32 |
+
# This method handles input defined in _split_generators to yield (key, example) tuples from the dataset.
|
33 |
+
# The `key` is here for legacy reason (tfds) and is not important in itself.
|
34 |
+
#print("files",files)
|
35 |
+
key = 0
|
36 |
+
for file in files:
|
37 |
+
with open(file, encoding="utf-8") as f:
|
38 |
+
data = json.load(f)
|
39 |
+
|
40 |
+
for item in data:
|
41 |
+
for review in item["reviews"]:
|
42 |
+
yield key, {
|
43 |
+
"description": item["description_raw"],
|
44 |
+
"text": review["reviewText"],
|
45 |
+
"rating": review["rating"],
|
46 |
+
}
|
47 |
+
key += 1
|
48 |
+
|
49 |
+
|
50 |
+
|
51 |
+
|
data/tokenize.py
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from datasets import Dataset, load_dataset, concatenate_datasets
|
2 |
+
import datasets
|
3 |
+
from transformers import GPT2TokenizerFast
|
4 |
+
from tokenizers.processors import TemplateProcessing
|
5 |
+
|
6 |
+
input_dir = "dataset_location"
|
7 |
+
tokenizer_file="path/to/file"
|
8 |
+
output_dir="output/dir"
|
9 |
+
tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_file)
|
10 |
+
#Add eos tokens to the tokenization pipeline as they are not added otherwise
|
11 |
+
tokenizer._tokenizer.post_processor = TemplateProcessing(
|
12 |
+
single="$0 "+tokenizer.eos_token,
|
13 |
+
pair="$A "+tokenizer.eos_token+" $B:1 "+tokenizer.eos_token,
|
14 |
+
special_tokens=[(tokenizer.eos_token, 0)],
|
15 |
+
)
|
16 |
+
|
17 |
+
def tokenize_function(examples):
|
18 |
+
return tokenizer(examples["text"])
|
19 |
+
|
20 |
+
|
21 |
+
def group_texts(examples):
|
22 |
+
#group texts. This is based on Hugging Face CLM example
|
23 |
+
block_size = 1024
|
24 |
+
concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
|
25 |
+
total_len = len(concatenated_examples[list(examples.keys())[0]])
|
26 |
+
total_len = (total_len//block_size) * block_size
|
27 |
+
result = {
|
28 |
+
k: [t[i:i+block_size] for i in range(0, total_len, block_size)]
|
29 |
+
for k, t in concatenated_examples.items()
|
30 |
+
}
|
31 |
+
result["labels"] = result["input_ids"].copy()
|
32 |
+
return result
|
33 |
+
|
34 |
+
def main():
|
35 |
+
num_proc=12 #set to something appropriate
|
36 |
+
dataset = datasets.load_from_disk(input_dir) #This one load a saved dataset object from disk. You could create a dataset from iterable or load one like:
|
37 |
+
#dataset = load_dataset("Finnish-NLP/mc4_fi_cleaned", split="train").remove_columns(["timestamp","url"]) #Example usage from Hugging Face Hub
|
38 |
+
|
39 |
+
#Tokenize, filter out very short texts and group texts to blocks of attention size
|
40 |
+
dataset\
|
41 |
+
.shuffle(seed=42, load_from_cache_file=False, writer_batch_size=100000)\
|
42 |
+
.map(tokenize_function, batched=True, num_proc=num_proc, remove_columns=dataset.column_names, load_from_cache_file=False, writer_batch_size=100000)\
|
43 |
+
.filter(lambda e: len(e["input_ids"]) > 20, num_proc=num_proc, load_from_cache_file=False, writer_batch_size=100000)\
|
44 |
+
.map(group_texts, batched=True, num_proc=num_proc, load_from_cache_file=False, writer_batch_size=100000)\
|
45 |
+
.train_test_split(test_size=0.05, load_from_cache_file=False, writer_batch_size=100000)\
|
46 |
+
.save_to_disk(output_dir)
|
47 |
+
print(dataset)
|
48 |
+
|
49 |
+
if __name__ == "__main__":
|
50 |
+
main()
|
data/train_tokenizer.py
ADDED
@@ -0,0 +1,25 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from tokenizers import Tokenizer, normalizers, models, pre_tokenizers, processors, ByteLevelBPETokenizer
|
2 |
+
import tokenizers
|
3 |
+
|
4 |
+
from tokenizers.models import WordPiece, BPE
|
5 |
+
from tokenizers.trainers import WordPieceTrainer, BpeTrainer
|
6 |
+
from tokenizers.pre_tokenizers import Whitespace, Punctuation, Sequence
|
7 |
+
from tokenizers.processors import TemplateProcessing
|
8 |
+
import os
|
9 |
+
|
10 |
+
from transformers import AutoTokenizer
|
11 |
+
old_tokenizer = AutoTokenizer.from_pretrained("gpt2")
|
12 |
+
|
13 |
+
import datasets
|
14 |
+
input_dir = "/dataset/location"
|
15 |
+
dataset = datasets.load_from_disk(input_dir)
|
16 |
+
|
17 |
+
def get_training_corpus():
|
18 |
+
for start_idx in range(0, len(dataset), 10000):
|
19 |
+
samples = dataset[start_idx : start_idx + 10000]
|
20 |
+
yield samples["text"]
|
21 |
+
|
22 |
+
print("start")
|
23 |
+
tokenizer = old_tokenizer.train_new_from_iterator(get_training_corpus(), vocab_size=50000)
|
24 |
+
print("end")
|
25 |
+
tokenizer.save_vocabulary("/tokenizer_location")
|
evaluate_and_analyze/evaluate.py
ADDED
@@ -0,0 +1,132 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
from datasets import load_dataset, load_from_disk, concatenate_datasets
|
6 |
+
from transformers import PreTrainedTokenizerFast
|
7 |
+
import transformers
|
8 |
+
from transformers import (
|
9 |
+
AutoConfig,
|
10 |
+
AutoModelForCausalLM,
|
11 |
+
Trainer,
|
12 |
+
TrainingArguments,
|
13 |
+
default_data_collator,
|
14 |
+
)
|
15 |
+
from transformers.trainer_utils import get_last_checkpoint
|
16 |
+
from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel
|
17 |
+
|
18 |
+
from transformers import GPT2Model
|
19 |
+
from transformers import GPT2TokenizerFast
|
20 |
+
import transformers
|
21 |
+
import torch
|
22 |
+
import numpy as np
|
23 |
+
import argparse
|
24 |
+
|
25 |
+
parser = argparse.ArgumentParser()
|
26 |
+
parser.add_argument('test', type=int)
|
27 |
+
parser.add_argument('length', type=int)
|
28 |
+
#parser.add_argument('--input_file', type=int)
|
29 |
+
args = parser.parse_args()
|
30 |
+
|
31 |
+
def compute_metrics(eval_pred):
|
32 |
+
logits,labels = eval_pred
|
33 |
+
import pickle
|
34 |
+
with open("logits_{}.pickle".format("xed"),"wb") as handle:
|
35 |
+
pickle.dump(logits, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
36 |
+
with open("labels_{}.pickle".format("xed"),"wb") as handle:
|
37 |
+
pickle.dump(labels, handle, protocol=pickle.HIGHEST_PROTOCOL)
|
38 |
+
#Continue in a jupyter notebook from here
|
39 |
+
|
40 |
+
return
|
41 |
+
|
42 |
+
|
43 |
+
|
44 |
+
class MultilabelTrainer(Trainer):
|
45 |
+
def compute_loss(self,model,inputs,return_outputs=False):
|
46 |
+
labels = inputs.pop("labels")
|
47 |
+
outputs = model(**inputs)
|
48 |
+
logits = outputs.logits
|
49 |
+
loss_fct = torch.nn.BCEWithLogitsLoss()
|
50 |
+
loss = loss_fct(logits.view(-1,self.model.config.num_labels),
|
51 |
+
labels.float().view(-1,self.model.config.num_labels))
|
52 |
+
return (loss,outputs) if return_outputs else loss
|
53 |
+
|
54 |
+
def main():
|
55 |
+
ds_names = ["yle", "online_review","xed","ylilauta"]
|
56 |
+
#ds_sizes = [1000, 3000, 10000, 32000, 9999999]
|
57 |
+
print("test:",args.test)
|
58 |
+
ds_name = ds_names[args.test]
|
59 |
+
#ds_size = int(args.test.slit()[1])
|
60 |
+
ds_size = args.length
|
61 |
+
print(ds_name, ds_size)
|
62 |
+
|
63 |
+
metric = compute_metrics
|
64 |
+
|
65 |
+
#print("cuda_avail:",torch.cuda.is_available())
|
66 |
+
#checkpoint_loc = "/media/volume/output/checkpoint-275000"
|
67 |
+
#output_dir = "/media/volume/fi_nlp/output/finetune"
|
68 |
+
#checkpoint_loc = r"H:\Data_temp\checkpoints\good_large\checkpoint-67400"
|
69 |
+
output_dir = "/data/loc/"+ds_name
|
70 |
+
|
71 |
+
#Most of the parameters not used but lets just pass this to make the Trainer happy...
|
72 |
+
training_args = TrainingArguments(
|
73 |
+
output_dir=output_dir,
|
74 |
+
per_device_train_batch_size=4,
|
75 |
+
per_device_eval_batch_size=4,
|
76 |
+
learning_rate=5e-6,
|
77 |
+
adam_beta1=0.95,
|
78 |
+
adam_beta2=0.985,
|
79 |
+
adam_epsilon=1e-8,
|
80 |
+
weight_decay=0.001,
|
81 |
+
lr_scheduler_type="linear",
|
82 |
+
gradient_accumulation_steps=4,
|
83 |
+
max_steps=10000,
|
84 |
+
num_train_epochs=20000,
|
85 |
+
save_total_limit=2,
|
86 |
+
dataloader_num_workers=5,
|
87 |
+
save_steps=100000,
|
88 |
+
warmup_steps=500,
|
89 |
+
do_eval=True,
|
90 |
+
eval_steps=500,
|
91 |
+
evaluation_strategy="steps",
|
92 |
+
logging_strategy="steps",
|
93 |
+
logging_steps=50,
|
94 |
+
fp16_opt_level="O2",
|
95 |
+
half_precision_backend="amp",
|
96 |
+
log_on_each_node=False,
|
97 |
+
disable_tqdm=True
|
98 |
+
)
|
99 |
+
|
100 |
+
print(training_args)
|
101 |
+
|
102 |
+
dataset = load_from_disk(r"/data_loc/"+ds_name)["test"]
|
103 |
+
#dataset = load_from_disk(r"C:\Users\vin\Documents\Projects\dippa\tests\ylilauta\tokenized_set").train_test_split(test_size=0.1)
|
104 |
+
|
105 |
+
trainer_class = MultilabelTrainer
|
106 |
+
|
107 |
+
#print("num_labels",num_labels)
|
108 |
+
model = AutoModelForSequenceClassification.from_pretrained("/fine_tuning_checkpoint/"+ds_name)
|
109 |
+
tokenizer = AutoTokenizer.from_pretrained("/fine_tuning_checkpoint/"+ds_name)
|
110 |
+
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
|
111 |
+
|
112 |
+
print("init trainer")
|
113 |
+
trainer = trainer_class(
|
114 |
+
model=model,
|
115 |
+
args=training_args,
|
116 |
+
train_dataset=dataset,
|
117 |
+
eval_dataset=dataset,
|
118 |
+
tokenizer=tokenizer,
|
119 |
+
compute_metrics=metric,
|
120 |
+
data_collator=default_data_collator
|
121 |
+
)
|
122 |
+
#checkpoint = None
|
123 |
+
#checkpoint = get_last_checkpoint(output_dir)
|
124 |
+
#checkpoint = None
|
125 |
+
#train_result = trainer.train()
|
126 |
+
#trainer.save_state()
|
127 |
+
metrics = trainer.evaluate()
|
128 |
+
print(metrics)
|
129 |
+
#trainer.save_model() # Saves the tokenizer too for easy upload
|
130 |
+
|
131 |
+
if __name__ == "__main__":
|
132 |
+
main()
|
evaluate_and_analyze/few_shot.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
evaluate_and_analyze/generation.ipynb
ADDED
The diff for this file is too large to render.
See raw diff
|
|
finetune.py
ADDED
@@ -0,0 +1,150 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
|
3 |
+
import os
|
4 |
+
import sys
|
5 |
+
from datasets import load_dataset, load_from_disk, concatenate_datasets
|
6 |
+
from transformers import PreTrainedTokenizerFast
|
7 |
+
import transformers
|
8 |
+
from transformers import (
|
9 |
+
AutoConfig,
|
10 |
+
AutoModelForCausalLM,
|
11 |
+
Trainer,
|
12 |
+
TrainingArguments,
|
13 |
+
default_data_collator,
|
14 |
+
)
|
15 |
+
from transformers.trainer_utils import get_last_checkpoint
|
16 |
+
from transformers import AutoModelWithLMHead, AutoTokenizer, AutoModelForCausalLM, AutoModelForSequenceClassification, AutoModel
|
17 |
+
|
18 |
+
from transformers import GPT2Model
|
19 |
+
from transformers import GPT2TokenizerFast
|
20 |
+
import transformers
|
21 |
+
import torch
|
22 |
+
import numpy as np
|
23 |
+
import argparse
|
24 |
+
|
25 |
+
parser = argparse.ArgumentParser()
|
26 |
+
parser.add_argument('test', type=int)
|
27 |
+
parser.add_argument('length', type=int)
|
28 |
+
#parser.add_argument('--input_file', type=int)
|
29 |
+
args = parser.parse_args()
|
30 |
+
|
31 |
+
def compute_metrics(eval_pred):
|
32 |
+
logits,labels = eval_pred
|
33 |
+
|
34 |
+
predictions = np.zeros(logits.shape)
|
35 |
+
predictions[np.arange(len(predictions)),logits.argmax(1)] = 1
|
36 |
+
predictions = predictions > 0.5
|
37 |
+
|
38 |
+
#predictions = logits > 0.5
|
39 |
+
labels = labels > 0.5
|
40 |
+
return {"acc":np.all(predictions == labels,axis=1).sum()/predictions.shape[0]}
|
41 |
+
|
42 |
+
def compute_metrics_regression(eval_pred):
|
43 |
+
logits,labels = eval_pred
|
44 |
+
|
45 |
+
labels = np.expand_dims(labels,1)
|
46 |
+
val = np.abs(logits-labels).mean()
|
47 |
+
perc = ((np.abs(logits-labels).round() < 1).sum()*100) / (len(labels))
|
48 |
+
perc_50 = ((np.abs(logits-labels).round()[0:50] < 1).sum()*100) / (50)
|
49 |
+
|
50 |
+
return {"dev":val,"perc":perc,"perc_50":perc_50}
|
51 |
+
|
52 |
+
|
53 |
+
|
54 |
+
class MultilabelTrainer(Trainer):
|
55 |
+
def compute_loss(self,model,inputs,return_outputs=False):
|
56 |
+
labels = inputs.pop("labels")
|
57 |
+
outputs = model(**inputs)
|
58 |
+
logits = outputs.logits
|
59 |
+
loss_fct = torch.nn.BCEWithLogitsLoss()
|
60 |
+
loss = loss_fct(logits.view(-1,self.model.config.num_labels),
|
61 |
+
labels.float().view(-1,self.model.config.num_labels))
|
62 |
+
return (loss,outputs) if return_outputs else loss
|
63 |
+
|
64 |
+
def main():
|
65 |
+
ds_names = ["yle", "online_reviews","xed","ylilauta"]
|
66 |
+
#ds_sizes = [1000, 3000, 10000, 32000, 9999999]
|
67 |
+
print("test:",args.test)
|
68 |
+
ds_name = ds_names[args.test]
|
69 |
+
ds_size = args.length
|
70 |
+
print(ds_name, ds_size)
|
71 |
+
|
72 |
+
metric = compute_metrics_regression if ds_name == "online_reviews" else compute_metrics
|
73 |
+
|
74 |
+
#print("cuda_avail:",torch.cuda.is_available())
|
75 |
+
#checkpoint_loc = "/media/volume/output/checkpoint-275000"
|
76 |
+
#output_dir = "/media/volume/fi_nlp/output/finetune"
|
77 |
+
#checkpoint_loc = r"H:\Data_temp\checkpoints\good_large\checkpoint-67400"
|
78 |
+
output_dir = "/scratch/project_462000007/hatanpav/output/dippa/gpt/"+ds_name
|
79 |
+
|
80 |
+
training_args = TrainingArguments(
|
81 |
+
output_dir=output_dir,
|
82 |
+
per_device_train_batch_size=4,
|
83 |
+
per_device_eval_batch_size=4,
|
84 |
+
learning_rate=5e-6,
|
85 |
+
adam_beta1=0.95,
|
86 |
+
adam_beta2=0.985,
|
87 |
+
adam_epsilon=1e-8,
|
88 |
+
weight_decay=0.001,
|
89 |
+
lr_scheduler_type="linear",
|
90 |
+
gradient_accumulation_steps=2,#This one assumes 4x8 GPUs. Set to 64 to get global batch size of 64 with one GPU
|
91 |
+
max_steps=10000,
|
92 |
+
num_train_epochs=20000,#Overriden by max_steps
|
93 |
+
save_total_limit=2,
|
94 |
+
dataloader_num_workers=5,
|
95 |
+
save_steps=100000,
|
96 |
+
warmup_steps=500,
|
97 |
+
do_eval=True,
|
98 |
+
eval_steps=500,
|
99 |
+
evaluation_strategy="steps",
|
100 |
+
logging_strategy="steps",
|
101 |
+
logging_steps=50,
|
102 |
+
fp16_opt_level="O2",
|
103 |
+
half_precision_backend="amp",
|
104 |
+
log_on_each_node=False,
|
105 |
+
disable_tqdm=True
|
106 |
+
)
|
107 |
+
|
108 |
+
print(training_args)
|
109 |
+
|
110 |
+
dataset = load_from_disk(r"/path/to/data/"+ds_name)
|
111 |
+
|
112 |
+
#Handle regression type task:
|
113 |
+
n_labels = 1
|
114 |
+
trainer_class = MultilabelTrainer
|
115 |
+
try:
|
116 |
+
n_labels = len(dataset["train"][0]["labels"])
|
117 |
+
except:
|
118 |
+
#The case of label being a float.
|
119 |
+
n_labels = 1
|
120 |
+
trainer_class = Trainer
|
121 |
+
if ds_size > len(dataset["train"]):
|
122 |
+
ds_size = len(dataset["train"])
|
123 |
+
|
124 |
+
|
125 |
+
model = AutoModelForSequenceClassification.from_pretrained("/checkpoint/loc",num_labels=n_labels)
|
126 |
+
tokenizer = AutoTokenizer.from_pretrained("/checkpoint/loc")
|
127 |
+
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
|
128 |
+
|
129 |
+
print("init trainer")
|
130 |
+
train_set = dataset["train"].select(range(ds_size))
|
131 |
+
test_set = dataset["test"]
|
132 |
+
trainer = trainer_class(
|
133 |
+
model=model,
|
134 |
+
args=training_args,
|
135 |
+
train_dataset=train_set,
|
136 |
+
eval_dataset=test_set,
|
137 |
+
tokenizer=tokenizer,
|
138 |
+
compute_metrics=metric,
|
139 |
+
data_collator=default_data_collator
|
140 |
+
)
|
141 |
+
checkpoint = None
|
142 |
+
#checkpoint = get_last_checkpoint(output_dir)
|
143 |
+
train_result = trainer.train(resume_from_checkpoint=checkpoint)
|
144 |
+
#trainer.save_state()
|
145 |
+
metrics = trainer.evaluate()
|
146 |
+
print(metrics)
|
147 |
+
trainer.save_model() # Saves the tokenizer too for easy upload
|
148 |
+
|
149 |
+
if __name__ == "__main__":
|
150 |
+
main()
|
train.py
ADDED
@@ -0,0 +1,97 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import transformers
|
3 |
+
import datasets
|
4 |
+
from transformers import PreTrainedTokenizerFast
|
5 |
+
from transformers import (
|
6 |
+
GPT2TokenizerFast,
|
7 |
+
AutoConfig,
|
8 |
+
AutoModelForCausalLM,
|
9 |
+
Trainer,
|
10 |
+
TrainingArguments,
|
11 |
+
default_data_collator
|
12 |
+
)
|
13 |
+
from transformers.trainer_utils import get_last_checkpoint
|
14 |
+
import torch
|
15 |
+
#from transformers.utils.dummy_tokenizers_objects import PreTrainedTokenizerFast
|
16 |
+
|
17 |
+
#config_name = "C:\\Users\\vin\\Documents\\Projects\\NLP\\kielimalli\\config.json"
|
18 |
+
#tokenizer_file = "C:\\Users\\vin\\Documents\\Projects\\NLP\\models\\tokens.json"
|
19 |
+
#input_dir = "H:\\Data_temp\\tokenized_dataset"
|
20 |
+
#output_dir = "H:\\Data_temp\\checkpoints\\model1"
|
21 |
+
|
22 |
+
def main():
|
23 |
+
import os
|
24 |
+
#enable if required by your environment
|
25 |
+
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"
|
26 |
+
#torch.backends.cuda.matmul.allow_tf32 = True
|
27 |
+
#torch.backends.cudnn.allow_tf32 = True
|
28 |
+
|
29 |
+
config_name = "config_large_bpe.json"
|
30 |
+
tokenizer_files = "/path/to/tokenizer/files"
|
31 |
+
input_dir = "/data/dir"
|
32 |
+
output_dir = "/out/dir"
|
33 |
+
|
34 |
+
training_args = TrainingArguments(
|
35 |
+
output_dir=output_dir,
|
36 |
+
per_device_train_batch_size=4,
|
37 |
+
per_device_eval_batch_size=4,
|
38 |
+
learning_rate=2.067e-5,
|
39 |
+
lr_scheduler_type="linear",
|
40 |
+
adam_beta1=0.95,
|
41 |
+
adam_beta2=0.985,
|
42 |
+
adam_epsilon=1e-8,
|
43 |
+
weight_decay=0.001,
|
44 |
+
gradient_accumulation_steps=32,
|
45 |
+
num_train_epochs=6.7,
|
46 |
+
save_total_limit=2,
|
47 |
+
dataloader_num_workers=10,
|
48 |
+
save_steps=100,
|
49 |
+
warmup_steps=1000,
|
50 |
+
do_eval=True,
|
51 |
+
eval_steps=1000,
|
52 |
+
evaluation_strategy="steps",
|
53 |
+
logging_strategy="steps",
|
54 |
+
logging_steps=100,
|
55 |
+
bf16=True,
|
56 |
+
tf32=True,
|
57 |
+
fp16_opt_level="O2",
|
58 |
+
half_precision_backend="amp",
|
59 |
+
bf16_full_eval=True
|
60 |
+
)
|
61 |
+
|
62 |
+
print("setting up tokenizer...")
|
63 |
+
tokenizer = GPT2TokenizerFast.from_pretrained(tokenizer_files)
|
64 |
+
#tokenizer.add_special_tokens({'pad_token': '[PAD]'})#Probably wrong
|
65 |
+
tokenizer.add_special_tokens({'pad_token': tokenizer.eos_token})
|
66 |
+
from tokenizers.processors import TemplateProcessing
|
67 |
+
tokenizer._tokenizer.post_processor = TemplateProcessing(
|
68 |
+
single="$0 "+tokenizer.eos_token,
|
69 |
+
pair="$A "+tokenizer.eos_token+" $B:1 "+tokenizer.eos_token,
|
70 |
+
special_tokens=[(tokenizer.eos_token, 0)],
|
71 |
+
)
|
72 |
+
|
73 |
+
print("loading model...")
|
74 |
+
config = AutoConfig.from_pretrained(config_name)
|
75 |
+
model = AutoModelForCausalLM.from_config(config)
|
76 |
+
#model = AutoModelForCausalLM.from_pretrained("/checkpoint/dir") if restarting training completely and loading weights from a checkpoints
|
77 |
+
model.gradient_checkpointing_enable() #Optional, affects performance
|
78 |
+
print("loading data...")
|
79 |
+
dataset = datasets.load_from_disk(input_dir)
|
80 |
+
|
81 |
+
print("starting training...")
|
82 |
+
trainer = Trainer(
|
83 |
+
model=model,
|
84 |
+
args=training_args,
|
85 |
+
train_dataset=dataset["train"],
|
86 |
+
data_collator=default_data_collator,
|
87 |
+
eval_dataset=dataset["test"].select(range(10000)), #To save time do not evaluate on whole test set during training
|
88 |
+
tokenizer=tokenizer
|
89 |
+
)
|
90 |
+
|
91 |
+
#checkpoint = None
|
92 |
+
checkpoint = get_last_checkpoint(output_dir)
|
93 |
+
print("checkpoint:", checkpoint)
|
94 |
+
trainer.train(resume_from_checkpoint=checkpoint)
|
95 |
+
|
96 |
+
if __name__ == "__main__":
|
97 |
+
main()
|