Spaces:
Runtime error
Runtime error
Commit
·
a99b495
1
Parent(s):
021fe7f
lints the code
Browse files- .pylintrc +2 -1
- elise/src/configs/__init__.py +3 -0
- elise/src/configs/logging_config.yaml +3 -1
- elise/src/configs/train_t5.py +4 -0
- elise/src/data/mit_seq2seq_dataset.py +3 -0
- elise/src/excutors/trainer_seq2seq.py +0 -208
- elise/src/train_t5_seq2seq.py +19 -10
- elise/src/utils/logger.py +3 -1
.pylintrc
CHANGED
@@ -428,7 +428,8 @@ disable=raw-checker-failed,
|
|
428 |
suppressed-message,
|
429 |
useless-suppression,
|
430 |
deprecated-pragma,
|
431 |
-
use-symbolic-message-instead
|
|
|
432 |
|
433 |
# Enable the message, report, category or checker with the given id(s). You can
|
434 |
# either give multiple identifier separated by comma (,) or put this option
|
|
|
428 |
suppressed-message,
|
429 |
useless-suppression,
|
430 |
deprecated-pragma,
|
431 |
+
use-symbolic-message-instead,
|
432 |
+
R0902
|
433 |
|
434 |
# Enable the message, report, category or checker with the given id(s). You can
|
435 |
# either give multiple identifier separated by comma (,) or put this option
|
elise/src/configs/__init__.py
CHANGED
@@ -1 +1,4 @@
|
|
|
|
|
|
|
|
1 |
from .train_t5 import T5TrainingConfig
|
|
|
1 |
+
"""
|
2 |
+
All configs for ML project
|
3 |
+
"""
|
4 |
from .train_t5 import T5TrainingConfig
|
elise/src/configs/logging_config.yaml
CHANGED
@@ -1,12 +1,14 @@
|
|
1 |
version: 1
|
|
|
2 |
formatters:
|
3 |
simple:
|
4 |
format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
5 |
handlers:
|
6 |
console:
|
|
|
7 |
class: logging.StreamHandler
|
8 |
formatter: simple
|
9 |
stream: ext://sys.stdout
|
10 |
-
|
11 |
Level: DEBUG
|
12 |
handlers: [console]
|
|
|
1 |
version: 1
|
2 |
+
disable_existing_loggers: False
|
3 |
formatters:
|
4 |
simple:
|
5 |
format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
6 |
handlers:
|
7 |
console:
|
8 |
+
level: DEBUG
|
9 |
class: logging.StreamHandler
|
10 |
formatter: simple
|
11 |
stream: ext://sys.stdout
|
12 |
+
root:
|
13 |
Level: DEBUG
|
14 |
handlers: [console]
|
elise/src/configs/train_t5.py
CHANGED
@@ -1,3 +1,7 @@
|
|
|
|
|
|
|
|
|
|
1 |
from dataclasses import dataclass
|
2 |
|
3 |
|
|
|
1 |
+
"""
|
2 |
+
Training config for T5 Seq2Seq training
|
3 |
+
"""
|
4 |
+
|
5 |
from dataclasses import dataclass
|
6 |
|
7 |
|
elise/src/data/mit_seq2seq_dataset.py
CHANGED
@@ -87,6 +87,9 @@ class MITRestaurants:
|
|
87 |
|
88 |
|
89 |
def get_default_transforms():
|
|
|
|
|
|
|
90 |
label_names = {v: k for k, v in MITRestaurants.ner_tags.items()}
|
91 |
|
92 |
def decode_tags(tags, words):
|
|
|
87 |
|
88 |
|
89 |
def get_default_transforms():
|
90 |
+
"""
|
91 |
+
Default transformation to convert ner dataset to seq2seq
|
92 |
+
"""
|
93 |
label_names = {v: k for k, v in MITRestaurants.ner_tags.items()}
|
94 |
|
95 |
def decode_tags(tags, words):
|
elise/src/excutors/trainer_seq2seq.py
DELETED
@@ -1,208 +0,0 @@
|
|
1 |
-
from transformers import get_scheduler
|
2 |
-
import torch
|
3 |
-
from torch.utils.data import DataLoader
|
4 |
-
from datasets import load_dataset
|
5 |
-
from transformers import AutoTokenizer, AutoModelForTokenClassification
|
6 |
-
from transformers import DataCollatorForTokenClassification
|
7 |
-
from accelerate import Accelerator
|
8 |
-
import evaluate
|
9 |
-
import datasets
|
10 |
-
|
11 |
-
from tqdm.auto import tqdm
|
12 |
-
|
13 |
-
|
14 |
-
ner_tags = {
|
15 |
-
"O": 0,
|
16 |
-
"B-Rating": 1,
|
17 |
-
"I-Rating": 2,
|
18 |
-
"B-Amenity": 3,
|
19 |
-
"I-Amenity": 4,
|
20 |
-
"B-Location": 5,
|
21 |
-
"I-Location": 6,
|
22 |
-
"B-Restaurant_Name": 7,
|
23 |
-
"I-Restaurant_Name": 8,
|
24 |
-
"B-Price": 9,
|
25 |
-
"B-Hours": 10,
|
26 |
-
"I-Hours": 11,
|
27 |
-
"B-Dish": 12,
|
28 |
-
"I-Dish": 13,
|
29 |
-
"B-Cuisine": 14,
|
30 |
-
"I-Price": 15,
|
31 |
-
"I-Cuisine": 16,
|
32 |
-
}
|
33 |
-
|
34 |
-
|
35 |
-
label_names = {v: k for k, v in ner_tags.items()}
|
36 |
-
|
37 |
-
# dataset aggregation
|
38 |
-
dataset = load_dataset("tner/mit_restaurant")
|
39 |
-
dataset["train"] = datasets.concatenate_datasets([dataset["train"], dataset["validation"]])
|
40 |
-
dataset["train"] = datasets.concatenate_datasets([dataset["train"], dataset["test"]])
|
41 |
-
|
42 |
-
print(dataset)
|
43 |
-
|
44 |
-
|
45 |
-
tokenizer = AutoTokenizer.from_pretrained(
|
46 |
-
'sentence-transformers/all-MiniLM-L6-v2')
|
47 |
-
|
48 |
-
|
49 |
-
def align_labels_with_tokens(labels, word_ids):
|
50 |
-
new_labels = []
|
51 |
-
current_word = None
|
52 |
-
for word_id in word_ids:
|
53 |
-
if word_id != current_word:
|
54 |
-
# Start of a new word!
|
55 |
-
current_word = word_id
|
56 |
-
label = -100 if word_id is None else labels[word_id]
|
57 |
-
new_labels.append(label)
|
58 |
-
elif word_id is None:
|
59 |
-
# Special token
|
60 |
-
new_labels.append(-100)
|
61 |
-
else:
|
62 |
-
# Same word as previous token
|
63 |
-
label = labels[word_id]
|
64 |
-
# If the label is B-XXX we change it to I-XXX
|
65 |
-
label_name = label_names[label]
|
66 |
-
if label_name.startswith("B"):
|
67 |
-
label = ner_tags["I" + label_name[1:]]
|
68 |
-
new_labels.append(label)
|
69 |
-
|
70 |
-
return new_labels
|
71 |
-
|
72 |
-
|
73 |
-
def tokenize_and_align_labels(examples):
|
74 |
-
tokenized_inputs = tokenizer(
|
75 |
-
examples["tokens"], truncation=True, is_split_into_words=True
|
76 |
-
)
|
77 |
-
all_labels = examples["tags"]
|
78 |
-
new_labels = []
|
79 |
-
for i, labels in enumerate(all_labels):
|
80 |
-
word_ids = tokenized_inputs.word_ids(i)
|
81 |
-
new_labels.append(align_labels_with_tokens(labels, word_ids))
|
82 |
-
|
83 |
-
tokenized_inputs["labels"] = new_labels
|
84 |
-
return tokenized_inputs
|
85 |
-
|
86 |
-
|
87 |
-
tokenized_datasets = dataset.map(
|
88 |
-
tokenize_and_align_labels,
|
89 |
-
batched=True,
|
90 |
-
remove_columns=dataset["train"].column_names,
|
91 |
-
)
|
92 |
-
|
93 |
-
|
94 |
-
def train():
|
95 |
-
metric = evaluate.load("seqeval")
|
96 |
-
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
|
97 |
-
|
98 |
-
train_dataloader = DataLoader(
|
99 |
-
tokenized_datasets["train"],
|
100 |
-
shuffle=True,
|
101 |
-
collate_fn=data_collator,
|
102 |
-
batch_size=128,
|
103 |
-
)
|
104 |
-
eval_dataloader = DataLoader(
|
105 |
-
tokenized_datasets["test"],
|
106 |
-
collate_fn=data_collator,
|
107 |
-
batch_size=8
|
108 |
-
)
|
109 |
-
|
110 |
-
model = AutoModelForTokenClassification.from_pretrained(
|
111 |
-
'sentence-transformers/all-MiniLM-L6-v2',
|
112 |
-
id2label=label_names,
|
113 |
-
label2id=ner_tags,
|
114 |
-
)
|
115 |
-
|
116 |
-
optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
|
117 |
-
|
118 |
-
accelerator = Accelerator()
|
119 |
-
model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
|
120 |
-
model, optimizer, train_dataloader, eval_dataloader
|
121 |
-
)
|
122 |
-
|
123 |
-
num_train_epochs = 50
|
124 |
-
num_update_steps_per_epoch = len(train_dataloader)
|
125 |
-
num_training_steps = num_train_epochs * num_update_steps_per_epoch
|
126 |
-
|
127 |
-
lr_scheduler = get_scheduler(
|
128 |
-
"linear",
|
129 |
-
optimizer=optimizer,
|
130 |
-
num_warmup_steps=0,
|
131 |
-
num_training_steps=num_training_steps,
|
132 |
-
)
|
133 |
-
|
134 |
-
def postprocess(predictions, labels):
|
135 |
-
predictions = predictions.detach().cpu().clone().numpy()
|
136 |
-
labels = labels.detach().cpu().clone().numpy()
|
137 |
-
|
138 |
-
# Remove ignored index (special tokens) and convert to labels
|
139 |
-
true_labels = [[label_names[l] for l in label if l != -100]
|
140 |
-
for label in labels]
|
141 |
-
true_predictions = [
|
142 |
-
[label_names[p] for (p, l) in zip(prediction, label) if l != -100]
|
143 |
-
for prediction, label in zip(predictions, labels)
|
144 |
-
]
|
145 |
-
return true_labels, true_predictions
|
146 |
-
|
147 |
-
progress_bar = tqdm(range(num_training_steps))
|
148 |
-
|
149 |
-
for epoch in range(num_train_epochs):
|
150 |
-
# Training
|
151 |
-
model.train()
|
152 |
-
for batch in train_dataloader:
|
153 |
-
outputs = model(**batch)
|
154 |
-
loss = outputs.loss
|
155 |
-
accelerator.backward(loss)
|
156 |
-
|
157 |
-
optimizer.step()
|
158 |
-
lr_scheduler.step()
|
159 |
-
optimizer.zero_grad()
|
160 |
-
progress_bar.update(1)
|
161 |
-
|
162 |
-
# Evaluation
|
163 |
-
model.eval()
|
164 |
-
for batch in eval_dataloader:
|
165 |
-
with torch.no_grad():
|
166 |
-
outputs = model(**batch)
|
167 |
-
|
168 |
-
predictions = outputs.logits.argmax(dim=-1)
|
169 |
-
labels = batch["labels"]
|
170 |
-
|
171 |
-
# Necessary to pad predictions and labels for being gathered
|
172 |
-
predictions = accelerator.pad_across_processes(
|
173 |
-
predictions, dim=1, pad_index=-100)
|
174 |
-
labels = accelerator.pad_across_processes(
|
175 |
-
labels, dim=1, pad_index=-100)
|
176 |
-
|
177 |
-
predictions_gathered = accelerator.gather(predictions)
|
178 |
-
labels_gathered = accelerator.gather(labels)
|
179 |
-
|
180 |
-
true_predictions, true_labels = postprocess(
|
181 |
-
predictions_gathered, labels_gathered)
|
182 |
-
metric.add_batch(predictions=true_predictions,
|
183 |
-
references=true_labels)
|
184 |
-
|
185 |
-
results = metric.compute()
|
186 |
-
print(
|
187 |
-
f"epoch {epoch}:",
|
188 |
-
{
|
189 |
-
key: results[f"overall_{key}"]
|
190 |
-
for key in ["precision", "recall", "f1", "accuracy"]
|
191 |
-
},
|
192 |
-
)
|
193 |
-
|
194 |
-
output_dir = "restaurant_ner"
|
195 |
-
# Save and upload
|
196 |
-
accelerator.wait_for_everyone()
|
197 |
-
unwrapped_model = accelerator.unwrap_model(model)
|
198 |
-
unwrapped_model.save_pretrained(
|
199 |
-
output_dir, save_function=accelerator.save)
|
200 |
-
if accelerator.is_main_process:
|
201 |
-
tokenizer.save_pretrained(output_dir)
|
202 |
-
|
203 |
-
accelerator.wait_for_everyone()
|
204 |
-
unwrapped_model = accelerator.unwrap_model(model)
|
205 |
-
unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
|
206 |
-
|
207 |
-
|
208 |
-
train()
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
elise/src/train_t5_seq2seq.py
CHANGED
@@ -1,22 +1,27 @@
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import torch
|
2 |
import evaluate
|
3 |
import datasets
|
4 |
from torch.utils.data import DataLoader
|
5 |
-
from
|
6 |
-
|
7 |
-
|
8 |
-
|
9 |
-
|
|
|
10 |
from accelerate import Accelerator
|
11 |
-
from transformers import get_scheduler
|
12 |
import numpy as np
|
13 |
import mlflow
|
14 |
|
15 |
from tqdm.auto import tqdm
|
16 |
|
17 |
-
from data import MITRestaurants, get_default_transforms
|
18 |
from utils.logger import get_logger
|
19 |
from configs import T5TrainingConfig
|
|
|
20 |
|
21 |
log = get_logger("Flan_T5")
|
22 |
log.debug("heloooooooooooo?")
|
@@ -36,6 +41,7 @@ model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
|
|
36 |
|
37 |
|
38 |
def tokenize(example):
|
|
|
39 |
tokenized = tokenizer(
|
40 |
example["tokens"],
|
41 |
text_target=example["labels"],
|
@@ -57,6 +63,7 @@ metric = evaluate.load("sacrebleu")
|
|
57 |
|
58 |
|
59 |
def postprocess(predictions, labels):
|
|
|
60 |
predictions = predictions.cpu().numpy()
|
61 |
labels = labels.cpu().numpy()
|
62 |
|
@@ -115,7 +122,8 @@ model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
|
|
115 |
progress_bar = tqdm(range(num_training_steps))
|
116 |
|
117 |
|
118 |
-
def train(
|
|
|
119 |
# log.info("Starting Training")
|
120 |
print("Starting Traning")
|
121 |
for epoch in range(config.epochs):
|
@@ -174,10 +182,11 @@ def train(model, dataset, metric):
|
|
174 |
transformers_model={"model": unwrapped_model, "tokenizer": tokenizer},
|
175 |
task="text2text-generation",
|
176 |
artifact_path="seq2seq_model",
|
177 |
-
registered_model_name="FlanT5_MIT"
|
178 |
)
|
179 |
|
|
|
180 |
mlflow.set_tracking_uri("http://127.0.0.1:5000")
|
181 |
with mlflow.start_run() as mlflow_run:
|
182 |
mlflow.log_params(asdict(config))
|
183 |
-
train(
|
|
|
1 |
+
"""
|
2 |
+
Training Flant_T5 model on tner/mit_restaurant on seq2seq task
|
3 |
+
"""
|
4 |
+
from dataclasses import asdict
|
5 |
+
|
6 |
import torch
|
7 |
import evaluate
|
8 |
import datasets
|
9 |
from torch.utils.data import DataLoader
|
10 |
+
from transformers import (
|
11 |
+
AutoTokenizer,
|
12 |
+
AutoModelForSeq2SeqLM,
|
13 |
+
DataCollatorForSeq2Seq,
|
14 |
+
get_scheduler,
|
15 |
+
)
|
16 |
from accelerate import Accelerator
|
|
|
17 |
import numpy as np
|
18 |
import mlflow
|
19 |
|
20 |
from tqdm.auto import tqdm
|
21 |
|
|
|
22 |
from utils.logger import get_logger
|
23 |
from configs import T5TrainingConfig
|
24 |
+
from data import MITRestaurants, get_default_transforms
|
25 |
|
26 |
log = get_logger("Flan_T5")
|
27 |
log.debug("heloooooooooooo?")
|
|
|
41 |
|
42 |
|
43 |
def tokenize(example):
|
44 |
+
"""Tokenizes dataset for seq2seq task"""
|
45 |
tokenized = tokenizer(
|
46 |
example["tokens"],
|
47 |
text_target=example["labels"],
|
|
|
63 |
|
64 |
|
65 |
def postprocess(predictions, labels):
|
66 |
+
"""Post processing to convert model output for evaluation"""
|
67 |
predictions = predictions.cpu().numpy()
|
68 |
labels = labels.cpu().numpy()
|
69 |
|
|
|
122 |
progress_bar = tqdm(range(num_training_steps))
|
123 |
|
124 |
|
125 |
+
def train():
|
126 |
+
"""Training function for finetuing flanT5"""
|
127 |
# log.info("Starting Training")
|
128 |
print("Starting Traning")
|
129 |
for epoch in range(config.epochs):
|
|
|
182 |
transformers_model={"model": unwrapped_model, "tokenizer": tokenizer},
|
183 |
task="text2text-generation",
|
184 |
artifact_path="seq2seq_model",
|
185 |
+
registered_model_name="FlanT5_MIT",
|
186 |
)
|
187 |
|
188 |
+
|
189 |
mlflow.set_tracking_uri("http://127.0.0.1:5000")
|
190 |
with mlflow.start_run() as mlflow_run:
|
191 |
mlflow.log_params(asdict(config))
|
192 |
+
train()
|
elise/src/utils/logger.py
CHANGED
@@ -4,7 +4,9 @@ Logging helper module
|
|
4 |
import logging.config
|
5 |
import yaml
|
6 |
|
7 |
-
with open(
|
|
|
|
|
8 |
config = yaml.safe_load(f.read())
|
9 |
logging.config.dictConfig(config)
|
10 |
logging.captureWarnings(True)
|
|
|
4 |
import logging.config
|
5 |
import yaml
|
6 |
|
7 |
+
with open(
|
8 |
+
"/home/kave/work/Elise/elise/src/configs/logging_config.yaml", "r", encoding="utf-8"
|
9 |
+
) as f:
|
10 |
config = yaml.safe_load(f.read())
|
11 |
logging.config.dictConfig(config)
|
12 |
logging.captureWarnings(True)
|