|
|
|
|
|
|
|
|
|
|
|
|
|
import sys, argparse |
|
import torch |
|
from transformers import T5ForConditionalGeneration, T5Tokenizer |
|
import pandas as pd |
|
import os |
|
import numpy as np |
|
from tqdm.auto import tqdm, trange |
|
import gc |
|
from datetime import datetime |
|
import time |
|
|
|
st = time.time() |
|
|
|
parser = argparse.ArgumentParser() |
|
parser.add_argument("-n","--model_name", type=str, default="d2t_model", required=False, help="Specify model name") |
|
parser.add_argument("-e","--epochs", type=int, default=100, required=False, help="Specify training epochs") |
|
args = parser.parse_args() |
|
|
|
model_name = args.model_name |
|
epochs = args.epochs |
|
|
|
print("Model name: " + model_name + " Epochs: " + str(epochs)) |
|
|
|
|
|
"""# Modelo T5 |
|
Importamos o modelo preadestrado |
|
""" |
|
|
|
model = T5ForConditionalGeneration.from_pretrained('google/mt5-base') |
|
tokenizer = T5Tokenizer.from_pretrained('google/mt5-base') |
|
model.cuda(); |
|
optimizer = torch.optim.Adam(params=[p for p in model.parameters() if p.requires_grad], lr=1e-5) |
|
|
|
|
|
|
|
all_data = pd.read_csv('./datasets/dataset-gl.csv', encoding="latin-1") |
|
|
|
|
|
|
|
|
|
train_split = all_data.iloc[:2733, :] |
|
test_split = all_data.iloc[2733:, :] |
|
|
|
|
|
train_split=train_split.dropna() |
|
train_split=train_split.dropna(axis=0) |
|
train_split=train_split.reset_index() |
|
print(torch.cuda.list_gpu_processes()) |
|
|
|
def split_batches(df, batch_size): |
|
batches = [] |
|
for i in range(0, len(df), batch_size): |
|
if (i+batch_size) > len(df): |
|
batches.append(df[i:]) |
|
else: |
|
batches.append(df[i: i+batch_size]) |
|
return batches |
|
|
|
|
|
def cleanup(): |
|
gc.collect() |
|
torch.cuda.empty_cache() |
|
|
|
cleanup() |
|
|
|
optimizer.param_groups[0]['lr'] = 1e-5 |
|
|
|
"""# Adestramento""" |
|
|
|
model.train(); |
|
batch_size = 8 |
|
max_len = 384 |
|
accumulation_steps = 1 |
|
save_steps = 1 |
|
epochs_tq = trange(epochs) |
|
|
|
window = 4000 |
|
ewm = 0 |
|
errors = 0 |
|
|
|
cleanup() |
|
|
|
batches = split_batches(train_split, batch_size) |
|
|
|
for i in epochs_tq: |
|
print("Epoch:", i) |
|
batch_count = 0 |
|
for batch in batches: |
|
batch_count += 1 |
|
print("Batch:", batch_count) |
|
xx = batch.table.values.tolist() |
|
yy = batch.table.values.tolist() |
|
try: |
|
x = tokenizer(xx, return_tensors='pt', padding=True, truncation=True, max_length=max_len).to(model.device) |
|
y = tokenizer(yy, return_tensors='pt', padding=True, truncation=True, max_length=max_len).to(model.device) |
|
|
|
y.input_ids[y.input_ids==0] = -100 |
|
|
|
loss = model( |
|
input_ids=x.input_ids, |
|
attention_mask=x.attention_mask, |
|
labels=y.input_ids, |
|
decoder_attention_mask=y.attention_mask, |
|
return_dict=True |
|
).loss |
|
loss.backward() |
|
|
|
except RuntimeError as e: |
|
errors += 1 |
|
print("ERROR") |
|
print(i, x.input_ids.shape[1], y.input_ids.shape[1], e) |
|
loss = None |
|
cleanup() |
|
continue |
|
|
|
w = 1 / min(i+1, window) |
|
ewm = ewm * (1-w) + loss.item() * w |
|
epochs_tq.set_description(f'loss: {ewm}') |
|
|
|
if i % accumulation_steps == 0: |
|
optimizer.step() |
|
optimizer.zero_grad() |
|
cleanup() |
|
|
|
if i % window == 0 and i > 0: |
|
print(ewm, errors) |
|
errors = 0 |
|
cleanup() |
|
|
|
if i % save_steps == 0 and i > 0: |
|
model.save_pretrained(model_name + "_" + str(epochs)) |
|
tokenizer.save_pretrained(model_name + "_" + str(epochs)) |
|
print('saving...', i, optimizer.param_groups[0]['lr']) |
|
|
|
model.save_pretrained(model_name + "_" + str(epochs)) |
|
tokenizer.save_pretrained(model_name + "_" + str(epochs)) |
|
|
|
total_time = time.time() - st |
|
print("Training time:", time.strftime("%H:%M:%S", time.gmtime(total_time))) |
|
|
|
"""# Test""" |
|
model.eval(); |
|
|
|
def generate(text): |
|
x = tokenizer(text, return_tensors='pt', padding=True).to(model.device) |
|
out = model.generate(**x, do_sample=False, num_beams=10, max_length=100) |
|
return tokenizer.decode(out[0], skip_special_tokens=True) |
|
|
|
with open(f"{model_name}_{epochs}_predictions_{datetime.now()}.txt", "w") as f: |
|
f.write("Training time:" + str(time.strftime("%H:%M:%S", time.gmtime(total_time)))) |
|
for index, row in test_split.iterrows(): |
|
text_id = str(row["id"]) |
|
text1 = str(row["table"]) |
|
text2 = str(row["caption"]) |
|
|
|
f.write(text_id + "\n" + text1 + "\n") |
|
print(text_id + "\n" + text1) |
|
f.write("Prediction:\n") |
|
f.write(generate(text1) + "\n") |
|
print(generate(text1)) |
|
f.write("Truth:\n") |
|
f.write(text2 + "\n\n") |
|
print(text2) |
|
|