gcjavi commited on
Commit
bbaf732
1 Parent(s): 81e1389

Upload 8 files

Browse files
data2text_gl_v1/config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "google/mt5-base",
3
+ "architectures": [
4
+ "T5ForConditionalGeneration"
5
+ ],
6
+ "d_ff": 2048,
7
+ "d_kv": 64,
8
+ "d_model": 768,
9
+ "decoder_start_token_id": 0,
10
+ "dense_act_fn": "gelu_new",
11
+ "dropout_rate": 0.1,
12
+ "eos_token_id": 1,
13
+ "feed_forward_proj": "gated-gelu",
14
+ "initializer_factor": 1.0,
15
+ "is_encoder_decoder": true,
16
+ "is_gated_act": true,
17
+ "layer_norm_epsilon": 1e-06,
18
+ "model_type": "t5",
19
+ "num_decoder_layers": 12,
20
+ "num_heads": 12,
21
+ "num_layers": 12,
22
+ "output_past": true,
23
+ "pad_token_id": 0,
24
+ "relative_attention_max_distance": 128,
25
+ "relative_attention_num_buckets": 32,
26
+ "tie_word_embeddings": false,
27
+ "tokenizer_class": "T5Tokenizer",
28
+ "torch_dtype": "float32",
29
+ "transformers_version": "4.24.0",
30
+ "use_cache": true,
31
+ "vocab_size": 250112
32
+ }
data2text_gl_v1/pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1d4df627a25b9dddc62330eb29318e77e4f9d480ce49fd0933548f023ff3bf41
3
+ size 2329702581
data2text_gl_v1/special_tokens_map.json ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ {
2
+ "eos_token": "</s>",
3
+ "pad_token": "<pad>",
4
+ "unk_token": "<unk>"
5
+ }
data2text_gl_v1/spiece.model ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ef78f86560d809067d12bac6c09f19a462cb3af3f54d2b8acbba26e1433125d6
3
+ size 4309802
data2text_gl_v1/tokenizer_config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "additional_special_tokens": null,
3
+ "eos_token": "</s>",
4
+ "extra_ids": 0,
5
+ "name_or_path": "google/mt5-base",
6
+ "pad_token": "<pad>",
7
+ "sp_model_kwargs": {},
8
+ "special_tokens_map_file": "/home/patrick/.cache/torch/transformers/685ac0ca8568ec593a48b61b0a3c272beee9bc194a3c7241d15dcadb5f875e53.f76030f3ec1b96a8199b2593390c610e76ca8028ef3d24680000619ffb646276",
9
+ "tokenizer_class": "T5Tokenizer",
10
+ "tokenizer_file": null,
11
+ "unk_token": "<unk>"
12
+ }
generate_text.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import T5ForConditionalGeneration, T5Tokenizer, AutoTokenizer
2
+ import pandas as pd
3
+ import os
4
+ import nltk
5
+ import string
6
+ import math
7
+ import sys
8
+ import argparse
9
+ import random
10
+
11
+
12
+ """# Modelo T5
13
+ Importamos o modelo preadestrado
14
+ """
15
+
16
+ """# Corpus
17
+ #J# Leemos nuestro dataset.
18
+ """
19
+
20
+ test_split = pd.read_csv('./test-dataset.csv', encoding="latin-1")
21
+ test_split= test_split.reset_index()
22
+
23
+ def generate(text):
24
+ print("Tokenizing sequence...")
25
+ x = tokenizer(text, return_tensors='pt', padding=True).to(model.device)
26
+ print("Generating description...")
27
+ out = model.generate(**x, do_sample=False, num_beams=10, max_new_tokens = 50)
28
+ return tokenizer.decode(out[0], skip_special_tokens=True)
29
+
30
+ parser = argparse.ArgumentParser()
31
+ parser.add_argument("-i", "--input_table", type=int, default=280, required=False, help="Specify data ID")
32
+ parser.add_argument("-o", "--output", type=str, default="./", required=False, help="Specify output path")
33
+ args = parser.parse_args()
34
+
35
+ data_id = args.input_table
36
+ output_path = args.output
37
+
38
+ if data_id not in range(0, 569):
39
+ sys.exit("ERROR: ID must be in the range [0,568] (testing IDs)")
40
+
41
+ #J# cargamos el modelo pre-entrenado que queramos, junto con su tokenizador
42
+ print("Loading model...")
43
+ model = T5ForConditionalGeneration.from_pretrained('data2text_gl_v1')
44
+ tokenizer = T5Tokenizer.from_pretrained("data2text_gl_v1")
45
+
46
+ print("Loading data... (dataset-id: " + str(test_split.id[int(data_id)]) + ")")
47
+
48
+ data = test_split.table[int(data_id)]
49
+ gold = test_split.caption[int(data_id)]
50
+ generation = generate(data)
51
+ img_id = str(test_split.id[int(data_id)])
52
+
53
+ pattern = "- Test ID: {} (DB id: {})\n- Data table: {}\n- Generated text: {}\n- Gold text: {}"
54
+
55
+ print(pattern.format(data_id, img_id, data[0:100] + "... </table>", generation, gold))
56
+
57
+ with open(output_path + "generated_"+ str(data_id) + ".txt", "w") as output_file:
58
+ output_file.write(pattern.format(data_id, img_id, data, generation, gold))
59
+ output_file.close()
test-dataset.csv ADDED
The diff for this file is too large to render. See raw diff
 
train.py ADDED
@@ -0,0 +1,169 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # -*- coding: utf-8 -*
2
+ #!pip install transformers
3
+ #!pip install pandas
4
+ #!pip install numpy
5
+ #!pip install SentencePiece
6
+
7
+ import sys, argparse
8
+ import torch
9
+ from transformers import T5ForConditionalGeneration, T5Tokenizer
10
+ import pandas as pd
11
+ import os
12
+ import numpy as np
13
+ from tqdm.auto import tqdm, trange
14
+ import gc
15
+ from datetime import datetime
16
+ import time
17
+
18
+ st = time.time() #start time
19
+
20
+ parser = argparse.ArgumentParser()
21
+ parser.add_argument("-n","--model_name", type=str, default="d2t_model", required=False, help="Specify model name")
22
+ parser.add_argument("-e","--epochs", type=int, default=100, required=False, help="Specify training epochs")
23
+ args = parser.parse_args()
24
+
25
+ model_name = args.model_name
26
+ epochs = args.epochs
27
+
28
+ print("Model name: " + model_name + " Epochs: " + str(epochs))
29
+
30
+
31
+ """# Modelo T5
32
+ Importamos o modelo preadestrado
33
+ """
34
+
35
+ model = T5ForConditionalGeneration.from_pretrained('google/mt5-base')
36
+ tokenizer = T5Tokenizer.from_pretrained('google/mt5-base')
37
+ model.cuda();
38
+ optimizer = torch.optim.Adam(params=[p for p in model.parameters() if p.requires_grad], lr=1e-5)
39
+
40
+
41
+ #Load dataset (dataset-gl.csv or dataset-es.csv)
42
+ all_data = pd.read_csv('./datasets/dataset-gl.csv', encoding="latin-1")
43
+
44
+ #seleccionamos 2733 registros para training (seria la particion 70-30 en dataset-es.csv)
45
+ #en dataset-gl.csv contamos con mas registros, por lo que en test habria 500 en lugar de 300 casos
46
+
47
+ train_split = all_data.iloc[:2733, :]
48
+ test_split = all_data.iloc[2733:, :]
49
+
50
+ #Clean dataset rows
51
+ train_split=train_split.dropna()
52
+ train_split=train_split.dropna(axis=0)
53
+ train_split=train_split.reset_index()
54
+ print(torch.cuda.list_gpu_processes())
55
+
56
+ def split_batches(df, batch_size):
57
+ batches = []
58
+ for i in range(0, len(df), batch_size):
59
+ if (i+batch_size) > len(df):
60
+ batches.append(df[i:])
61
+ else:
62
+ batches.append(df[i: i+batch_size])
63
+ return batches
64
+
65
+
66
+ def cleanup():
67
+ gc.collect()
68
+ torch.cuda.empty_cache()
69
+
70
+ cleanup()
71
+
72
+ optimizer.param_groups[0]['lr'] = 1e-5
73
+
74
+ """# Adestramento"""
75
+
76
+ model.train();
77
+ batch_size = 8
78
+ max_len = 384
79
+ accumulation_steps = 1
80
+ save_steps = 1
81
+ epochs_tq = trange(epochs) #epochs
82
+
83
+ window = 4000
84
+ ewm = 0
85
+ errors = 0
86
+
87
+ cleanup()
88
+
89
+ batches = split_batches(train_split, batch_size)
90
+
91
+ for i in epochs_tq:
92
+ print("Epoch:", i)
93
+ batch_count = 0
94
+ for batch in batches:
95
+ batch_count += 1
96
+ print("Batch:", batch_count)
97
+ xx = batch.table.values.tolist()
98
+ yy = batch.table.values.tolist()
99
+ try:
100
+ x = tokenizer(xx, return_tensors='pt', padding=True, truncation=True, max_length=max_len).to(model.device)
101
+ y = tokenizer(yy, return_tensors='pt', padding=True, truncation=True, max_length=max_len).to(model.device)
102
+ # do not force the model to predict pad tokens
103
+ y.input_ids[y.input_ids==0] = -100
104
+
105
+ loss = model(
106
+ input_ids=x.input_ids,
107
+ attention_mask=x.attention_mask,
108
+ labels=y.input_ids,
109
+ decoder_attention_mask=y.attention_mask,
110
+ return_dict=True
111
+ ).loss
112
+ loss.backward()
113
+
114
+ except RuntimeError as e:
115
+ errors += 1
116
+ print("ERROR")
117
+ print(i, x.input_ids.shape[1], y.input_ids.shape[1], e)
118
+ loss = None
119
+ cleanup()
120
+ continue
121
+
122
+ w = 1 / min(i+1, window)
123
+ ewm = ewm * (1-w) + loss.item() * w
124
+ epochs_tq.set_description(f'loss: {ewm}')
125
+
126
+ if i % accumulation_steps == 0:
127
+ optimizer.step()
128
+ optimizer.zero_grad()
129
+ cleanup()
130
+
131
+ if i % window == 0 and i > 0:
132
+ print(ewm, errors)
133
+ errors = 0
134
+ cleanup()
135
+ # optimizer.param_groups[0]['lr'] *= 0.999
136
+ if i % save_steps == 0 and i > 0:
137
+ model.save_pretrained(model_name + "_" + str(epochs))
138
+ tokenizer.save_pretrained(model_name + "_" + str(epochs))
139
+ print('saving...', i, optimizer.param_groups[0]['lr'])
140
+
141
+ model.save_pretrained(model_name + "_" + str(epochs))
142
+ tokenizer.save_pretrained(model_name + "_" + str(epochs))
143
+
144
+ total_time = time.time() - st
145
+ print("Training time:", time.strftime("%H:%M:%S", time.gmtime(total_time)))
146
+
147
+ """# Test"""
148
+ model.eval();
149
+
150
+ def generate(text):
151
+ x = tokenizer(text, return_tensors='pt', padding=True).to(model.device)
152
+ out = model.generate(**x, do_sample=False, num_beams=10, max_length=100)
153
+ return tokenizer.decode(out[0], skip_special_tokens=True)
154
+
155
+ with open(f"{model_name}_{epochs}_predictions_{datetime.now()}.txt", "w") as f:
156
+ f.write("Training time:" + str(time.strftime("%H:%M:%S", time.gmtime(total_time))))
157
+ for index, row in test_split.iterrows():
158
+ text_id = str(row["id"])
159
+ text1 = str(row["table"])
160
+ text2 = str(row["caption"])
161
+
162
+ f.write(text_id + "\n" + text1 + "\n")
163
+ print(text_id + "\n" + text1)
164
+ f.write("Prediction:\n")
165
+ f.write(generate(text1) + "\n")
166
+ print(generate(text1))
167
+ f.write("Truth:\n")
168
+ f.write(text2 + "\n\n")
169
+ print(text2)