BerserkerMother commited on
Commit
a99b495
·
1 Parent(s): 021fe7f

lints the code

Browse files
.pylintrc CHANGED
@@ -428,7 +428,8 @@ disable=raw-checker-failed,
428
  suppressed-message,
429
  useless-suppression,
430
  deprecated-pragma,
431
- use-symbolic-message-instead
 
432
 
433
  # Enable the message, report, category or checker with the given id(s). You can
434
  # either give multiple identifier separated by comma (,) or put this option
 
428
  suppressed-message,
429
  useless-suppression,
430
  deprecated-pragma,
431
+ use-symbolic-message-instead,
432
+ R0902
433
 
434
  # Enable the message, report, category or checker with the given id(s). You can
435
  # either give multiple identifier separated by comma (,) or put this option
elise/src/configs/__init__.py CHANGED
@@ -1 +1,4 @@
 
 
 
1
  from .train_t5 import T5TrainingConfig
 
1
+ """
2
+ All configs for ML project
3
+ """
4
  from .train_t5 import T5TrainingConfig
elise/src/configs/logging_config.yaml CHANGED
@@ -1,12 +1,14 @@
1
  version: 1
 
2
  formatters:
3
  simple:
4
  format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
5
  handlers:
6
  console:
 
7
  class: logging.StreamHandler
8
  formatter: simple
9
  stream: ext://sys.stdout
10
- Root:
11
  Level: DEBUG
12
  handlers: [console]
 
1
  version: 1
2
+ disable_existing_loggers: False
3
  formatters:
4
  simple:
5
  format: '%(asctime)s - %(name)s - %(levelname)s - %(message)s'
6
  handlers:
7
  console:
8
+ level: DEBUG
9
  class: logging.StreamHandler
10
  formatter: simple
11
  stream: ext://sys.stdout
12
+ root:
13
  Level: DEBUG
14
  handlers: [console]
elise/src/configs/train_t5.py CHANGED
@@ -1,3 +1,7 @@
 
 
 
 
1
  from dataclasses import dataclass
2
 
3
 
 
1
+ """
2
+ Training config for T5 Seq2Seq training
3
+ """
4
+
5
  from dataclasses import dataclass
6
 
7
 
elise/src/data/mit_seq2seq_dataset.py CHANGED
@@ -87,6 +87,9 @@ class MITRestaurants:
87
 
88
 
89
  def get_default_transforms():
 
 
 
90
  label_names = {v: k for k, v in MITRestaurants.ner_tags.items()}
91
 
92
  def decode_tags(tags, words):
 
87
 
88
 
89
  def get_default_transforms():
90
+ """
91
+ Default transformation to convert ner dataset to seq2seq
92
+ """
93
  label_names = {v: k for k, v in MITRestaurants.ner_tags.items()}
94
 
95
  def decode_tags(tags, words):
elise/src/excutors/trainer_seq2seq.py DELETED
@@ -1,208 +0,0 @@
1
- from transformers import get_scheduler
2
- import torch
3
- from torch.utils.data import DataLoader
4
- from datasets import load_dataset
5
- from transformers import AutoTokenizer, AutoModelForTokenClassification
6
- from transformers import DataCollatorForTokenClassification
7
- from accelerate import Accelerator
8
- import evaluate
9
- import datasets
10
-
11
- from tqdm.auto import tqdm
12
-
13
-
14
- ner_tags = {
15
- "O": 0,
16
- "B-Rating": 1,
17
- "I-Rating": 2,
18
- "B-Amenity": 3,
19
- "I-Amenity": 4,
20
- "B-Location": 5,
21
- "I-Location": 6,
22
- "B-Restaurant_Name": 7,
23
- "I-Restaurant_Name": 8,
24
- "B-Price": 9,
25
- "B-Hours": 10,
26
- "I-Hours": 11,
27
- "B-Dish": 12,
28
- "I-Dish": 13,
29
- "B-Cuisine": 14,
30
- "I-Price": 15,
31
- "I-Cuisine": 16,
32
- }
33
-
34
-
35
- label_names = {v: k for k, v in ner_tags.items()}
36
-
37
- # dataset aggregation
38
- dataset = load_dataset("tner/mit_restaurant")
39
- dataset["train"] = datasets.concatenate_datasets([dataset["train"], dataset["validation"]])
40
- dataset["train"] = datasets.concatenate_datasets([dataset["train"], dataset["test"]])
41
-
42
- print(dataset)
43
-
44
-
45
- tokenizer = AutoTokenizer.from_pretrained(
46
- 'sentence-transformers/all-MiniLM-L6-v2')
47
-
48
-
49
- def align_labels_with_tokens(labels, word_ids):
50
- new_labels = []
51
- current_word = None
52
- for word_id in word_ids:
53
- if word_id != current_word:
54
- # Start of a new word!
55
- current_word = word_id
56
- label = -100 if word_id is None else labels[word_id]
57
- new_labels.append(label)
58
- elif word_id is None:
59
- # Special token
60
- new_labels.append(-100)
61
- else:
62
- # Same word as previous token
63
- label = labels[word_id]
64
- # If the label is B-XXX we change it to I-XXX
65
- label_name = label_names[label]
66
- if label_name.startswith("B"):
67
- label = ner_tags["I" + label_name[1:]]
68
- new_labels.append(label)
69
-
70
- return new_labels
71
-
72
-
73
- def tokenize_and_align_labels(examples):
74
- tokenized_inputs = tokenizer(
75
- examples["tokens"], truncation=True, is_split_into_words=True
76
- )
77
- all_labels = examples["tags"]
78
- new_labels = []
79
- for i, labels in enumerate(all_labels):
80
- word_ids = tokenized_inputs.word_ids(i)
81
- new_labels.append(align_labels_with_tokens(labels, word_ids))
82
-
83
- tokenized_inputs["labels"] = new_labels
84
- return tokenized_inputs
85
-
86
-
87
- tokenized_datasets = dataset.map(
88
- tokenize_and_align_labels,
89
- batched=True,
90
- remove_columns=dataset["train"].column_names,
91
- )
92
-
93
-
94
- def train():
95
- metric = evaluate.load("seqeval")
96
- data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
97
-
98
- train_dataloader = DataLoader(
99
- tokenized_datasets["train"],
100
- shuffle=True,
101
- collate_fn=data_collator,
102
- batch_size=128,
103
- )
104
- eval_dataloader = DataLoader(
105
- tokenized_datasets["test"],
106
- collate_fn=data_collator,
107
- batch_size=8
108
- )
109
-
110
- model = AutoModelForTokenClassification.from_pretrained(
111
- 'sentence-transformers/all-MiniLM-L6-v2',
112
- id2label=label_names,
113
- label2id=ner_tags,
114
- )
115
-
116
- optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
117
-
118
- accelerator = Accelerator()
119
- model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
120
- model, optimizer, train_dataloader, eval_dataloader
121
- )
122
-
123
- num_train_epochs = 50
124
- num_update_steps_per_epoch = len(train_dataloader)
125
- num_training_steps = num_train_epochs * num_update_steps_per_epoch
126
-
127
- lr_scheduler = get_scheduler(
128
- "linear",
129
- optimizer=optimizer,
130
- num_warmup_steps=0,
131
- num_training_steps=num_training_steps,
132
- )
133
-
134
- def postprocess(predictions, labels):
135
- predictions = predictions.detach().cpu().clone().numpy()
136
- labels = labels.detach().cpu().clone().numpy()
137
-
138
- # Remove ignored index (special tokens) and convert to labels
139
- true_labels = [[label_names[l] for l in label if l != -100]
140
- for label in labels]
141
- true_predictions = [
142
- [label_names[p] for (p, l) in zip(prediction, label) if l != -100]
143
- for prediction, label in zip(predictions, labels)
144
- ]
145
- return true_labels, true_predictions
146
-
147
- progress_bar = tqdm(range(num_training_steps))
148
-
149
- for epoch in range(num_train_epochs):
150
- # Training
151
- model.train()
152
- for batch in train_dataloader:
153
- outputs = model(**batch)
154
- loss = outputs.loss
155
- accelerator.backward(loss)
156
-
157
- optimizer.step()
158
- lr_scheduler.step()
159
- optimizer.zero_grad()
160
- progress_bar.update(1)
161
-
162
- # Evaluation
163
- model.eval()
164
- for batch in eval_dataloader:
165
- with torch.no_grad():
166
- outputs = model(**batch)
167
-
168
- predictions = outputs.logits.argmax(dim=-1)
169
- labels = batch["labels"]
170
-
171
- # Necessary to pad predictions and labels for being gathered
172
- predictions = accelerator.pad_across_processes(
173
- predictions, dim=1, pad_index=-100)
174
- labels = accelerator.pad_across_processes(
175
- labels, dim=1, pad_index=-100)
176
-
177
- predictions_gathered = accelerator.gather(predictions)
178
- labels_gathered = accelerator.gather(labels)
179
-
180
- true_predictions, true_labels = postprocess(
181
- predictions_gathered, labels_gathered)
182
- metric.add_batch(predictions=true_predictions,
183
- references=true_labels)
184
-
185
- results = metric.compute()
186
- print(
187
- f"epoch {epoch}:",
188
- {
189
- key: results[f"overall_{key}"]
190
- for key in ["precision", "recall", "f1", "accuracy"]
191
- },
192
- )
193
-
194
- output_dir = "restaurant_ner"
195
- # Save and upload
196
- accelerator.wait_for_everyone()
197
- unwrapped_model = accelerator.unwrap_model(model)
198
- unwrapped_model.save_pretrained(
199
- output_dir, save_function=accelerator.save)
200
- if accelerator.is_main_process:
201
- tokenizer.save_pretrained(output_dir)
202
-
203
- accelerator.wait_for_everyone()
204
- unwrapped_model = accelerator.unwrap_model(model)
205
- unwrapped_model.save_pretrained(output_dir, save_function=accelerator.save)
206
-
207
-
208
- train()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
elise/src/train_t5_seq2seq.py CHANGED
@@ -1,22 +1,27 @@
 
 
 
 
 
1
  import torch
2
  import evaluate
3
  import datasets
4
  from torch.utils.data import DataLoader
5
- from datasets import load_dataset
6
- from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
7
- from dataclasses import asdict
8
-
9
- from transformers import DataCollatorForSeq2Seq
 
10
  from accelerate import Accelerator
11
- from transformers import get_scheduler
12
  import numpy as np
13
  import mlflow
14
 
15
  from tqdm.auto import tqdm
16
 
17
- from data import MITRestaurants, get_default_transforms
18
  from utils.logger import get_logger
19
  from configs import T5TrainingConfig
 
20
 
21
  log = get_logger("Flan_T5")
22
  log.debug("heloooooooooooo?")
@@ -36,6 +41,7 @@ model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
36
 
37
 
38
  def tokenize(example):
 
39
  tokenized = tokenizer(
40
  example["tokens"],
41
  text_target=example["labels"],
@@ -57,6 +63,7 @@ metric = evaluate.load("sacrebleu")
57
 
58
 
59
  def postprocess(predictions, labels):
 
60
  predictions = predictions.cpu().numpy()
61
  labels = labels.cpu().numpy()
62
 
@@ -115,7 +122,8 @@ model, optimizer, train_dataloader, eval_dataloader = accelerator.prepare(
115
  progress_bar = tqdm(range(num_training_steps))
116
 
117
 
118
- def train(model, dataset, metric):
 
119
  # log.info("Starting Training")
120
  print("Starting Traning")
121
  for epoch in range(config.epochs):
@@ -174,10 +182,11 @@ def train(model, dataset, metric):
174
  transformers_model={"model": unwrapped_model, "tokenizer": tokenizer},
175
  task="text2text-generation",
176
  artifact_path="seq2seq_model",
177
- registered_model_name="FlanT5_MIT"
178
  )
179
 
 
180
  mlflow.set_tracking_uri("http://127.0.0.1:5000")
181
  with mlflow.start_run() as mlflow_run:
182
  mlflow.log_params(asdict(config))
183
- train(model, tokenized_datasets, metric)
 
1
+ """
2
+ Training Flant_T5 model on tner/mit_restaurant on seq2seq task
3
+ """
4
+ from dataclasses import asdict
5
+
6
  import torch
7
  import evaluate
8
  import datasets
9
  from torch.utils.data import DataLoader
10
+ from transformers import (
11
+ AutoTokenizer,
12
+ AutoModelForSeq2SeqLM,
13
+ DataCollatorForSeq2Seq,
14
+ get_scheduler,
15
+ )
16
  from accelerate import Accelerator
 
17
  import numpy as np
18
  import mlflow
19
 
20
  from tqdm.auto import tqdm
21
 
 
22
  from utils.logger import get_logger
23
  from configs import T5TrainingConfig
24
+ from data import MITRestaurants, get_default_transforms
25
 
26
  log = get_logger("Flan_T5")
27
  log.debug("heloooooooooooo?")
 
41
 
42
 
43
  def tokenize(example):
44
+ """Tokenizes dataset for seq2seq task"""
45
  tokenized = tokenizer(
46
  example["tokens"],
47
  text_target=example["labels"],
 
63
 
64
 
65
  def postprocess(predictions, labels):
66
+ """Post processing to convert model output for evaluation"""
67
  predictions = predictions.cpu().numpy()
68
  labels = labels.cpu().numpy()
69
 
 
122
  progress_bar = tqdm(range(num_training_steps))
123
 
124
 
125
+ def train():
126
+ """Training function for finetuing flanT5"""
127
  # log.info("Starting Training")
128
  print("Starting Traning")
129
  for epoch in range(config.epochs):
 
182
  transformers_model={"model": unwrapped_model, "tokenizer": tokenizer},
183
  task="text2text-generation",
184
  artifact_path="seq2seq_model",
185
+ registered_model_name="FlanT5_MIT",
186
  )
187
 
188
+
189
  mlflow.set_tracking_uri("http://127.0.0.1:5000")
190
  with mlflow.start_run() as mlflow_run:
191
  mlflow.log_params(asdict(config))
192
+ train()
elise/src/utils/logger.py CHANGED
@@ -4,7 +4,9 @@ Logging helper module
4
  import logging.config
5
  import yaml
6
 
7
- with open("elise/src/configs/logging_config.yaml", "r", encoding="utf-8") as f:
 
 
8
  config = yaml.safe_load(f.read())
9
  logging.config.dictConfig(config)
10
  logging.captureWarnings(True)
 
4
  import logging.config
5
  import yaml
6
 
7
+ with open(
8
+ "/home/kave/work/Elise/elise/src/configs/logging_config.yaml", "r", encoding="utf-8"
9
+ ) as f:
10
  config = yaml.safe_load(f.read())
11
  logging.config.dictConfig(config)
12
  logging.captureWarnings(True)