Kevin Fink commited on
Commit
d8c9b4e
·
1 Parent(s): cfb27a8
Files changed (1) hide show
  1. app.py +11 -8
app.py CHANGED
@@ -109,20 +109,22 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
109
  #text_target=examples['target'],
110
  return_tensors='pt',
111
  )
112
-
 
 
113
  # Add labels to the model inputs
114
  model_inputs["labels"] = labels["input_ids"]
115
  return model_inputs
116
 
117
  #max_length = 512
118
  # Load the dataset
119
-
120
 
121
  try:
122
  saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
123
  if os.access(f'/data/{hub_id.strip()}_test_dataset', os.R_OK):
124
  train_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset3')
125
- saved_test_dataset = load_from_disk(f'/data/{hub_id.strip()}_validation_dataset')
126
  print("FOUND TEST")
127
  # Create Trainer
128
  data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
@@ -132,14 +134,15 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
132
  train_dataset=train_dataset,
133
  eval_dataset=saved_test_dataset,
134
  compute_metrics=compute_metrics,
135
- data_collator=data_collator,
 
136
  )
137
 
138
  elif os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
139
  dataset = load_dataset(dataset_name.strip())
140
  del dataset['train']
141
  del dataset['validation']
142
- test_set = dataset.map(tokenize_function, batched=True, batch_size=50)
143
  test_set['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
144
  return 'TRAINING DONE'
145
 
@@ -154,7 +157,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
154
  third_third = dataset['train'].select(range(third_size*2, train_size))
155
  dataset['train'] = third_third
156
  #tokenized_second_half = tokenize_function(third_third)
157
- tokenized_second_half = dataset.map(tokenize_function, batched=True, batch_size=50)
158
  dataset['train'] = concatenate_datasets([saved_dataset, tokenized_second_half['train']])
159
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
160
  return 'THIRD THIRD LOADED'
@@ -167,7 +170,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
167
  second_third = dataset['train'].select(range(third_size, third_size*2))
168
  dataset['train'] = second_third
169
  del dataset['test']
170
- tokenized_sh_fq_dataset = dataset.map(tokenize_function, batched=True, batch_size=50)
171
  dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
172
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2')
173
  dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset')
@@ -183,7 +186,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
183
  dataset['train'] = first_third
184
  del dataset['test']
185
  del dataset['validation']
186
- tokenized_first_third = dataset.map(tokenize_function, batched=True, batch_size=50)
187
 
188
  tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
189
  print('DONE')
 
109
  #text_target=examples['target'],
110
  return_tensors='pt',
111
  )
112
+ labels["input_ids"] = [
113
+ [(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
114
+ ]
115
  # Add labels to the model inputs
116
  model_inputs["labels"] = labels["input_ids"]
117
  return model_inputs
118
 
119
  #max_length = 512
120
  # Load the dataset
121
+ column_names = ['text', 'target']
122
 
123
  try:
124
  saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
125
  if os.access(f'/data/{hub_id.strip()}_test_dataset', os.R_OK):
126
  train_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset3')
127
+ saved_test_dataset = load_from_disk(f'/data/{hub_id.strip()}_test_dataset')
128
  print("FOUND TEST")
129
  # Create Trainer
130
  data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
 
134
  train_dataset=train_dataset,
135
  eval_dataset=saved_test_dataset,
136
  compute_metrics=compute_metrics,
137
+ data_collator=data_collator,
138
+ processing_class=tokenizer,
139
  )
140
 
141
  elif os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
142
  dataset = load_dataset(dataset_name.strip())
143
  del dataset['train']
144
  del dataset['validation']
145
+ test_set = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,)
146
  test_set['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
147
  return 'TRAINING DONE'
148
 
 
157
  third_third = dataset['train'].select(range(third_size*2, train_size))
158
  dataset['train'] = third_third
159
  #tokenized_second_half = tokenize_function(third_third)
160
+ tokenized_second_half = dataset.map(tokenize_function, batched=True, batch_size=50,remove_columns=column_names,)
161
  dataset['train'] = concatenate_datasets([saved_dataset, tokenized_second_half['train']])
162
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
163
  return 'THIRD THIRD LOADED'
 
170
  second_third = dataset['train'].select(range(third_size, third_size*2))
171
  dataset['train'] = second_third
172
  del dataset['test']
173
+ tokenized_sh_fq_dataset = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,)
174
  dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
175
  dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2')
176
  dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset')
 
186
  dataset['train'] = first_third
187
  del dataset['test']
188
  del dataset['validation']
189
+ tokenized_first_third = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,)
190
 
191
  tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
192
  print('DONE')