Kevin Fink commited on
Commit
e7cde01
·
1 Parent(s): 1744a34
Files changed (1) hide show
  1. app.py +16 -12
app.py CHANGED
@@ -116,7 +116,18 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
116
  max_length = model.get_input_embeddings().weight.shape[0]
117
  try:
118
  saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
119
- if 'test' in saved_dataset.keys():
 
 
 
 
 
 
 
 
 
 
 
120
  third_third = dataset['train'].select(range(third_size*2, train_size))
121
  dataset['train'] = third_third
122
  tokenized_second_half = dataset.map(tokenize_function, batched=True)
@@ -125,20 +136,13 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
125
  tokenized_test_dataset = dataset['test']
126
  else:
127
  second_third = dataset['train'].select(range(third_size, third_size*2))
128
- dataset['train'] = second_third
 
129
  tokenized_sh_fq_dataset = tokenize_function(dataset, batched=True)
130
  dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
131
- tokenized_half.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
132
  return
133
-
134
- # Create Trainer
135
- trainer = Trainer(
136
- model=model,
137
- args=training_args,
138
- train_dataset=tokenized_train_dataset,
139
- eval_dataset=tokenized_test_dataset,
140
- compute_metrics=compute_metrics,
141
- )
142
  except:
143
  tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8')
144
  # Tokenize the dataset
 
116
  max_length = model.get_input_embeddings().weight.shape[0]
117
  try:
118
  saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
119
+ if 'test' in saved_dataset.key():
120
+ print("FOUND TEST")
121
+ # Create Trainer
122
+ trainer = Trainer(
123
+ model=model,
124
+ args=training_args,
125
+ train_dataset=tokenized_train_dataset,
126
+ eval_dataset=tokenized_test_dataset,
127
+ compute_metrics=compute_metrics,
128
+ )
129
+ elif 'validation' in saved_dataset.keys():
130
+ print("FOUND VALIDATION")
131
  third_third = dataset['train'].select(range(third_size*2, train_size))
132
  dataset['train'] = third_third
133
  tokenized_second_half = dataset.map(tokenize_function, batched=True)
 
136
  tokenized_test_dataset = dataset['test']
137
  else:
138
  second_third = dataset['train'].select(range(third_size, third_size*2))
139
+ dataset['train'] = second_third
140
+ del dataset['test']
141
  tokenized_sh_fq_dataset = tokenize_function(dataset, batched=True)
142
  dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
143
+ dataset.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
144
  return
145
+
 
 
 
 
 
 
 
 
146
  except:
147
  tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8')
148
  # Tokenize the dataset