Kevin Fink
commited on
Commit
·
d8c9b4e
1
Parent(s):
cfb27a8
dev
Browse files
app.py
CHANGED
@@ -109,20 +109,22 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
109 |
#text_target=examples['target'],
|
110 |
return_tensors='pt',
|
111 |
)
|
112 |
-
|
|
|
|
|
113 |
# Add labels to the model inputs
|
114 |
model_inputs["labels"] = labels["input_ids"]
|
115 |
return model_inputs
|
116 |
|
117 |
#max_length = 512
|
118 |
# Load the dataset
|
119 |
-
|
120 |
|
121 |
try:
|
122 |
saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
|
123 |
if os.access(f'/data/{hub_id.strip()}_test_dataset', os.R_OK):
|
124 |
train_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset3')
|
125 |
-
saved_test_dataset = load_from_disk(f'/data/{hub_id.strip()}
|
126 |
print("FOUND TEST")
|
127 |
# Create Trainer
|
128 |
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
|
@@ -132,14 +134,15 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
132 |
train_dataset=train_dataset,
|
133 |
eval_dataset=saved_test_dataset,
|
134 |
compute_metrics=compute_metrics,
|
135 |
-
data_collator=data_collator,
|
|
|
136 |
)
|
137 |
|
138 |
elif os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
|
139 |
dataset = load_dataset(dataset_name.strip())
|
140 |
del dataset['train']
|
141 |
del dataset['validation']
|
142 |
-
test_set = dataset.map(tokenize_function, batched=True, batch_size=50)
|
143 |
test_set['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
|
144 |
return 'TRAINING DONE'
|
145 |
|
@@ -154,7 +157,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
154 |
third_third = dataset['train'].select(range(third_size*2, train_size))
|
155 |
dataset['train'] = third_third
|
156 |
#tokenized_second_half = tokenize_function(third_third)
|
157 |
-
tokenized_second_half = dataset.map(tokenize_function, batched=True, batch_size=50)
|
158 |
dataset['train'] = concatenate_datasets([saved_dataset, tokenized_second_half['train']])
|
159 |
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
|
160 |
return 'THIRD THIRD LOADED'
|
@@ -167,7 +170,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
167 |
second_third = dataset['train'].select(range(third_size, third_size*2))
|
168 |
dataset['train'] = second_third
|
169 |
del dataset['test']
|
170 |
-
tokenized_sh_fq_dataset = dataset.map(tokenize_function, batched=True, batch_size=50)
|
171 |
dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
|
172 |
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2')
|
173 |
dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset')
|
@@ -183,7 +186,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
|
|
183 |
dataset['train'] = first_third
|
184 |
del dataset['test']
|
185 |
del dataset['validation']
|
186 |
-
tokenized_first_third = dataset.map(tokenize_function, batched=True, batch_size=50)
|
187 |
|
188 |
tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
|
189 |
print('DONE')
|
|
|
109 |
#text_target=examples['target'],
|
110 |
return_tensors='pt',
|
111 |
)
|
112 |
+
labels["input_ids"] = [
|
113 |
+
[(l if l != tokenizer.pad_token_id else -100) for l in label] for label in labels["input_ids"]
|
114 |
+
]
|
115 |
# Add labels to the model inputs
|
116 |
model_inputs["labels"] = labels["input_ids"]
|
117 |
return model_inputs
|
118 |
|
119 |
#max_length = 512
|
120 |
# Load the dataset
|
121 |
+
column_names = ['text', 'target']
|
122 |
|
123 |
try:
|
124 |
saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')
|
125 |
if os.access(f'/data/{hub_id.strip()}_test_dataset', os.R_OK):
|
126 |
train_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset3')
|
127 |
+
saved_test_dataset = load_from_disk(f'/data/{hub_id.strip()}_test_dataset')
|
128 |
print("FOUND TEST")
|
129 |
# Create Trainer
|
130 |
data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
|
|
|
134 |
train_dataset=train_dataset,
|
135 |
eval_dataset=saved_test_dataset,
|
136 |
compute_metrics=compute_metrics,
|
137 |
+
data_collator=data_collator,
|
138 |
+
processing_class=tokenizer,
|
139 |
)
|
140 |
|
141 |
elif os.access(f'/data/{hub_id.strip()}_train_dataset3', os.R_OK):
|
142 |
dataset = load_dataset(dataset_name.strip())
|
143 |
del dataset['train']
|
144 |
del dataset['validation']
|
145 |
+
test_set = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,)
|
146 |
test_set['test'].save_to_disk(f'/data/{hub_id.strip()}_test_dataset')
|
147 |
return 'TRAINING DONE'
|
148 |
|
|
|
157 |
third_third = dataset['train'].select(range(third_size*2, train_size))
|
158 |
dataset['train'] = third_third
|
159 |
#tokenized_second_half = tokenize_function(third_third)
|
160 |
+
tokenized_second_half = dataset.map(tokenize_function, batched=True, batch_size=50,remove_columns=column_names,)
|
161 |
dataset['train'] = concatenate_datasets([saved_dataset, tokenized_second_half['train']])
|
162 |
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset3')
|
163 |
return 'THIRD THIRD LOADED'
|
|
|
170 |
second_third = dataset['train'].select(range(third_size, third_size*2))
|
171 |
dataset['train'] = second_third
|
172 |
del dataset['test']
|
173 |
+
tokenized_sh_fq_dataset = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,)
|
174 |
dataset['train'] = concatenate_datasets([saved_dataset['train'], tokenized_sh_fq_dataset['train']])
|
175 |
dataset['train'].save_to_disk(f'/data/{hub_id.strip()}_train_dataset2')
|
176 |
dataset['validation'].save_to_disk(f'/data/{hub_id.strip()}_validation_dataset')
|
|
|
186 |
dataset['train'] = first_third
|
187 |
del dataset['test']
|
188 |
del dataset['validation']
|
189 |
+
tokenized_first_third = dataset.map(tokenize_function, batched=True, batch_size=50, remove_columns=column_names,)
|
190 |
|
191 |
tokenized_first_third.save_to_disk(f'/data/{hub_id.strip()}_train_dataset')
|
192 |
print('DONE')
|