Kevin Fink commited on
Commit
a1b0975
·
1 Parent(s): 57918ff
Files changed (1) hide show
  1. app.py +19 -5
app.py CHANGED
@@ -1,6 +1,7 @@
1
  import spaces
2
  import gradio as gr
3
  from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM, TrainerCallback
 
4
  from datasets import load_dataset
5
  import traceback
6
  from huggingface_hub import login
@@ -41,17 +42,30 @@ def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch
41
  def tokenize_function(examples):
42
  max_length = 16
43
  # Assuming 'text' is the input and 'target' is the expected output
44
- model_inputs = tokenizer(examples['text'], max_length=max_length, truncation=True)
 
 
 
 
 
 
45
 
46
  # Setup the decoder input IDs (shifted right)
47
  with tokenizer.as_target_tokenizer():
48
- labels = tokenizer(examples['target'], max_length=max_length, truncation=True)
 
 
 
 
 
 
49
 
50
  # Add labels to the model inputs
51
  model_inputs["labels"] = labels["input_ids"]
52
  return model_inputs
53
 
54
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
 
55
 
56
  # Set training arguments
57
  training_args = TrainingArguments(
@@ -63,7 +77,7 @@ def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch
63
  per_device_eval_batch_size=1,
64
  num_train_epochs=int(num_epochs),
65
  weight_decay=0.01,
66
- #gradient_accumulation_steps=grad*0.1,
67
  load_best_model_at_end=True,
68
  metric_for_best_model="accuracy",
69
  greater_is_better=True,
@@ -79,8 +93,8 @@ def fine_tune_model(model_name, dataset_name, hub_id, api_key, num_epochs, batch
79
  trainer = Trainer(
80
  model=model,
81
  args=training_args,
82
- train_dataset=tokenized_datasets['train'],
83
- eval_dataset=tokenized_datasets['test'],
84
  #callbacks=[LoggingCallback()],
85
  )
86
 
 
1
  import spaces
2
  import gradio as gr
3
  from transformers import Trainer, TrainingArguments, AutoTokenizer, AutoModelForSeq2SeqLM, TrainerCallback
4
+ from transformers import DataCollatorForSeq2Seq
5
  from datasets import load_dataset
6
  import traceback
7
  from huggingface_hub import login
 
42
  def tokenize_function(examples):
43
  max_length = 16
44
  # Assuming 'text' is the input and 'target' is the expected output
45
+ model_inputs = tokenizer(
46
+ examples['text'],
47
+ max_length=None, # Set to None for dynamic padding
48
+ padding=False, # Disable padding here, we will handle it later
49
+ truncation=True,
50
+ return_tensors="pt" # Return PyTorch tensors
51
+ )
52
 
53
  # Setup the decoder input IDs (shifted right)
54
  with tokenizer.as_target_tokenizer():
55
+ labels = tokenizer(
56
+ examples['target'],
57
+ max_length=None, # Set to None for dynamic padding
58
+ padding=False, # Disable padding here, we will handle it later
59
+ truncation=True,
60
+ return_tensors="pt"
61
+ )
62
 
63
  # Add labels to the model inputs
64
  model_inputs["labels"] = labels["input_ids"]
65
  return model_inputs
66
 
67
  tokenized_datasets = dataset.map(tokenize_function, batched=True)
68
+ data_collator = DataCollatorForSeq2Seq(tokenizer, model=model)
69
 
70
  # Set training arguments
71
  training_args = TrainingArguments(
 
77
  per_device_eval_batch_size=1,
78
  num_train_epochs=int(num_epochs),
79
  weight_decay=0.01,
80
+ gradient_accumulation_steps=int(grad),
81
  load_best_model_at_end=True,
82
  metric_for_best_model="accuracy",
83
  greater_is_better=True,
 
93
  trainer = Trainer(
94
  model=model,
95
  args=training_args,
96
+ train_dataset=data_collator['train'],
97
+ eval_dataset=data_collator['test'],
98
  #callbacks=[LoggingCallback()],
99
  )
100