Spaces:

shorecode
/

gradio-3

Sleeping

Kevin Fink commited on Dec 8, 2024

Commit

94aee2e

1 Parent(s): 50f7a65

dev

Files changed (1) hide show

app.py CHANGED Viewed

@@ -86,23 +86,25 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
         tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8')
         def tokenize_function(examples):
             # Assuming 'text' is the input and 'target' is the expected output
             model_inputs = tokenizer(
                 examples['text'],
-                #max_length=max_length,  # Set to None for dynamic padding
-                #truncation=True,
-                #padding='max_length',
                 return_tensors='pt',
             )
             # Setup the decoder input IDs (shifted right)
             labels = tokenizer(
                 examples['target'],
-                #max_length=max_length,  # Set to None for dynamic padding
-                #truncation=True,
-                #padding='max_length',
                 #text_target=examples['target'],
                 return_tensors='pt',
             )
@@ -113,7 +115,7 @@ def fine_tune_model(model, dataset_name, hub_id, api_key, num_epochs, batch_size
         #max_length = 512
         # Load the dataset
-        max_length = model.get_input_embeddings().weight.shape[0]
         try:
             saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')

         tokenizer = AutoTokenizer.from_pretrained('google/t5-efficient-tiny-nh8')
+        max_length = model.get_input_embeddings().weight.shape[0]
         def tokenize_function(examples):
             # Assuming 'text' is the input and 'target' is the expected output
             model_inputs = tokenizer(
                 examples['text'],
+                max_length=max_length,  # Set to None for dynamic padding
+                truncation=True,
+                padding=True,
                 return_tensors='pt',
             )
             # Setup the decoder input IDs (shifted right)
             labels = tokenizer(
                 examples['target'],
+                max_length=max_length,  # Set to None for dynamic padding
+                truncation=True,
+                padding=True,
                 #text_target=examples['target'],
                 return_tensors='pt',
             )
         #max_length = 512
         # Load the dataset
         try:
             saved_dataset = load_from_disk(f'/data/{hub_id.strip()}_train_dataset')