Spaces:

amihai85
/

aicypress

Runtime error

amihai85 commited on Nov 18, 2024

Commit

8363f0b

verified ·

1 Parent(s): 8b67a67

Update app.py

Files changed (1) hide show

app.py CHANGED Viewed

@@ -9,6 +9,7 @@ dataset = load_dataset("json", data_files="dataset.jsonl")
 model_name = "Salesforce/codegen-2B-multi"
 model = AutoModelForCausalLM.from_pretrained(model_name)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
 # Tokenize the dataset
 def tokenize_function(examples):
@@ -16,8 +17,8 @@ def tokenize_function(examples):
         examples["input"],
         text_target=examples["output"],
         truncation=True,  # Truncate sequences longer than max_length
-        max_length=512,   # Adjust this based on your use case
-        padding="max_length"  # Pad shorter sequences to max_length
     )
 tokenized_dataset = dataset.map(tokenize_function, batched=True)

 model_name = "Salesforce/codegen-2B-multi"
 model = AutoModelForCausalLM.from_pretrained(model_name)
 tokenizer = AutoTokenizer.from_pretrained(model_name)
+tokenizer.pad_token = tokenizer.eos_token
 # Tokenize the dataset
 def tokenize_function(examples):
         examples["input"],
         text_target=examples["output"],
         truncation=True,  # Truncate sequences longer than max_length
+        max_length=512,   # Adjust max length if needed
+        padding="max_length"  # Pad sequences to max_length
     )
 tokenized_dataset = dataset.map(tokenize_function, batched=True)