Update app.py
Browse files
app.py
CHANGED
@@ -9,6 +9,7 @@ dataset = load_dataset("json", data_files="dataset.jsonl")
|
|
9 |
model_name = "Salesforce/codegen-2B-multi"
|
10 |
model = AutoModelForCausalLM.from_pretrained(model_name)
|
11 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
|
|
12 |
|
13 |
# Tokenize the dataset
|
14 |
def tokenize_function(examples):
|
@@ -16,8 +17,8 @@ def tokenize_function(examples):
|
|
16 |
examples["input"],
|
17 |
text_target=examples["output"],
|
18 |
truncation=True, # Truncate sequences longer than max_length
|
19 |
-
max_length=512, # Adjust
|
20 |
-
padding="max_length" # Pad
|
21 |
)
|
22 |
|
23 |
tokenized_dataset = dataset.map(tokenize_function, batched=True)
|
|
|
9 |
model_name = "Salesforce/codegen-2B-multi"
|
10 |
model = AutoModelForCausalLM.from_pretrained(model_name)
|
11 |
tokenizer = AutoTokenizer.from_pretrained(model_name)
|
12 |
+
tokenizer.pad_token = tokenizer.eos_token
|
13 |
|
14 |
# Tokenize the dataset
|
15 |
def tokenize_function(examples):
|
|
|
17 |
examples["input"],
|
18 |
text_target=examples["output"],
|
19 |
truncation=True, # Truncate sequences longer than max_length
|
20 |
+
max_length=512, # Adjust max length if needed
|
21 |
+
padding="max_length" # Pad sequences to max_length
|
22 |
)
|
23 |
|
24 |
tokenized_dataset = dataset.map(tokenize_function, batched=True)
|