amihai85 commited on
Commit
8363f0b
·
verified ·
1 Parent(s): 8b67a67

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +3 -2
app.py CHANGED
@@ -9,6 +9,7 @@ dataset = load_dataset("json", data_files="dataset.jsonl")
9
  model_name = "Salesforce/codegen-2B-multi"
10
  model = AutoModelForCausalLM.from_pretrained(model_name)
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
 
12
 
13
  # Tokenize the dataset
14
  def tokenize_function(examples):
@@ -16,8 +17,8 @@ def tokenize_function(examples):
16
  examples["input"],
17
  text_target=examples["output"],
18
  truncation=True, # Truncate sequences longer than max_length
19
- max_length=512, # Adjust this based on your use case
20
- padding="max_length" # Pad shorter sequences to max_length
21
  )
22
 
23
  tokenized_dataset = dataset.map(tokenize_function, batched=True)
 
9
  model_name = "Salesforce/codegen-2B-multi"
10
  model = AutoModelForCausalLM.from_pretrained(model_name)
11
  tokenizer = AutoTokenizer.from_pretrained(model_name)
12
+ tokenizer.pad_token = tokenizer.eos_token
13
 
14
  # Tokenize the dataset
15
  def tokenize_function(examples):
 
17
  examples["input"],
18
  text_target=examples["output"],
19
  truncation=True, # Truncate sequences longer than max_length
20
+ max_length=512, # Adjust max length if needed
21
+ padding="max_length" # Pad sequences to max_length
22
  )
23
 
24
  tokenized_dataset = dataset.map(tokenize_function, batched=True)