amihai85 commited on
Commit
e0ecae1
·
verified ·
1 Parent(s): b486ce5

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +41 -0
app.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from datasets import load_dataset
2
+ from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
3
+
4
+ # Load the dataset
5
+ dataset = load_dataset("json", data_files="dataset.jsonl")
6
+
7
+ # Load the model and tokenizer
8
+ model_name = "Salesforce/codegen-2B-multi"
9
+ model = AutoModelForCausalLM.from_pretrained(model_name)
10
+ tokenizer = AutoTokenizer.from_pretrained(model_name)
11
+
12
+ # Tokenize the dataset
13
+ def tokenize_function(examples):
14
+ return tokenizer(examples["input"], text_target=examples["output"], truncation=True)
15
+
16
+ tokenized_dataset = dataset.map(tokenize_function, batched=True)
17
+
18
+ # Define training arguments
19
+ training_args = TrainingArguments(
20
+ output_dir="./results",
21
+ overwrite_output_dir=True,
22
+ evaluation_strategy="epoch",
23
+ learning_rate=5e-5,
24
+ per_device_train_batch_size=4,
25
+ num_train_epochs=3,
26
+ save_strategy="epoch",
27
+ logging_dir="./logs",
28
+ )
29
+
30
+ # Train the model
31
+ trainer = Trainer(
32
+ model=model,
33
+ args=training_args,
34
+ train_dataset=tokenized_dataset["train"],
35
+ eval_dataset=tokenized_dataset["train"],
36
+ )
37
+
38
+ trainer.train()
39
+ trainer.save_model("./fine_tuned_codegen")
40
+ tokenizer.save_pretrained("./fine_tuned_codegen")
41
+ print("Training complete. Model saved.")