GabrielSalem commited on
Commit
1b1d234
·
verified ·
1 Parent(s): d5e59b9

Create utils.py

Browse files
Files changed (1) hide show
  1. utils.py +28 -0
utils.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import Trainer, TrainingArguments, DataCollatorForLanguageModeling
2
+ from datasets import Dataset
3
+
4
+ def preprocess_data(df, tokenizer):
5
+ df["text"] = df.apply(lambda row: f"Question: {row['Question']} Answer: {row['Answer']}", axis=1)
6
+ dataset = Dataset.from_pandas(df)
7
+ dataset = dataset.map(lambda x: tokenizer(x["text"], truncation=True, padding="max_length", max_length=512), batched=True)
8
+ return dataset
9
+
10
+ def train_model(model, tokenizer, dataset, output_dir):
11
+ training_args = TrainingArguments(
12
+ output_dir=output_dir,
13
+ per_device_train_batch_size=4,
14
+ num_train_epochs=1,
15
+ logging_dir="./logs",
16
+ save_steps=10,
17
+ logging_steps=10
18
+ )
19
+ data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
20
+ trainer = Trainer(
21
+ model=model,
22
+ args=training_args,
23
+ train_dataset=dataset,
24
+ data_collator=data_collator
25
+ )
26
+ trainer.train()
27
+ model.save_pretrained(output_dir)
28
+ tokenizer.save_pretrained(output_dir)