Kiet2302 commited on
Commit
2be9e40
·
verified ·
1 Parent(s): 1b7f245

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +79 -0
app.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from datasets import load_dataset
3
+ from transformers import GPT2LMHeadModel, GPT2Tokenizer
4
+ import torch
5
+
6
+ ds = load_dataset("higgsfield/school-math-questions")
7
+
8
+ class MathDataset(torch.utils.data.Dataset):
9
+ def __init__(self, qa_pairs, tokenizer, max_length=128):
10
+ self.qa_pairs = qa_pairs
11
+ self.tokenizer = tokenizer
12
+ self.max_length = max_length
13
+
14
+ def __len__(self):
15
+ return len(self.qa_pairs)
16
+
17
+ def __getitem__(self, idx):
18
+ question, answer = self.qa_pairs[idx]
19
+ input_text = f"Q: {question} A:"
20
+
21
+ # Tokenize and pad input and target sequences
22
+ input_ids = self.tokenizer.encode(input_text, truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt").squeeze(0)
23
+ target_ids = self.tokenizer.encode(answer.strip(), truncation=True, padding="max_length", max_length=self.max_length, return_tensors="pt").squeeze(0)
24
+
25
+ # Set the labels to -100 where input_ids are padding tokens
26
+ target_ids[target_ids == self.tokenizer.pad_token_id] = -100
27
+
28
+ return {
29
+ "input_ids": input_ids,
30
+ "labels": target_ids,
31
+ }
32
+
33
+ model_name = "gpt2"
34
+ tokenizer = GPT2Tokenizer.from_pretrained(model_name)
35
+ tokenizer.pad_token = tokenizer.eos_token
36
+ model = GPT2LMHeadModel.from_pretrained(model_name)
37
+
38
+ math_dataset = MathDataset(qa_pairs, tokenizer)
39
+
40
+ from transformers import Trainer, TrainingArguments
41
+
42
+ # Set training arguments
43
+ training_args = TrainingArguments(
44
+ output_dir="./results",
45
+ num_train_epochs=3,
46
+ per_device_train_batch_size=2,
47
+ save_steps=10,
48
+ save_total_limit=2,
49
+ )
50
+
51
+ # Create a Trainer
52
+ trainer = Trainer(
53
+ model=model,
54
+ args=training_args,
55
+ train_dataset=math_dataset,
56
+ )
57
+
58
+ # Fine-tune the model
59
+ trainer.train()
60
+
61
+ class MathChatBot:
62
+ def __init__(self, model_name="gpt2"):
63
+ self.tokenizer = GPT2Tokenizer.from_pretrained(model_name)
64
+ self.model = GPT2LMHeadModel.from_pretrained(model_name)
65
+
66
+ def get_response(self, question):
67
+ input_text = f"Q: {question} A:"
68
+ input_ids = self.tokenizer.encode(input_text, return_tensors="pt")
69
+
70
+ output = self.model.generate(input_ids, max_length=50, num_return_sequences=1)
71
+ answer = self.tokenizer.decode(output[0], skip_special_tokens=True)
72
+ return answer.split("A:")[-1].strip()
73
+
74
+ # Usage
75
+ if __name__ == "__main__":
76
+ bot = MathChatBot()
77
+ user_input = st.text_area("Enter your question:"
78
+ response = bot.get_response(user_input)
79
+ st.write(f"Bot: {response}")