NoOrdinaryJoy commited on
Commit
cbc539c
·
verified ·
1 Parent(s): 9416946

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +209 -0
app.py ADDED
@@ -0,0 +1,209 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ !pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
2
+ !pip install --no-deps xformers trl peft accelerate bitsandbytes
3
+
4
+ from unsloth import FastLanguageModel
5
+ import torch
6
+ max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
7
+ dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
8
+ load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.
9
+
10
+ # 4bit pre quantized models we support for 4x faster downloading + no OOMs.
11
+ fourbit_models = [
12
+ "unsloth/mistral-7b-v0.3-bnb-4bit", # New Mistral v3 2x faster!
13
+ "unsloth/mistral-7b-instruct-v0.3-bnb-4bit",
14
+ "unsloth/llama-3-8b-bnb-4bit", # Llama-3 15 trillion tokens model 2x faster!
15
+ "unsloth/llama-3-8b-Instruct-bnb-4bit",
16
+ "unsloth/llama-3-70b-bnb-4bit",
17
+ "unsloth/Phi-3-mini-4k-instruct", # Phi-3 2x faster!
18
+ "unsloth/Phi-3-medium-4k-instruct",
19
+ "unsloth/mistral-7b-bnb-4bit",
20
+ "unsloth/gemma-7b-bnb-4bit", # Gemma 2.2x faster!
21
+ ] # More models at https://huggingface.co/unsloth
22
+
23
+ model, tokenizer = FastLanguageModel.from_pretrained(
24
+ model_name = "unsloth/llama-3-8b-bnb-4bit",
25
+ max_seq_length = max_seq_length,
26
+ dtype = dtype,
27
+ load_in_4bit = load_in_4bit,
28
+ # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
29
+ )
30
+
31
+ from google.colab import drive
32
+ drive.mount('/content/drive')
33
+
34
+ import pandas as pd
35
+ df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/qa_examples.csv')
36
+ df.head(5)
37
+
38
+ df.columns = df.columns.str.strip()
39
+ df.columns
40
+
41
+ # Format into new columns
42
+ df['instruction'] = df.apply(lambda row: f"The following question is solved for {row['marks_available']} marks: {row['question']}. Referring to the mark-scheme, award the appropriate amount of marks to the student: {row['mark_scheme']}", axis=1)
43
+ df['input'] = df['student_response']
44
+ df['output'] = df.apply(lambda row: str({'marks': row['marks_award'], 'explanation': row['explanation']}), axis=1)
45
+
46
+ # Create a new DataFrame with the desired structure
47
+ formatted_df = pd.DataFrame({
48
+ 'instruction': df['instruction'],
49
+ 'input': df['input'],
50
+ 'output': df['output']
51
+ })
52
+
53
+ # Display the formatted DataFrame
54
+ formatted_df.head(5)
55
+
56
+ """* We support Llama, Mistral, Phi-3, Gemma, Yi, DeepSeek, Qwen, TinyLlama, Vicuna, Open Hermes etc
57
+ * We support 16bit LoRA or 4bit QLoRA. Both 2x faster.
58
+ * `max_seq_length` can be set to anything, since we do automatic RoPE Scaling via [kaiokendev's](https://kaiokendev.github.io/til) method.
59
+ * With [PR 26037](https://github.com/huggingface/transformers/pull/26037), we support downloading 4bit models **4x faster**! [Our repo](https://huggingface.co/unsloth) has Llama, Mistral 4bit models.
60
+ * [**NEW**] We make Phi-3 Medium / Mini **2x faster**! See our [Phi-3 Medium notebook](https://colab.research.google.com/drive/1hhdhBa1j_hsymiW9m-WzxQtgqTH_NHqi?usp=sharing)
61
+
62
+ We now add LoRA adapters so we only need to update 1 to 10% of all parameters!
63
+ """
64
+
65
+ model = FastLanguageModel.get_peft_model(
66
+ model,
67
+ r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
68
+ target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
69
+ "gate_proj", "up_proj", "down_proj",],
70
+ lora_alpha = 16,
71
+ lora_dropout = 0, # Supports any, but = 0 is optimized
72
+ bias = "none", # Supports any, but = "none" is optimized
73
+ # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
74
+ use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
75
+ random_state = 3407,
76
+ use_rslora = False, # We support rank stabilized LoRA
77
+ loftq_config = None, # And LoftQ
78
+ )
79
+
80
+ """<a name="Data"></a>
81
+ ### Data Prep
82
+ We now use the Alpaca dataset from [yahma](https://huggingface.co/datasets/yahma/alpaca-cleaned), which is a filtered version of 52K of the original [Alpaca dataset](https://crfm.stanford.edu/2023/03/13/alpaca.html). You can replace this code section with your own data prep.
83
+
84
+ **[NOTE]** To train only on completions (ignoring the user's input) read TRL's docs [here](https://huggingface.co/docs/trl/sft_trainer#train-on-completions-only).
85
+
86
+ **[NOTE]** Remember to add the **EOS_TOKEN** to the tokenized output!! Otherwise you'll get infinite generations!
87
+
88
+ If you want to use the `llama-3` template for ShareGPT datasets, try our conversational [notebook](https://colab.research.google.com/drive/1XamvWYinY6FOSX9GLvnqSjjsNflxdhNc?usp=sharing).
89
+
90
+ For text completions like novel writing, try this [notebook](https://colab.research.google.com/drive/1ef-tab5bhkvWmBOObepl1WgJvfvSzn5Q?usp=sharing).
91
+
92
+ <a name="Train"></a>
93
+ ### Train the model
94
+ Now let's use Huggingface TRL's `SFTTrainer`! More docs here: [TRL SFT docs](https://huggingface.co/docs/trl/sft_trainer). We do 60 steps to speed things up, but you can set `num_train_epochs=1` for a full run, and turn off `max_steps=None`. We also support TRL's `DPOTrainer`!
95
+ """
96
+
97
+ alpaca_prompt = """
98
+ ### Instruction:
99
+ {}
100
+ ### Input:
101
+ {}
102
+ ### Response:
103
+ {}"""
104
+
105
+ EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN
106
+ def formatting_prompts_func(examples):
107
+ instructions = examples["instruction"]
108
+ inputs = examples["input"]
109
+ outputs = examples["output"]
110
+ texts = []
111
+ for instruction, input, output in zip(instructions, inputs, outputs):
112
+ text = alpaca_prompt.format(instruction, input, output) + EOS_TOKEN
113
+ texts.append(text)
114
+ return { "text" : texts, }
115
+ pass
116
+
117
+ from datasets import Dataset
118
+ dataset = Dataset.from_pandas(formatted_df, split = 'train')
119
+ dataset = dataset.map(formatting_prompts_func, batched = True,)
120
+ print(dataset)
121
+
122
+ from trl import SFTTrainer
123
+ from transformers import TrainingArguments
124
+ from unsloth import is_bfloat16_supported
125
+
126
+ trainer = SFTTrainer(
127
+ model = model,
128
+ tokenizer = tokenizer,
129
+ train_dataset = dataset,
130
+ dataset_text_field = "text",
131
+ max_seq_length = max_seq_length,
132
+ dataset_num_proc = 3,
133
+ packing = False, # Can make training 5x faster for short sequences.
134
+ args = TrainingArguments(
135
+ per_device_train_batch_size = 2,
136
+ gradient_accumulation_steps = 4,
137
+ warmup_steps = 5,
138
+ max_steps = 60,
139
+ learning_rate = 2e-4,
140
+ fp16 = not is_bfloat16_supported(),
141
+ bf16 = is_bfloat16_supported(),
142
+ logging_steps = 1,
143
+ optim = "adamw_8bit",
144
+ weight_decay = 0.01,
145
+ lr_scheduler_type = "linear",
146
+ seed = 3407,
147
+ output_dir = "outputs",
148
+ ),
149
+ )
150
+
151
+ #@title Show current memory stats
152
+ gpu_stats = torch.cuda.get_device_properties(0)
153
+ start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
154
+ max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
155
+ print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
156
+ print(f"{start_gpu_memory} GB of memory reserved.")
157
+
158
+ trainer_stats = trainer.train()
159
+
160
+ #@title Show final memory and time stats
161
+ used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
162
+ used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
163
+ used_percentage = round(used_memory /max_memory*100, 3)
164
+ lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
165
+ print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
166
+ print(f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training.")
167
+ print(f"Peak reserved memory = {used_memory} GB.")
168
+ print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
169
+ print(f"Peak reserved memory % of max memory = {used_percentage} %.")
170
+ print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")
171
+
172
+ """<a name="Inference"></a>
173
+ ### Inference
174
+ Let's run the model! You can change the instruction and input - leave the output blank!
175
+
176
+ You can also use a `TextStreamer` for continuous inference - so you can see the generation token by token, instead of waiting the whole time!
177
+ """
178
+
179
+ from transformers import TextStreamer
180
+ FastLanguageModel.for_inference(model) # Enable native 2x faster inference
181
+
182
+ import json
183
+ import re
184
+
185
+ def extract_json(text):
186
+ # Regular expression pattern to match JSON objects
187
+ json_pattern = re.compile(r'\{.*?\}', re.DOTALL)
188
+ potential_jsons = json_pattern.findall(text)
189
+ extracted_jsons = []
190
+ for potential_json in potential_jsons:
191
+ try:
192
+ extracted_jsons.append(json.loads(potential_json))
193
+ except json.JSONDecodeError:
194
+ continue
195
+ return extracted_jsons[0:1]
196
+
197
+ # alpaca_prompt = You MUST copy from above!
198
+ inputs = tokenizer(
199
+ [
200
+ alpaca_prompt.format(
201
+ "Find the derivative of f(x) = 3x^2 + 4cos(x) - 1 for a maximum of 2 marks.'. Referring to the mark-scheme, award the appropriate amount of marks to the student: 'Correctly apply differentiation rules.", # instruction
202
+ "6x^2 - 4sin(x)", # input
203
+ "", # output - leave this blank for generation!
204
+ )
205
+ ], return_tensors = "pt").to("cuda")
206
+
207
+ outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
208
+ tokenizer.batch_decode(outputs)
209
+