lxe commited on
Commit
40fab0e
1 Parent(s): 12e7ebc

Initial Commit

Browse files
Files changed (3) hide show
  1. app.py +227 -0
  2. finetune.ipynb +1220 -0
  3. requirements.txt +5 -0
app.py ADDED
@@ -0,0 +1,227 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import gradio as gr
3
+ import re
4
+ import transformers
5
+ import peft
6
+ import traceback
7
+
8
+ from queue import Queue
9
+ from threading import Thread
10
+ import gc
11
+
12
+ CUDA_AVAILABLE = torch.cuda.is_available()
13
+
14
+ device = torch.device("cuda" if CUDA_AVAILABLE else "cpu")
15
+
16
+ tokenizer = transformers.AutoTokenizer.from_pretrained("cerebras/Cerebras-GPT-2.7B")
17
+ tokenizer.pad_token_id = 0
18
+
19
+ model = transformers.AutoModelForCausalLM.from_pretrained(
20
+ "cerebras/Cerebras-GPT-2.7B",
21
+ load_in_8bit=True,
22
+ torch_dtype=torch.float16,
23
+ device_map={'':0} if CUDA_AVAILABLE else 'auto',
24
+ )
25
+
26
+ model = peft.PeftModel.from_pretrained(
27
+ model,
28
+ 'lxe/lora-cerebras-gpt2.7b-alpaca-shortprompt',
29
+ torch_dtype=torch.float16
30
+ )
31
+
32
+ model.half()
33
+
34
+ # Streaming functionality taken from https://github.com/oobabooga/text-generation-webui/blob/master/modules/text_generation.py#L105
35
+
36
+ class Stream(transformers.StoppingCriteria):
37
+ def __init__(self, callback_func=None):
38
+ self.callback_func = callback_func
39
+
40
+ def __call__(self, input_ids, scores) -> bool:
41
+ if self.callback_func is not None:
42
+ self.callback_func(input_ids[0])
43
+ return False
44
+
45
+ class Iteratorize:
46
+ """
47
+ Transforms a function that takes a callback
48
+ into a lazy iterator (generator).
49
+ """
50
+ def __init__(self, func, kwargs={}, callback=None):
51
+ self.mfunc=func
52
+ self.c_callback=callback
53
+ self.q = Queue()
54
+ self.sentinel = object()
55
+ self.kwargs = kwargs
56
+ self.stop_now = False
57
+
58
+ def _callback(val):
59
+ if self.stop_now:
60
+ raise ValueError
61
+ self.q.put(val)
62
+
63
+ def gentask():
64
+ try:
65
+ ret = self.mfunc(callback=_callback, **self.kwargs)
66
+ except ValueError:
67
+ traceback.print_exc()
68
+ pass
69
+ except:
70
+ traceback.print_exc()
71
+ pass
72
+
73
+ clear_torch_cache()
74
+ self.q.put(self.sentinel)
75
+ if self.c_callback:
76
+ self.c_callback(ret)
77
+
78
+ self.thread = Thread(target=gentask)
79
+ self.thread.start()
80
+
81
+ def __iter__(self):
82
+ return self
83
+
84
+ def __next__(self):
85
+ obj = self.q.get(True,None)
86
+ if obj is self.sentinel:
87
+ raise StopIteration
88
+ else:
89
+ return obj
90
+
91
+ def __del__(self):
92
+ clear_torch_cache()
93
+
94
+ def __enter__(self):
95
+ return self
96
+
97
+ def __exit__(self, exc_type, exc_val, exc_tb):
98
+ self.stop_now = True
99
+ clear_torch_cache()
100
+
101
+ def clear_torch_cache():
102
+ gc.collect()
103
+ if CUDA_AVAILABLE:
104
+ torch.cuda.empty_cache()
105
+
106
+ def generate_text(
107
+ history,
108
+ max_new_tokens,
109
+ do_sample,
110
+ temperature,
111
+ top_p,
112
+ top_k,
113
+ repetition_penalty,
114
+ typical_p,
115
+ num_beams
116
+ ):
117
+ # Create a conversation context of the last 4 entries in the history
118
+ inp = ''.join([
119
+ f"Human: {h[0]}\n\nAssistant: {'' if h[1] is None else h[1]}\n\n" for h in history[-4:]
120
+ ]).strip()
121
+
122
+ input_ids = tokenizer.encode(
123
+ inp,
124
+ return_tensors='pt',
125
+ truncation=True,
126
+ add_special_tokens=False
127
+ ).to(device) # type: ignore
128
+
129
+ generate_params = {
130
+ "input_ids": input_ids,
131
+ "max_new_tokens": max_new_tokens,
132
+ "do_sample": do_sample,
133
+ "temperature": temperature,
134
+ "top_p": top_p,
135
+ "top_k": top_k,
136
+ "repetition_penalty": repetition_penalty,
137
+ "typical_p": typical_p,
138
+ "num_beams": num_beams,
139
+ "stopping_criteria": transformers.StoppingCriteriaList(),
140
+ "pad_token_id": tokenizer.pad_token_id,
141
+ }
142
+
143
+ def generate_with_callback(callback=None, **kwargs):
144
+ kwargs['stopping_criteria'].append(Stream(callback_func=callback))
145
+ clear_torch_cache()
146
+ with torch.no_grad():
147
+ model.generate(**kwargs) # type: ignore
148
+
149
+ def generate_with_streaming(**kwargs):
150
+ return Iteratorize(generate_with_callback, kwargs, callback=None)
151
+
152
+ with generate_with_streaming(**generate_params) as generator:
153
+ for output in generator:
154
+ new_tokens = len(output) - len(input_ids[0])
155
+ reply = tokenizer.decode(output[-new_tokens:], skip_special_tokens=True)
156
+
157
+ # If reply contains '^Human:' or '^Assistant:'
158
+ # then we have reached the end of the assistant's response
159
+ stop_re = re.compile(r'^(Human|Assistant):', re.MULTILINE)
160
+ if re.search(stop_re, reply):
161
+ reply = ''.join(reply.split('\n')[:-1])
162
+ history[-1][1] = reply.strip()
163
+ yield history
164
+ break
165
+
166
+ # if reply contains 'EOS' then we have reached the end of the conversation
167
+ if output[-1] in [tokenizer.eos_token_id]:
168
+ yield history
169
+ break
170
+
171
+ history[-1][1] = reply.strip()
172
+ yield history
173
+
174
+ with gr.Blocks() as demo:
175
+ gr.Markdown("""
176
+ ## Cerebras GPT-2.7B Alpcaca-Shortprompt LoRA Demo
177
+ This is a very fast and relatively coherent chatbot. It uses the [Cerebras-GPT-2.7B](https://huggingface.co/cerebras/Cerebras-GPT-2.7B), with a LoRA finetuned on the [Alpcaca Dataset]) dataset using a shorter prompt. The chatbok keeps a very short conversation context as well.
178
+ """)
179
+ with gr.Row():
180
+ with gr.Column():
181
+ chatbot = gr.Chatbot()
182
+ msg = gr.Textbox(value="How old is the Earth?", placeholder="Type a message...")
183
+ with gr.Row():
184
+ clear = gr.Button("Clear")
185
+
186
+ with gr.Column():
187
+ max_new_tokens = gr.Slider(0, 2048, 200, step=1, label="max_new_tokens")
188
+ do_sample = gr.Checkbox(True, label="do_sample")
189
+ with gr.Row():
190
+ with gr.Column():
191
+ temperature = gr.Slider(0, 2, 0.5, step=0.01, label="temperature")
192
+ top_p = gr.Slider(0, 1, 0.75, step=0.01, label="top_p")
193
+ top_k = gr.Slider(0, 100, 80, step=1, label="top_k")
194
+ with gr.Column():
195
+ repetition_penalty = gr.Slider(0, 10, 1.5, step=0.01, label="repetition_penalty")
196
+ typical_p = gr.Slider(0, 1, 1, step=0.01, label="typical_p")
197
+ num_beams = gr.Slider(0, 10, 1, step=1, label="num_beams")
198
+
199
+ def user(user_message, history):
200
+ return "", history + [[user_message, None]]
201
+
202
+ def fix_history(history):
203
+ update_history = False
204
+ for i, (user, bot) in enumerate(history):
205
+ if bot is None:
206
+ update_history = True
207
+ history[i][1] = "_silence_"
208
+ if update_history:
209
+ chatbot.update(history)
210
+
211
+ msg.submit(user, [msg, chatbot], [msg, chatbot], queue=False).then(
212
+ generate_text, inputs=[
213
+ chatbot,
214
+ max_new_tokens,
215
+ do_sample,
216
+ temperature,
217
+ top_p,
218
+ top_k,
219
+ repetition_penalty,
220
+ typical_p,
221
+ num_beams
222
+ ], outputs=[chatbot],
223
+ ).then(fix_history, chatbot)
224
+
225
+ clear.click(lambda: None, None, chatbot, queue=False)
226
+
227
+ demo.queue().launch()
finetune.ipynb ADDED
@@ -0,0 +1,1220 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "20b1e7bd",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import torch"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 2,
16
+ "id": "4e92fff5",
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "import transformers\n",
21
+ "\n",
22
+ "tokenizer = transformers.AutoTokenizer.from_pretrained('cerebras/Cerebras-GPT-2.7B')\n",
23
+ "tokenizer.pad_token_id = 0"
24
+ ]
25
+ },
26
+ {
27
+ "cell_type": "code",
28
+ "execution_count": 13,
29
+ "id": "77637440",
30
+ "metadata": {},
31
+ "outputs": [
32
+ {
33
+ "name": "stderr",
34
+ "output_type": "stream",
35
+ "text": [
36
+ "Found cached dataset json (/root/.cache/huggingface/datasets/json/default-8d265dbd6f34ccd3/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)\n"
37
+ ]
38
+ },
39
+ {
40
+ "data": {
41
+ "application/vnd.jupyter.widget-view+json": {
42
+ "model_id": "1f5bceec2f7540f9b46c29f8074c4760",
43
+ "version_major": 2,
44
+ "version_minor": 0
45
+ },
46
+ "text/plain": [
47
+ " 0%| | 0/1 [00:00<?, ?it/s]"
48
+ ]
49
+ },
50
+ "metadata": {},
51
+ "output_type": "display_data"
52
+ }
53
+ ],
54
+ "source": [
55
+ "import datasets\n",
56
+ "dataset = datasets.load_dataset('json', data_files='alpaca_data_cleaned.json')"
57
+ ]
58
+ },
59
+ {
60
+ "cell_type": "code",
61
+ "execution_count": 14,
62
+ "id": "dc81310c",
63
+ "metadata": {},
64
+ "outputs": [
65
+ {
66
+ "name": "stdout",
67
+ "output_type": "stream",
68
+ "text": [
69
+ "DatasetDict({\n",
70
+ " train: Dataset({\n",
71
+ " features: ['instruction', 'input', 'output'],\n",
72
+ " num_rows: 51942\n",
73
+ " })\n",
74
+ "})\n"
75
+ ]
76
+ }
77
+ ],
78
+ "source": [
79
+ "print(dataset)"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "code",
84
+ "execution_count": 15,
85
+ "id": "660f022e",
86
+ "metadata": {},
87
+ "outputs": [],
88
+ "source": [
89
+ "cutoff_len = 512\n",
90
+ "\n",
91
+ "def generate_prompt(entry):\n",
92
+ " if entry['input']:\n",
93
+ " return f\"User: {entry['instruction']}: {entry['input']}\\n\\nAssistant: {entry['output']}\"\n",
94
+ " else:\n",
95
+ " return f\"User: {entry['instruction']}\\n\\nAssistant: {entry['output']}\"\n",
96
+ "\n",
97
+ "def tokenize(item, add_eos_token=True):\n",
98
+ " result = tokenizer(\n",
99
+ " generate_prompt(item),\n",
100
+ " truncation=True,\n",
101
+ " max_length=cutoff_len,\n",
102
+ " padding=False,\n",
103
+ " return_tensors=None,\n",
104
+ " )\n",
105
+ "\n",
106
+ " if (\n",
107
+ " result[\"input_ids\"][-1] != tokenizer.eos_token_id\n",
108
+ " and len(result[\"input_ids\"]) < cutoff_len\n",
109
+ " and add_eos_token\n",
110
+ " ):\n",
111
+ " result[\"input_ids\"].append(tokenizer.eos_token_id)\n",
112
+ " result[\"attention_mask\"].append(1)\n",
113
+ "\n",
114
+ " result[\"labels\"] = result[\"input_ids\"].copy()\n",
115
+ "\n",
116
+ " return result"
117
+ ]
118
+ },
119
+ {
120
+ "cell_type": "code",
121
+ "execution_count": 16,
122
+ "id": "28bc5713",
123
+ "metadata": {},
124
+ "outputs": [
125
+ {
126
+ "data": {
127
+ "application/vnd.jupyter.widget-view+json": {
128
+ "model_id": "5ec872a8d87d49d79f0b9ed2f1946af1",
129
+ "version_major": 2,
130
+ "version_minor": 0
131
+ },
132
+ "text/plain": [
133
+ "Map: 0%| | 0/41553 [00:00<?, ? examples/s]"
134
+ ]
135
+ },
136
+ "metadata": {},
137
+ "output_type": "display_data"
138
+ },
139
+ {
140
+ "data": {
141
+ "application/vnd.jupyter.widget-view+json": {
142
+ "model_id": "3a2a0426dd664b5e892895cbd06fe02a",
143
+ "version_major": 2,
144
+ "version_minor": 0
145
+ },
146
+ "text/plain": [
147
+ "Map: 0%| | 0/10389 [00:00<?, ? examples/s]"
148
+ ]
149
+ },
150
+ "metadata": {},
151
+ "output_type": "display_data"
152
+ }
153
+ ],
154
+ "source": [
155
+ "train_val = dataset[\"train\"].train_test_split(test_size=0.2, shuffle=True, seed=42)\n",
156
+ "train_data = train_val[\"train\"].shuffle().map(tokenize)\n",
157
+ "val_data = train_val[\"test\"].shuffle().map(tokenize)"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": 18,
163
+ "id": "10d2fc55",
164
+ "metadata": {},
165
+ "outputs": [
166
+ {
167
+ "name": "stdout",
168
+ "output_type": "stream",
169
+ "text": [
170
+ "\n",
171
+ "===================================BUG REPORT===================================\n",
172
+ "Welcome to bitsandbytes. For bug reports, please submit your error trace to: https://github.com/TimDettmers/bitsandbytes/issues\n",
173
+ "================================================================================\n",
174
+ "CUDA SETUP: CUDA runtime path found: /root/miniconda3/envs/llama/lib/libcudart.so\n",
175
+ "CUDA SETUP: Highest compute capability among GPUs detected: 8.6\n",
176
+ "CUDA SETUP: Detected CUDA version 117\n",
177
+ "CUDA SETUP: Loading binary /root/miniconda3/envs/llama/lib/python3.10/site-packages/bitsandbytes/libbitsandbytes_cuda117.so...\n"
178
+ ]
179
+ }
180
+ ],
181
+ "source": [
182
+ "if 'model' in globals(): \n",
183
+ " del model\n",
184
+ " torch.cuda.empty_cache()\n",
185
+ "\n",
186
+ "model = transformers.AutoModelForCausalLM.from_pretrained(\n",
187
+ " 'cerebras/Cerebras-GPT-2.7B', \n",
188
+ " load_in_8bit=True,\n",
189
+ " torch_dtype=torch.float16,\n",
190
+ " device_map={'': 0}\n",
191
+ ")"
192
+ ]
193
+ },
194
+ {
195
+ "cell_type": "code",
196
+ "execution_count": 19,
197
+ "id": "2fd1028c",
198
+ "metadata": {},
199
+ "outputs": [],
200
+ "source": [
201
+ "import peft\n",
202
+ "\n",
203
+ "model = peft.prepare_model_for_int8_training(model)\n",
204
+ "\n",
205
+ "model = peft.get_peft_model(model, peft.LoraConfig(\n",
206
+ " r=8,\n",
207
+ " lora_alpha=16,\n",
208
+ " # target_modules=[\"q_proj\", \"v_proj\"],\n",
209
+ " target_modules=[\"c_attn\"],\n",
210
+ " lora_dropout=0.05,\n",
211
+ " bias=\"none\",\n",
212
+ " task_type=\"CAUSAL_LM\",\n",
213
+ "))"
214
+ ]
215
+ },
216
+ {
217
+ "cell_type": "code",
218
+ "execution_count": 20,
219
+ "id": "deb33df4",
220
+ "metadata": {},
221
+ "outputs": [
222
+ {
223
+ "ename": "ValueError",
224
+ "evalue": "Can't find config.json at 'lora-cerebras-gpt2.7b-alpaca'",
225
+ "output_type": "error",
226
+ "traceback": [
227
+ "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
228
+ "\u001b[0;31mHTTPError\u001b[0m Traceback (most recent call last)",
229
+ "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/huggingface_hub/utils/_errors.py:259\u001b[0m, in \u001b[0;36mhf_raise_for_status\u001b[0;34m(response, endpoint_name)\u001b[0m\n\u001b[1;32m 258\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m--> 259\u001b[0m response\u001b[39m.\u001b[39;49mraise_for_status()\n\u001b[1;32m 260\u001b[0m \u001b[39mexcept\u001b[39;00m HTTPError \u001b[39mas\u001b[39;00m e:\n",
230
+ "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/requests/models.py:1021\u001b[0m, in \u001b[0;36mResponse.raise_for_status\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 1020\u001b[0m \u001b[39mif\u001b[39;00m http_error_msg:\n\u001b[0;32m-> 1021\u001b[0m \u001b[39mraise\u001b[39;00m HTTPError(http_error_msg, response\u001b[39m=\u001b[39m\u001b[39mself\u001b[39m)\n",
231
+ "\u001b[0;31mHTTPError\u001b[0m: 404 Client Error: Not Found for url: https://huggingface.co/lora-cerebras-gpt2.7b-alpaca/resolve/main/adapter_config.json",
232
+ "\nThe above exception was the direct cause of the following exception:\n",
233
+ "\u001b[0;31mRepositoryNotFoundError\u001b[0m Traceback (most recent call last)",
234
+ "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/peft/utils/config.py:99\u001b[0m, in \u001b[0;36mPeftConfigMixin.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, **kwargs)\u001b[0m\n\u001b[1;32m 98\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m---> 99\u001b[0m config_file \u001b[39m=\u001b[39m hf_hub_download(pretrained_model_name_or_path, CONFIG_NAME)\n\u001b[1;32m 100\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n",
235
+ "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:120\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 118\u001b[0m kwargs \u001b[39m=\u001b[39m smoothly_deprecate_use_auth_token(fn_name\u001b[39m=\u001b[39mfn\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m, has_token\u001b[39m=\u001b[39mhas_token, kwargs\u001b[39m=\u001b[39mkwargs)\n\u001b[0;32m--> 120\u001b[0m \u001b[39mreturn\u001b[39;00m fn(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
236
+ "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/huggingface_hub/file_download.py:1160\u001b[0m, in \u001b[0;36mhf_hub_download\u001b[0;34m(repo_id, filename, subfolder, repo_type, revision, library_name, library_version, cache_dir, local_dir, local_dir_use_symlinks, user_agent, force_download, force_filename, proxies, etag_timeout, resume_download, token, local_files_only, legacy_cache_layout)\u001b[0m\n\u001b[1;32m 1159\u001b[0m \u001b[39mtry\u001b[39;00m:\n\u001b[0;32m-> 1160\u001b[0m metadata \u001b[39m=\u001b[39m get_hf_file_metadata(\n\u001b[1;32m 1161\u001b[0m url\u001b[39m=\u001b[39;49murl,\n\u001b[1;32m 1162\u001b[0m token\u001b[39m=\u001b[39;49mtoken,\n\u001b[1;32m 1163\u001b[0m proxies\u001b[39m=\u001b[39;49mproxies,\n\u001b[1;32m 1164\u001b[0m timeout\u001b[39m=\u001b[39;49metag_timeout,\n\u001b[1;32m 1165\u001b[0m )\n\u001b[1;32m 1166\u001b[0m \u001b[39mexcept\u001b[39;00m EntryNotFoundError \u001b[39mas\u001b[39;00m http_error:\n\u001b[1;32m 1167\u001b[0m \u001b[39m# Cache the non-existence of the file and raise\u001b[39;00m\n",
237
+ "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/huggingface_hub/utils/_validators.py:120\u001b[0m, in \u001b[0;36mvalidate_hf_hub_args.<locals>._inner_fn\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m 118\u001b[0m kwargs \u001b[39m=\u001b[39m smoothly_deprecate_use_auth_token(fn_name\u001b[39m=\u001b[39mfn\u001b[39m.\u001b[39m\u001b[39m__name__\u001b[39m, has_token\u001b[39m=\u001b[39mhas_token, kwargs\u001b[39m=\u001b[39mkwargs)\n\u001b[0;32m--> 120\u001b[0m \u001b[39mreturn\u001b[39;00m fn(\u001b[39m*\u001b[39;49margs, \u001b[39m*\u001b[39;49m\u001b[39m*\u001b[39;49mkwargs)\n",
238
+ "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/huggingface_hub/file_download.py:1501\u001b[0m, in \u001b[0;36mget_hf_file_metadata\u001b[0;34m(url, token, proxies, timeout)\u001b[0m\n\u001b[1;32m 1492\u001b[0m r \u001b[39m=\u001b[39m _request_wrapper(\n\u001b[1;32m 1493\u001b[0m method\u001b[39m=\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mHEAD\u001b[39m\u001b[39m\"\u001b[39m,\n\u001b[1;32m 1494\u001b[0m url\u001b[39m=\u001b[39murl,\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 1499\u001b[0m timeout\u001b[39m=\u001b[39mtimeout,\n\u001b[1;32m 1500\u001b[0m )\n\u001b[0;32m-> 1501\u001b[0m hf_raise_for_status(r)\n\u001b[1;32m 1503\u001b[0m \u001b[39m# Return\u001b[39;00m\n",
239
+ "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/huggingface_hub/utils/_errors.py:291\u001b[0m, in \u001b[0;36mhf_raise_for_status\u001b[0;34m(response, endpoint_name)\u001b[0m\n\u001b[1;32m 283\u001b[0m message \u001b[39m=\u001b[39m (\n\u001b[1;32m 284\u001b[0m \u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39m{\u001b[39;00mresponse\u001b[39m.\u001b[39mstatus_code\u001b[39m}\u001b[39;00m\u001b[39m Client Error.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 285\u001b[0m \u001b[39m+\u001b[39m \u001b[39m\"\u001b[39m\u001b[39m\\n\u001b[39;00m\u001b[39m\\n\u001b[39;00m\u001b[39m\"\u001b[39m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 289\u001b[0m \u001b[39m\"\u001b[39m\u001b[39m make sure you are authenticated.\u001b[39m\u001b[39m\"\u001b[39m\n\u001b[1;32m 290\u001b[0m )\n\u001b[0;32m--> 291\u001b[0m \u001b[39mraise\u001b[39;00m RepositoryNotFoundError(message, response) \u001b[39mfrom\u001b[39;00m \u001b[39me\u001b[39;00m\n\u001b[1;32m 293\u001b[0m \u001b[39melif\u001b[39;00m response\u001b[39m.\u001b[39mstatus_code \u001b[39m==\u001b[39m \u001b[39m400\u001b[39m:\n",
240
+ "\u001b[0;31mRepositoryNotFoundError\u001b[0m: 404 Client Error. (Request ID: Root=1-6424c7f5-7796bb54152221004f83dc73)\n\nRepository Not Found for url: https://huggingface.co/lora-cerebras-gpt2.7b-alpaca/resolve/main/adapter_config.json.\nPlease make sure you specified the correct `repo_id` and `repo_type`.\nIf you are trying to access a private or gated repo, make sure you are authenticated.",
241
+ "\nDuring handling of the above exception, another exception occurred:\n",
242
+ "\u001b[0;31mValueError\u001b[0m Traceback (most recent call last)",
243
+ "Cell \u001b[0;32mIn[20], line 5\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mpeft\u001b[39;00m\n\u001b[1;32m 3\u001b[0m output_dir \u001b[39m=\u001b[39m \u001b[39m'\u001b[39m\u001b[39mlora-cerebras-gpt2.7b-alpaca\u001b[39m\u001b[39m'\u001b[39m\n\u001b[0;32m----> 5\u001b[0m model \u001b[39m=\u001b[39m peft\u001b[39m.\u001b[39;49mPeftModel\u001b[39m.\u001b[39;49mfrom_pretrained(\n\u001b[1;32m 6\u001b[0m model,\n\u001b[1;32m 7\u001b[0m \u001b[39m# 'lora-cerebras-gpt2.7b-hh-rlhf-helpful-online',\u001b[39;49;00m\n\u001b[1;32m 8\u001b[0m output_dir,\n\u001b[1;32m 9\u001b[0m torch_dtype\u001b[39m=\u001b[39;49mtorch\u001b[39m.\u001b[39;49mfloat16\n\u001b[1;32m 10\u001b[0m )\n",
244
+ "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/peft/peft_model.py:135\u001b[0m, in \u001b[0;36mPeftModel.from_pretrained\u001b[0;34m(cls, model, model_id, **kwargs)\u001b[0m\n\u001b[1;32m 132\u001b[0m \u001b[39mfrom\u001b[39;00m \u001b[39m.\u001b[39;00m\u001b[39mmapping\u001b[39;00m \u001b[39mimport\u001b[39;00m MODEL_TYPE_TO_PEFT_MODEL_MAPPING, PEFT_TYPE_TO_CONFIG_MAPPING\n\u001b[1;32m 134\u001b[0m \u001b[39m# load the config\u001b[39;00m\n\u001b[0;32m--> 135\u001b[0m config \u001b[39m=\u001b[39m PEFT_TYPE_TO_CONFIG_MAPPING[PeftConfig\u001b[39m.\u001b[39;49mfrom_pretrained(model_id)\u001b[39m.\u001b[39mpeft_type]\u001b[39m.\u001b[39mfrom_pretrained(model_id)\n\u001b[1;32m 137\u001b[0m \u001b[39mif\u001b[39;00m \u001b[39mgetattr\u001b[39m(model, \u001b[39m\"\u001b[39m\u001b[39mhf_device_map\u001b[39m\u001b[39m\"\u001b[39m, \u001b[39mNone\u001b[39;00m) \u001b[39mis\u001b[39;00m \u001b[39mnot\u001b[39;00m \u001b[39mNone\u001b[39;00m:\n\u001b[1;32m 138\u001b[0m remove_hook_from_submodules(model)\n",
245
+ "File \u001b[0;32m~/miniconda3/envs/llama/lib/python3.10/site-packages/peft/utils/config.py:101\u001b[0m, in \u001b[0;36mPeftConfigMixin.from_pretrained\u001b[0;34m(cls, pretrained_model_name_or_path, **kwargs)\u001b[0m\n\u001b[1;32m 99\u001b[0m config_file \u001b[39m=\u001b[39m hf_hub_download(pretrained_model_name_or_path, CONFIG_NAME)\n\u001b[1;32m 100\u001b[0m \u001b[39mexcept\u001b[39;00m \u001b[39mException\u001b[39;00m:\n\u001b[0;32m--> 101\u001b[0m \u001b[39mraise\u001b[39;00m \u001b[39mValueError\u001b[39;00m(\u001b[39mf\u001b[39m\u001b[39m\"\u001b[39m\u001b[39mCan\u001b[39m\u001b[39m'\u001b[39m\u001b[39mt find config.json at \u001b[39m\u001b[39m'\u001b[39m\u001b[39m{\u001b[39;00mpretrained_model_name_or_path\u001b[39m}\u001b[39;00m\u001b[39m'\u001b[39m\u001b[39m\"\u001b[39m)\n\u001b[1;32m 103\u001b[0m loaded_attributes \u001b[39m=\u001b[39m \u001b[39mcls\u001b[39m\u001b[39m.\u001b[39mfrom_json_file(config_file)\n\u001b[1;32m 105\u001b[0m config \u001b[39m=\u001b[39m \u001b[39mcls\u001b[39m(\u001b[39m*\u001b[39m\u001b[39m*\u001b[39mkwargs)\n",
246
+ "\u001b[0;31mValueError\u001b[0m: Can't find config.json at 'lora-cerebras-gpt2.7b-alpaca'"
247
+ ]
248
+ }
249
+ ],
250
+ "source": [
251
+ "import peft\n",
252
+ "\n",
253
+ "\n",
254
+ "\n",
255
+ "model = peft.PeftModel.from_pretrained(\n",
256
+ " model,\n",
257
+ " # 'lora-cerebras-gpt2.7b-hh-rlhf-helpful-online',\n",
258
+ " output_dir,\n",
259
+ " torch_dtype=torch.float16\n",
260
+ ")"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "code",
265
+ "execution_count": 28,
266
+ "id": "8ec93ed2",
267
+ "metadata": {},
268
+ "outputs": [],
269
+ "source": [
270
+ "\n",
271
+ "\n",
272
+ "import os\n",
273
+ "import wandb \n",
274
+ "\n",
275
+ "output_dir = 'lora-cerebras-gpt2.7b-alpaca'\n",
276
+ "\n",
277
+ "use_wandb = True,\n",
278
+ "wandb_run_name = f\"{output_dir}-{wandb.util.generate_id()}\"\n",
279
+ "\n",
280
+ "# set the wandb project where this run will be logged\n",
281
+ "os.environ[\"WANDB_PROJECT\"]=output_dir\n",
282
+ "\n",
283
+ "# save your trained model checkpoint to wandb\n",
284
+ "os.environ[\"WANDB_LOG_MODEL\"]=\"true\"\n",
285
+ "\n",
286
+ "# turn off watch to log faster\n",
287
+ "os.environ[\"WANDB_WATCH\"]=\"false\"\n",
288
+ "\n",
289
+ "training_args = transformers.TrainingArguments(\n",
290
+ " per_device_train_batch_size=16, \n",
291
+ " gradient_accumulation_steps=8, \n",
292
+ " num_train_epochs=3, \n",
293
+ " learning_rate=1e-4, \n",
294
+ " fp16=True,\n",
295
+ " optim=\"adamw_torch\",\n",
296
+ " logging_steps=10, \n",
297
+ " evaluation_strategy=\"steps\",\n",
298
+ " save_strategy=\"steps\",\n",
299
+ " eval_steps=200,\n",
300
+ " save_steps=200,\n",
301
+ " output_dir=output_dir, \n",
302
+ " save_total_limit=3,\n",
303
+ "\n",
304
+ " report_to=\"wandb\" if use_wandb else None,\n",
305
+ " run_name=wandb_run_name if use_wandb else None,\n",
306
+ ")"
307
+ ]
308
+ },
309
+ {
310
+ "cell_type": "code",
311
+ "execution_count": 32,
312
+ "id": "2686ecf2",
313
+ "metadata": {},
314
+ "outputs": [
315
+ {
316
+ "data": {
317
+ "text/html": [
318
+ "\n",
319
+ " <div>\n",
320
+ " \n",
321
+ " <progress value='972' max='972' style='width:300px; height:20px; vertical-align: middle;'></progress>\n",
322
+ " [972/972 27:33, Epoch 2/3]\n",
323
+ " </div>\n",
324
+ " <table border=\"1\" class=\"dataframe\">\n",
325
+ " <thead>\n",
326
+ " <tr style=\"text-align: left;\">\n",
327
+ " <th>Step</th>\n",
328
+ " <th>Training Loss</th>\n",
329
+ " <th>Validation Loss</th>\n",
330
+ " </tr>\n",
331
+ " </thead>\n",
332
+ " <tbody>\n",
333
+ " </tbody>\n",
334
+ "</table><p>"
335
+ ],
336
+ "text/plain": [
337
+ "<IPython.core.display.HTML object>"
338
+ ]
339
+ },
340
+ "metadata": {},
341
+ "output_type": "display_data"
342
+ },
343
+ {
344
+ "data": {
345
+ "text/html": [
346
+ "Waiting for W&B process to finish... <strong style=\"color:green\">(success).</strong>"
347
+ ],
348
+ "text/plain": [
349
+ "<IPython.core.display.HTML object>"
350
+ ]
351
+ },
352
+ "metadata": {},
353
+ "output_type": "display_data"
354
+ },
355
+ {
356
+ "data": {
357
+ "text/html": [
358
+ "<style>\n",
359
+ " table.wandb td:nth-child(1) { padding: 0 10px; text-align: left ; width: auto;} td:nth-child(2) {text-align: left ; width: 100%}\n",
360
+ " .wandb-row { display: flex; flex-direction: row; flex-wrap: wrap; justify-content: flex-start; width: 100% }\n",
361
+ " .wandb-col { display: flex; flex-direction: column; flex-basis: 100%; flex: 1; padding: 10px; }\n",
362
+ " </style>\n",
363
+ "<div class=\"wandb-row\"><div class=\"wandb-col\"><h3>Run history:</h3><br/><table class=\"wandb\"><tr><td>eval/loss</td><td>█▄▂▁</td></tr><tr><td>eval/runtime</td><td>▅█▄▁</td></tr><tr><td>eval/samples_per_second</td><td>▄▁▅█</td></tr><tr><td>eval/steps_per_second</td><td>▄▁▅█</td></tr><tr><td>train/epoch</td><td>▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇██▇▇▇███</td></tr><tr><td>train/global_step</td><td>▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇██▇▇▇███</td></tr><tr><td>train/learning_rate</td><td>████▇▇▇▇▇▆▆▆▆▆▅▅▅▅▄▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▂▂▂▂▁▁</td></tr><tr><td>train/loss</td><td>█▃▃▂▂▂▂▂▂▂▂▁▂▁▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁</td></tr><tr><td>train/total_flos</td><td>▁█</td></tr><tr><td>train/train_loss</td><td>█▁</td></tr><tr><td>train/train_runtime</td><td>█▁</td></tr><tr><td>train/train_samples_per_second</td><td>▁█</td></tr><tr><td>train/train_steps_per_second</td><td>▁█</td></tr></table><br/></div><div class=\"wandb-col\"><h3>Run summary:</h3><br/><table class=\"wandb\"><tr><td>eval/loss</td><td>1.69353</td></tr><tr><td>eval/runtime</td><td>213.477</td></tr><tr><td>eval/samples_per_second</td><td>48.666</td></tr><tr><td>eval/steps_per_second</td><td>6.085</td></tr><tr><td>train/epoch</td><td>3.0</td></tr><tr><td>train/global_step</td><td>972</td></tr><tr><td>train/learning_rate</td><td>0.0</td></tr><tr><td>train/loss</td><td>1.7007</td></tr><tr><td>train/total_flos</td><td>4.1553623137959936e+17</td></tr><tr><td>train/train_loss</td><td>0.29741</td></tr><tr><td>train/train_runtime</td><td>1642.1473</td></tr><tr><td>train/train_samples_per_second</td><td>75.912</td></tr><tr><td>train/train_steps_per_second</td><td>0.592</td></tr></table><br/></div></div>"
364
+ ],
365
+ "text/plain": [
366
+ "<IPython.core.display.HTML object>"
367
+ ]
368
+ },
369
+ "metadata": {},
370
+ "output_type": "display_data"
371
+ },
372
+ {
373
+ "data": {
374
+ "text/html": [
375
+ " View run <strong style=\"color:#cdcd00\">lora-cerebras-gpt2.7b-alpaca-jecyepye</strong> at: <a href='https://wandb.ai/lxelxe/lora-cerebras-gpt2.7b-alpaca/runs/3up74y7g' target=\"_blank\">https://wandb.ai/lxelxe/lora-cerebras-gpt2.7b-alpaca/runs/3up74y7g</a><br/>Synced 6 W&B file(s), 0 media file(s), 2 artifact file(s) and 0 other file(s)"
376
+ ],
377
+ "text/plain": [
378
+ "<IPython.core.display.HTML object>"
379
+ ]
380
+ },
381
+ "metadata": {},
382
+ "output_type": "display_data"
383
+ },
384
+ {
385
+ "data": {
386
+ "text/html": [
387
+ "Find logs at: <code>./wandb/run-20230329_232219-3up74y7g/logs</code>"
388
+ ],
389
+ "text/plain": [
390
+ "<IPython.core.display.HTML object>"
391
+ ]
392
+ },
393
+ "metadata": {},
394
+ "output_type": "display_data"
395
+ }
396
+ ],
397
+ "source": [
398
+ "trainer = transformers.Trainer(\n",
399
+ " model=model, \n",
400
+ " train_dataset=train_data,\n",
401
+ " eval_dataset=val_data,\n",
402
+ " args=training_args, \n",
403
+ " data_collator=transformers.DataCollatorForSeq2Seq(\n",
404
+ " tokenizer, pad_to_multiple_of=8, return_tensors=\"pt\", padding=True\n",
405
+ " ),\n",
406
+ ")\n",
407
+ "\n",
408
+ "model.config.use_cache = False\n",
409
+ "result = trainer.train('lora-cerebras-gpt2.7b-alpaca/checkpoint-800')\n",
410
+ "model.save_pretrained(output_dir)\n",
411
+ "\n",
412
+ "wandb.finish()"
413
+ ]
414
+ },
415
+ {
416
+ "cell_type": "code",
417
+ "execution_count": 33,
418
+ "id": "27e9ad70",
419
+ "metadata": {},
420
+ "outputs": [
421
+ {
422
+ "name": "stdout",
423
+ "output_type": "stream",
424
+ "text": [
425
+ "torch.float16\n"
426
+ ]
427
+ },
428
+ {
429
+ "data": {
430
+ "text/plain": [
431
+ "PeftModelForCausalLM(\n",
432
+ " (base_model): LoraModel(\n",
433
+ " (model): GPT2LMHeadModel(\n",
434
+ " (transformer): GPT2Model(\n",
435
+ " (wte): Embedding(50257, 2560)\n",
436
+ " (wpe): Embedding(2048, 2560)\n",
437
+ " (drop): Dropout(p=0.0, inplace=False)\n",
438
+ " (h): ModuleList(\n",
439
+ " (0): GPT2Block(\n",
440
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
441
+ " (attn): GPT2Attention(\n",
442
+ " (c_attn): MergedLinear(\n",
443
+ " in_features=2560, out_features=7680, bias=True\n",
444
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
445
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
446
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
447
+ " )\n",
448
+ " (c_proj): Conv1D()\n",
449
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
450
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
451
+ " )\n",
452
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
453
+ " (mlp): GPT2MLP(\n",
454
+ " (c_fc): Conv1D()\n",
455
+ " (c_proj): Conv1D()\n",
456
+ " (act): GELUActivation()\n",
457
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
458
+ " )\n",
459
+ " )\n",
460
+ " (1): GPT2Block(\n",
461
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
462
+ " (attn): GPT2Attention(\n",
463
+ " (c_attn): MergedLinear(\n",
464
+ " in_features=2560, out_features=7680, bias=True\n",
465
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
466
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
467
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
468
+ " )\n",
469
+ " (c_proj): Conv1D()\n",
470
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
471
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
472
+ " )\n",
473
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
474
+ " (mlp): GPT2MLP(\n",
475
+ " (c_fc): Conv1D()\n",
476
+ " (c_proj): Conv1D()\n",
477
+ " (act): GELUActivation()\n",
478
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
479
+ " )\n",
480
+ " )\n",
481
+ " (2): GPT2Block(\n",
482
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
483
+ " (attn): GPT2Attention(\n",
484
+ " (c_attn): MergedLinear(\n",
485
+ " in_features=2560, out_features=7680, bias=True\n",
486
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
487
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
488
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
489
+ " )\n",
490
+ " (c_proj): Conv1D()\n",
491
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
492
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
493
+ " )\n",
494
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
495
+ " (mlp): GPT2MLP(\n",
496
+ " (c_fc): Conv1D()\n",
497
+ " (c_proj): Conv1D()\n",
498
+ " (act): GELUActivation()\n",
499
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
500
+ " )\n",
501
+ " )\n",
502
+ " (3): GPT2Block(\n",
503
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
504
+ " (attn): GPT2Attention(\n",
505
+ " (c_attn): MergedLinear(\n",
506
+ " in_features=2560, out_features=7680, bias=True\n",
507
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
508
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
509
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
510
+ " )\n",
511
+ " (c_proj): Conv1D()\n",
512
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
513
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
514
+ " )\n",
515
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
516
+ " (mlp): GPT2MLP(\n",
517
+ " (c_fc): Conv1D()\n",
518
+ " (c_proj): Conv1D()\n",
519
+ " (act): GELUActivation()\n",
520
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
521
+ " )\n",
522
+ " )\n",
523
+ " (4): GPT2Block(\n",
524
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
525
+ " (attn): GPT2Attention(\n",
526
+ " (c_attn): MergedLinear(\n",
527
+ " in_features=2560, out_features=7680, bias=True\n",
528
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
529
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
530
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
531
+ " )\n",
532
+ " (c_proj): Conv1D()\n",
533
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
534
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
535
+ " )\n",
536
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
537
+ " (mlp): GPT2MLP(\n",
538
+ " (c_fc): Conv1D()\n",
539
+ " (c_proj): Conv1D()\n",
540
+ " (act): GELUActivation()\n",
541
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
542
+ " )\n",
543
+ " )\n",
544
+ " (5): GPT2Block(\n",
545
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
546
+ " (attn): GPT2Attention(\n",
547
+ " (c_attn): MergedLinear(\n",
548
+ " in_features=2560, out_features=7680, bias=True\n",
549
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
550
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
551
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
552
+ " )\n",
553
+ " (c_proj): Conv1D()\n",
554
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
555
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
556
+ " )\n",
557
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
558
+ " (mlp): GPT2MLP(\n",
559
+ " (c_fc): Conv1D()\n",
560
+ " (c_proj): Conv1D()\n",
561
+ " (act): GELUActivation()\n",
562
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
563
+ " )\n",
564
+ " )\n",
565
+ " (6): GPT2Block(\n",
566
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
567
+ " (attn): GPT2Attention(\n",
568
+ " (c_attn): MergedLinear(\n",
569
+ " in_features=2560, out_features=7680, bias=True\n",
570
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
571
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
572
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
573
+ " )\n",
574
+ " (c_proj): Conv1D()\n",
575
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
576
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
577
+ " )\n",
578
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
579
+ " (mlp): GPT2MLP(\n",
580
+ " (c_fc): Conv1D()\n",
581
+ " (c_proj): Conv1D()\n",
582
+ " (act): GELUActivation()\n",
583
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
584
+ " )\n",
585
+ " )\n",
586
+ " (7): GPT2Block(\n",
587
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
588
+ " (attn): GPT2Attention(\n",
589
+ " (c_attn): MergedLinear(\n",
590
+ " in_features=2560, out_features=7680, bias=True\n",
591
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
592
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
593
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
594
+ " )\n",
595
+ " (c_proj): Conv1D()\n",
596
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
597
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
598
+ " )\n",
599
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
600
+ " (mlp): GPT2MLP(\n",
601
+ " (c_fc): Conv1D()\n",
602
+ " (c_proj): Conv1D()\n",
603
+ " (act): GELUActivation()\n",
604
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
605
+ " )\n",
606
+ " )\n",
607
+ " (8): GPT2Block(\n",
608
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
609
+ " (attn): GPT2Attention(\n",
610
+ " (c_attn): MergedLinear(\n",
611
+ " in_features=2560, out_features=7680, bias=True\n",
612
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
613
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
614
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
615
+ " )\n",
616
+ " (c_proj): Conv1D()\n",
617
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
618
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
619
+ " )\n",
620
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
621
+ " (mlp): GPT2MLP(\n",
622
+ " (c_fc): Conv1D()\n",
623
+ " (c_proj): Conv1D()\n",
624
+ " (act): GELUActivation()\n",
625
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
626
+ " )\n",
627
+ " )\n",
628
+ " (9): GPT2Block(\n",
629
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
630
+ " (attn): GPT2Attention(\n",
631
+ " (c_attn): MergedLinear(\n",
632
+ " in_features=2560, out_features=7680, bias=True\n",
633
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
634
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
635
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
636
+ " )\n",
637
+ " (c_proj): Conv1D()\n",
638
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
639
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
640
+ " )\n",
641
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
642
+ " (mlp): GPT2MLP(\n",
643
+ " (c_fc): Conv1D()\n",
644
+ " (c_proj): Conv1D()\n",
645
+ " (act): GELUActivation()\n",
646
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
647
+ " )\n",
648
+ " )\n",
649
+ " (10): GPT2Block(\n",
650
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
651
+ " (attn): GPT2Attention(\n",
652
+ " (c_attn): MergedLinear(\n",
653
+ " in_features=2560, out_features=7680, bias=True\n",
654
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
655
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
656
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
657
+ " )\n",
658
+ " (c_proj): Conv1D()\n",
659
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
660
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
661
+ " )\n",
662
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
663
+ " (mlp): GPT2MLP(\n",
664
+ " (c_fc): Conv1D()\n",
665
+ " (c_proj): Conv1D()\n",
666
+ " (act): GELUActivation()\n",
667
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
668
+ " )\n",
669
+ " )\n",
670
+ " (11): GPT2Block(\n",
671
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
672
+ " (attn): GPT2Attention(\n",
673
+ " (c_attn): MergedLinear(\n",
674
+ " in_features=2560, out_features=7680, bias=True\n",
675
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
676
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
677
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
678
+ " )\n",
679
+ " (c_proj): Conv1D()\n",
680
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
681
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
682
+ " )\n",
683
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
684
+ " (mlp): GPT2MLP(\n",
685
+ " (c_fc): Conv1D()\n",
686
+ " (c_proj): Conv1D()\n",
687
+ " (act): GELUActivation()\n",
688
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
689
+ " )\n",
690
+ " )\n",
691
+ " (12): GPT2Block(\n",
692
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
693
+ " (attn): GPT2Attention(\n",
694
+ " (c_attn): MergedLinear(\n",
695
+ " in_features=2560, out_features=7680, bias=True\n",
696
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
697
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
698
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
699
+ " )\n",
700
+ " (c_proj): Conv1D()\n",
701
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
702
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
703
+ " )\n",
704
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
705
+ " (mlp): GPT2MLP(\n",
706
+ " (c_fc): Conv1D()\n",
707
+ " (c_proj): Conv1D()\n",
708
+ " (act): GELUActivation()\n",
709
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
710
+ " )\n",
711
+ " )\n",
712
+ " (13): GPT2Block(\n",
713
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
714
+ " (attn): GPT2Attention(\n",
715
+ " (c_attn): MergedLinear(\n",
716
+ " in_features=2560, out_features=7680, bias=True\n",
717
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
718
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
719
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
720
+ " )\n",
721
+ " (c_proj): Conv1D()\n",
722
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
723
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
724
+ " )\n",
725
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
726
+ " (mlp): GPT2MLP(\n",
727
+ " (c_fc): Conv1D()\n",
728
+ " (c_proj): Conv1D()\n",
729
+ " (act): GELUActivation()\n",
730
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
731
+ " )\n",
732
+ " )\n",
733
+ " (14): GPT2Block(\n",
734
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
735
+ " (attn): GPT2Attention(\n",
736
+ " (c_attn): MergedLinear(\n",
737
+ " in_features=2560, out_features=7680, bias=True\n",
738
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
739
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
740
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
741
+ " )\n",
742
+ " (c_proj): Conv1D()\n",
743
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
744
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
745
+ " )\n",
746
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
747
+ " (mlp): GPT2MLP(\n",
748
+ " (c_fc): Conv1D()\n",
749
+ " (c_proj): Conv1D()\n",
750
+ " (act): GELUActivation()\n",
751
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
752
+ " )\n",
753
+ " )\n",
754
+ " (15): GPT2Block(\n",
755
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
756
+ " (attn): GPT2Attention(\n",
757
+ " (c_attn): MergedLinear(\n",
758
+ " in_features=2560, out_features=7680, bias=True\n",
759
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
760
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
761
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
762
+ " )\n",
763
+ " (c_proj): Conv1D()\n",
764
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
765
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
766
+ " )\n",
767
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
768
+ " (mlp): GPT2MLP(\n",
769
+ " (c_fc): Conv1D()\n",
770
+ " (c_proj): Conv1D()\n",
771
+ " (act): GELUActivation()\n",
772
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
773
+ " )\n",
774
+ " )\n",
775
+ " (16): GPT2Block(\n",
776
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
777
+ " (attn): GPT2Attention(\n",
778
+ " (c_attn): MergedLinear(\n",
779
+ " in_features=2560, out_features=7680, bias=True\n",
780
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
781
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
782
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
783
+ " )\n",
784
+ " (c_proj): Conv1D()\n",
785
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
786
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
787
+ " )\n",
788
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
789
+ " (mlp): GPT2MLP(\n",
790
+ " (c_fc): Conv1D()\n",
791
+ " (c_proj): Conv1D()\n",
792
+ " (act): GELUActivation()\n",
793
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
794
+ " )\n",
795
+ " )\n",
796
+ " (17): GPT2Block(\n",
797
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
798
+ " (attn): GPT2Attention(\n",
799
+ " (c_attn): MergedLinear(\n",
800
+ " in_features=2560, out_features=7680, bias=True\n",
801
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
802
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
803
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
804
+ " )\n",
805
+ " (c_proj): Conv1D()\n",
806
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
807
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
808
+ " )\n",
809
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
810
+ " (mlp): GPT2MLP(\n",
811
+ " (c_fc): Conv1D()\n",
812
+ " (c_proj): Conv1D()\n",
813
+ " (act): GELUActivation()\n",
814
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
815
+ " )\n",
816
+ " )\n",
817
+ " (18): GPT2Block(\n",
818
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
819
+ " (attn): GPT2Attention(\n",
820
+ " (c_attn): MergedLinear(\n",
821
+ " in_features=2560, out_features=7680, bias=True\n",
822
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
823
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
824
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
825
+ " )\n",
826
+ " (c_proj): Conv1D()\n",
827
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
828
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
829
+ " )\n",
830
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
831
+ " (mlp): GPT2MLP(\n",
832
+ " (c_fc): Conv1D()\n",
833
+ " (c_proj): Conv1D()\n",
834
+ " (act): GELUActivation()\n",
835
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
836
+ " )\n",
837
+ " )\n",
838
+ " (19): GPT2Block(\n",
839
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
840
+ " (attn): GPT2Attention(\n",
841
+ " (c_attn): MergedLinear(\n",
842
+ " in_features=2560, out_features=7680, bias=True\n",
843
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
844
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
845
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
846
+ " )\n",
847
+ " (c_proj): Conv1D()\n",
848
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
849
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
850
+ " )\n",
851
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
852
+ " (mlp): GPT2MLP(\n",
853
+ " (c_fc): Conv1D()\n",
854
+ " (c_proj): Conv1D()\n",
855
+ " (act): GELUActivation()\n",
856
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
857
+ " )\n",
858
+ " )\n",
859
+ " (20): GPT2Block(\n",
860
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
861
+ " (attn): GPT2Attention(\n",
862
+ " (c_attn): MergedLinear(\n",
863
+ " in_features=2560, out_features=7680, bias=True\n",
864
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
865
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
866
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
867
+ " )\n",
868
+ " (c_proj): Conv1D()\n",
869
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
870
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
871
+ " )\n",
872
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
873
+ " (mlp): GPT2MLP(\n",
874
+ " (c_fc): Conv1D()\n",
875
+ " (c_proj): Conv1D()\n",
876
+ " (act): GELUActivation()\n",
877
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
878
+ " )\n",
879
+ " )\n",
880
+ " (21): GPT2Block(\n",
881
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
882
+ " (attn): GPT2Attention(\n",
883
+ " (c_attn): MergedLinear(\n",
884
+ " in_features=2560, out_features=7680, bias=True\n",
885
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
886
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
887
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
888
+ " )\n",
889
+ " (c_proj): Conv1D()\n",
890
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
891
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
892
+ " )\n",
893
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
894
+ " (mlp): GPT2MLP(\n",
895
+ " (c_fc): Conv1D()\n",
896
+ " (c_proj): Conv1D()\n",
897
+ " (act): GELUActivation()\n",
898
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
899
+ " )\n",
900
+ " )\n",
901
+ " (22): GPT2Block(\n",
902
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
903
+ " (attn): GPT2Attention(\n",
904
+ " (c_attn): MergedLinear(\n",
905
+ " in_features=2560, out_features=7680, bias=True\n",
906
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
907
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
908
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
909
+ " )\n",
910
+ " (c_proj): Conv1D()\n",
911
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
912
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
913
+ " )\n",
914
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
915
+ " (mlp): GPT2MLP(\n",
916
+ " (c_fc): Conv1D()\n",
917
+ " (c_proj): Conv1D()\n",
918
+ " (act): GELUActivation()\n",
919
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
920
+ " )\n",
921
+ " )\n",
922
+ " (23): GPT2Block(\n",
923
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
924
+ " (attn): GPT2Attention(\n",
925
+ " (c_attn): MergedLinear(\n",
926
+ " in_features=2560, out_features=7680, bias=True\n",
927
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
928
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
929
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
930
+ " )\n",
931
+ " (c_proj): Conv1D()\n",
932
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
933
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
934
+ " )\n",
935
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
936
+ " (mlp): GPT2MLP(\n",
937
+ " (c_fc): Conv1D()\n",
938
+ " (c_proj): Conv1D()\n",
939
+ " (act): GELUActivation()\n",
940
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
941
+ " )\n",
942
+ " )\n",
943
+ " (24): GPT2Block(\n",
944
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
945
+ " (attn): GPT2Attention(\n",
946
+ " (c_attn): MergedLinear(\n",
947
+ " in_features=2560, out_features=7680, bias=True\n",
948
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
949
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
950
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
951
+ " )\n",
952
+ " (c_proj): Conv1D()\n",
953
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
954
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
955
+ " )\n",
956
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
957
+ " (mlp): GPT2MLP(\n",
958
+ " (c_fc): Conv1D()\n",
959
+ " (c_proj): Conv1D()\n",
960
+ " (act): GELUActivation()\n",
961
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
962
+ " )\n",
963
+ " )\n",
964
+ " (25): GPT2Block(\n",
965
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
966
+ " (attn): GPT2Attention(\n",
967
+ " (c_attn): MergedLinear(\n",
968
+ " in_features=2560, out_features=7680, bias=True\n",
969
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
970
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
971
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
972
+ " )\n",
973
+ " (c_proj): Conv1D()\n",
974
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
975
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
976
+ " )\n",
977
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
978
+ " (mlp): GPT2MLP(\n",
979
+ " (c_fc): Conv1D()\n",
980
+ " (c_proj): Conv1D()\n",
981
+ " (act): GELUActivation()\n",
982
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
983
+ " )\n",
984
+ " )\n",
985
+ " (26): GPT2Block(\n",
986
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
987
+ " (attn): GPT2Attention(\n",
988
+ " (c_attn): MergedLinear(\n",
989
+ " in_features=2560, out_features=7680, bias=True\n",
990
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
991
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
992
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
993
+ " )\n",
994
+ " (c_proj): Conv1D()\n",
995
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
996
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
997
+ " )\n",
998
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
999
+ " (mlp): GPT2MLP(\n",
1000
+ " (c_fc): Conv1D()\n",
1001
+ " (c_proj): Conv1D()\n",
1002
+ " (act): GELUActivation()\n",
1003
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
1004
+ " )\n",
1005
+ " )\n",
1006
+ " (27): GPT2Block(\n",
1007
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
1008
+ " (attn): GPT2Attention(\n",
1009
+ " (c_attn): MergedLinear(\n",
1010
+ " in_features=2560, out_features=7680, bias=True\n",
1011
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
1012
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
1013
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
1014
+ " )\n",
1015
+ " (c_proj): Conv1D()\n",
1016
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
1017
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
1018
+ " )\n",
1019
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
1020
+ " (mlp): GPT2MLP(\n",
1021
+ " (c_fc): Conv1D()\n",
1022
+ " (c_proj): Conv1D()\n",
1023
+ " (act): GELUActivation()\n",
1024
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
1025
+ " )\n",
1026
+ " )\n",
1027
+ " (28): GPT2Block(\n",
1028
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
1029
+ " (attn): GPT2Attention(\n",
1030
+ " (c_attn): MergedLinear(\n",
1031
+ " in_features=2560, out_features=7680, bias=True\n",
1032
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
1033
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
1034
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
1035
+ " )\n",
1036
+ " (c_proj): Conv1D()\n",
1037
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
1038
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
1039
+ " )\n",
1040
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
1041
+ " (mlp): GPT2MLP(\n",
1042
+ " (c_fc): Conv1D()\n",
1043
+ " (c_proj): Conv1D()\n",
1044
+ " (act): GELUActivation()\n",
1045
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
1046
+ " )\n",
1047
+ " )\n",
1048
+ " (29): GPT2Block(\n",
1049
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
1050
+ " (attn): GPT2Attention(\n",
1051
+ " (c_attn): MergedLinear(\n",
1052
+ " in_features=2560, out_features=7680, bias=True\n",
1053
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
1054
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
1055
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
1056
+ " )\n",
1057
+ " (c_proj): Conv1D()\n",
1058
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
1059
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
1060
+ " )\n",
1061
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
1062
+ " (mlp): GPT2MLP(\n",
1063
+ " (c_fc): Conv1D()\n",
1064
+ " (c_proj): Conv1D()\n",
1065
+ " (act): GELUActivation()\n",
1066
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
1067
+ " )\n",
1068
+ " )\n",
1069
+ " (30): GPT2Block(\n",
1070
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
1071
+ " (attn): GPT2Attention(\n",
1072
+ " (c_attn): MergedLinear(\n",
1073
+ " in_features=2560, out_features=7680, bias=True\n",
1074
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
1075
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
1076
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
1077
+ " )\n",
1078
+ " (c_proj): Conv1D()\n",
1079
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
1080
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
1081
+ " )\n",
1082
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
1083
+ " (mlp): GPT2MLP(\n",
1084
+ " (c_fc): Conv1D()\n",
1085
+ " (c_proj): Conv1D()\n",
1086
+ " (act): GELUActivation()\n",
1087
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
1088
+ " )\n",
1089
+ " )\n",
1090
+ " (31): GPT2Block(\n",
1091
+ " (ln_1): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
1092
+ " (attn): GPT2Attention(\n",
1093
+ " (c_attn): MergedLinear(\n",
1094
+ " in_features=2560, out_features=7680, bias=True\n",
1095
+ " (lora_dropout): Dropout(p=0.05, inplace=False)\n",
1096
+ " (lora_A): Linear(in_features=2560, out_features=16, bias=False)\n",
1097
+ " (lora_B): Conv1d(16, 5120, kernel_size=(1,), stride=(1,), groups=2, bias=False)\n",
1098
+ " )\n",
1099
+ " (c_proj): Conv1D()\n",
1100
+ " (attn_dropout): Dropout(p=0.0, inplace=False)\n",
1101
+ " (resid_dropout): Dropout(p=0.0, inplace=False)\n",
1102
+ " )\n",
1103
+ " (ln_2): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
1104
+ " (mlp): GPT2MLP(\n",
1105
+ " (c_fc): Conv1D()\n",
1106
+ " (c_proj): Conv1D()\n",
1107
+ " (act): GELUActivation()\n",
1108
+ " (dropout): Dropout(p=0.0, inplace=False)\n",
1109
+ " )\n",
1110
+ " )\n",
1111
+ " )\n",
1112
+ " (ln_f): LayerNorm((2560,), eps=1e-05, elementwise_affine=True)\n",
1113
+ " )\n",
1114
+ " (lm_head): CastOutputToFloat(\n",
1115
+ " (0): Linear(in_features=2560, out_features=50257, bias=False)\n",
1116
+ " )\n",
1117
+ " )\n",
1118
+ " )\n",
1119
+ ")"
1120
+ ]
1121
+ },
1122
+ "execution_count": 33,
1123
+ "metadata": {},
1124
+ "output_type": "execute_result"
1125
+ }
1126
+ ],
1127
+ "source": [
1128
+ "model.config\n",
1129
+ "print(model.dtype)\n",
1130
+ "\n",
1131
+ "model.half()"
1132
+ ]
1133
+ },
1134
+ {
1135
+ "cell_type": "code",
1136
+ "execution_count": 35,
1137
+ "id": "9cca3b03",
1138
+ "metadata": {},
1139
+ "outputs": [
1140
+ {
1141
+ "name": "stderr",
1142
+ "output_type": "stream",
1143
+ "text": [
1144
+ "Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.\n",
1145
+ "/root/miniconda3/envs/llama/lib/python3.10/site-packages/torch/utils/checkpoint.py:31: UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n",
1146
+ " warnings.warn(\"None of the inputs have requires_grad=True. Gradients will be None\")\n"
1147
+ ]
1148
+ },
1149
+ {
1150
+ "name": "stdout",
1151
+ "output_type": "stream",
1152
+ "text": [
1153
+ "Human: Can I run inference on my local machine?\n",
1154
+ "Assistant: Yes, you can. You should be able to use the same model and data as your local machine for inference. The only difference is that you will need to download the necessary packages from the cloud or install them locally.\n"
1155
+ ]
1156
+ }
1157
+ ],
1158
+ "source": [
1159
+ "text = \"Human: Can I run inference on my local machine?\\nAssistant:\"\n",
1160
+ "\n",
1161
+ "inputs = tokenizer(text, return_tensors=\"pt\")\n",
1162
+ "input_ids = inputs[\"input_ids\"].to(model.device)\n",
1163
+ "\n",
1164
+ "generation_config = transformers.GenerationConfig(\n",
1165
+ " max_new_tokens=100,\n",
1166
+ " temperature=0.2,\n",
1167
+ " top_p=0.75,\n",
1168
+ " top_k=50,\n",
1169
+ " repetition_penalty=1.2,\n",
1170
+ " do_sample=True,\n",
1171
+ " early_stopping=True,\n",
1172
+ "# num_beams=5,\n",
1173
+ " \n",
1174
+ " pad_token_id=model.config.pad_token_id,\n",
1175
+ " eos_token_id=model.config.eos_token_id,\n",
1176
+ ")\n",
1177
+ "\n",
1178
+ "with torch.no_grad():\n",
1179
+ " output = model.generate(\n",
1180
+ " input_ids=input_ids,\n",
1181
+ " attention_mask=torch.ones_like(input_ids),\n",
1182
+ " generation_config=generation_config\n",
1183
+ " )[0].cuda()\n",
1184
+ "\n",
1185
+ "result = tokenizer.decode(output, skip_special_tokens=True).strip()\n",
1186
+ "print(result)"
1187
+ ]
1188
+ },
1189
+ {
1190
+ "cell_type": "code",
1191
+ "execution_count": null,
1192
+ "id": "be542e91",
1193
+ "metadata": {},
1194
+ "outputs": [],
1195
+ "source": []
1196
+ }
1197
+ ],
1198
+ "metadata": {
1199
+ "kernelspec": {
1200
+ "display_name": "Python 3 (ipykernel)",
1201
+ "language": "python",
1202
+ "name": "python3"
1203
+ },
1204
+ "language_info": {
1205
+ "codemirror_mode": {
1206
+ "name": "ipython",
1207
+ "version": 3
1208
+ },
1209
+ "file_extension": ".py",
1210
+ "mimetype": "text/x-python",
1211
+ "name": "python",
1212
+ "nbconvert_exporter": "python",
1213
+ "pygments_lexer": "ipython3",
1214
+ "version": "3.10.9"
1215
+ }
1216
+ },
1217
+ "nbformat": 4,
1218
+ "nbformat_minor": 5
1219
+ }
1220
+
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ gradio
2
+ transformers
3
+ bitsandbytes
4
+ accelerate
5
+ peft