BlinkDL commited on
Commit
a00d592
·
1 Parent(s): 1035e19

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +16 -191
app.py CHANGED
@@ -5,17 +5,17 @@ from huggingface_hub import hf_hub_download
5
  from pynvml import *
6
  nvmlInit()
7
  gpu_h = nvmlDeviceGetHandleByIndex(0)
8
- ctx_limit = 1536
9
- title = "RWKV-4-Raven-14B-v12-Eng98%-Other2%-20230523-ctx8192"
10
 
11
  os.environ["RWKV_JIT_ON"] = '1'
12
  os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)
13
 
14
  from rwkv.model import RWKV
15
- model_path = hf_hub_download(repo_id="BlinkDL/rwkv-4-raven", filename=f"{title}.pth")
16
- model = RWKV(model=model_path, strategy='cuda fp16i8 *24 -> cuda fp16')
17
  from rwkv.utils import PIPELINE, PIPELINE_ARGS
18
- pipeline = PIPELINE(model, "20B_tokenizer.json")
19
 
20
  def generate_prompt(instruction, input=None):
21
  instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n')
@@ -95,165 +95,27 @@ def evaluate(
95
  yield out_str.strip()
96
 
97
  examples = [
98
- ["Tell me about ravens.", "", 300, 1.2, 0.5, 0.4, 0.4],
99
- ["Write a python function to mine 1 BTC, with details and comments.", "", 300, 1.2, 0.5, 0.4, 0.4],
100
- ["Write a song about ravens.", "", 300, 1.2, 0.5, 0.4, 0.4],
101
- ["Explain the following metaphor: Life is like cats.", "", 300, 1.2, 0.5, 0.4, 0.4],
102
- ["Write a story using the following information", "A man named Alex chops a tree down", 300, 1.2, 0.5, 0.4, 0.4],
103
- ["Generate a list of adjectives that describe a person as brave.", "", 300, 1.2, 0.5, 0.4, 0.4],
104
- ["You have $100, and your goal is to turn that into as much money as possible with AI and Machine Learning. Please respond with detailed plan.", "", 300, 1.2, 0.5, 0.4, 0.4],
105
  ]
106
 
107
  ##########################################################################
108
 
109
- chat_intro = '''The following is a coherent verbose detailed conversation between <|user|> and an AI girl named <|bot|>.
110
-
111
- <|user|>: Hi <|bot|>, Would you like to chat with me for a while?
112
-
113
- <|bot|>: Hi <|user|>. Sure. What would you like to talk about? I'm listening.
114
- '''
115
-
116
- def user(message, chatbot):
117
- chatbot = chatbot or []
118
- # print(f"User: {message}")
119
- return "", chatbot + [[message, None]]
120
-
121
- def alternative(chatbot, history):
122
- if not chatbot or not history:
123
- return chatbot, history
124
-
125
- chatbot[-1][1] = None
126
- history[0] = copy.deepcopy(history[1])
127
-
128
- return chatbot, history
129
-
130
- def chat(
131
- prompt,
132
- user,
133
- bot,
134
- chatbot,
135
- history,
136
- temperature=1.0,
137
- top_p=0.8,
138
- presence_penalty=0.1,
139
- count_penalty=0.1,
140
- ):
141
- args = PIPELINE_ARGS(temperature=max(0.2, float(temperature)), top_p=float(top_p),
142
- alpha_frequency=float(count_penalty),
143
- alpha_presence=float(presence_penalty),
144
- token_ban=[], # ban the generation of some tokens
145
- token_stop=[]) # stop generation whenever you see any token here
146
-
147
- if not chatbot:
148
- return chatbot, history
149
-
150
- message = chatbot[-1][0]
151
- message = message.strip().replace('\r\n','\n').replace('\n\n','\n')
152
- ctx = f"{user}: {message}\n\n{bot}:"
153
-
154
- if not history:
155
- prompt = prompt.replace("<|user|>", user.strip())
156
- prompt = prompt.replace("<|bot|>", bot.strip())
157
- prompt = prompt.strip()
158
- prompt = f"\n{prompt}\n\n"
159
-
160
- out, state = model.forward(pipeline.encode(prompt), None)
161
- history = [state, None, []] # [state, state_pre, tokens]
162
- # print("History reloaded.")
163
-
164
- [state, _, all_tokens] = history
165
- state_pre_0 = copy.deepcopy(state)
166
-
167
- out, state = model.forward(pipeline.encode(ctx)[-ctx_limit:], state)
168
- state_pre_1 = copy.deepcopy(state) # For recovery
169
-
170
- # print("Bot:", end='')
171
-
172
- begin = len(all_tokens)
173
- out_last = begin
174
- out_str: str = ''
175
- occurrence = {}
176
- for i in range(300):
177
- if i <= 0:
178
- nl_bias = -float('inf')
179
- elif i <= 30:
180
- nl_bias = (i - 30) * 0.1
181
- elif i <= 130:
182
- nl_bias = 0
183
- else:
184
- nl_bias = (i - 130) * 0.25
185
- out[187] += nl_bias
186
- for n in occurrence:
187
- out[n] -= (args.alpha_presence + occurrence[n] * args.alpha_frequency)
188
-
189
- token = pipeline.sample_logits(out, temperature=args.temperature, top_p=args.top_p)
190
- next_tokens = [token]
191
- if token == 0:
192
- next_tokens = pipeline.encode('\n\n')
193
- all_tokens += next_tokens
194
- for xxx in occurrence:
195
- occurrence[xxx] *= 0.996
196
- if token not in occurrence:
197
- occurrence[token] = 1
198
- else:
199
- occurrence[token] += 1
200
-
201
- out, state = model.forward(next_tokens, state)
202
-
203
- tmp = pipeline.decode(all_tokens[out_last:])
204
- if '\ufffd' not in tmp:
205
- # print(tmp, end='', flush=True)
206
- out_last = begin + i + 1
207
- out_str += tmp
208
-
209
- chatbot[-1][1] = out_str.strip()
210
- history = [state, all_tokens]
211
- yield chatbot, history
212
-
213
- out_str = pipeline.decode(all_tokens[begin:])
214
- out_str = out_str.replace("\r\n", '\n').replace('\\n', '\n')
215
-
216
- if '\n\n' in out_str:
217
- break
218
-
219
- # State recovery
220
- if f'{user}:' in out_str or f'{bot}:' in out_str:
221
- idx_user = out_str.find(f'{user}:')
222
- idx_user = len(out_str) if idx_user == -1 else idx_user
223
- idx_bot = out_str.find(f'{bot}:')
224
- idx_bot = len(out_str) if idx_bot == -1 else idx_bot
225
- idx = min(idx_user, idx_bot)
226
-
227
- if idx < len(out_str):
228
- out_str = f" {out_str[:idx].strip()}\n\n"
229
- tokens = pipeline.encode(out_str)
230
-
231
- all_tokens = all_tokens[:begin] + tokens
232
- out, state = model.forward(tokens, state_pre_1)
233
- break
234
-
235
- gpu_info = nvmlDeviceGetMemoryInfo(gpu_h)
236
- print(f'vram {gpu_info.total} used {gpu_info.used} free {gpu_info.free}')
237
-
238
- gc.collect()
239
- torch.cuda.empty_cache()
240
-
241
- chatbot[-1][1] = out_str.strip()
242
- history = [state, state_pre_0, all_tokens]
243
- yield chatbot, history
244
-
245
- ##########################################################################
246
-
247
  with gr.Blocks(title=title) as demo:
248
- gr.HTML(f"<div style=\"text-align: center;\">\n<h1>🐦Raven - {title}</h1>\n</div>")
249
  with gr.Tab("Instruct mode"):
250
- gr.Markdown(f"Raven is [RWKV 14B](https://github.com/BlinkDL/ChatRWKV) 100% RNN [RWKV-LM](https://github.com/BlinkDL/RWKV-LM) finetuned to follow instructions. *** Please try examples first (bottom of page) *** (edit them to use your question). Demo limited to ctxlen {ctx_limit}. Finetuned on alpaca, gpt4all, codealpaca and more. For best results, *** keep you prompt short and clear ***. <b>UPDATE: now with Chat (see above, as a tab) ==> turn off as of now due to VRAM leak caused by buggy code.</b>.")
251
  with gr.Row():
252
  with gr.Column():
253
  instruction = gr.Textbox(lines=2, label="Instruction", value="Tell me about ravens.")
254
  input = gr.Textbox(lines=2, label="Input", placeholder="none")
255
- token_count = gr.Slider(10, 300, label="Max Tokens", step=10, value=300)
256
- temperature = gr.Slider(0.2, 2.0, label="Temperature", step=0.1, value=1.2)
257
  top_p = gr.Slider(0.0, 1.0, label="Top P", step=0.05, value=0.5)
258
  presence_penalty = gr.Slider(0.0, 1.0, label="Presence Penalty", step=0.1, value=0.4)
259
  count_penalty = gr.Slider(0.0, 1.0, label="Count Penalty", step=0.1, value=0.4)
@@ -266,43 +128,6 @@ with gr.Blocks(title=title) as demo:
266
  submit.click(evaluate, [instruction, input, token_count, temperature, top_p, presence_penalty, count_penalty], [output])
267
  clear.click(lambda: None, [], [output])
268
  data.click(lambda x: x, [data], [instruction, input, token_count, temperature, top_p, presence_penalty, count_penalty])
269
-
270
- # with gr.Tab("Chat (Experimental - Might be buggy - use ChatRWKV for reference)"):
271
- # gr.Markdown(f'''<b>*** The length of response is restricted in this demo. Use ChatRWKV for longer generations. ***</b> Say "go on" or "continue" can sometimes continue the response. If you'd like to edit the scenario, make sure to follow the exact same format: empty lines between (and only between) different speakers. Changes only take effect after you press [Clear]. <b>The default "Bob" & "Alice" names work the best.</b>''', label="Description")
272
- # with gr.Row():
273
- # with gr.Column():
274
- # chatbot = gr.Chatbot()
275
- # state = gr.State()
276
- # message = gr.Textbox(label="Message", value="Write me a python code to land on moon.")
277
- # with gr.Row():
278
- # send = gr.Button("Send", variant="primary")
279
- # alt = gr.Button("Alternative", variant="secondary")
280
- # clear = gr.Button("Clear", variant="secondary")
281
- # with gr.Column():
282
- # with gr.Row():
283
- # user_name = gr.Textbox(lines=1, max_lines=1, label="User Name", value="Bob")
284
- # bot_name = gr.Textbox(lines=1, max_lines=1, label="Bot Name", value="Alice")
285
- # prompt = gr.Textbox(lines=10, max_lines=50, label="Scenario", value=chat_intro)
286
- # temperature = gr.Slider(0.2, 2.0, label="Temperature", step=0.1, value=1.2)
287
- # top_p = gr.Slider(0.0, 1.0, label="Top P", step=0.05, value=0.5)
288
- # presence_penalty = gr.Slider(0.0, 1.0, label="Presence Penalty", step=0.1, value=0.4)
289
- # count_penalty = gr.Slider(0.0, 1.0, label="Count Penalty", step=0.1, value=0.4)
290
- # chat_inputs = [
291
- # prompt,
292
- # user_name,
293
- # bot_name,
294
- # chatbot,
295
- # state,
296
- # temperature,
297
- # top_p,
298
- # presence_penalty,
299
- # count_penalty
300
- # ]
301
- # chat_outputs = [chatbot, state]
302
- # message.submit(user, [message, chatbot], [message, chatbot], queue=False).then(chat, chat_inputs, chat_outputs)
303
- # send.click(user, [message, chatbot], [message, chatbot], queue=False).then(chat, chat_inputs, chat_outputs)
304
- # alt.click(alternative, [chatbot, state], [chatbot, state], queue=False).then(chat, chat_inputs, chat_outputs)
305
- # clear.click(lambda: ([], None, ""), [], [chatbot, state, message], queue=False)
306
 
307
  demo.queue(concurrency_count=1, max_size=10)
308
  demo.launch(share=False)
 
5
  from pynvml import *
6
  nvmlInit()
7
  gpu_h = nvmlDeviceGetHandleByIndex(0)
8
+ ctx_limit = 3000
9
+ title = "RWKV-5-World-1.5B-v2-OnlyForTest_56%_trained-20231013-ctx4096"
10
 
11
  os.environ["RWKV_JIT_ON"] = '1'
12
  os.environ["RWKV_CUDA_ON"] = '1' # if '1' then use CUDA kernel for seq mode (much faster)
13
 
14
  from rwkv.model import RWKV
15
+ model_path = hf_hub_download(repo_id="BlinkDL/temp", filename=f"{title}.pth")
16
+ model = RWKV(model=model_path, strategy='cuda fp16')
17
  from rwkv.utils import PIPELINE, PIPELINE_ARGS
18
+ pipeline = PIPELINE(model, "rwkv_vocab_v20230424")
19
 
20
  def generate_prompt(instruction, input=None):
21
  instruction = instruction.strip().replace('\r\n','\n').replace('\n\n','\n')
 
95
  yield out_str.strip()
96
 
97
  examples = [
98
+ ["Tell me about ravens.", "", 300, 1, 0.5, 0.4, 0.4],
99
+ ["Write a python function to mine 1 BTC, with details and comments.", "", 300, 1, 0.5, 0.4, 0.4],
100
+ ["Write a song about ravens.", "", 300, 1, 0.5, 0.4, 0.4],
101
+ ["Explain the following metaphor: Life is like cats.", "", 300, 1, 0.5, 0.4, 0.4],
102
+ ["Write a story using the following information", "A man named Alex chops a tree down", 300, 1, 0.5, 0.4, 0.4],
103
+ ["Generate a list of adjectives that describe a person as brave.", "", 300, 1, 0.5, 0.4, 0.4],
104
+ ["You have $100, and your goal is to turn that into as much money as possible with AI and Machine Learning. Please respond with detailed plan.", "", 300, 1, 0.5, 0.4, 0.4],
105
  ]
106
 
107
  ##########################################################################
108
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
109
  with gr.Blocks(title=title) as demo:
110
+ gr.HTML(f"<div style=\"text-align: center;\">\n<h1>RWKV-5 World v2 - {title}</h1>\n</div>")
111
  with gr.Tab("Instruct mode"):
112
+ gr.Markdown(f"This is a 1.5B [RWKV-5 World v2](https://huggingface.co/BlinkDL/rwkv-5-world) 100% RNN [RWKV-LM](https://github.com/BlinkDL/RWKV-LM). *** Please try examples first (bottom of page) *** (edit them to use your question). Demo limited to ctxlen {ctx_limit}. For best results, *** keep you prompt short and clear ***.")
113
  with gr.Row():
114
  with gr.Column():
115
  instruction = gr.Textbox(lines=2, label="Instruction", value="Tell me about ravens.")
116
  input = gr.Textbox(lines=2, label="Input", placeholder="none")
117
+ token_count = gr.Slider(10, 500, label="Max Tokens", step=10, value=500)
118
+ temperature = gr.Slider(0.2, 2.0, label="Temperature", step=0.1, value=1.0)
119
  top_p = gr.Slider(0.0, 1.0, label="Top P", step=0.05, value=0.5)
120
  presence_penalty = gr.Slider(0.0, 1.0, label="Presence Penalty", step=0.1, value=0.4)
121
  count_penalty = gr.Slider(0.0, 1.0, label="Count Penalty", step=0.1, value=0.4)
 
128
  submit.click(evaluate, [instruction, input, token_count, temperature, top_p, presence_penalty, count_penalty], [output])
129
  clear.click(lambda: None, [], [output])
130
  data.click(lambda x: x, [data], [instruction, input, token_count, temperature, top_p, presence_penalty, count_penalty])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
131
 
132
  demo.queue(concurrency_count=1, max_size=10)
133
  demo.launch(share=False)