ffreemt commited on
Commit
62bb826
1 Parent(s): a0303a2

Update app.py from falcon-7b-gglm-m app.py"

Browse files
Files changed (2) hide show
  1. .ruff.toml +3 -1
  2. app.py +236 -482
.ruff.toml CHANGED
@@ -8,10 +8,12 @@ line-length = 300
8
  select = ["F", "E", "W", "I001", "YTT", "D", "PLC"]
9
  # select = ["ALL"]
10
 
 
 
11
  # D103 Missing docstring in public function
12
  # D101 Missing docstring in public class
13
  # `multi-line-summary-first-line` (D212)
14
  # `one-blank-line-before-class` (D203)
15
- extend-ignore = ["D103", "D101", "D212", "D203"]
16
 
17
  exclude = [".venv"]
 
8
  select = ["F", "E", "W", "I001", "YTT", "D", "PLC"]
9
  # select = ["ALL"]
10
 
11
+ # D100 Missing docstring in public module
12
+ # E501 Line too long
13
  # D103 Missing docstring in public function
14
  # D101 Missing docstring in public class
15
  # `multi-line-summary-first-line` (D212)
16
  # `one-blank-line-before-class` (D203)
17
+ extend-ignore = ["E501", "D100", "D103", "D101", "D212", "D203"]
18
 
19
  exclude = [".venv"]
app.py CHANGED
@@ -1,510 +1,264 @@
1
- """Run codes."""
2
- # pylint: disable=line-too-long, broad-exception-caught, invalid-name, missing-function-docstring, too-many-instance-attributes, missing-class-docstring
3
- import os
4
- import time
5
- from dataclasses import asdict, dataclass
6
  from pathlib import Path
7
- from types import SimpleNamespace
8
  from urllib.parse import urlparse
9
 
10
  import gradio as gr
11
  import psutil
12
- from about_time import about_time
13
  from ctransformers import AutoModelForCausalLM
14
- from huggingface_hub import hf_hub_download, snapshot_download
15
- from loguru import logger
16
-
17
- filename_list = [
18
- "Wizard-Vicuna-7B-Uncensored.ggmlv3.q2_K.bin",
19
- "Wizard-Vicuna-7B-Uncensored.ggmlv3.q3_K_L.bin",
20
- "Wizard-Vicuna-7B-Uncensored.ggmlv3.q3_K_M.bin",
21
- "Wizard-Vicuna-7B-Uncensored.ggmlv3.q3_K_S.bin",
22
- "Wizard-Vicuna-7B-Uncensored.ggmlv3.q4_0.bin",
23
- "Wizard-Vicuna-7B-Uncensored.ggmlv3.q4_1.bin",
24
- "Wizard-Vicuna-7B-Uncensored.ggmlv3.q4_K_M.bin",
25
- "Wizard-Vicuna-7B-Uncensored.ggmlv3.q4_K_S.bin",
26
- "Wizard-Vicuna-7B-Uncensored.ggmlv3.q5_0.bin",
27
- "Wizard-Vicuna-7B-Uncensored.ggmlv3.q5_1.bin",
28
- "Wizard-Vicuna-7B-Uncensored.ggmlv3.q5_K_M.bin",
29
- "Wizard-Vicuna-7B-Uncensored.ggmlv3.q5_K_S.bin",
30
- "Wizard-Vicuna-7B-Uncensored.ggmlv3.q6_K.bin",
31
- "Wizard-Vicuna-7B-Uncensored.ggmlv3.q8_0.bin",
32
- ]
33
-
34
- URL = "https://huggingface.co/TheBloke/Wizard-Vicuna-7B-Uncensored-GGML/raw/main/Wizard-Vicuna-7B-Uncensored.ggmlv3.q4_K_M.bin" # 4.05G
35
-
36
- URL = "https://huggingface.co/TheBloke/30B-Lazarus-GGML/blob/main/30b-Lazarus.ggmlv3.q4_0.bin"
37
- URL = "https://huggingface.co/TheBloke/30B-Lazarus-GGML/blob/main/30b-Lazarus.ggmlv3.q4_1.bin"
38
- URL = "https://huggingface.co/TheBloke/30B-Lazarus-GGML/resolve/main/30b-Lazarus.ggmlv3.q4_K_M.bin"
39
- URL = "https://huggingface.co/TheBloke/30B-Lazarus-GGML/resolve/main/30b-Lazarus.ggmlv3.q4_K_S.bin" # 18GB
40
-
41
- URL = "https://huggingface.co/TheBloke/30B-Lazarus-GGML/blob/main/30b-Lazarus.ggmlv3.q3_K_S.bin" # 14GB
42
-
43
- MODEL_FILENAME = Path(URL).name
44
-
45
- # MODEL_FILENAME = filename_list[0] # q2_K 4.05G
46
- # MODEL_FILENAME = filename_list[5] # q4_1 4.21
47
-
48
- REPO_ID = "/".join(
49
- urlparse(URL).path.strip("/").split("/")[:2]
50
- )
51
- # TheBloke/30B-Lazarus-GGML
52
- # # TheBloke/Wizard-Vicuna-7B-Uncensored-GGML
53
-
54
- DESTINATION_FOLDER = "models"
55
 
56
- os.environ["TZ"] = "Asia/Shanghai"
57
- try:
58
- time.tzset() # type: ignore # pylint: disable=no-member
59
- except Exception:
60
- # Windows
61
- logger.warning("Windows, cant run time.tzset()")
62
-
63
- ns = SimpleNamespace(
64
- response="",
65
- generator=[],
66
  )
67
 
68
- default_system_prompt = "A conversation between a user and an LLM-based AI assistant named Local Assistant. Local Assistant gives helpful and honest answers."
69
-
70
- user_prefix = "[user]: "
71
- assistant_prefix = "[assistant]: "
72
-
73
-
74
- def predict_str(prompt, bot): # bot is in fact bot_history
75
- # logger.debug(f"{prompt=}, {bot=}, {timeout=}")
76
-
77
- if bot is None:
78
- bot = []
79
-
80
- logger.debug(f"{prompt=}, {bot=}")
81
-
82
- try:
83
- # user_prompt = prompt
84
- generator = generate(
85
- LLM,
86
- GENERATION_CONFIG,
87
- system_prompt=default_system_prompt,
88
- user_prompt=prompt.strip(),
89
- )
90
-
91
- ns.generator = generator # for .then
92
-
93
- except Exception as exc:
94
- logger.error(exc)
95
-
96
- # bot.append([prompt, f"{response} {_}"])
97
- # return prompt, bot
98
-
99
- _ = bot + [[prompt, None]]
100
- logger.debug(f"{prompt=}, {_=}")
101
-
102
- return prompt, _
103
-
104
-
105
- def bot_str(bot):
106
- if bot:
107
- bot[-1][1] = ""
108
- else:
109
- bot = [["Something is wrong", ""]]
110
-
111
- print(assistant_prefix, end=" ", flush=True)
112
-
113
- response = ""
114
-
115
- flag = 1
116
- then = time.time()
117
- for word in ns.generator:
118
- # record first response time
119
- if flag:
120
- logger.debug(f"\t {time.time() - then:.1f}s")
121
- flag = 0
122
- print(word, end="", flush=True)
123
- # print(word, flush=True) # vertical stream
124
- response += word
125
- bot[-1][1] = response
126
- yield bot
127
-
128
-
129
- def predict(prompt, bot):
130
- # logger.debug(f"{prompt=}, {bot=}, {timeout=}")
131
- logger.debug(f"{prompt=}, {bot=}")
132
-
133
- ns.response = ""
134
- then = time.time()
135
- with about_time() as atime: # type: ignore
136
- try:
137
- # user_prompt = prompt
138
- generator = generate(
139
- LLM,
140
- GENERATION_CONFIG,
141
- system_prompt=default_system_prompt,
142
- user_prompt=prompt.strip(),
143
- )
144
-
145
- ns.generator = generator # for .then
146
-
147
- print(assistant_prefix, end=" ", flush=True)
148
-
149
- response = ""
150
- buff.update(value="diggin...")
151
-
152
- flag = 1
153
- for word in generator:
154
- # record first response time
155
- if flag:
156
- logger.debug(f"\t {time.time() - then:.1f}s")
157
- flag = 0
158
- # print(word, end="", flush=True)
159
- print(word, flush=True) # vertical stream
160
- response += word
161
- ns.response = response
162
- buff.update(value=response)
163
- print("")
164
- logger.debug(f"{response=}")
165
- except Exception as exc:
166
- logger.error(exc)
167
- response = f"{exc=}"
168
-
169
- # bot = {"inputs": [response]}
170
- _ = (
171
- f"(time elapsed: {atime.duration_human}, " # type: ignore
172
- f"{atime.duration/(len(prompt) + len(response)):.1f}s/char)" # type: ignore
173
- )
174
-
175
- bot.append([prompt, f"{response} {_}"])
176
-
177
- return prompt, bot
178
-
179
-
180
- def predict_api(prompt):
181
- logger.debug(f"{prompt=}")
182
- ns.response = ""
183
- try:
184
- # user_prompt = prompt
185
- _ = GenerationConfig(
186
- temperature=0.2,
187
- top_k=0,
188
- top_p=0.9,
189
- repetition_penalty=1.0,
190
- max_new_tokens=512, # adjust as needed
191
- seed=42,
192
- reset=False, # reset history (cache)
193
- stream=True, # TODO stream=False and generator
194
- threads=os.cpu_count() // 2, # type: ignore # adjust for your CPU
195
- stop=["<|im_end|>", "|<"],
196
- )
197
-
198
- # TODO: stream does not make sense in api?
199
- generator = generate(
200
- LLM, _, system_prompt=default_system_prompt, user_prompt=prompt.strip()
201
- )
202
- print(assistant_prefix, end=" ", flush=True)
203
-
204
- response = ""
205
- buff.update(value="diggin...")
206
- for word in generator:
207
- print(word, end="", flush=True)
208
- response += word
209
- ns.response = response
210
- buff.update(value=response)
211
- print("")
212
- logger.debug(f"{response=}")
213
- except Exception as exc:
214
- logger.error(exc)
215
- response = f"{exc=}"
216
- # bot = {"inputs": [response]}
217
- # bot = [(prompt, response)]
218
-
219
- return response
220
-
221
-
222
- def download_quant(destination_folder: str, repo_id: str, model_filename: str):
223
- local_path = os.path.abspath(destination_folder)
224
- return hf_hub_download(
225
- repo_id=repo_id,
226
- filename=model_filename,
227
- local_dir=local_path,
228
- local_dir_use_symlinks=True,
229
- )
230
-
231
-
232
- @dataclass
233
- class GenerationConfig:
234
- temperature: float
235
- top_k: int
236
- top_p: float
237
- repetition_penalty: float
238
- max_new_tokens: int
239
- seed: int
240
- reset: bool
241
- stream: bool
242
- threads: int
243
- stop: list[str]
244
-
245
-
246
- def format_prompt(system_prompt: str, user_prompt: str):
247
- """Format prompt based on: https://huggingface.co/spaces/mosaicml/mpt-30b-chat/blob/main/app.py."""
248
- # TODO: fix prompts
249
-
250
- system_prompt = f"<|im_start|>system\n{system_prompt}<|im_end|>\n"
251
- user_prompt = f"<|im_start|>user\n{user_prompt}<|im_end|>\n"
252
- assistant_prompt = "<|im_start|>assistant\n"
253
-
254
- return f"{system_prompt}{user_prompt}{assistant_prompt}"
255
-
256
-
257
- def generate(
258
- llm: AutoModelForCausalLM,
259
- generation_config: GenerationConfig,
260
- system_prompt: str = default_system_prompt,
261
- user_prompt: str = "",
262
- ):
263
- """Run model inference, will return a Generator if streaming is true."""
264
- # if not user_prompt.strip():
265
- return llm(
266
- format_prompt(
267
- system_prompt,
268
- user_prompt,
269
- ),
270
- **asdict(generation_config),
271
- )
272
-
273
-
274
- # if "mpt" in model_filename:
275
- # config = AutoConfig.from_pretrained("mosaicml/mpt-30b-cha t", context_length=8192)
276
- # llm = AutoModelForCausalLM.from_pretrained(
277
- # os.path.abspath(f"models/{model_filename}"),
278
- # model_type="mpt",
279
- # config=config,
280
- # )
281
 
282
- # https://huggingface.co/spaces/matthoffner/wizardcoder-ggml/blob/main/main.py
283
  _ = """
284
  llm = AutoModelForCausalLM.from_pretrained(
285
- "TheBloke/WizardCoder-15B-1.0-GGML",
286
- model_file="WizardCoder-15B-1.0.ggmlv3.q4_0.bin",
287
- model_type="starcoder",
288
- threads=8
289
  )
290
  # """
291
-
292
- logger.info(f"start dl, {REPO_ID=}, {MODEL_FILENAME=}, {DESTINATION_FOLDER=}")
293
-
294
- # download_quant(DESTINATION_FOLDER, REPO_ID, MODEL_FILENAME)
295
- snapshot_download(
296
- repo_id=REPO_ID, # TheBloke/30B-Lazarus-GGML
297
- allow_patterns=MODEL_FILENAME, # 30b-Lazarus.ggmlv3.q4_K_S.bin 18.3G
298
- # revision="ggmlv3",
299
- local_dir="models",
 
 
 
 
 
 
 
300
  )
301
 
302
- logger.info("done dl")
303
-
304
- logger.debug(f"{os.cpu_count()=} {psutil.cpu_count(logical=False)=}")
305
- cpu_count = os.cpu_count() // 2 # type: ignore
306
- cpu_count = psutil.cpu_count(logical=False)
307
-
308
- logger.debug(f"{cpu_count=}")
309
-
310
- logger.info("load llm")
311
-
312
- # from ctransformers import AutoConfig
313
- # AutoConfig(REPO_ID)
314
- # AutoConfig(config='TheBloke/30B-Lazarus-GGML', model_type=None)
315
-
316
- _ = Path("models", MODEL_FILENAME).absolute().as_posix()
317
- logger.debug(f"model_file: {_}, exists: {Path(_).exists()}")
318
- LLM = AutoModelForCausalLM.from_pretrained(
319
- # "TheBloke/WizardCoder-15B-1.0-GGML",
320
- # REPO_ID, # DESTINATION_FOLDER, # model_path_or_repo_id: str required
321
  # model_file=_,
322
  _,
323
- model_type="llama", #AutoConfig.from_pretrained(REPO_ID).model_type,
324
- threads=cpu_count,
325
- )
326
-
327
- logger.info("done load llm")
328
-
329
- GENERATION_CONFIG = GenerationConfig(
330
- temperature=0.2,
331
- top_k=0,
332
- top_p=0.9,
333
- repetition_penalty=1.0,
334
- max_new_tokens=512, # adjust as needed
335
- seed=42,
336
- reset=False, # reset history (cache)
337
- stream=True, # streaming per word/token
338
- threads=cpu_count,
339
- stop=["<|im_end|>", "|<"], # TODO possible fix of stop
340
  )
341
 
342
- css = """
343
- .importantButton {
344
- background: linear-gradient(45deg, #7e0570,#5d1c99, #6e00ff) !important;
345
- border: none !important;
346
- }
347
- .importantButton:hover {
348
- background: linear-gradient(45deg, #ff00e0,#8500ff, #6e00ff) !important;
349
- border: none !important;
350
- }
351
- .disclaimer {font-variant-caps: all-small-caps; font-size: xx-small;}
352
- .xsmall {font-size: x-small;}
353
  """
354
- etext = """In America, where cars are an important part of the national psyche, a decade ago people had suddenly started to drive less, which had not happened since the oil shocks of the 1970s. """
355
- examples = [
356
- ["How to pick a lock? Provide detailed steps."],
357
- ["Explain the plot of Cinderella in a sentence."],
358
- [
359
- "How long does it take to become proficient in French, and what are the best methods for retaining information?"
360
- ],
361
- ["What are some common mistakes to avoid when writing code?"],
362
- ["Build a prompt to generate a beautiful portrait of a horse"],
363
- ["Suggest four metaphors to describe the benefits of AI"],
364
- ["Write a pop song about leaving home for the sandy beaches."],
365
- ["Write a summary demonstrating my ability to tame lions"],
366
- ["鲁迅和周树人什么关系 说中文"],
367
- ["鲁迅和周树人什么关系"],
368
- ["鲁迅和周树人什么关系 用英文回答"],
369
- ["从前有一头牛,这头牛后面有什么?"],
370
- ["正无穷大加一大于正无穷大吗?"],
371
- ["正无穷大加正无穷大大于正无穷大吗?"],
372
- ["-2的平方根等于什么"],
373
- ["树上有5只鸟,猎人开枪打死了一只。树上还有几只鸟?"],
374
- ["树上有11只鸟,猎人开枪打死了一只。树上还有几只鸟?提示:需考虑鸟可能受惊吓飞走。"],
375
- ["以红楼梦的行文风格写一张委婉的请假条。不少于320字。"],
376
- [f"{etext} 翻成中文,列出3个版本"],
377
- [f"{etext} \n 翻成中文,保留原意,但使用文学性的语言。不要写解释。列出3个版本"],
378
- ["假定 1 + 2 = 4, 试求 7 + 8"],
379
- ["判断一个数是不是质数的 javascript 码"],
380
- ["实现python 里 range(10)的 javascript 码"],
381
- ["实现python 里 [*(range(10)]的 javascript 码"],
382
- ["Erkläre die Handlung von Cinderella in einem Satz."],
383
- ["Erkläre die Handlung von Cinderella in einem Satz. Auf Deutsch"],
384
- ]
385
-
386
- with gr.Blocks(
387
- # title="mpt-30b-chat-ggml",
388
- title=f"{MODEL_FILENAME}",
389
- theme=gr.themes.Soft(text_size="sm", spacing_size="sm"),
390
- css=css,
391
- ) as block:
392
- with gr.Accordion("🎈 Info", open=False):
393
- # gr.HTML(
394
- # """<center><a href="https://huggingface.co/spaces/mikeee/mpt-30b-chat?duplicate=true"><img src="https://bit.ly/3gLdBN6" alt="Duplicate"></a> and spin a CPU UPGRADE to avoid the queue</center>"""
395
- # )
396
- gr.Markdown(
397
- f"""<h5><center><{REPO_ID}>{MODEL_FILENAME}</center></h4>
398
- The bot only speaks English.
399
-
400
- Most examples are meant for another model.
401
- You probably should try to test
402
- some related prompts.
403
- """,
404
- elem_classes="xsmall",
405
  )
406
-
407
- # chatbot = gr.Chatbot().style(height=700) # 500
408
- chatbot = gr.Chatbot(height=500)
409
- buff = gr.Textbox(show_label=False, visible=False)
410
- with gr.Row():
411
- with gr.Column(scale=4):
412
- msg = gr.Textbox(
413
- label="Chat Message Box",
414
- placeholder="Ask me anything (press Enter or click Submit to send)",
415
- show_label=False,
416
- ).style(container=False)
417
- with gr.Column(scale=1, min_width=50):
418
- with gr.Row():
419
- submit = gr.Button("Submit", elem_classes="xsmall")
420
- stop = gr.Button("Stop", visible=False)
421
- clear = gr.Button("Clear History", visible=True)
422
- with gr.Row(visible=False):
423
- with gr.Accordion("Advanced Options:", open=False):
424
- with gr.Row():
425
- with gr.Column(scale=2):
426
- system = gr.Textbox(
427
- label="System Prompt",
428
- value=default_system_prompt,
429
- show_label=False,
430
- ).style(container=False)
431
- with gr.Column():
432
- with gr.Row():
433
- change = gr.Button("Change System Prompt")
434
- reset = gr.Button("Reset System Prompt")
435
-
436
- with gr.Accordion("Example Inputs", open=True):
437
- examples = gr.Examples(
438
- examples=examples,
439
- inputs=[msg],
440
- examples_per_page=20,
441
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
442
 
443
- # with gr.Row():
444
- with gr.Accordion("Disclaimer", open=False):
445
- _ = "-".join(MODEL_FILENAME.split("-")[:2])
446
- gr.Markdown(
447
- f"Disclaimer: {_} can produce factually incorrect output, and should not be relied on to produce "
448
- "factually accurate information. {_} was trained on various public datasets; while great efforts "
449
- "have been taken to clean the pretraining data, it is possible that this model could generate lewd, "
450
- "biased, or otherwise offensive outputs.",
451
- elem_classes=["disclaimer"],
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
452
  )
453
- _ = """
454
- msg.submit(
455
- # fn=conversation.user_turn,
456
- fn=predict,
457
- inputs=[msg, chatbot],
458
- outputs=[msg, chatbot],
459
- # queue=True,
460
- show_progress="full",
461
- api_name="predict",
462
  )
463
- submit.click(
464
- fn=lambda x, y: ("",) + predict(x, y)[1:], # clear msg
465
- inputs=[msg, chatbot],
466
- outputs=[msg, chatbot],
467
- queue=True,
468
- show_progress="full",
469
  )
470
- # """
471
- msg.submit(
472
- # fn=conversation.user_turn,
473
- fn=predict_str,
474
- inputs=[msg, chatbot],
475
- outputs=[msg, chatbot],
476
- queue=True,
477
- show_progress="full",
478
- api_name="predict",
479
- ).then(bot_str, chatbot, chatbot)
480
- submit.click(
481
- fn=lambda x, y: ("",) + predict_str(x, y)[1:], # clear msg
482
- inputs=[msg, chatbot],
483
- outputs=[msg, chatbot],
484
- queue=True,
485
- show_progress="full",
486
- ).then(bot_str, chatbot, chatbot)
487
-
488
- clear.click(lambda: None, None, chatbot, queue=False)
489
-
490
- # update buff Textbox, every: units in seconds)
491
- # https://huggingface.co/spaces/julien-c/nvidia-smi/discussions
492
- # does not work
493
- # AttributeError: 'Blocks' object has no attribute 'run_forever'
494
- # block.run_forever(lambda: ns.response, None, [buff], every=1)
495
-
496
- with gr.Accordion("For Chat/Translation API", open=False, visible=False):
497
- input_text = gr.Text()
498
- api_btn = gr.Button("Go", variant="primary")
499
- out_text = gr.Text()
500
- api_btn.click(
501
- predict_api,
502
- input_text,
503
- out_text,
504
- # show_progress="full",
505
- api_name="api",
506
  )
507
-
508
- # concurrency_count=5, max_size=20
509
- # max_size=36, concurrency_count=14
510
- block.queue(concurrency_count=5, max_size=20).launch(debug=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  from pathlib import Path
 
2
  from urllib.parse import urlparse
3
 
4
  import gradio as gr
5
  import psutil
 
6
  from ctransformers import AutoModelForCausalLM
7
+ from huggingface_hub import hf_hub_download
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
 
9
+ _ = """
10
+ snapshot_download(
11
+ repo_id="TheBloke/falcon-7b-instruct-GGML",
12
+ allow_patterns="falcon7b-instruct.ggmlv3.q4_0.bin",
13
+ revision="ggmlv3",
14
+ local_dir="models",
15
+ local_dir_use_symlinks=False, # default "auto"
 
 
 
16
  )
17
 
18
+ hf_hub_download(
19
+ repo_id=repo_id,
20
+ filename=model_filename,
21
+ local_dir=local_path,
22
+ local_dir_use_symlinks=True,
23
+ )
24
+ # """
25
+ # 4.06G
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
 
 
27
  _ = """
28
  llm = AutoModelForCausalLM.from_pretrained(
29
+ "TheBloke/falcon-7b-instruct-GGML",
30
+ model_file="falcon7b-instruct.ggmlv3.q4_0.bin",
31
+ model_type="falcon", gpu_layers=32, threads=2,
 
32
  )
33
  # """
34
+ # _ = Path("models", "falcon7b-instruct.ggmlv3.q4_0.bin").absolute().as_posix()
35
+ # assert Path(_).exists(), f"{_} does not exist, perhaps snapshot_download failed?"
36
+
37
+ URL = "https://huggingface.co/TheBloke/falcon-7b-instruct-GGML/blob/main/falcon-7b-instruct.ggccv1.q4_1.bin"
38
+ URL = "https://huggingface.co/TheBloke/falcon-7b-instruct-GGML/blob/ggmlv3/falcon7b-instruct.ggmlv3.q4_1.bin"
39
+ repo_id = "/".join(urlparse(URL).path.strip("/").split("/")[:2])
40
+ URL = "https://huggingface.co/TheBloke/30B-Lazarus-GGML/blob/main/30b-Lazarus.ggmlv3.q2_K.bin" # 13.6 G
41
+
42
+ model_file = Path(URL).name
43
+
44
+ _ = hf_hub_download(
45
+ repo_id=repo_id,
46
+ revision="ggmlv3",
47
+ filename=model_file,
48
+ local_dir="models",
49
+ # local_dir_use_symlinks=True,
50
  )
51
 
52
+ llm = AutoModelForCausalLM.from_pretrained(
53
+ # repo_id, # "TheBloke/falcon-7b-instruct-GGML",
54
+ # model_file=model_file,
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
55
  # model_file=_,
56
  _,
57
+ # model_type="falcon",
58
+ model_type="llama",
59
+ threads=psutil.cpu_count(logical=False),
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  )
61
 
62
+ TITLE = f"""<h2 align="center">🦅 30B-Lazarus-GGM ({model_file}) 🦅"""
63
+ USER_NAME = "User"
64
+ BOT_NAME = "Falcon"
65
+ DEFAULT_INSTRUCTIONS = """The following is a conversation between a highly knowledgeable and intelligent AI assistant, called Falcon, and a human user, called User. In the following interactions, User and Falcon will converse in natural language, and Falcon will answer User's questions. Falcon was built to be respectful, polite and inclusive. Falcon was built by the Technology Innovation Institute in Abu Dhabi. Falcon will never decline to answer a question, and always attempts to give an answer that User would be satisfied with. It knows a lot, and always tells the truth. The conversation begins.
 
 
 
 
 
 
 
66
  """
67
+ RETRY_COMMAND = "/retry"
68
+ STOP_STR = f"\n{USER_NAME}:"
69
+ STOP_SUSPECT_LIST = [":", "\n", "User"]
70
+
71
+
72
+ def chat_accordion():
73
+ with gr.Accordion("Parameters", open=False):
74
+ temperature = gr.Slider(
75
+ minimum=0.1,
76
+ maximum=2.0,
77
+ value=0.8,
78
+ step=0.1,
79
+ interactive=True,
80
+ label="Temperature",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
81
  )
82
+ top_p = gr.Slider(
83
+ minimum=0.1,
84
+ maximum=0.99,
85
+ value=0.9,
86
+ step=0.01,
87
+ interactive=True,
88
+ label="p (nucleus sampling)",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
89
  )
90
+ return temperature, top_p
91
+
92
+
93
+ # TODO: fix prompt
94
+ def format_chat_prompt(message: str, chat_history, instructions: str) -> str:
95
+ instructions = instructions.strip(" ").strip("\n")
96
+ prompt = instructions
97
+ for turn in chat_history:
98
+ user_message, bot_message = turn
99
+ prompt = f"{prompt}\n{USER_NAME}: {user_message}\n{BOT_NAME}: {bot_message}"
100
+ prompt = f"{prompt}\n{USER_NAME}: {message}\n{BOT_NAME}:"
101
+ return prompt
102
+
103
+
104
+ def chat():
105
+ with gr.Column(elem_id="chat_container"):
106
+ with gr.Row():
107
+ chatbot = gr.Chatbot(elem_id="chatbot")
108
+ with gr.Row():
109
+ inputs = gr.Textbox(
110
+ placeholder=f"Hello {BOT_NAME} !!",
111
+ label="Type an input and press Enter",
112
+ max_lines=3,
113
+ )
114
+
115
+ with gr.Row(elem_id="button_container"):
116
+ with gr.Column():
117
+ submit_button = gr.Button("🚀 Submit")
118
+ with gr.Column():
119
+ retry_button = gr.Button("♻️ Retry last turn")
120
+ with gr.Column():
121
+ delete_turn_button = gr.Button("🧽 Delete last turn")
122
+ with gr.Column():
123
+ clear_chat_button = gr.Button("✨ Delete all history")
124
+
125
+ gr.Examples(
126
+ [
127
+ ["Hey! Any recommendations for my holidays in Abu Dhabi?"],
128
+ ["What's the Everett interpretation of quantum mechanics?"],
129
+ [
130
+ "Give me a list of the top 10 dive sites you would recommend around the world."
131
+ ],
132
+ ["Can you tell me more about deep-water soloing?"],
133
+ [
134
+ "Can you write a short tweet about 30B-Lazarus-GGM?"
135
+ ],
136
+ ],
137
+ inputs=inputs,
138
+ label="Click on any example and press Enter in the input textbox!",
139
+ )
140
 
141
+ with gr.Row(elem_id="param_container"):
142
+ with gr.Column():
143
+ temperature, top_p = chat_accordion()
144
+ with gr.Column():
145
+ with gr.Accordion("Instructions", open=False):
146
+ instructions = gr.Textbox(
147
+ placeholder="LLM instructions",
148
+ value=DEFAULT_INSTRUCTIONS,
149
+ lines=10,
150
+ interactive=True,
151
+ label="Instructions",
152
+ max_lines=16,
153
+ show_label=False,
154
+ )
155
+
156
+ def run_chat(
157
+ message: str, chat_history, instructions: str, temperature: float, top_p: float
158
+ ):
159
+ if not message or (message == RETRY_COMMAND and len(chat_history) == 0):
160
+ yield chat_history
161
+ return
162
+
163
+ if message == RETRY_COMMAND and chat_history:
164
+ prev_turn = chat_history.pop(-1)
165
+ user_message, _ = prev_turn
166
+ message = user_message
167
+
168
+ prompt = format_chat_prompt(message, chat_history, instructions)
169
+ chat_history = chat_history + [[message, ""]]
170
+ stream = llm(
171
+ prompt,
172
+ max_new_tokens=1024,
173
+ stop=[STOP_STR, "<|endoftext|>"],
174
+ temperature=temperature,
175
+ top_p=top_p,
176
+ stream=True,
177
+ )
178
+ acc_text = ""
179
+ for idx, response in enumerate(stream):
180
+ text_token = response
181
+
182
+ if text_token in STOP_SUSPECT_LIST:
183
+ acc_text += text_token
184
+ continue
185
+
186
+ if idx == 0 and text_token.startswith(" "):
187
+ text_token = text_token[1:]
188
+
189
+ acc_text += text_token
190
+ last_turn = list(chat_history.pop(-1))
191
+ last_turn[-1] += acc_text
192
+ chat_history = chat_history + [last_turn]
193
+ yield chat_history
194
+ acc_text = ""
195
+
196
+ def delete_last_turn(chat_history):
197
+ if chat_history:
198
+ chat_history.pop(-1)
199
+ return {chatbot: gr.update(value=chat_history)}
200
+
201
+ def run_retry(
202
+ message: str, chat_history, instructions: str, temperature: float, top_p: float
203
+ ):
204
+ yield from run_chat(
205
+ RETRY_COMMAND, chat_history, instructions, temperature, top_p
206
  )
207
+
208
+ def clear_chat():
209
+ return []
210
+
211
+ inputs.submit(
212
+ run_chat,
213
+ [inputs, chatbot, instructions, temperature, top_p],
214
+ outputs=[chatbot],
215
+ show_progress="minimal",
216
  )
217
+ inputs.submit(lambda: "", inputs=None, outputs=inputs)
218
+ submit_button.click(
219
+ run_chat,
220
+ [inputs, chatbot, instructions, temperature, top_p],
221
+ outputs=[chatbot],
222
+ show_progress="minimal",
223
  )
224
+ delete_turn_button.click(delete_last_turn, inputs=[chatbot], outputs=[chatbot])
225
+ retry_button.click(
226
+ run_retry,
227
+ [inputs, chatbot, instructions, temperature, top_p],
228
+ outputs=[chatbot],
229
+ show_progress="minimal",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
230
  )
231
+ clear_chat_button.click(clear_chat, [], chatbot)
232
+
233
+
234
+ def get_demo():
235
+ with gr.Blocks(
236
+ # css=None
237
+ # css="""#chat_container {width: 700px; margin-left: auto; margin-right: auto;}
238
+ # #button_container {width: 700px; margin-left: auto; margin-right: auto;}
239
+ # #param_container {width: 700px; margin-left: auto; margin-right: auto;}"""
240
+ css="""#chatbot {
241
+ font-size: 14px;
242
+ min-height: 300px;
243
+ }"""
244
+ ) as demo:
245
+ gr.HTML(TITLE)
246
+
247
+ with gr.Row():
248
+ with gr.Column():
249
+ gr.Markdown(
250
+ """
251
+ ⚠️ **Limitations**: the model can and will produce factually incorrect information, hallucinating facts and actions. As it has not undergone any advanced tuning/alignment, it can produce problematic outputs, especially if prompted to do so.
252
+ """
253
+ )
254
+
255
+ chat()
256
+
257
+ return demo
258
+
259
+
260
+ if __name__ == "__main__":
261
+ demo = get_demo()
262
+ demo.queue(max_size=64, concurrency_count=8)
263
+ # demo.launch(server_name="0.0.0.0", server_port=7860)
264
+ demo.launch()