Staticaliza commited on
Commit
2be57ea
Β·
verified Β·
1 Parent(s): 6c7cccb

Create app.py

Browse files
Files changed (1) hide show
  1. app.py +138 -0
app.py ADDED
@@ -0,0 +1,138 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Imports
2
+ import gradio as gr
3
+ import os
4
+ import random
5
+ import time
6
+ import spaces
7
+ import torch
8
+ import threading
9
+ import socket
10
+ import requests
11
+
12
+ from llama_cpp import Llama
13
+ from huggingface_hub import hf_hub_download
14
+
15
+ # Variables
16
+ HF_TOKEN = os.environ.get("HF_TOKEN")
17
+
18
+ REPO = "FUGG-B22x8-2-MLdraziW/ihanaPrayizaM"[::-1]
19
+ FILE = "WizardLM-2-8x22B.IQ1_M.gguf"
20
+
21
+ TIMEOUT = 30
22
+
23
+ MAX_SEED = 9007199254740991
24
+ DEFAULT_INPUT = ""
25
+ DEFAULT_SYSTEM = "Statical is a totally normal human named 'Statical'. Statical responds very briefly, truthfully, and accurately."
26
+
27
+ TAG_USER = "USER"
28
+ TAG_ASSISTANT = "STATICAL"
29
+
30
+ DEFAULT_SEPARATOR = ","
31
+ DEFAULT_STOP_SEQUENCES = f"{TAG_USER}:,{TAG_ASSISTANT}:,</s>"
32
+
33
+ model = Llama(model_path=hf_hub_download(repo_id=REPO, filename=FILE, token=HF_TOKEN), n_ctx=32768, n_threads=48, n_batch=512, n_gpu_layers=0, verbose=True)
34
+
35
+ # Functions
36
+ def get_seed(seed):
37
+ seed = seed.strip()
38
+ if seed.isdigit():
39
+ return int(seed)
40
+ else:
41
+ return random.randint(0, MAX_SEED)
42
+
43
+ def generate(input=DEFAULT_INPUT, history=[], system=DEFAULT_SYSTEM, stream=False, temperature=1, top_p=0.95, top_k=50, rep_p=1.2, max_tokens=64, seed=None, separator=DEFAULT_SEPARATOR, stop_sequences=DEFAULT_STOP_SEQUENCES):
44
+ print("[GENERATE] Model is generating...")
45
+
46
+ memory = ""
47
+ for item in history:
48
+ if item[0]:
49
+ memory += f"{TAG_USER}: {item[0].strip()}\n"
50
+ if item[1]:
51
+ memory += f"{TAG_ASSISTANT}: {item[1].strip()}</s>\n"
52
+ prompt = f"{system.strip()}\n{memory}{TAG_USER}: {input.strip()}\n{TAG_ASSISTANT}: "
53
+
54
+ print(prompt)
55
+
56
+ parameters = {
57
+ "prompt": prompt,
58
+ "temperature": temperature,
59
+ "top_p": top_p,
60
+ "top_k": top_k,
61
+ "repeat_penalty": rep_p,
62
+ "max_tokens": max_tokens,
63
+ "stop": [seq.strip() for seq in stop_sequences.split(separator)] if stop_sequences else [],
64
+ "seed": get_seed(seed),
65
+ "stream": stream
66
+ }
67
+
68
+ event = threading.Event()
69
+
70
+ try:
71
+ output = model.create_completion(**parameters)
72
+ print("[GENERATE] Model has generated.")
73
+ if stream:
74
+ buffer = ""
75
+ timer = threading.Timer(TIMEOUT, event.set)
76
+ timer.start()
77
+ try:
78
+ for _, item in enumerate(output):
79
+ if event.is_set():
80
+ raise TimeoutError("[ERROR] Generation timed out.")
81
+ buffer += item["choices"][0]["text"]
82
+ yield buffer
83
+ timer.cancel()
84
+ timer = threading.Timer(TIMEOUT, event.set)
85
+ timer.start()
86
+ finally:
87
+ timer.cancel()
88
+ else:
89
+ yield output["choices"][0]["text"]
90
+ except TimeoutError as e:
91
+ yield str(e)
92
+ finally:
93
+ timer.cancel()
94
+
95
+ @spaces.GPU(duration=15)
96
+ def gpu():
97
+ return
98
+
99
+ # Initialize
100
+ theme = gr.themes.Default(
101
+ primary_hue="violet",
102
+ secondary_hue="indigo",
103
+ neutral_hue="zinc",
104
+ spacing_size="sm",
105
+ radius_size="lg",
106
+ font=[gr.themes.GoogleFont('Kanit'), 'ui-sans-serif', 'system-ui', 'sans-serif'],
107
+ font_mono=[gr.themes.GoogleFont('Kanit'), 'ui-monospace', 'Consolas', 'monospace'],
108
+ ).set(background_fill_primary='*neutral_50', background_fill_secondary='*neutral_100')
109
+
110
+ model_base = "https://huggingface.co/MaziyarPanahi/WizardLM-2-8x22B-GGUF" # [::-1]
111
+ model_quant = "https://huggingface.co/alpindale/WizardLM-2-8x22B" # [::-1]
112
+
113
+ with gr.Blocks(theme=theme) as main:
114
+ with gr.Column():
115
+ gr.Markdown("# πŸ‘οΈβ€πŸ—¨οΈ WizardLM")
116
+ gr.Markdown("β €β €β€’ ⚑ A text generation inference for one of the best open-source text models: WizardLM-2-8x22B.")
117
+ gr.Markdown("β €β €β€’ ⚠️ WARNING! The inference is very slow due to the model being HUGE; it takes 10 seconds before it starts generating; please avoid high max token parameters and sending large amounts of text; note it uses CPU because I cannot figure out how to run it in GPU without overloading the model.")
118
+ gr.Markdown(f"β €β €β€’ πŸ”— Link to models: {model_base} (BASE), {model_quant} (QUANT)")
119
+
120
+ with gr.Column():
121
+ gr.ChatInterface(
122
+ fn=generate,
123
+ additional_inputs_accordion=gr.Accordion(label="βš™οΈ Configurations", open=False, render=False),
124
+ additional_inputs=[
125
+ gr.Textbox(lines=1, value=DEFAULT_SYSTEM, label="πŸͺ„ System", render=False),
126
+ gr.Checkbox(label="⚑ Stream", value=True, render=False),
127
+ gr.Slider(minimum=0, maximum=2, step=0.01, value=1, label="🌑️ Temperature", render=False),
128
+ gr.Slider(minimum=0.01, maximum=0.99, step=0.01, value=0.95, label="🧲 Top P", render=False),
129
+ gr.Slider(minimum=1, maximum=2048, step=1, value=50, label="πŸ“Š Top K", render=False),
130
+ gr.Slider(minimum=0.01, maximum=2, step=0.01, value=1.2, label="πŸ“š Repetition Penalty", render=False),
131
+ gr.Slider(minimum=1, maximum=2048, step=1, value=256, label="⏳ Max New Tokens", render=False),
132
+ gr.Textbox(lines=1, value="", label="🌱 Seed (Blank for random)", render=False),
133
+ gr.Textbox(lines=1, value=DEFAULT_SEPARATOR, label="🏷️ Stop Sequences Separator", render=False),
134
+ gr.Textbox(lines=1, value=DEFAULT_STOP_SEQUENCES, label="πŸ›‘ Stop Sequences (Blank for none)", render=False),
135
+ ]
136
+ )
137
+
138
+ main.launch(show_api=False)