mgoin commited on
Commit
0c8feaf
·
1 Parent(s): b1d1e11

Initial commit

Browse files
Files changed (3) hide show
  1. README.md +6 -5
  2. app.py +267 -0
  3. requirements.txt +3 -0
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
- title: Llama 3 8b Deepsparse Chat
3
- emoji: 🐠
4
- colorFrom: blue
5
- colorTo: purple
6
  sdk: gradio
7
- sdk_version: 4.37.1
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Llama 3 8B Chat Deepsparse
3
+ emoji: 🏃
4
+ colorFrom: purple
5
+ colorTo: green
6
  sdk: gradio
7
+ sdk_version: 4.21.0
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py ADDED
@@ -0,0 +1,267 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import deepsparse
2
+ import gradio as gr
3
+ from typing import Tuple, List
4
+
5
+ deepsparse.cpu.print_hardware_capability()
6
+
7
+ MODEL_ID = "hf:mgoin/Meta-Llama-3-8B-Instruct-pruned50-quant-ds"
8
+
9
+ DESCRIPTION = f"""
10
+ # Chat with an Efficient Llama-3-8B-Instruct Model on CPU with DeepSparse
11
+
12
+ Model ID: {MODEL_ID[len("hf:"):]}
13
+ """
14
+
15
+ MAX_MAX_NEW_TOKENS = 1024
16
+ DEFAULT_MAX_NEW_TOKENS = 200
17
+
18
+ # Setup the engine
19
+ from deepsparse.legacy import Pipeline
20
+ pipe = Pipeline.create(
21
+ task="text-generation",
22
+ model_path=MODEL_ID,
23
+ sequence_length=MAX_MAX_NEW_TOKENS,
24
+ prompt_sequence_length=8,
25
+ num_cores=8,
26
+ )
27
+
28
+
29
+ def clear_and_save_textbox(message: str) -> Tuple[str, str]:
30
+ return "", message
31
+
32
+
33
+ def display_input(
34
+ message: str, history: List[Tuple[str, str]]
35
+ ) -> List[Tuple[str, str]]:
36
+ history.append((message, ""))
37
+ return history
38
+
39
+
40
+ def delete_prev_fn(history: List[Tuple[str, str]]) -> Tuple[List[Tuple[str, str]], str]:
41
+ try:
42
+ message, _ = history.pop()
43
+ except IndexError:
44
+ message = ""
45
+ return history, message or ""
46
+
47
+
48
+ with gr.Blocks(css="style.css") as demo:
49
+ gr.Markdown(DESCRIPTION)
50
+
51
+ with gr.Group():
52
+ chatbot = gr.Chatbot(label="Chatbot")
53
+ with gr.Row():
54
+ textbox = gr.Textbox(
55
+ container=False,
56
+ show_label=False,
57
+ placeholder="Type a message...",
58
+ scale=10,
59
+ )
60
+ submit_button = gr.Button("Submit", variant="primary", scale=1, min_width=0)
61
+
62
+ with gr.Row():
63
+ retry_button = gr.Button("🔄 Retry", variant="secondary")
64
+ undo_button = gr.Button("↩️ Undo", variant="secondary")
65
+ clear_button = gr.Button("🗑️ Clear", variant="secondary")
66
+
67
+ saved_input = gr.State()
68
+
69
+ gr.Examples(
70
+ examples=[
71
+ "Write a story about sparse neurons.",
72
+ "Write a story about a summer camp.",
73
+ "Make a recipe for banana bread.",
74
+ "Write a cookbook for gluten-free snacks.",
75
+ "Write about the role of animation in video games."
76
+ ],
77
+ inputs=[textbox],
78
+ )
79
+
80
+ max_new_tokens = gr.Slider(
81
+ label="Max new tokens",
82
+ value=DEFAULT_MAX_NEW_TOKENS,
83
+ minimum=0,
84
+ maximum=MAX_MAX_NEW_TOKENS,
85
+ step=1,
86
+ interactive=True,
87
+ info="The maximum numbers of new tokens",
88
+ )
89
+ temperature = gr.Slider(
90
+ label="Temperature",
91
+ value=0.9,
92
+ minimum=0.05,
93
+ maximum=1.0,
94
+ step=0.05,
95
+ interactive=True,
96
+ info="Higher values produce more diverse outputs",
97
+ )
98
+ top_p = gr.Slider(
99
+ label="Top-p (nucleus) sampling",
100
+ value=0.40,
101
+ minimum=0.0,
102
+ maximum=1,
103
+ step=0.05,
104
+ interactive=True,
105
+ info="Higher values sample more low-probability tokens",
106
+ )
107
+ top_k = gr.Slider(
108
+ label="Top-k sampling",
109
+ value=20,
110
+ minimum=1,
111
+ maximum=100,
112
+ step=1,
113
+ interactive=True,
114
+ info="Sample from the top_k most likely tokens",
115
+ )
116
+ reptition_penalty = gr.Slider(
117
+ label="Repetition penalty",
118
+ value=1.2,
119
+ minimum=1.0,
120
+ maximum=2.0,
121
+ step=0.05,
122
+ interactive=True,
123
+ info="Penalize repeated tokens",
124
+ )
125
+
126
+ # Generation inference
127
+ def generate(
128
+ message,
129
+ history,
130
+ max_new_tokens: int,
131
+ temperature: float,
132
+ top_p: float,
133
+ top_k: int,
134
+ reptition_penalty: float,
135
+ ):
136
+ generation_config = {
137
+ "max_new_tokens": max_new_tokens,
138
+ "do_sample": True,
139
+ "temperature": temperature,
140
+ "top_p": top_p,
141
+ "top_k": top_k,
142
+ "reptition_penalty": reptition_penalty,
143
+ }
144
+
145
+ conversation = []
146
+ conversation.append({"role": "user", "content": message})
147
+
148
+ formatted_conversation = pipe.tokenizer.apply_chat_template(
149
+ conversation, tokenize=False, add_generation_prompt=True
150
+ )
151
+
152
+ inference = pipe(
153
+ sequences=formatted_conversation,
154
+ generation_config=generation_config,
155
+ streaming=True,
156
+ )
157
+
158
+ for token in inference:
159
+ history[-1][1] += token.generations[0].text
160
+ yield history
161
+
162
+ print(pipe.timer_manager)
163
+
164
+ # Hooking up all the buttons
165
+ textbox.submit(
166
+ fn=clear_and_save_textbox,
167
+ inputs=textbox,
168
+ outputs=[textbox, saved_input],
169
+ api_name=False,
170
+ queue=False,
171
+ ).then(
172
+ fn=display_input,
173
+ inputs=[saved_input, chatbot],
174
+ outputs=chatbot,
175
+ api_name=False,
176
+ queue=False,
177
+ ).success(
178
+ generate,
179
+ inputs=[
180
+ saved_input,
181
+ chatbot,
182
+ max_new_tokens,
183
+ temperature,
184
+ top_p,
185
+ top_k,
186
+ reptition_penalty,
187
+ ],
188
+ outputs=[chatbot],
189
+ api_name=False,
190
+ )
191
+
192
+ submit_button.click(
193
+ fn=clear_and_save_textbox,
194
+ inputs=textbox,
195
+ outputs=[textbox, saved_input],
196
+ api_name=False,
197
+ queue=False,
198
+ ).then(
199
+ fn=display_input,
200
+ inputs=[saved_input, chatbot],
201
+ outputs=chatbot,
202
+ api_name=False,
203
+ queue=False,
204
+ ).success(
205
+ generate,
206
+ inputs=[
207
+ saved_input,
208
+ chatbot,
209
+ max_new_tokens,
210
+ temperature,
211
+ top_p,
212
+ top_k,
213
+ reptition_penalty,
214
+ ],
215
+ outputs=[chatbot],
216
+ api_name=False,
217
+ )
218
+
219
+ retry_button.click(
220
+ fn=delete_prev_fn,
221
+ inputs=chatbot,
222
+ outputs=[chatbot, saved_input],
223
+ api_name=False,
224
+ queue=False,
225
+ ).then(
226
+ fn=display_input,
227
+ inputs=[saved_input, chatbot],
228
+ outputs=chatbot,
229
+ api_name=False,
230
+ queue=False,
231
+ ).then(
232
+ generate,
233
+ inputs=[
234
+ saved_input,
235
+ chatbot,
236
+ max_new_tokens,
237
+ temperature,
238
+ top_p,
239
+ top_k,
240
+ reptition_penalty,
241
+ ],
242
+ outputs=[chatbot],
243
+ api_name=False,
244
+ )
245
+
246
+ undo_button.click(
247
+ fn=delete_prev_fn,
248
+ inputs=chatbot,
249
+ outputs=[chatbot, saved_input],
250
+ api_name=False,
251
+ queue=False,
252
+ ).then(
253
+ fn=lambda x: x,
254
+ inputs=[saved_input],
255
+ outputs=textbox,
256
+ api_name=False,
257
+ queue=False,
258
+ )
259
+
260
+ clear_button.click(
261
+ fn=lambda: ([], ""),
262
+ outputs=[chatbot, saved_input],
263
+ queue=False,
264
+ api_name=False,
265
+ )
266
+
267
+ demo.queue().launch(share=True)
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ deepsparse-nightly==1.8.0.20240502
2
+ transformers
3
+ gradio