deepthought_8B_gguf_inference

Sleeping

App Files Files Community

mgoin commited on Jun 28, 2024

Commit

0c8feaf

1 Parent(s): b1d1e11

Initial commit

Browse files

Files changed (3) hide show

README.md +6 -5
app.py +267 -0
requirements.txt +3 -0

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
 ---
-title: Llama 3 8b Deepsparse Chat
-emoji: 🐠
-colorFrom: blue
-colorTo: purple
 sdk: gradio
-sdk_version: 4.37.1
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Llama 3 8B Chat Deepsparse
+emoji: 🏃
+colorFrom: purple
+colorTo: green
 sdk: gradio
+sdk_version: 4.21.0
 app_file: app.py
 pinned: false
+license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,267 @@

+import deepsparse
+import gradio as gr
+from typing import Tuple, List
+deepsparse.cpu.print_hardware_capability()
+MODEL_ID = "hf:mgoin/Meta-Llama-3-8B-Instruct-pruned50-quant-ds"
+DESCRIPTION = f"""
+# Chat with an Efficient Llama-3-8B-Instruct Model on CPU with DeepSparse
+Model ID: {MODEL_ID[len("hf:"):]}
+"""
+MAX_MAX_NEW_TOKENS = 1024
+DEFAULT_MAX_NEW_TOKENS = 200
+# Setup the engine
+from deepsparse.legacy import Pipeline
+pipe = Pipeline.create(
+    task="text-generation",
+    model_path=MODEL_ID,
+    sequence_length=MAX_MAX_NEW_TOKENS,
+    prompt_sequence_length=8,
+    num_cores=8,
+)
+def clear_and_save_textbox(message: str) -> Tuple[str, str]:
+    return "", message
+def display_input(
+    message: str, history: List[Tuple[str, str]]
+) -> List[Tuple[str, str]]:
+    history.append((message, ""))
+    return history
+def delete_prev_fn(history: List[Tuple[str, str]]) -> Tuple[List[Tuple[str, str]], str]:
+    try:
+        message, _ = history.pop()
+    except IndexError:
+        message = ""
+    return history, message or ""
+with gr.Blocks(css="style.css") as demo:
+    gr.Markdown(DESCRIPTION)
+    with gr.Group():
+        chatbot = gr.Chatbot(label="Chatbot")
+        with gr.Row():
+            textbox = gr.Textbox(
+                container=False,
+                show_label=False,
+                placeholder="Type a message...",
+                scale=10,
+            )
+            submit_button = gr.Button("Submit", variant="primary", scale=1, min_width=0)
+    with gr.Row():
+        retry_button = gr.Button("🔄  Retry", variant="secondary")
+        undo_button = gr.Button("↩️ Undo", variant="secondary")
+        clear_button = gr.Button("🗑️  Clear", variant="secondary")
+    saved_input = gr.State()
+    gr.Examples(
+        examples=[
+            "Write a story about sparse neurons.",
+            "Write a story about a summer camp.",
+            "Make a recipe for banana bread.",
+            "Write a cookbook for gluten-free snacks.",
+            "Write about the role of animation in video games."
+        ],
+        inputs=[textbox],
+    )
+    max_new_tokens = gr.Slider(
+        label="Max new tokens",
+        value=DEFAULT_MAX_NEW_TOKENS,
+        minimum=0,
+        maximum=MAX_MAX_NEW_TOKENS,
+        step=1,
+        interactive=True,
+        info="The maximum numbers of new tokens",
+    )
+    temperature = gr.Slider(
+        label="Temperature",
+        value=0.9,
+        minimum=0.05,
+        maximum=1.0,
+        step=0.05,
+        interactive=True,
+        info="Higher values produce more diverse outputs",
+    )
+    top_p = gr.Slider(
+        label="Top-p (nucleus) sampling",
+        value=0.40,
+        minimum=0.0,
+        maximum=1,
+        step=0.05,
+        interactive=True,
+        info="Higher values sample more low-probability tokens",
+    )
+    top_k = gr.Slider(
+        label="Top-k sampling",
+        value=20,
+        minimum=1,
+        maximum=100,
+        step=1,
+        interactive=True,
+        info="Sample from the top_k most likely tokens",
+    )
+    reptition_penalty = gr.Slider(
+        label="Repetition penalty",
+        value=1.2,
+        minimum=1.0,
+        maximum=2.0,
+        step=0.05,
+        interactive=True,
+        info="Penalize repeated tokens",
+    )
+    # Generation inference
+    def generate(
+        message,
+        history,
+        max_new_tokens: int,
+        temperature: float,
+        top_p: float,
+        top_k: int,
+        reptition_penalty: float,
+    ):
+        generation_config = {
+            "max_new_tokens": max_new_tokens,
+            "do_sample": True,
+            "temperature": temperature,
+            "top_p": top_p,
+            "top_k": top_k,
+            "reptition_penalty": reptition_penalty,
+        }
+        conversation = []
+        conversation.append({"role": "user", "content": message})
+        formatted_conversation = pipe.tokenizer.apply_chat_template(
+            conversation, tokenize=False, add_generation_prompt=True
+        )
+        inference = pipe(
+            sequences=formatted_conversation,
+            generation_config=generation_config,
+            streaming=True,
+        )
+        for token in inference:
+            history[-1][1] += token.generations[0].text
+            yield history
+        print(pipe.timer_manager)
+    # Hooking up all the buttons
+    textbox.submit(
+        fn=clear_and_save_textbox,
+        inputs=textbox,
+        outputs=[textbox, saved_input],
+        api_name=False,
+        queue=False,
+    ).then(
+        fn=display_input,
+        inputs=[saved_input, chatbot],
+        outputs=chatbot,
+        api_name=False,
+        queue=False,
+    ).success(
+        generate,
+        inputs=[
+            saved_input,
+            chatbot,
+            max_new_tokens,
+            temperature,
+            top_p,
+            top_k,
+            reptition_penalty,
+        ],
+        outputs=[chatbot],
+        api_name=False,
+    )
+    submit_button.click(
+        fn=clear_and_save_textbox,
+        inputs=textbox,
+        outputs=[textbox, saved_input],
+        api_name=False,
+        queue=False,
+    ).then(
+        fn=display_input,
+        inputs=[saved_input, chatbot],
+        outputs=chatbot,
+        api_name=False,
+        queue=False,
+    ).success(
+        generate,
+        inputs=[
+            saved_input,
+            chatbot,
+            max_new_tokens,
+            temperature,
+            top_p,
+            top_k,
+            reptition_penalty,
+        ],
+        outputs=[chatbot],
+        api_name=False,
+    )
+    retry_button.click(
+        fn=delete_prev_fn,
+        inputs=chatbot,
+        outputs=[chatbot, saved_input],
+        api_name=False,
+        queue=False,
+    ).then(
+        fn=display_input,
+        inputs=[saved_input, chatbot],
+        outputs=chatbot,
+        api_name=False,
+        queue=False,
+    ).then(
+        generate,
+        inputs=[
+            saved_input,
+            chatbot,
+            max_new_tokens,
+            temperature,
+            top_p,
+            top_k,
+            reptition_penalty,
+        ],
+        outputs=[chatbot],
+        api_name=False,
+    )
+    undo_button.click(
+        fn=delete_prev_fn,
+        inputs=chatbot,
+        outputs=[chatbot, saved_input],
+        api_name=False,
+        queue=False,
+    ).then(
+        fn=lambda x: x,
+        inputs=[saved_input],
+        outputs=textbox,
+        api_name=False,
+        queue=False,
+    )
+    clear_button.click(
+        fn=lambda: ([], ""),
+        outputs=[chatbot, saved_input],
+        queue=False,
+        api_name=False,
+    )
+demo.queue().launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,3 @@

+deepsparse-nightly==1.8.0.20240502
+transformers
+gradio