import json import spaces import subprocess import gradio as gr from llama_cpp import Llama from huggingface_hub import hf_hub_download CSS = """ #qwen-md .katex-display { display: inline; } #qwen-md .katex-display>.katex { display: inline; } #qwen-md .katex-display>.katex>.katex-html { display: inline; } """ hf_hub_download( repo_id="bartowski/Qwen2.5-Math-7B-Instruct-GGUF", filename="Qwen2.5-Math-7B-Instruct-Q6_K_L.gguf", local_dir="./models", ) llm = Llama( model_path="models/Qwen2.5-Math-7B-Instruct-Q6_K_L.gguf", flash_attn=True, n_ctx=8192, n_batch=1024, chat_format="chatml", ) # Gradio 组件 output_md = gr.Markdown( label="Answer", value="Answer will be presented here", latex_delimiters=[ {"left": "\\(", "right": "\\)", "display": True}, {"left": "\\begin\{equation\}", "right": "\\end\{equation\}", "display": True}, {"left": "\\begin\{align\}", "right": "\\end\{align\}", "display": True}, {"left": "\\begin\{alignat\}", "right": "\\end\{alignat\}", "display": True}, {"left": "\\begin\{gather\}", "right": "\\end\{gather\}", "display": True}, {"left": "\\begin\{CD\}", "right": "\\end\{CD\}", "display": True}, {"left": "\\[", "right": "\\]", "display": True}, ], elem_id="qwen-md", show_copy_button=True, container=True, render=False, ) target_lang = gr.Dropdown( choices=["Chinese", "English"], value="Chinese", label="Output Language", interactive=True, render=False, ) new_tokens = gr.Slider( minimum=1, maximum=8192, value=2048, step=1, label="Max new tokens", render=False ) temperature = gr.Slider( minimum=0, maximum=2.0, value=0.5, step=0.1, label="Temperature", render=False ) top_p = gr.Slider(minimum=0.0, maximum=1.0, value=0.95, step=0.05, label="Top P", render=False) input_text = gr.Textbox(label="Ask math questions here", render=False) submit_btn = gr.Button(value="Ask", render=False) banner = gr.Markdown(value=""" # 📖 Qwen2.5-Math GGUF This WebUI is based on Qwen2.5-Math-7B-Instruct-GGUF for mathematical reasoning. You can input texts of mathematical or arithmetic problems. """ ) # Gradio 函数 def respond( input_text, lang="Chinese", max_tokens=2048, temperature=0.5, top_p=0.95, ): if lang == "Chinese": sys_msg = "你是一个乐于助人的数学助手. 你使用中文回答问题" else: sys_msg = "You are a helpful math assistant. You should always provide your answer in English." messages = [ { "role": "system", "content": sys_msg, }, {"role": "user", "content": input_text}, ] response = "" response = llm.create_chat_completion( messages=messages, stream=True, max_tokens=max_tokens, temperature=temperature, top_p=top_p, ) message_repl = "" for chunk in response: if len(chunk['choices'][0]["delta"]) != 0 and "content" in chunk['choices'][0]["delta"]: message_repl = message_repl + \ chunk['choices'][0]["delta"]["content"] yield message_repl with gr.Blocks(css=CSS, theme="NoCrypt/miku") as demo: submit_btn.click( fn=respond, inputs=[input_text, target_lang, new_tokens, temperature, top_p], outputs=output_md, ) with gr.Column(): banner.render() with gr.Row(): with gr.Column(): input_text.render() target_lang.render() new_tokens.render() temperature.render() top_p.render() submit_btn.render() with gr.Column(): output_md.render() if __name__ == "__main__": demo.launch()