import gradio as gr from llama_cpp import Llama llm = Llama(model_path="model.gguf", n_ctx=8000, n_threads=2, chat_format="chatml") def generate(message, history,temperature=0.3,max_tokens=512): system_prompt = """You are a highly intelligent AI agent that thinks and reasons step by step. Your task is to provide very precise, clear, and accurate responses without fabricating information. You excel in processing complex queries and offering solutions based on a deep understanding of the subject matter. When faced with questions, you analyze the context thoroughly, consider all relevant factors, and apply logic and knowledge to reach conclusions. Your responses are always rooted in verified data or established theories, and you're adept at explaining complex concepts in an accessible manner. You're committed to maintaining the integrity of information while being responsive to the needs and curiosity of the user. Your goal is to assist users in expanding their knowledge, solving problems, and making informed decisions. You're equipped with a vast database of information, and you're capable of learning from new data inputs to continuously improve your performance. Your expertise spans across various domains, including science, technology, mathematics, literature, and more. You're not just a source of information but a partner in the exploration of knowledge, ready to navigate the vast landscape of human inquiry with precision and insight. Remember "thinks and reasons step by step" and response in the same language of the query/prompt of the user""" formatted_prompt = [{"role": "system", "content": system_prompt}] for user_prompt, bot_response in history: formatted_prompt.append({"role": "user", "content": user_prompt}) formatted_prompt.append({"role": "assistant", "content": bot_response }) formatted_prompt.append({"role": "user", "content": message}) stream_response = llm.create_chat_completion(messages=formatted_prompt, temperature=temperature, max_tokens=max_tokens, stream=True) response = "" for chunk in stream_response: if len(chunk['choices'][0]["delta"]) != 0 and "content" in chunk['choices'][0]["delta"]: response += chunk['choices'][0]["delta"]["content"] yield response mychatbot = gr.Chatbot( avatar_images=["user.png", "botnb.png"], bubble_full_width=False, show_label=False, show_copy_button=True, likeable=True,) iface = gr.ChatInterface(fn=generate, chatbot=mychatbot, retry_btn=None, undo_btn=None) with gr.Blocks() as demo: gr.HTML("

Neuronal-9b Quantized in Q5_K_M

") iface.render() demo.queue().launch(show_api=False, server_name="0.0.0.0")