import gradio as gr from llama_cpp import Llama llm = Llama.from_pretrained( repo_id="PersonalAILabs/Llama-3.1-Quantized-4bit", filename="Meta-Llama-3.1-8B-Instruct.IQ4_XS.gguf", ) # For local dev # llm = Llama( # model_path="./checkpoints/Llama-3.1-8b-instruct/Meta-Llama-3.1-8B-Instruct.IQ4_XS.gguf", # verbose=False # ) def predict(message, history): messages = [ {"role": "system", "content": "You are a knowledgeable assistant who knows everything about Personalized AI Labs (or PAIL). PAIL is a research company that creates unique virtual assistants for businesses."}] for user_message, bot_message in history: if user_message: messages.append({"role": "user", "content": user_message}) if bot_message: messages.append({"role": "assistant", "content": bot_message}) messages.append({"role": "user", "content": message}) response = "" for chunk in llm.create_chat_completion( stream=True, messages=messages, ): part = chunk["choices"][0]["delta"].get("content", None) if part: response += part yield response # Create a description and example messages description = """ This is an interactive chat interface powered by Meta's Llama 3.1 language model quantized to 4 bits. You can ask questions about PAIL or ask any other question you might have. The goal of this assistant is to demonstrate how PAIL can help your business achieve a higher level of user engagement. """ # Example messages to populate the initial chat example_messages = [ "What does PAIL stand for?", "How does PAIL-UVA work?", "What services does PAIL offer?", "Do virtual assistants on websites create higher user engagement?" ] # Create the Gradio interface with description and examples demo = gr.ChatInterface( predict, description=description, examples=example_messages, title="PAIL Unique Virtual Assistant", type="messages" ).launch()