llama-cpp-template

Runtime error

App Files Files Community

ecyht2 commited on Jan 23, 2024

Commit

dcb96e9

verified ·

1 Parent(s): 27b9ede

feat: Added other controls

Browse files

Files changed (1) hide show

app.py +132 -20

app.py CHANGED Viewed

@@ -1,30 +1,142 @@
 import gradio as gr
 from llama_cpp import Llama
-llm = Llama(model_path="model.gguf", n_ctx=4000, n_threads=2, chat_format="chatml")
-def generate(message, history,temperature=0.1,max_tokens=512):
-    system_prompt = "You are Dolphin, a helpful AI assistant."
     formatted_prompt = [{"role": "system", "content": system_prompt}]
-    for user_prompt, bot_response  in history:
         formatted_prompt.append({"role": "user", "content": user_prompt})
-        formatted_prompt.append({"role": "assistant", "content": bot_response })
     formatted_prompt.append({"role": "user", "content": message})
-    stream_response = llm.create_chat_completion(messages=formatted_prompt, temperature=temperature, max_tokens=max_tokens, stream=True)
-    response  = ""
     for chunk in stream_response:
-        if len(chunk['choices'][0]["delta"]) != 0 and "content" in chunk['choices'][0]["delta"]:
-            response  += chunk['choices'][0]["delta"]["content"]
-        yield response
-mychatbot = gr.Chatbot(
-avatar_images=["user.png", "botsc.png"], bubble_full_width=False, show_label=False, show_copy_button=True, likeable=True,)
-iface = gr.ChatInterface(fn=generate, chatbot=mychatbot, retry_btn=None, undo_btn=None)
-with gr.Blocks() as demo:
-    gr.HTML("<center><h1>Tomoniai's Chat with Stable Code 3b</h1></center>")
-    iface.render()
-demo.queue().launch(show_api=False, server_name="0.0.0.0")

+"""Python Application Script for AI chatbot using LLAMA CPP."""
+import logging
 import gradio as gr
 from llama_cpp import Llama
+# Setting up enviornment
+log_level = os.environ.get("LOG_LEVEL", "WARNING")
+logging.basicConfig(encoding='utf-8', level=log_level)
+# Default System Prompt
+DEFAULT_SYSTEM_PROMPT = os.environ.get("DEFAULT_SYSTEM", "You are Dolphin, a helpful AI assistant.")
+# Model Path
+model_path = "model.gguf"
+logging.debug("Model Path: %s", model_path)
+logging.info("Loading Moddel")
+llm = Llama(model_path=model_path, n_ctx=4000, n_threads=2, chat_format="chatml")
+def generate(
+    message: str,
+    history: list[tuple[str, str]],
+    system_prompt: str,
+    temperature: float = 0.1,
+    max_tokens: int = 512,
+    top_p: float = 0.95,
+    repetition_penalty: float = 1.0,
+):
+    """Function to generate text.
+    :param message: The new user prompt.
+    :param history: The history of the chat session.
+    :param system: The system prompt of the model.
+    :param temperature: The temperature parameter for the model.
+    :param max_tokens: The maximum amount of tokens to use for the model.
+    :param top_p: The top p value for the model.
+    :param repetition_penalty: The repetition penalty for the model.
+    """
+    logging.info("Generating Text")
+    logging.debug("message: %s", message)
+    logging.debug("history: %s", history)
+    logging.debug("system: %s", system)
+    logging.debug("temperature: %s", temperature)
+    logging.debug("max_tokens: %s", max_tokens)
+    logging.debug("top_p: %s", top_p)
+    logging.debug("repetion_penalty: %s", repetition_penalty)
+    # Formatting Prompt
+    logging.info("Formatting Prompt")
     formatted_prompt = [{"role": "system", "content": system_prompt}]
+    for user_prompt, bot_response in history:
         formatted_prompt.append({"role": "user", "content": user_prompt})
+        formatted_prompt.append({"role": "assistant", "content": bot_response})
     formatted_prompt.append({"role": "user", "content": message})
+    logging.debug("Formatted Prompt: %s", formatted_prompt)
+    # Generating Response
+    logging.info("Generating Response")
+    stream_response = llm.create_chat_completion(
+        messages=formatted_prompt,
+        temperature=temperature,
+        max_tokens=max_tokens,
+        top_p=top_p,
+        repeat_penalty=repetition_penalty,
+        stream=True,
+    )
+    # Parsing Response
+    logging.info("Parsing Response")
+    response = ""
     for chunk in stream_response:
+        if (
+            len(chunk["choices"][0]["delta"]) != 0
+            and "content" in chunk["choices"][0]["delta"]
+        ):
+            response += chunk["choices"][0]["delta"]["content"]
+        logging.debug("Response: %s", response)
+        yield response
+additional_inputs = [
+    gr.Textbox(
+        label="System Prompt",
+        max_lines=1,
+        interactive=True,
+        value=DEFAULT_SYSTEM_PROMPT,
+    ),
+    gr.Slider(
+        label="Temperature",
+        value=0.9,
+        minimum=0.0,
+        maximum=1.0,
+        step=0.05,
+        interactive=True,
+        info="Higher values produce more diverse outputs",
+    ),
+    gr.Slider(
+        label="Max new tokens",
+        value=256,
+        minimum=0,
+        maximum=1048,
+        step=64,
+        interactive=True,
+        info="The maximum numbers of new tokens",
+    ),
+    gr.Slider(
+        label="Top-p (nucleus sampling)",
+        value=0.90,
+        minimum=0.0,
+        maximum=1,
+        step=0.05,
+        interactive=True,
+        info="Higher values sample more low-probability tokens",
+    ),
+    gr.Slider(
+        label="Repetition penalty",
+        value=1.2,
+        minimum=1.0,
+        maximum=2.0,
+        step=0.05,
+        interactive=True,
+        info="Penalize repeated tokens",
+    )
+]
+examples = []
+logging.info("Creating Chatbot")
+mychatbot = gr.Chatbot(avatar_images=["user.png", "botsc.png"], bubble_full_width=False, show_label=False, show_copy_button=True, likeable=True,)
+logging.info("Creating Chat Interface")
+iface = gr.ChatInterface(
+    fn=generate,
+    chatbot=mychatbot,
+    additional_inputs=additional_inputs,
+    examples=examples,
+    concurrency_limit=20,
+    title="LLAMA CPP Template"
+)
+logging.info("Starting Application")
+iface.launch(show_api=False, server_name="0.0.0.0")