File size: 4,287 Bytes
dcb96e9
 
 
3d1ea38
 
 
dcb96e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d1ea38
dcb96e9
3d1ea38
dcb96e9
3d1ea38
dcb96e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d1ea38
dcb96e9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d1ea38
dcb96e9
 
3d1ea38
dcb96e9
 
 
 
 
 
 
 
 
3d1ea38
dcb96e9
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
"""Python Application Script for AI chatbot using LLAMA CPP."""
import logging

import gradio as gr
from llama_cpp import Llama

# Setting up enviornment
log_level = os.environ.get("LOG_LEVEL", "WARNING")
logging.basicConfig(encoding='utf-8', level=log_level)
# Default System Prompt
DEFAULT_SYSTEM_PROMPT = os.environ.get("DEFAULT_SYSTEM", "You are Dolphin, a helpful AI assistant.")
# Model Path
model_path = "model.gguf"
logging.debug("Model Path: %s", model_path)

logging.info("Loading Moddel")
llm = Llama(model_path=model_path, n_ctx=4000, n_threads=2, chat_format="chatml")


def generate(
    message: str,
    history: list[tuple[str, str]],
    system_prompt: str,
    temperature: float = 0.1,
    max_tokens: int = 512,
    top_p: float = 0.95,
    repetition_penalty: float = 1.0,
):
    """Function to generate text.

    :param message: The new user prompt.
    :param history: The history of the chat session.
    :param system: The system prompt of the model.
    :param temperature: The temperature parameter for the model.
    :param max_tokens: The maximum amount of tokens to use for the model.
    :param top_p: The top p value for the model.
    :param repetition_penalty: The repetition penalty for the model.
    """
    logging.info("Generating Text")
    logging.debug("message: %s", message)
    logging.debug("history: %s", history)
    logging.debug("system: %s", system)
    logging.debug("temperature: %s", temperature)
    logging.debug("max_tokens: %s", max_tokens)
    logging.debug("top_p: %s", top_p)
    logging.debug("repetion_penalty: %s", repetition_penalty)

    # Formatting Prompt
    logging.info("Formatting Prompt")
    formatted_prompt = [{"role": "system", "content": system_prompt}]
    for user_prompt, bot_response in history:
        formatted_prompt.append({"role": "user", "content": user_prompt})
        formatted_prompt.append({"role": "assistant", "content": bot_response})
    formatted_prompt.append({"role": "user", "content": message})
    logging.debug("Formatted Prompt: %s", formatted_prompt)

    # Generating Response
    logging.info("Generating Response")
    stream_response = llm.create_chat_completion(
        messages=formatted_prompt,
        temperature=temperature,
        max_tokens=max_tokens,
        top_p=top_p,
        repeat_penalty=repetition_penalty,
        stream=True,
    )

    # Parsing Response
    logging.info("Parsing Response")
    response = ""
    for chunk in stream_response:
        if (
            len(chunk["choices"][0]["delta"]) != 0
            and "content" in chunk["choices"][0]["delta"]
        ):
            response += chunk["choices"][0]["delta"]["content"]
        logging.debug("Response: %s", response)
        yield response


additional_inputs = [
    gr.Textbox(
        label="System Prompt",
        max_lines=1,
        interactive=True,
        value=DEFAULT_SYSTEM_PROMPT,
    ),
    gr.Slider(
        label="Temperature",
        value=0.9,
        minimum=0.0,
        maximum=1.0,
        step=0.05,
        interactive=True,
        info="Higher values produce more diverse outputs",
    ),
    gr.Slider(
        label="Max new tokens",
        value=256,
        minimum=0,
        maximum=1048,
        step=64,
        interactive=True,
        info="The maximum numbers of new tokens",
    ),
    gr.Slider(
        label="Top-p (nucleus sampling)",
        value=0.90,
        minimum=0.0,
        maximum=1,
        step=0.05,
        interactive=True,
        info="Higher values sample more low-probability tokens",
    ),
    gr.Slider(
        label="Repetition penalty",
        value=1.2,
        minimum=1.0,
        maximum=2.0,
        step=0.05,
        interactive=True,
        info="Penalize repeated tokens",
    )
]

examples = []

logging.info("Creating Chatbot")
mychatbot = gr.Chatbot(avatar_images=["user.png", "botsc.png"], bubble_full_width=False, show_label=False, show_copy_button=True, likeable=True,)

logging.info("Creating Chat Interface")
iface = gr.ChatInterface(
    fn=generate,
    chatbot=mychatbot,
    additional_inputs=additional_inputs,
    examples=examples,
    concurrency_limit=20,
    title="LLAMA CPP Template"
)

logging.info("Starting Application")
iface.launch(show_api=False, server_name="0.0.0.0")