Spaces:

MaziyarPanahi
/

Chat-with-Mistral

Running

File size: 7,200 Bytes

import gradio as gr
import time
import requests
import json
import os
from urllib3.util.retry import Retry
from requests.adapters import HTTPAdapter

API_URL = os.getenv("API_URL")
API_KEY = os.getenv("API_KEY")

print(f"API_URL: {API_URL}")
print(f"API_KEY: {API_KEY}")

url = f"{API_URL}/v1/chat/completions"

# The headers for the HTTP request
headers = {
    "accept": "application/json",
    "Content-Type": "application/json",
    "Authorization": f"Bearer {API_KEY}",
}


def is_valid_json(data):
    try:
        parsed_data = json.loads(data)
        return True, parsed_data
    except ValueError as e:
        return False, str(e)


with gr.Blocks() as demo:

    markup = gr.Markdown(
        """
                         # Mistral 7B Instruct v0.2
                         This is a demo of the Mistral 7B Instruct quantized model in GGUF (Q2) hosted on K8s cluster.

                         The original models can be found [MaziyarPanahi/Mistral-7B-Instruct-v0.2-GGUF](https://huggingface.co/MaziyarPanahi/Mistral-7B-Instruct-v0.2-GGUF)"""
    )
    chatbot = gr.Chatbot(height=500)
    msg = gr.Textbox(lines=1, label="User Message")
    clear = gr.Button("Clear")
    with gr.Row():

        with gr.Column(scale=2):
            system_prompt_input = gr.Textbox(
                label="System Prompt",
                placeholder="Type system prompt here...",
                value="You are a helpful assistant.",
            )
            temperature_input = gr.Slider(
                label="Temperature", minimum=0.0, maximum=1.0, value=0.9, step=0.01
            )
            max_new_tokens_input = gr.Slider(
                label="Max New Tokens", minimum=0, maximum=1024, value=256, step=1
            )

        with gr.Column(scale=2):
            top_p_input = gr.Slider(
                label="Top P", minimum=0.0, maximum=1.0, value=0.95, step=0.01
            )
            top_k_input = gr.Slider(
                label="Top K", minimum=1, maximum=100, value=50, step=1
            )
            repetition_penalty_input = gr.Slider(
                label="Repetition Penalty",
                minimum=1.0,
                maximum=2.0,
                value=1.1,
                step=0.01,
            )

    def update_globals(
        system_prompt, temperature, max_new_tokens, top_p, top_k, repetition_penalty
    ):
        global global_system_prompt, global_temperature, global_max_new_tokens, global_top_p, global_repetition_penalty, global_top_k
        global_system_prompt = system_prompt
        global_temperature = temperature
        global_max_new_tokens = max_new_tokens
        global_top_p = top_p
        global_top_k = top_k
        global_repetition_penalty = repetition_penalty

    def user(user_message, history):
        return "", history + [[user_message, None]]

    def bot(
        history,
        system_prompt,
        temperature,
        max_new_tokens,
        top_p,
        top_k,
        repetition_penalty,
    ):
        print(f"History in bot: {history}")
        print(f"System Prompt: {system_prompt}")
        print(f"Temperature: {temperature}")
        print(f"Max New Tokens: {max_new_tokens}")
        print(f"Top P: {top_p}")
        print(f"Top K: {top_k}")
        print(f"Repetition Penalty: {repetition_penalty}")

        history_messages = [{"content": h[0], "role": "user"} for h in history if h[0]]
        history[-1][1] = ""
        sys_msg = [
            {
                "content": (
                    system_prompt if system_prompt else "You are a helpful assistant."
                ),
                "role": "system",
            }
        ]
        history_messages = sys_msg + history_messages
        print(history_messages)

        # Create a session object
        session = requests.Session()

        # Define the retry strategy
        retries = Retry(
            total=5,  # Total number of retries to allow
            backoff_factor=1,  # A backoff factor to apply between attempts
            status_forcelist=[
                500,
                502,
                503,
                504,
            ],  # A set of HTTP status codes that we should force a retry on
            allowed_methods=[
                "HEAD",
                "GET",
                "OPTIONS",
                "POST",
            ],  # HTTP methods to retry on
        )
        data = {
            "messages": history_messages,
            "stream": True,
            "temprature": temperature,
            "top_k": top_k,
            "top_p": top_p,
            "seed": 42,
            "repeat_penalty": repetition_penalty,
            "chat_format": "mistral-instruct",
            "max_tokens": max_new_tokens,
            # "response_format": {
            #     "type": "json_object",
            # },
        }

        # Mount it for http usage
        session.mount("http://", HTTPAdapter(max_retries=retries))

        # Making the POST request with increased timeout and retry logic
        try:
            response = session.post(
                url,
                headers=headers,
                data=json.dumps(data),
                stream=True,
                timeout=(10, 30),
            )
            if response.status_code == 200:
                for line in response.iter_lines():
                    # Filter out keep-alive new lines
                    if line:
                        data = line.decode("utf-8").lstrip("data: ")
                        # Check if the examples are valid
                        valid_check = is_valid_json(data)
                        if valid_check[0]:
                            try:
                                # Attempt to parse the JSON dataa
                                # json_data = json.loads(data)
                                json_data = valid_check[1]

                                delta_content = (
                                    json_data.get("choices", [{}])[0]
                                    .get("delta", {})
                                    .get("content", "")
                                )

                                if delta_content:  # Ensure there's content to print
                                    history[-1][1] += delta_content
                                    time.sleep(0.05)
                                    yield history
                            except json.JSONDecodeError as e:
                                print(f"Error decoding JSON: {e} date: {data}")
        except requests.exceptions.RequestException as e:
            print(f"An error occurred: {e}")

    msg.submit(
        user, [msg, chatbot], [msg, chatbot], queue=True, concurrency_limit=10
    ).then(
        bot,
        inputs=[
            chatbot,
            system_prompt_input,
            temperature_input,
            max_new_tokens_input,
            top_p_input,
            top_k_input,
            repetition_penalty_input,
        ],
        outputs=chatbot,
    )

    clear.click(lambda: None, None, chatbot, queue=False)


demo.queue(default_concurrency_limit=20, max_size=20, api_open=False)
if __name__ == "__main__":
    demo.launch(show_api=False, share=False)