import time
import gradio as gr
from ctransformers import AutoModelForCausalLM

model = AutoModelForCausalLM.from_pretrained("mistral-7b-instruct-v0.1.Q6_K.gguf", model_type="mistral", gpu_layers=0, context_length=2048)

history = []

def generateResponse(prompt, history):
    formattedPrompt = f"<s>[INST] {prompt} [/INST]"
    response = model(formattedPrompt, max_new_tokens=1024)
    history.append([prompt, response])
    return response

examples = ['Write a poem', 'Tell me a joke', 'Write a marketing catch phrase for an AI app']

title = "Mistral-7B-Instruct-v0.1-GGUF"

description = "This space is an attempt to run the GGUF 4 bit quantized version of 'Mistral-7B-Instruct-v0.1'."

UI = gr.ChatInterface(
    fn=generateResponse,
    examples=examples,
    title=title,
    description=description,
    submit_btn="Submit",
    stop_btn="Stop generating",
    clear_btn="Clear chat"
)

UI.queue(max_size=10, concurrency_count=16)
UI.launch()