import time import gradio as gr from ctransformers import AutoModelForCausalLM model = AutoModelForCausalLM.from_pretrained("mistral-7b-instruct-v0.1.Q6_K.gguf", model_type="mistral", gpu_layers=0, context_length=2048) history = [] def generateResponse(prompt, history): formattedPrompt = f"[INST] {prompt} [/INST]" response = model(formattedPrompt, max_new_tokens=1024) history.append([prompt, response]) return response examples = ['Write a poem', 'Tell me a joke', 'Write a marketing catch phrase for an AI app'] title = "Mistral-7B-Instruct-v0.1-GGUF" description = "This space is an attempt to run the GGUF 4 bit quantized version of 'Mistral-7B-Instruct-v0.1'." UI = gr.ChatInterface( fn=generateResponse, examples=examples, title=title, description=description, submit_btn="Submit", stop_btn="Stop generating", clear_btn="Clear chat" ) UI.queue(max_size=10, concurrency_count=16) UI.launch()