import gradio as gr from threading import Thread import os from ctransformers import AutoModelForCausalLM llm = AutoModelForCausalLM.from_pretrained("TheBloke/Mistral-7B-Instruct-v0.1-GGUF", model_file="mistral-7b-instruct-v0.1.Q4_K_M.gguf", model_type="mistral", temperature=0.7, gpu_layers=0, stream=True, threads=int(os.cpu_count()), max_new_tokens=10000) # Function to generate model predictions. def predict(message, history): history_transformer_format = history + [[message, ""]] # Formatting the input for the model. messages = "".join(["".join(["\n<|user|>:" + item[0], "\n<|assistant|>:" + item[1]]) for item in history_transformer_format]) prompt = f"[INST]{messages}[/INST]" message_out = "" for text in llm(prompt=prompt): message_out += text yield message_out # Setting up the Gradio chat interface. gr.ChatInterface(predict, title="Test Mistral 7B", description="Ask Mistral any questions", examples=['How to cook a fish?', 'Who is the president of US now?'] ).launch() # Launching the web interface.