import gradio as gr import random import time from ctransformers import AutoModelForCausalLM # Set gpu_layers to the number of layers to offload to GPU. Set to 0 if no GPU acceleration is available on your system. model = AutoModelForCausalLM.from_pretrained( "manan05/mistral-7b-friends-v0.1.gguf", model_file="mistralfriends-7b-v0.1.gguf", model_type="mistral", gpu_layers=0, hf=True ) from transformers import AutoTokenizer, pipeline # Tokenizer tokenizer = AutoTokenizer.from_pretrained("manan05/mistral-7b-friends") # Pipeline generator = pipeline( model=model, tokenizer=tokenizer, task='text-generation', max_new_tokens=50, repetition_penalty=1.1 ) with gr.Blocks() as demo: chatbot = gr.Chatbot() msg = gr.Textbox() clear = gr.ClearButton([msg, chatbot]) def respond(message, chat_history): user_message = "[INST] Given the following conversation context, generate the upcomming dialogue of Joey in his style. \n CONTEXT: Me: " + message + "[/INST]" bot_message = generator(user_message[0]["generated_text"]) chat_history.append((user_message, bot_message)) time.sleep(2) return "", chat_history msg.submit(respond, [msg, chatbot], [msg, chatbot]) demo.launch()