import gradio as gr import random import time import torch import bitsandbytes import accelerate import peft # Use a pipeline as a high-level helper # Use a pipeline as a high-level helper from transformers import pipeline from transformers import BitsAndBytesConfig from transformers import AutoTokenizer,AutoModelForCausalLM nf4_config = BitsAndBytesConfig( load_in_4bit=True, bnb_4bit_quant_type="nf4", bnb_4bit_use_double_quant=True, bnb_4bit_compute_dtype=torch.bfloat16 ) tokenizer = AutoTokenizer.from_pretrained("llSourcell/medllama2_7b",quantization_config=nf4_config) model = AutoModelForCausalLM.from_pretrained("llSourcell/medllama2_7b",quantization_config=nf4_config) with gr.Blocks() as demo: chatbot = gr.Chatbot() msg = gr.Textbox() clear = gr.ClearButton([msg, chatbot]) def respond(message, chat_history): inputs = tokenizer(message, return_tensors="pt") generate_ids = model.generate(inputs.input_ids, max_length=30) bot_message = tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0] chat_history.append((message, bot_message)) time.sleep(2) return "", chat_history msg.submit(respond, [msg, chatbot], [msg, chatbot]) demo.launch()