import gradio as gr from transformers import AutoTokenizer from auto_gptq import AutoGPTQForCausalLM model_path = 'vita-group/vicuna-7b-v1.3_gptq' tokenizer_path = 'lmsys/vicuna-7b-v1.3' model = AutoGPTQForCausalLM.from_quantized( model_path, disable_exllama=True, device_map='auto', revision='2bit_128g', ) tokenizer = AutoTokenizer.from_pretrained(tokenizer_path, trust_remote_code=True) def generate_response(prompt): input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to('cuda') outputs = model.generate(input_ids=input_ids, max_length=128) decoded_output = tokenizer.decode(outputs[0]) return decoded_output iface = gr.Interface(fn=generate_response, inputs="text", outputs="text") iface.launch()