# %% import os os.environ["CUDA_VISIBLE_DEVICES"] = "" import gradio as gr from transformers import LlamaTokenizer from transformers import LlamaForCausalLM, GenerationConfig from peft import PeftModel import torch if torch.cuda.is_available(): device = "cuda" else: device = "cpu" device_map={'': 0} def generate_instruction_prompt(instruction, input=None): if input: return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {instruction} ### Input: {input} ### Response:""" else: return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: {instruction} ### Response:""" def evaluate( model, tokenizer, instruction, input=None, temperature=0.1, top_p=0.75, num_beams=4, max_token=256, ): generation_config = GenerationConfig( temperature=temperature, top_p=top_p, num_beams=num_beams, top_k=40, no_repeat_ngram_size=3, ) prompt = generate_instruction_prompt(instruction, input) inputs = tokenizer(prompt, return_tensors="pt") input_ids = inputs["input_ids"].to(device) generation_output = model.generate( input_ids=input_ids, generation_config=generation_config, return_dict_in_generate=True, output_scores=True, max_new_tokens=max_token, ) s = generation_output.sequences[0] output = tokenizer.decode(s) res = output.split("### Response:")[1].strip() print("Response:", res) return res def load_lora(lora_path, base_model="decapoda-research/llama-7b-hf"): model = LlamaForCausalLM.from_pretrained( base_model, # load_in_8bit=True, # device_map=device_map, low_cpu_mem_usage=True, # torch_type=torch.float16, ) print("Loading LoRA...") lora = PeftModel.from_pretrained( model, lora_path, torch_type=torch.float16, # device_map=device_map, ) return lora base_model = "decapoda-research/llama-7b-hf" tokenizer = LlamaTokenizer.from_pretrained(base_model) # question = "如果今天是星期五, 那么后天是星期几?" model = load_lora(lora_path="facat/alpaca-lora-cn", base_model=base_model) eval = lambda question, input, temperature, beams, max_token: evaluate( model, tokenizer, question, input=input, temperature=temperature, num_beams=beams, max_token=max_token, ) gr.Interface( fn=eval, inputs=[ gr.components.Textbox( lines=2, label="Instruction", placeholder="Tell me about alpacas." ), gr.components.Textbox(lines=2, label="Input", placeholder="none"), gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"), # gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"), # gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"), gr.components.Slider(minimum=1, maximum=4, step=1, value=4, label="Beams"), gr.components.Slider( minimum=1, maximum=512, step=1, value=256, label="Max tokens" ), ], outputs=[ gr.inputs.Textbox( lines=8, label="Output", ) ], title=f"Alpaca-LoRA", description=f"Alpaca-LoRA", ).launch()