import sys import torch import torch.nn as nn import transformers import gradio as gr import argparse import warnings import os import quant from gptq import GPTQ from datautils import get_loaders assert ( "LlamaTokenizer" in transformers._import_structure["models.llama"] ), "LLaMA is now in HuggingFace's main branch.\nPlease reinstall it: pip uninstall transformers && pip install git+https://github.com/huggingface/transformers.git" from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig def find_layers(module, layers=[nn.Conv2d, nn.Linear], name=''): if type(module) in layers: return {name: module} res = {} for name1, child in module.named_children(): res.update(find_layers(child, layers=layers, name=name + '.' + name1 if name != '' else name1)) return res def load_quant(model, checkpoint, wbits, groupsize=-1, fused_mlp=True, eval=True, warmup_autotune=True): from transformers import LlamaConfig, LlamaForCausalLM config = LlamaConfig.from_pretrained(model) def noop(*args, **kwargs): pass torch.nn.init.kaiming_uniform_ = noop torch.nn.init.uniform_ = noop torch.nn.init.normal_ = noop torch.set_default_dtype(torch.half) transformers.modeling_utils._init_weights = False torch.set_default_dtype(torch.half) model = LlamaForCausalLM(config) torch.set_default_dtype(torch.float) if eval: model = model.eval() layers = find_layers(model) for name in ['lm_head']: if name in layers: del layers[name] quant.make_quant_linear(model, layers, wbits, groupsize) del layers print('Loading model ...') model.load_state_dict(torch.load(checkpoint), strict=False) quant.make_quant_attn(model) if eval and fused_mlp: quant.make_fused_mlp(model) if warmup_autotune: quant.autotune_warmup_linear(model, transpose=not (eval)) if eval and fused_mlp: quant.autotune_warmup_fused(model) model.seqlen = 2048 print('Done.') return model def generate_prompt(instruction, input=None): if input: return f"""Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. ### Instruction: {instruction} ### Input: {input} ### Response:""" else: return f"""Below is an instruction that describes a task. Write a response that appropriately completes the request. ### Instruction: {instruction} ### Response:""" def main(): parser = argparse.ArgumentParser() parser.add_argument("--model_path",type=str,default="decapoda-research/llama-7b-hf",help="llama huggingface model to load") parser.add_argument("--quant_path",type=str,default="llama7b-8bit-128g.pt",help="the quantified model path") parser.add_argument( "--wbits", type=int, default=4, choices=[2, 3, 4, 8], help="bits to use for quantization; use 8 for evaluating base model.") parser.add_argument('--text', type=str, default='the mean of life is', help='input text') parser.add_argument('--min_length', type=int, default=10, help='The minimum length of the sequence to be generated.') parser.add_argument('--max_length', type=int, default=256, help='The maximum length of the sequence to be generated.') parser.add_argument('--top_p', type=float, default=0.95, help='If set to float < 1, only the smallest set of most probable tokens with probabilities that add up to top_p or higher are kept for generation.') parser.add_argument('--temperature', type=float, default=0.1, help='The value used to module the next token probabilities.') parser.add_argument('--repetition_penalty',type=float, default=2.0, help='The parameter for repetition penalty. 1.0 means no penalty(0~10)') parser.add_argument('--groupsize', type=int, default=-1, help='Groupsize to use for quantization; default uses full row.') parser.add_argument('--gradio', action='store_true', help='Whether to use gradio to present results.') args = parser.parse_args() if torch.cuda.is_available(): device = "cuda" else: device = "cpu" model = load_quant(args.model_path, args.quant_path, args.wbits, args.groupsize) model.to(device) tokenizer = LlamaTokenizer.from_pretrained(args.model_path) model.eval() if torch.__version__ >= "2" and sys.platform != "win32": model = torch.compile(model) #[Way1]: drectly generate if not args.gradio: input_ids = tokenizer.encode(args.text, return_tensors="pt").to(device) with torch.no_grad(): generated_ids = model.generate( input_ids, min_new_tokens=args.min_length, max_new_tokens=args.max_length, top_p=args.top_p, temperature=args.temperature, repetition_penalty=args.repetition_penalty, ) print("*"*80) print("🦙:", tokenizer.decode([el.item() for el in generated_ids[0]],skip_special_tokens=True)) #[Way2]: generate through the gradio interface else: def evaluate( input, temperature=0.1, top_p=0.75, top_k=40, num_beams=1, max_new_tokens=128, repetition_penalty=1.0, **kwargs, ): prompt = generate_prompt(input) inputs = tokenizer(prompt, return_tensors="pt") input_ids = inputs["input_ids"].to(device) generation_config = GenerationConfig( temperature=temperature, top_p=top_p, top_k=top_k, num_beams=num_beams, **kwargs, ) with torch.no_grad(): generation_output = model.generate( input_ids=input_ids, generation_config=generation_config, return_dict_in_generate=True, output_scores=True, max_new_tokens=max_new_tokens, repetition_penalty=float(repetition_penalty), ) s = generation_output.sequences[0] output = tokenizer.decode(s,skip_special_tokens=True) return output.split("### Response:")[1].strip() gr.Interface( fn=evaluate, inputs=[ gr.components.Textbox( lines=2, label="Input", placeholder="Tell me about alpacas." ), gr.components.Slider(minimum=0, maximum=1, value=0.1, label="Temperature"), gr.components.Slider(minimum=0, maximum=1, value=0.75, label="Top p"), gr.components.Slider(minimum=0, maximum=100, step=1, value=40, label="Top k"), gr.components.Slider(minimum=1, maximum=5, step=1, value=1, label="Beams"), gr.components.Slider( minimum=1, maximum=2000, step=1, value=256, label="Max tokens" ), gr.components.Slider( minimum=0.1, maximum=10.0, step=0.1, value=1.0, label="Repetition Penalty" ), ], outputs=[ gr.inputs.Textbox( lines=5, label="Output", ) ], title="Chinese-Vicuna 中文小羊驼", description="中文小羊驼由各种高质量的开源instruction数据集,结合Alpaca-lora的代码训练而来,模型基于开源的llama7B,主要贡献是对应的lora模型。由于代码训练资源要求较小,希望为llama中文lora社区做一份贡献。", ).launch(share=True) if __name__ == '__main__': main()