File size: 1,611 Bytes
7a8f074 05a9876 7a8f074 d5dbfd1 05a9876 7a8f074 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 |
import gradio as gr
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer
import torch
from peft import PeftModel, PeftConfig
base_model = "TinyPixel/Llama-2-7B-bf16-sharded"
tuned_adapter = "newronai/llama-2-7b-QLoRA-Trial1"
# bnb_config = BitsAndBytesConfig(
# load_in_4bit=True,
# bnb_4bit_quant_type="nf4",
# bnb_4bit_compute_dtype=torch.float16,
# )
# bnb_config = BitsAndBytesConfig()
config = PeftConfig.from_pretrained(tuned_adapter)
model = AutoModelForCausalLM.from_pretrained(base_model,
use_cache="cache",
load_in_8bit = True
# quantization_config=bnb_config
)
model = PeftModel.from_pretrained(model, tuned_adapter)
print("Model Downloaded")
tokenizer = AutoTokenizer.from_pretrained(base_model,
use_cache="cache")
tokenizer.pad_token = tokenizer.eos_token
print("Tokenizer Ready")
def question_answer(context, question):
tokens = tokenizer.encode(question, return_tensors="pt").to("cuda")
output = model.generate(input_tokens)
output_text = tokenizer.batch_decode(output, skip_special_tokens = True)[0]
return output_text
gr.Interface(fn=question_answer, inputs=[gr.inputs.Textbox(lines=7, label="Context Paragraph"),
gr.inputs.Textbox(lines=2, label="Question"),],
outputs=[gr.outputs.Textbox(label="Answer")]).launch() |