|
import gradio as gr |
|
from unsloth import FastLanguageModel |
|
from transformers import TextStreamer |
|
import torch |
|
|
|
|
|
def load_model(model_name, max_seq_length, dtype, load_in_4bit, token=None): |
|
model, tokenizer = FastLanguageModel.from_pretrained( |
|
model_name=model_name, |
|
max_seq_length=max_seq_length, |
|
dtype=dtype, |
|
load_in_4bit=load_in_4bit, |
|
token=token |
|
) |
|
FastLanguageModel.for_inference(model) |
|
return model, tokenizer |
|
|
|
|
|
model_name = "unsloth/Phi-3-mini-4k-instruct" |
|
token = None |
|
|
|
model, tokenizer = load_model(model_name, max_seq_length=2048, dtype=None, load_in_4bit=True, token=token) |
|
|
|
def generate_response(instruction, input_text, max_new_tokens): |
|
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request. |
|
|
|
### Instruction: |
|
{} |
|
|
|
### Input: |
|
{} |
|
|
|
### Response: |
|
{}""" |
|
|
|
inputs = tokenizer( |
|
[ |
|
alpaca_prompt.format( |
|
instruction, |
|
input_text, |
|
"" |
|
) |
|
], return_tensors="pt").to("cpu") |
|
|
|
text_streamer = TextStreamer(tokenizer) |
|
output = model.generate(**inputs, streamer=text_streamer, max_new_tokens=max_new_tokens) |
|
|
|
response = tokenizer.decode(output[0], skip_special_tokens=True) |
|
return response |
|
|
|
|
|
iface = gr.Interface( |
|
fn=generate_response, |
|
inputs=[ |
|
gr.Textbox(lines=2, label="Instruction", placeholder="Continue the Fibonacci sequence."), |
|
gr.Textbox(lines=2, label="Input", placeholder="1, 1, 2, 3, 5, 8"), |
|
gr.Slider(1, 2048, value=128, step=1, label="Max New Tokens") |
|
], |
|
outputs=gr.Textbox(label="Response", lines=10), |
|
title="Language Model Chat UI" |
|
) |
|
|
|
iface.launch() |
|
|