File size: 3,109 Bytes
fc1301c 9aa57c1 b65471a bf0b05e a6d2bf9 bccb272 a6d2bf9 b65471a fc1301c 9eaa1af bccb272 b65471a 9eaa1af bccb272 b65471a fc1301c b65471a fc1301c b65471a fc1301c b65471a fc1301c b65471a fc1301c b65471a fc1301c b65471a bccb272 b65471a fc1301c b65471a fc1301c b65471a fc1301c b65471a fc1301c b65471a fc1301c b65471a bccb272 a6d2bf9 fc1301c b65471a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
import gradio as gr
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TextIteratorStreamer,
)
import os
from threading import Thread
import spaces
import time
import subprocess
subprocess.run(
"pip install flash-attn --no-build-isolation",
env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
shell=True,
)
DESCRIPTION = """\
# Phi-3-mini-4k-instruct
This Space demonstrates the open source [microsoft/Phi-3-mini-4k-instruct](https://huggingface.co/microsoft/Phi-3-mini-4k-instruct) model.
- refer to the [Hugging Face Paper page for Phi 3](https://huggingface.co/papers/2404.14219) for more info.
"""
token = os.environ["HF_TOKEN"]
model = AutoModelForCausalLM.from_pretrained(
"microsoft/Phi-3-mini-4k-instruct",
token=token,
trust_remote_code=True,
)
tok = AutoTokenizer.from_pretrained("microsoft/Phi-3-mini-4k-instruct", token=token)
terminators = [
tok.eos_token_id,
]
if torch.cuda.is_available():
device = torch.device("cuda")
print(f"Using GPU: {torch.cuda.get_device_name(device)}")
else:
device = torch.device("cpu")
print("Using CPU")
model = model.to(device)
# Dispatch Errors
@spaces.GPU(duration=60)
def chat(message, history, temperature, do_sample, max_tokens):
chat = []
for item in history:
chat.append({"role": "user", "content": item[0]})
if item[1] is not None:
chat.append({"role": "assistant", "content": item[1]})
chat.append({"role": "user", "content": message})
messages = tok.apply_chat_template(chat, tokenize=False, add_generation_prompt=True)
model_inputs = tok([messages], return_tensors="pt").to(device)
streamer = TextIteratorStreamer(
tok, timeout=20.0, skip_prompt=True, skip_special_tokens=True
)
generate_kwargs = dict(
model_inputs,
streamer=streamer,
max_new_tokens=max_tokens,
do_sample=True,
temperature=temperature,
eos_token_id=terminators,
)
if temperature == 0:
generate_kwargs["do_sample"] = False
t = Thread(target=model.generate, kwargs=generate_kwargs)
t.start()
partial_text = ""
for new_text in streamer:
partial_text += new_text
yield partial_text
yield partial_text
demo = gr.ChatInterface(
fn=chat,
examples=[
["Explain quantum physics in 5 words or less:"],
["Question: What do you call a bear with no teeth?\nAnswer:"],
],
# multimodal=False,
additional_inputs_accordion=gr.Accordion(
label="⚙️ Parameters", open=False, render=False
),
additional_inputs=[
gr.Slider(
minimum=0, maximum=1, step=0.1, value=0.9, label="Temperature", render=False
),
gr.Checkbox(label="Sampling", value=True),
gr.Slider(
minimum=128,
maximum=4096,
step=1,
value=512,
label="Max new tokens",
render=False,
),
],
stop_btn="Stop Generation",
title="Chat with Microsoft PHI",
description=DESCRIPTION,
)
demo.launch()
|