File size: 3,133 Bytes
aca8f85 104d147 aca8f85 90aa4a9 cd3f73f aca8f85 104d147 aca8f85 1e46ae4 aca8f85 d5e204e aca8f85 d5e204e aca8f85 104d147 aca8f85 104d147 aca8f85 104d147 aca8f85 07f9f12 aca8f85 5ebbf5f 90aa4a9 5ebbf5f 07f9f12 aca8f85 5ebbf5f aca8f85 2323cb2 94f09c8 5ebbf5f 94f09c8 104d147 aca8f85 104d147 865e55c |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 |
import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import gradio as gr
import os
from threading import Thread
os.system("rm -rf /data-nvme/zerogpu-offload/*")
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'
model = AutoModelForCausalLM.from_pretrained(
"NyxKrage/Microsoft_Phi-4",
device_map="cuda",
torch_dtype="auto",
trust_remote_code=True,
)
tokenizer = AutoTokenizer.from_pretrained("NyxKrage/Microsoft_Phi-4")
streamer = TextIteratorStreamer(tokenizer)
@spaces.GPU
def respond(
message,
history: list[tuple[str, str]],
system_message,
max_tokens,
temperature,
top_p,
seed,
):
messages = [{"role": "system", "content": system_message}]
for val in history:
if val[0]:
messages.append({"role": "user", "content": val[0]})
if val[1]:
messages.append({"role": "assistant", "content": val[1]})
messages.append({"role": "user", "content": message})
# Convert messages to the format expected by the model
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")
torch.random.manual_seed(seed)
generation_kwargs = dict(
input_ids=input_ids,
max_new_tokens=max_tokens,
temperature=temperature,
streamer=streamer,
top_p=top_p,
return_full_text=False,
do_sample=True,
)
response = ""
# Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
thread = Thread(target=model.generate, kwargs=generation_kwargs)
thread.start()
# Print the generated text in real-time
for new_text in streamer:
response += new_text
yield response
with gr.Blocks() as demo:
with gr.Row():
gr.LoginButton(min_width=100)
gr.Markdown("""
This is the space I built.
As of 2025/1/7, this is the first phi-4 space.
If this helps you, and if you have enough money, can you give me 1$? I am facing a financial crisis.
If you do this, I will pass on the kindness.
This is my bank card number:5592921230414708
Thank you!!
""")
with gr.Row():
with gr.Column():
system_message = gr.Textbox(value="You are a friendly Chatbot.", label="System message")
max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens")
temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
seed = gr.Slider(minimum=0, maximum=20091114, value=42, step=1, label="Seed")
with gr.Column():
gr.ChatInterface(
respond,
additional_inputs=[system_message, max_tokens, temperature, top_p, seed],
)
if __name__ == "__main__":
demo.launch() |