Spaces:

likewendy
/

phi-4

Sleeping

File size: 3,133 Bytes

aca8f85
 
 
104d147
aca8f85
 
 
90aa4a9
cd3f73f
aca8f85
104d147
aca8f85
 
1e46ae4
aca8f85
 
d5e204e
aca8f85
d5e204e
aca8f85
 
 
104d147
 
 
 
 
 
 
aca8f85
104d147
 
aca8f85
 
 
 
 
 
 
104d147
 
aca8f85
 
 
 
 
 
 
 
07f9f12
aca8f85
5ebbf5f
90aa4a9
5ebbf5f
07f9f12
aca8f85
 
 
 
 
 
 
5ebbf5f
 
aca8f85
2323cb2
94f09c8
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
5ebbf5f
94f09c8
 
 
 
 
 
 
104d147
aca8f85
104d147
865e55c

import spaces
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer
import gradio as gr
import os
from threading import Thread

os.system("rm -rf /data-nvme/zerogpu-offload/*")

os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

model = AutoModelForCausalLM.from_pretrained(
    "NyxKrage/Microsoft_Phi-4", 
    device_map="cuda", 
    torch_dtype="auto", 
    trust_remote_code=True, 
)
tokenizer = AutoTokenizer.from_pretrained("NyxKrage/Microsoft_Phi-4")

streamer = TextIteratorStreamer(tokenizer)

@spaces.GPU
def respond(
    message,
    history: list[tuple[str, str]],
    system_message,
    max_tokens,
    temperature,
    top_p,
    seed,
):
    messages = [{"role": "system", "content": system_message}]

    for val in history:
        if val[0]:
            messages.append({"role": "user", "content": val[0]})
        if val[1]:
            messages.append({"role": "assistant", "content": val[1]})

    messages.append({"role": "user", "content": message})

    # Convert messages to the format expected by the model
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda")

    torch.random.manual_seed(seed)

    generation_kwargs = dict(
        input_ids=input_ids,
        max_new_tokens=max_tokens,
        temperature=temperature,
        streamer=streamer,
        top_p=top_p,
        return_full_text=False,
        do_sample=True,
    )
    
    response = ""
    # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way.
    thread = Thread(target=model.generate, kwargs=generation_kwargs)
    thread.start()
    # Print the generated text in real-time
    for new_text in streamer:
        response += new_text
    yield response
    
with gr.Blocks() as demo:
    with gr.Row():
        gr.LoginButton(min_width=100)
        gr.Markdown("""
                    This is the space I built.
                    As of 2025/1/7, this is the first phi-4 space.
                    If this helps you, and if you have enough money, can you give me 1$? I am facing a financial crisis.
                    If you do this, I will pass on the kindness.
                    This is my bank card number:5592921230414708
                    Thank you!!
                    """)
    
    with gr.Row():
        with gr.Column():
            system_message = gr.Textbox(value="You are a friendly Chatbot.", label="System message")
            max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens")
            temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature")
            top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)")
            seed = gr.Slider(minimum=0, maximum=20091114, value=42, step=1, label="Seed")
        
        with gr.Column():
            gr.ChatInterface(
                respond,
                additional_inputs=[system_message, max_tokens, temperature, top_p, seed],
            )


if __name__ == "__main__":
    demo.launch()