import spaces import torch from transformers import AutoModelForCausalLM, AutoTokenizer, TextIteratorStreamer import gradio as gr import os from threading import Thread os.system("rm -rf /data-nvme/zerogpu-offload/*") os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True' model = AutoModelForCausalLM.from_pretrained( "NyxKrage/Microsoft_Phi-4", device_map="cuda", torch_dtype="auto", trust_remote_code=True, ) tokenizer = AutoTokenizer.from_pretrained("NyxKrage/Microsoft_Phi-4") streamer = TextIteratorStreamer(tokenizer) @spaces.GPU def respond( message, history: list[tuple[str, str]], system_message, max_tokens, temperature, top_p, seed, ): messages = [{"role": "system", "content": system_message}] for val in history: if val[0]: messages.append({"role": "user", "content": val[0]}) if val[1]: messages.append({"role": "assistant", "content": val[1]}) messages.append({"role": "user", "content": message}) # Convert messages to the format expected by the model input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt").to("cuda") torch.random.manual_seed(seed) generation_kwargs = dict( input_ids=input_ids, max_new_tokens=max_tokens, temperature=temperature, streamer=streamer, top_p=top_p, return_full_text=False, do_sample=True, ) response = "" # Run the generation in a separate thread, so that we can fetch the generated text in a non-blocking way. thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() # Print the generated text in real-time for new_text in streamer: response += new_text yield response with gr.Blocks() as demo: with gr.Row(): gr.LoginButton(min_width=100) gr.Markdown(""" This is the space I built. As of 2025/1/7, this is the first phi-4 space. If this helps you, and if you have enough money, can you give me 1$? I am facing a financial crisis. If you do this, I will pass on the kindness. This is my bank card number:5592921230414708 Thank you!! """) with gr.Row(): with gr.Column(): system_message = gr.Textbox(value="You are a friendly Chatbot.", label="System message") max_tokens = gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens") temperature = gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature") top_p = gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)") seed = gr.Slider(minimum=0, maximum=20091114, value=42, step=1, label="Seed") with gr.Column(): gr.ChatInterface( respond, additional_inputs=[system_message, max_tokens, temperature, top_p, seed], ) if __name__ == "__main__": demo.launch()