import gradio as gr from transformers import AutoModelForCausalLM, AutoTokenizer import torch import spaces # Load the model and tokenizer model_name = "Qwen/Qwen2-72B-Instruct" # Load model (without moving to GPU yet) model = AutoModelForCausalLM.from_pretrained( model_name, torch_dtype=torch.float16, trust_remote_code=True ) tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True) @spaces.GPU def generate_text(prompt): # Move model to GPU when function is called model.to('cuda') messages = [ {"role": "system", "content": "You are a helpful assistant."}, {"role": "user", "content": prompt} ] text = tokenizer.apply_chat_template( messages, tokenize=False, add_generation_prompt=True ) model_inputs = tokenizer([text], return_tensors="pt").to('cuda') with torch.no_grad(): generated_ids = model.generate( model_inputs.input_ids, temperature=0.7, max_new_tokens=500, do_sample=True, top_p=0.95 ) generated_ids = [ output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) ] response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] # Move model back to CPU to free up GPU resources model.to('cpu') return response # Create Gradio interface iface = gr.Interface( fn=generate_text, inputs=gr.Textbox(lines=5, label="Input Prompt"), outputs=gr.Textbox(label="Generated Text"), title="Qwen Text Generator (Spaces GPU)", description="Enter a prompt to generate text using the Qwen model. This Space uses Spaces GPU for efficient GPU usage." ) # Launch the app iface.launch()