import gradio as gr import spaces from transformers import AutoTokenizer, AutoModelForCausalLM import torch from huggingface_hub import login import os access_token = os.getenv('HF_TOKEN') login(access_token) model_id = "google/gemma-2-9b-it" print("Model loading started") tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", torch_dtype=torch.bfloat16, ) print("Model loading completed. Device of the model:", model.device) """ tokenizer = None model = None model_loaded = False # Flag to check if the model is loaded @spaces.GPU def load_model(): global tokenizer, model, model_loaded if not model_loaded: # Load model only if it's not already loaded print("Model loading started") tokenizer = AutoTokenizer.from_pretrained(model_id) model = AutoModelForCausalLM.from_pretrained( model_id, device_map="auto", torch_dtype=torch.bfloat16, ) model_loaded = True print("Model loading completed. Device of the model:", model.device) return model, tokenizer else: print("Model is already loaded") return model, tokenizer """ @spaces.GPU(duration=30) def ask(prompt): if not prompt: return {"error": "Prompt is missing"} #if not model_loaded: # model, tokenizer = load_model() # Ensure the model is loaded before processing model.to("cuda") print("Device of the model:", model.device) messages = [ {"role": "user", "content": f"{prompt}"}, ] print("Messages:", messages) print("Tokenizer process started") input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to("cuda") print("Tokenizer process completed") print("Model process started") outputs = model.generate(**input_ids, max_new_tokens=256) print("Tokenizer decode process started") answer = tokenizer.decode(outputs[0]) print("Answer:", answer) answer = answer.split("")[1].strip().replace("*", "") print("Final answer:", answer) return answer demo = gr.Interface(fn=ask, inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), outputs=gr.Textbox()) demo.launch()