Spaces:
Sleeping
Sleeping
File size: 2,295 Bytes
96292d0 208435b 96292d0 225f228 96292d0 225f228 208435b b2be325 ca386d6 96292d0 208435b 96292d0 652db8c 9660892 652db8c 96292d0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 |
import gradio as gr
import spaces
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from huggingface_hub import login
import os
access_token = os.getenv('HF_TOKEN')
login(access_token)
model_id = "google/gemma-2-9b-it"
print("Model loading started")
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.bfloat16,
)
print("Model loading completed. Device of the model:", model.device)
"""
tokenizer = None
model = None
model_loaded = False # Flag to check if the model is loaded
@spaces.GPU
def load_model():
global tokenizer, model, model_loaded
if not model_loaded: # Load model only if it's not already loaded
print("Model loading started")
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
model_id,
device_map="auto",
torch_dtype=torch.bfloat16,
)
model_loaded = True
print("Model loading completed. Device of the model:", model.device)
return model, tokenizer
else:
print("Model is already loaded")
return model, tokenizer
"""
@spaces.GPU(duration=30)
def ask(prompt):
if not prompt:
return {"error": "Prompt is missing"}
#if not model_loaded:
# model, tokenizer = load_model() # Ensure the model is loaded before processing
model.to("cuda")
print("Device of the model:", model.device)
messages = [
{"role": "user", "content": f"{prompt}"},
]
print("Messages:", messages)
print("Tokenizer process started")
input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to("cuda")
print("Tokenizer process completed")
print("Model process started")
outputs = model.generate(**input_ids, max_new_tokens=256)
print("Tokenizer decode process started")
answer = tokenizer.decode(outputs[0])
print("Answer:", answer)
answer = answer.split("<end_of_turn>")[1].strip().replace("*", "")
print("Final answer:", answer)
return answer
demo = gr.Interface(fn=ask, inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), outputs=gr.Textbox())
demo.launch()
|