File size: 2,295 Bytes
96292d0
 
 
 
 
 
 
 
 
 
 
208435b
 
 
 
 
 
 
 
 
 
 
96292d0
 
225f228
96292d0
 
 
225f228
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208435b
b2be325
ca386d6
96292d0
 
 
 
208435b
 
 
96292d0
 
 
 
 
 
 
 
 
 
 
 
 
652db8c
 
9660892
652db8c
96292d0
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
import gradio as gr
import spaces
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
from huggingface_hub import login
import os

access_token = os.getenv('HF_TOKEN')
login(access_token)

model_id = "google/gemma-2-9b-it"

print("Model loading started")
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)
print("Model loading completed. Device of the model:", model.device)

"""
tokenizer = None
model = None
model_loaded = False  # Flag to check if the model is loaded

@spaces.GPU
def load_model():
    global tokenizer, model, model_loaded
    if not model_loaded:  # Load model only if it's not already loaded
        print("Model loading started")
        tokenizer = AutoTokenizer.from_pretrained(model_id)
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map="auto",
            torch_dtype=torch.bfloat16,
        )
        model_loaded = True
        print("Model loading completed. Device of the model:", model.device)
        return model, tokenizer
    else:
        print("Model is already loaded")
        return model, tokenizer
"""

@spaces.GPU(duration=30)
def ask(prompt):
    if not prompt:
        return {"error": "Prompt is missing"}

    #if not model_loaded:
    #    model, tokenizer = load_model()  # Ensure the model is loaded before processing
    model.to("cuda")
    print("Device of the model:", model.device)
    messages = [
        {"role": "user", "content": f"{prompt}"},
    ]
    print("Messages:", messages)
    print("Tokenizer process started")
    input_ids = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to("cuda")
    print("Tokenizer process completed")

    print("Model process started")
    outputs = model.generate(**input_ids, max_new_tokens=256)

    print("Tokenizer decode process started")
    answer = tokenizer.decode(outputs[0])
    print("Answer:", answer)
    answer = answer.split("<end_of_turn>")[1].strip().replace("*", "")
    print("Final answer:", answer)

    return answer

demo = gr.Interface(fn=ask, inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."), outputs=gr.Textbox())
demo.launch()