from fastapi import FastAPI
import time


from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cpu" # the device to load the model onto

model = AutoModelForCausalLM.from_pretrained(
    "Qwen/Qwen2-0.5B-Instruct",
    torch_dtype="auto",
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")

app = FastAPI()

@app.get("/")
async def read_root():
    return {"Hello": "World!"}
start_time = time.time()
messages = [
        {"role": "system", "content": "You are a helpful assistant, Sia. You are developed by Sushma. You will response in polity and brief."},
        {"role": "user", "content": "Who are you?"},
        {"role": "assistant", "content": "I am Sia, a small language model created by Sushma. I am here to assist you."},
        {"role": "user", "content": "Hi, How are you?"}
]
text = tokenizer.apply_chat_template(
    messages,
    tokenize=False,
    add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
    
generated_ids = model.generate(
    model_inputs.input_ids,
    max_new_tokens=64
)
generated_ids = [
    output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
    
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
end_time = time.time()
time_taken = end_time - start_time 
print(time_taken)


@app.get("/test")
async def read_droot():
    starttime = time.time()
    messages = [
        {"role": "system", "content": "You are a helpful assistant, Sia. You are developed by Sushma. You will response in polity and brief."},
        {"role": "user", "content": "Who are you?"},
        {"role": "assistant", "content": "I am Sia, a small language model created by Sushma. I am here to assist you."},
        {"role": "user", "content": "Hi, How are you?"}
    ]
    text = tokenizer.apply_chat_template(
        messages,
        tokenize=False,
        add_generation_prompt=True
    )
    model_inputs = tokenizer([text], return_tensors="pt").to(device)
        
    generated_ids = model.generate(
        model_inputs.input_ids,
        max_new_tokens=64
    )
    generated_ids = [
        output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
    ]
        
    response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
    print(response)
    end_time = time.time()
    time_taken = end_time - starttime 
    print(time_taken)
    return {"Hello": "World!"}