Spaces:
Runtime error
Runtime error
from fastapi import FastAPI | |
import time | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
device = "cpu" # the device to load the model onto | |
model = AutoModelForCausalLM.from_pretrained( | |
"Qwen/Qwen2-0.5B-Instruct", | |
torch_dtype="auto", | |
device_map="auto" | |
) | |
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct") | |
app = FastAPI() | |
async def read_root(): | |
return {"Hello": "World!"} | |
start_time = time.time() | |
messages = [ | |
{"role": "system", "content": "You are a helpful assistant, Sia. You are developed by Sushma. You will response in polity and brief."}, | |
{"role": "user", "content": "Who are you?"}, | |
{"role": "assistant", "content": "I am Sia, a small language model created by Sushma. I am here to assist you."}, | |
{"role": "user", "content": "Hi, How are you?"} | |
] | |
text = tokenizer.apply_chat_template( | |
messages, | |
tokenize=False, | |
add_generation_prompt=True | |
) | |
model_inputs = tokenizer([text], return_tensors="pt").to(device) | |
generated_ids = model.generate( | |
model_inputs.input_ids, | |
max_new_tokens=64 | |
) | |
generated_ids = [ | |
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) | |
] | |
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
print(response) | |
end_time = time.time() | |
time_taken = end_time - start_time | |
print(time_taken) | |
async def read_droot(): | |
starttime = time.time() | |
messages = [ | |
{"role": "system", "content": "You are a helpful assistant, Sia. You are developed by Sushma. You will response in polity and brief."}, | |
{"role": "user", "content": "Who are you?"}, | |
{"role": "assistant", "content": "I am Sia, a small language model created by Sushma. I am here to assist you."}, | |
{"role": "user", "content": "Hi, How are you?"} | |
] | |
text = tokenizer.apply_chat_template( | |
messages, | |
tokenize=False, | |
add_generation_prompt=True | |
) | |
model_inputs = tokenizer([text], return_tensors="pt").to(device) | |
generated_ids = model.generate( | |
model_inputs.input_ids, | |
max_new_tokens=64 | |
) | |
generated_ids = [ | |
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids) | |
] | |
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0] | |
print(response) | |
end_time = time.time() | |
time_taken = end_time - starttime | |
print(time_taken) | |
return {"Hello": "World!"} |