Spaces:
Runtime error
Runtime error
File size: 2,860 Bytes
867ef01 e2d20d4 1cec5b6 19d0396 867ef01 318fbfd af7530b 318fbfd af7530b 8ce2931 af7530b c8120a6 af7530b 92b7d80 318fbfd af7530b 318fbfd 92b7d80 867ef01 392dfeb 867ef01 fca36f2 f38c19f 867ef01 318fbfd 867ef01 392dfeb 867ef01 e2d20d4 f38c19f 318fbfd f38c19f af7530b f38c19f 318fbfd f38c19f 92b7d80 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 |
from fastapi import FastAPI
import time
import torch
import os
access_token = os.getenv("read_access")
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cpu" # the device to load the model onto
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
model1 = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen2-1.5B-Instruct",
device_map="auto"
)
model = AutoModelForCausalLM.from_pretrained(
"Qwen/Qwen2-1.5B-Instruct",
device_map="auto",
torch_dtype="auto"
)
app = FastAPI()
@app.get("/")
async def read_root():
return {"Hello": "World!"}
@app.get("/test")
async def read_droot():
starttime = time.time()
messages = [
{"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."},
{"role": "user", "content": "I'm Alok. Who are you?"},
{"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
{"role": "user", "content": "How are you?"}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model.generate(
model_inputs.input_ids,
max_new_tokens=128
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
end_time = time.time()
time_taken = end_time - starttime
print(time_taken)
return {"Hello": "World!"}
@app.get("/text")
async def read_droot():
starttime = time.time()
messages = [
{"role": "system", "content": "You are a helpful assistant, Sia, developed by Sushma. You will response in polity and brief."},
{"role": "user", "content": "I'm Alok. Who are you?"},
{"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
{"role": "user", "content": "How are you?"}
]
text = tokenizer.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
model_inputs = tokenizer([text], return_tensors="pt").to(device)
generated_ids = model1.generate(
model_inputs.input_ids,
max_new_tokens=128
)
generated_ids = [
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
]
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
print(response)
end_time = time.time()
time_taken = end_time - starttime
print(time_taken)
return {"Hello": "World!"}
#return {response: time}
|