Spaces:
Runtime error
Runtime error
Update app.py
Browse files
app.py
CHANGED
@@ -8,20 +8,19 @@ access_token = os.getenv("read_access")
|
|
8 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
9 |
device = "cpu" # the device to load the model onto
|
10 |
|
11 |
-
|
12 |
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", token=access_token)
|
19 |
model = AutoModelForCausalLM.from_pretrained(
|
20 |
-
"
|
21 |
-
device_map="auto"
|
22 |
-
|
23 |
)
|
24 |
|
|
|
25 |
app = FastAPI()
|
26 |
|
27 |
@app.get("/")
|
@@ -77,7 +76,7 @@ async def read_droot():
|
|
77 |
|
78 |
generated_ids = model1.generate(
|
79 |
model_inputs.input_ids,
|
80 |
-
max_new_tokens=
|
81 |
)
|
82 |
generated_ids = [
|
83 |
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
@@ -91,34 +90,3 @@ async def read_droot():
|
|
91 |
return {"Hello": "World!"}
|
92 |
#return {response: time}
|
93 |
|
94 |
-
|
95 |
-
@app.get("/tet")
|
96 |
-
async def read_droot():
|
97 |
-
starttime = time.time()
|
98 |
-
messages = [
|
99 |
-
{"role": "user", "content": "I'm Alok. Who are you?"},
|
100 |
-
{"role": "assistant", "content": "I am Sia, a small language model created by Sushma."},
|
101 |
-
{"role": "user", "content": "How are you?"}
|
102 |
-
]
|
103 |
-
text = tokenizer.apply_chat_template(
|
104 |
-
messages,
|
105 |
-
tokenize=False,
|
106 |
-
add_generation_prompt=True
|
107 |
-
)
|
108 |
-
model_inputs = tokenizer([text], return_tensors="pt").to(device)
|
109 |
-
|
110 |
-
generated_ids = model.generate(
|
111 |
-
model_inputs.input_ids,
|
112 |
-
max_new_tokens=64
|
113 |
-
)
|
114 |
-
generated_ids = [
|
115 |
-
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
116 |
-
]
|
117 |
-
|
118 |
-
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
119 |
-
print(response)
|
120 |
-
end_time = time.time()
|
121 |
-
time_taken = end_time - starttime
|
122 |
-
print(time_taken)
|
123 |
-
return {"Hello": "resps"}
|
124 |
-
#return {response: time}
|
|
|
8 |
from transformers import AutoModelForCausalLM, AutoTokenizer
|
9 |
device = "cpu" # the device to load the model onto
|
10 |
|
11 |
+
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
|
12 |
|
13 |
+
model1 = AutoModelForCausalLM.from_pretrained(
|
14 |
+
"Qwen/Qwen2-1.5B-Instruct",
|
15 |
+
device_map="auto"
|
16 |
+
)
|
|
|
|
|
17 |
model = AutoModelForCausalLM.from_pretrained(
|
18 |
+
"Qwen/Qwen2-1.5B-Instruct",
|
19 |
+
device_map="auto"
|
20 |
+
torch_dtype="auto"
|
21 |
)
|
22 |
|
23 |
+
|
24 |
app = FastAPI()
|
25 |
|
26 |
@app.get("/")
|
|
|
76 |
|
77 |
generated_ids = model1.generate(
|
78 |
model_inputs.input_ids,
|
79 |
+
max_new_tokens=128
|
80 |
)
|
81 |
generated_ids = [
|
82 |
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
|
|
90 |
return {"Hello": "World!"}
|
91 |
#return {response: time}
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|