Spaces:
Sleeping
Sleeping
import time | |
import gradio as gr | |
import json | |
import requests | |
import os | |
models = [ | |
"TinyLLama 1b 4_K_M 2048", | |
"TinyLLama 1b OpenOrca 4_K_M 2048", | |
"OpenLLama 3b 4_K_M 196k", | |
"Phi-2 2.7b 4_K_M 2048", | |
"Stable Zephyr 3b 4_K_M 4096" | |
] | |
def make_request_to_llm(llm, prompt, max_new_tokens, nctx): | |
headers = { | |
"Authorization": f"Bearer {os.getenv('HF_TOKEN')}", | |
"Content-Type": "application/json" | |
} | |
body = {'prompt': prompt, 'max_new_tokens': max_new_tokens, "llm": llm, "nctx":nctx} | |
response = requests.post('https://daniellefranca96-cpu-inf.hf.space/llm_on_cpu', headers=headers, json=body) | |
return response.text | |
def change(llm): | |
return int(models[llm]['nctx']/1000) | |
def update(prompt, llm, nctx, max_tokens): | |
answer = {} | |
# Measure processing time | |
start_time = time.time() | |
result = make_request_to_llm(llm, prompt, max_tokens, int(nctx)*1000) | |
print(result) | |
end_time = time.time() | |
# Calculate tokens per second | |
duration = end_time - start_time | |
answer['Duration'] = duration | |
print("Duration: "+str(duration)) | |
answer['answer'] = result | |
return json.dumps(answer) | |
with gr.Blocks() as demo: | |
gr.Markdown("Test LM inferences speeds on CPU.") | |
with gr.Row(): | |
select = gr.Dropdown(models, label="LLM", value=models[0]) | |
with gr.Row(): | |
nctx = gr.Slider(minimum=1, maximum=100, label='Context (consider as 1000* the value chosen)', value="1") | |
with gr.Row(): | |
max_tokens = gr.Slider(minimum=512, maximum=4096, label='Max Tokens Generated') | |
with gr.Row(): | |
inp = gr.Textbox(placeholder="What is your prompt?", label="Prompt") | |
with gr.Row(): | |
out = gr.Textbox(label="Output", lines=20) | |
btn = gr.Button("Run") | |
btn.click(fn=update, inputs=[inp, select, nctx, max_tokens], outputs=out) | |
select.change(fn=change, inputs=[select], outputs=nctx) | |
demo.launch() | |