chat_cpu_inf / app.py
daniellefranca96's picture
Update app.py
e9109a5
import time
import gradio as gr
import json
import requests
import os
models = [
"TinyLLama 1b 4_K_M 2048",
"TinyLLama 1b OpenOrca 4_K_M 2048",
"OpenLLama 3b 4_K_M 196k",
"Phi-2 2.7b 4_K_M 2048",
"Stable Zephyr 3b 4_K_M 4096"
]
def make_request_to_llm(llm, prompt, max_new_tokens, nctx):
headers = {
"Authorization": f"Bearer {os.getenv('HF_TOKEN')}",
"Content-Type": "application/json"
}
body = {'prompt': prompt, 'max_new_tokens': max_new_tokens, "llm": llm, "nctx":nctx}
response = requests.post('https://daniellefranca96-cpu-inf.hf.space/llm_on_cpu', headers=headers, json=body)
return response.text
def change(llm):
return int(models[llm]['nctx']/1000)
def update(prompt, llm, nctx, max_tokens):
answer = {}
# Measure processing time
start_time = time.time()
result = make_request_to_llm(llm, prompt, max_tokens, int(nctx)*1000)
print(result)
end_time = time.time()
# Calculate tokens per second
duration = end_time - start_time
answer['Duration'] = duration
print("Duration: "+str(duration))
answer['answer'] = result
return json.dumps(answer)
with gr.Blocks() as demo:
gr.Markdown("Test LM inferences speeds on CPU.")
with gr.Row():
select = gr.Dropdown(models, label="LLM", value=models[0])
with gr.Row():
nctx = gr.Slider(minimum=1, maximum=100, label='Context (consider as 1000* the value chosen)', value="1")
with gr.Row():
max_tokens = gr.Slider(minimum=512, maximum=4096, label='Max Tokens Generated')
with gr.Row():
inp = gr.Textbox(placeholder="What is your prompt?", label="Prompt")
with gr.Row():
out = gr.Textbox(label="Output", lines=20)
btn = gr.Button("Run")
btn.click(fn=update, inputs=[inp, select, nctx, max_tokens], outputs=out)
select.change(fn=change, inputs=[select], outputs=nctx)
demo.launch()