|
import start |
|
import gradio as gr |
|
import pandas as pd |
|
from glob import glob |
|
from pathlib import Path |
|
from tabs.dashboard import df |
|
from tabs.faq import ( |
|
about_olas_predict_benchmark, |
|
about_olas_predict, |
|
about_the_dataset, |
|
about_the_tools, |
|
) |
|
from tabs.howto_benchmark import how_to_run |
|
|
|
|
|
from tabs.run_benchmark import run_benchmark_main |
|
|
|
demo = gr.Blocks() |
|
|
|
|
|
def run_benchmark_gradio( |
|
tool_name, |
|
model_name, |
|
num_questions, |
|
openai_api_key, |
|
anthropic_api_key, |
|
openrouter_api_key, |
|
): |
|
"""Run the benchmark using inputs.""" |
|
if tool_name is None: |
|
return "Please enter the name of your tool." |
|
if ( |
|
openai_api_key is None |
|
and anthropic_api_key is None |
|
and openrouter_api_key is None |
|
): |
|
return "Please enter either OpenAI or Anthropic or OpenRouter API key." |
|
|
|
result = run_benchmark_main( |
|
tool_name, |
|
model_name, |
|
num_questions, |
|
openai_api_key, |
|
anthropic_api_key, |
|
openrouter_api_key, |
|
) |
|
|
|
if result == "completed": |
|
|
|
fns = glob("results/*.csv") |
|
|
|
print(f"Number of files in results directory: {len(fns)}") |
|
|
|
|
|
files = [Path(file) for file in fns] |
|
|
|
|
|
results_files = [file for file in files if "results" in file.name] |
|
|
|
|
|
summary_files = [file for file in files if "summary" in file.name] |
|
|
|
print(results_files, summary_files) |
|
|
|
|
|
results_df = pd.read_csv(results_files[0]) |
|
summary_df = pd.read_csv(summary_files[0]) |
|
|
|
|
|
results_df = results_df.round(4) |
|
summary_df = summary_df.round(4) |
|
|
|
return gr.Dataframe(value=results_df), gr.Dataframe(value=summary_df) |
|
|
|
return gr.Textbox( |
|
label="Benchmark Result", value=result, interactive=False |
|
), gr.Textbox(label="Summary", value="") |
|
|
|
|
|
with demo: |
|
gr.HTML("<h1>Olas Predict Benchmark</hjson>") |
|
gr.Markdown( |
|
"Leaderboard showing the performance of Olas Predict tools on the Autocast dataset and overview of the project." |
|
) |
|
|
|
with gr.Tabs() as tabs: |
|
|
|
with gr.TabItem("π
Benchmark Leaderboard", id=0): |
|
|
|
gr.components.Dataframe( |
|
value=df, |
|
) |
|
|
|
|
|
with gr.TabItem("βΉοΈ About"): |
|
with gr.Row(): |
|
with gr.Accordion("About the Benchmark", open=False): |
|
gr.Markdown(about_olas_predict_benchmark) |
|
with gr.Row(): |
|
with gr.Accordion("About the Tools", open=False): |
|
gr.Markdown(about_the_tools) |
|
with gr.Row(): |
|
with gr.Accordion("About the Autocast Dataset", open=False): |
|
gr.Markdown(about_the_dataset) |
|
with gr.Row(): |
|
with gr.Accordion("About Olas", open=False): |
|
gr.Markdown(about_olas_predict) |
|
|
|
|
|
with gr.TabItem("π Contribute"): |
|
gr.Markdown(how_to_run) |
|
|
|
|
|
with gr.TabItem("π₯ Run the Benchmark"): |
|
with gr.Row(): |
|
tool_name = gr.Dropdown( |
|
[ |
|
"prediction-offline", |
|
"prediction-online", |
|
|
|
|
|
|
|
"prediction-request-rag", |
|
"prediction-request-reasoning", |
|
|
|
|
|
|
|
|
|
], |
|
label="Tool Name", |
|
info="Choose the tool to run", |
|
) |
|
model_name = gr.Dropdown( |
|
[ |
|
"gpt-3.5-turbo-0125", |
|
"gpt-4-0125-preview", |
|
"claude-3-haiku-20240307", |
|
"claude-3-sonnet-20240229", |
|
"claude-3-opus-20240229", |
|
"databricks/dbrx-instruct:nitro", |
|
"nousresearch/nous-hermes-2-mixtral-8x7b-sft", |
|
|
|
], |
|
label="Model Name", |
|
info="Choose the model to use", |
|
) |
|
with gr.Row(): |
|
openai_api_key = gr.Textbox( |
|
label="OpenAI API Key", |
|
placeholder="Enter your OpenAI API key here", |
|
type="password", |
|
) |
|
anthropic_api_key = gr.Textbox( |
|
label="Anthropic API Key", |
|
placeholder="Enter your Anthropic API key here", |
|
type="password", |
|
) |
|
openrouter_api_key = gr.Textbox( |
|
label="OpenRouter API Key", |
|
placeholder="Enter your OpenRouter API key here", |
|
type="password", |
|
) |
|
with gr.Row(): |
|
num_questions = gr.Slider( |
|
minimum=1, |
|
maximum=340, |
|
value=10, |
|
label="Number of questions to run the benchmark on", |
|
) |
|
with gr.Row(): |
|
run_button = gr.Button("Run Benchmark") |
|
with gr.Row(): |
|
with gr.Accordion("Results", open=True): |
|
result = gr.Dataframe() |
|
with gr.Row(): |
|
with gr.Accordion("Summary", open=False): |
|
summary = gr.Dataframe() |
|
|
|
run_button.click( |
|
run_benchmark_gradio, |
|
inputs=[ |
|
tool_name, |
|
model_name, |
|
num_questions, |
|
openai_api_key, |
|
anthropic_api_key, |
|
openrouter_api_key, |
|
], |
|
outputs=[result, summary], |
|
) |
|
|
|
|
|
demo.queue(default_concurrency_limit=40).launch() |
|
|