import start import gradio as gr import pandas as pd from glob import glob from pathlib import Path from tabs.dashboard import df from tabs.faq import ( about_olas_predict_benchmark, about_olas_predict, about_the_dataset, about_the_tools ) from tabs.howto_benchmark import how_to_run from tabs.run_benchmark import run_benchmark_main demo = gr.Blocks() def run_benchmark_gradio(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key, openrouter_api_key): """Run the benchmark using inputs.""" if tool_name is None: return "Please enter the name of your tool." if openai_api_key is None and anthropic_api_key is None and openrouter_api_key is None: return "Please enter either OpenAI or Anthropic or OpenRouter API key." result = run_benchmark_main(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key, openrouter_api_key) if result == 'completed': # get the results file in the results directory fns = glob('results/*.csv') print(f"Number of files in results directory: {len(fns)}") # convert to Path files = [Path(file) for file in fns] # get results and summary files results_files = [file for file in files if 'results' in file.name] # the other file is the summary file summary_files = [file for file in files if 'summary' in file.name] print(results_files, summary_files) # get the path with results results_df = pd.read_csv(results_files[0]) summary_df = pd.read_csv(summary_files[0]) # make sure all df float values are rounded to 4 decimal places results_df = results_df.round(4) summary_df = summary_df.round(4) return gr.Dataframe(value=results_df), gr.Dataframe(value=summary_df) return gr.Textbox(label="Benchmark Result", value=result, interactive=False), gr.Textbox(label="Summary", value="") with demo: gr.HTML("

Olas Predict Benchmark") gr.Markdown("Leaderboard showing the performance of Olas Predict tools on the Autocast dataset and overview of the project.") with gr.Tabs() as tabs: # first tab - leaderboard with gr.TabItem("🏅 Benchmark Leaderboard", id=0): gr.components.Dataframe( value=df, ) # second tab - about with gr.TabItem("ℹī¸ About"): with gr.Row(): with gr.Accordion("About the Benchmark", open=False): gr.Markdown(about_olas_predict_benchmark) with gr.Row(): with gr.Accordion("About the Tools", open=False): gr.Markdown(about_the_tools) with gr.Row(): with gr.Accordion("About the Autocast Dataset", open=False): gr.Markdown(about_the_dataset) with gr.Row(): with gr.Accordion("About Olas", open=False): gr.Markdown(about_olas_predict) # third tab - how to run the benchmark with gr.TabItem("🚀 Contribute"): gr.Markdown(how_to_run) def update_dropdown(tool): if "claude" in tool: return ["claude-3-haiku-20240307", "claude-3-sonnet-20240229", "claude-3-opus-20240229"] else: return ["gpt-3.5-turbo-0125", "gpt-4-0125-preview"] # fourth tab - run the benchmark with gr.TabItem("đŸ”Ĩ Run the Benchmark"): with gr.Row(): tool_name = gr.Dropdown( [ "prediction-offline", "prediction-online", # "prediction-online-summarized-info", # "prediction-offline-sme", # "prediction-online-sme", 'prediction-request-rag', 'prediction-request-reasoning', "prediction-url-cot-claude", # "prediction-request-rag-cohere", # "prediction-with-research-conservative", # "prediction-with-research-bold", ], label="Tool Name", info="Choose the tool to run") model_name = gr.Dropdown([ "gpt-3.5-turbo-0125", "gpt-4-0125-preview" "claude-3-haiku-20240307", "claude-3-sonnet-20240229", "claude-3-opus-20240229", "databricks/dbrx-instruct:nitro", "nousresearch/nous-hermes-2-mixtral-8x7b-sft", # "cohere/command-r-plus", ], label="Model Name", info="Choose the model to use") with gr.Row(): openai_api_key = gr.Textbox(label="OpenAI API Key", placeholder="Enter your OpenAI API key here", type="password") anthropic_api_key = gr.Textbox(label="Anthropic API Key", placeholder="Enter your Anthropic API key here", type="password") openrouter_api_key = gr.Textbox(label="OpenRouter API Key", placeholder="Enter your OpenRouter API key here", type="password") with gr.Row(): num_questions = gr.Slider( minimum=1, maximum=340, value=10, label="Number of questions to run the benchmark on", ) with gr.Row(): run_button = gr.Button("Run Benchmark") with gr.Row(): with gr.Accordion("Results", open=True): result = gr.Dataframe() with gr.Row(): with gr.Accordion("Summary", open=False): summary = gr.Dataframe() run_button.click(run_benchmark_gradio, inputs=[tool_name, model_name, num_questions, openai_api_key, anthropic_api_key, openrouter_api_key], outputs=[result, summary]) demo.queue(default_concurrency_limit=40).launch()