Spaces:

valory
/

olas-prediction-leaderboard

Runtime error

File size: 6,200 Bytes

import start
import gradio as gr
import pandas as pd
from glob import glob
from pathlib import Path
from tabs.dashboard import df
from tabs.faq import (
    about_olas_predict_benchmark,
    about_olas_predict,
    about_the_dataset,
    about_the_tools
)
from tabs.howto_benchmark import how_to_run
from tabs.run_benchmark import run_benchmark_main


demo = gr.Blocks()


def run_benchmark_gradio(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key, openrouter_api_key):
    """Run the benchmark using inputs."""
    if tool_name is None:
        return "Please enter the name of your tool."
    if openai_api_key is None and anthropic_api_key is None and openrouter_api_key is None:
        return "Please enter either OpenAI or Anthropic or OpenRouter API key."
    
    result = run_benchmark_main(tool_name, model_name, num_questions, openai_api_key, anthropic_api_key, openrouter_api_key)
    if result == 'completed':
        # get the results file in the results directory
        fns = glob('results/*.csv')

        print(f"Number of files in results directory: {len(fns)}")

        # convert to Path
        files = [Path(file) for file in fns]

        # get results and summary files
        results_files = [file for file in files if 'results' in file.name]

        # the other file is the summary file
        summary_files = [file for file in files if 'summary' in file.name]

        print(results_files, summary_files)

        # get the path with results
        results_df = pd.read_csv(results_files[0])
        summary_df = pd.read_csv(summary_files[0])

        # make sure all df float values are rounded to 4 decimal places
        results_df = results_df.round(4)
        summary_df = summary_df.round(4)

        return gr.Dataframe(value=results_df), gr.Dataframe(value=summary_df)
    
    return gr.Textbox(label="Benchmark Result", value=result, interactive=False), gr.Textbox(label="Summary", value="")


with demo:
    gr.HTML("<h1>Olas Predict Benchmark</hjson>")
    gr.Markdown("Leaderboard showing the performance of Olas Predict tools on the Autocast dataset and overview of the project.")

    with gr.Tabs() as tabs:
        # first tab - leaderboard
        with gr.TabItem("🏅 Benchmark Leaderboard", id=0):

            gr.components.Dataframe(
                value=df,
            )

        # second tab - about
        with gr.TabItem("ℹ️ About"):
            with gr.Row():
                with gr.Accordion("About the Benchmark", open=False):
                    gr.Markdown(about_olas_predict_benchmark)
            with gr.Row():
                with gr.Accordion("About the Tools", open=False):
                    gr.Markdown(about_the_tools)
            with gr.Row():
                with gr.Accordion("About the Autocast Dataset", open=False):
                    gr.Markdown(about_the_dataset)
            with gr.Row():
                with gr.Accordion("About Olas", open=False):
                    gr.Markdown(about_olas_predict)

        
        # third tab - how to run the benchmark
        with gr.TabItem("🚀 Contribute"):
            gr.Markdown(how_to_run)

        def update_dropdown(tool):
            if "claude" in tool:
                return ["claude-3-haiku-20240307", "claude-3-sonnet-20240229", "claude-3-opus-20240229"]
            else:
                return ["gpt-3.5-turbo-0125", "gpt-4-0125-preview"]


        # fourth tab - run the benchmark
        with gr.TabItem("🔥 Run the Benchmark"):
            with gr.Row():
                tool_name = gr.Dropdown(
                    [
                        "prediction-offline",
                        "prediction-online",
                        # "prediction-online-summarized-info",
                        # "prediction-offline-sme",
                        # "prediction-online-sme",
                        'prediction-request-rag',
                        'prediction-request-reasoning',
                        "prediction-url-cot-claude",
                        # "prediction-request-rag-cohere",
                        # "prediction-with-research-conservative",
                        # "prediction-with-research-bold",
                    ], label="Tool Name", info="Choose the tool to run")
                model_name = gr.Dropdown([
                    "gpt-3.5-turbo-0125",
                    "gpt-4-0125-preview"
                    "claude-3-haiku-20240307", 
                    "claude-3-sonnet-20240229", 
                    "claude-3-opus-20240229",
                    "databricks/dbrx-instruct:nitro",
                    "nousresearch/nous-hermes-2-mixtral-8x7b-sft",
                    # "cohere/command-r-plus",
                ], label="Model Name", info="Choose the model to use")
            with gr.Row():
                openai_api_key = gr.Textbox(label="OpenAI API Key", placeholder="Enter your OpenAI API key here", type="password")
                anthropic_api_key = gr.Textbox(label="Anthropic API Key", placeholder="Enter your Anthropic API key here", type="password")
                openrouter_api_key = gr.Textbox(label="OpenRouter API Key", placeholder="Enter your OpenRouter API key here", type="password")
            with gr.Row():
                num_questions = gr.Slider(
                                    minimum=1,
                                    maximum=340,
                                    value=10,
                                    label="Number of questions to run the benchmark on",
                                )
            with gr.Row():
                run_button = gr.Button("Run Benchmark")
            with gr.Row():
                with gr.Accordion("Results", open=True):
                    result = gr.Dataframe()
            with gr.Row():
                with gr.Accordion("Summary", open=False):
                    summary = gr.Dataframe()
            
            run_button.click(run_benchmark_gradio, 
                            inputs=[tool_name, model_name, num_questions, openai_api_key, anthropic_api_key, openrouter_api_key], 
                            outputs=[result, summary])

demo.queue(default_concurrency_limit=40).launch()