Spaces:

valory
/

olas-prediction-leaderboard

Running

cyberosa

disabling temporarily the run_benchmark tab

a9bd212 5 months ago

6.71 kB

	import start
	import gradio as gr
	import pandas as pd
	from glob import glob
	from pathlib import Path
	from tabs.dashboard import df
	from tabs.faq import (
	about_olas_predict_benchmark,
	about_olas_predict,
	about_the_dataset,
	about_the_tools,
	)
	from tabs.howto_benchmark import how_to_run

	# disabling temporarily
	# from tabs.run_benchmark import run_benchmark_main

	demo = gr.Blocks()


	def run_benchmark_gradio(
	tool_name,
	model_name,
	num_questions,
	openai_api_key,
	anthropic_api_key,
	openrouter_api_key,
	):
	"""Run the benchmark using inputs."""
	if tool_name is None:
	return "Please enter the name of your tool."
	if (
	openai_api_key is None
	and anthropic_api_key is None
	and openrouter_api_key is None
	):
	return "Please enter either OpenAI or Anthropic or OpenRouter API key."

	result = run_benchmark_main(
	tool_name,
	model_name,
	num_questions,
	openai_api_key,
	anthropic_api_key,
	openrouter_api_key,
	)

	if result == "completed":
	# get the results file in the results directory
	fns = glob("results/*.csv")

	print(f"Number of files in results directory: {len(fns)}")

	# convert to Path
	files = [Path(file) for file in fns]

	# get results and summary files
	results_files = [file for file in files if "results" in file.name]

	# the other file is the summary file
	summary_files = [file for file in files if "summary" in file.name]

	print(results_files, summary_files)

	# get the path with results
	results_df = pd.read_csv(results_files[0])
	summary_df = pd.read_csv(summary_files[0])

	# make sure all df float values are rounded to 4 decimal places
	results_df = results_df.round(4)
	summary_df = summary_df.round(4)

	return gr.Dataframe(value=results_df), gr.Dataframe(value=summary_df)

	return gr.Textbox(
	label="Benchmark Result", value=result, interactive=False
	), gr.Textbox(label="Summary", value="")


	with demo:
	gr.HTML("<h1>Olas Predict Benchmark</hjson>")
	gr.Markdown(
	"Leaderboard showing the performance of Olas Predict tools on the Autocast dataset and overview of the project."
	)

	with gr.Tabs() as tabs:
	# first tab - leaderboard
	with gr.TabItem("🏅 Benchmark Leaderboard", id=0):

	gr.components.Dataframe(
	value=df,
	)

	# second tab - about
	with gr.TabItem("ℹ️ About"):
	with gr.Row():
	with gr.Accordion("About the Benchmark", open=False):
	gr.Markdown(about_olas_predict_benchmark)
	with gr.Row():
	with gr.Accordion("About the Tools", open=False):
	gr.Markdown(about_the_tools)
	with gr.Row():
	with gr.Accordion("About the Autocast Dataset", open=False):
	gr.Markdown(about_the_dataset)
	with gr.Row():
	with gr.Accordion("About Olas", open=False):
	gr.Markdown(about_olas_predict)

	# third tab - how to run the benchmark
	with gr.TabItem("🚀 Contribute"):
	gr.Markdown(how_to_run)

	# fourth tab - run the benchmark
	# with gr.TabItem("🔥 Run the Benchmark"):
	# with gr.Row():
	# tool_name = gr.Dropdown(
	# [
	# "prediction-offline",
	# "prediction-online",
	# # "prediction-online-summarized-info",
	# # "prediction-offline-sme",
	# # "prediction-online-sme",
	# "prediction-request-rag",
	# "prediction-request-reasoning",
	# # "prediction-url-cot-claude",
	# # "prediction-request-rag-cohere",
	# # "prediction-with-research-conservative",
	# # "prediction-with-research-bold",
	# ],
	# label="Tool Name",
	# info="Choose the tool to run",
	# )
	# model_name = gr.Dropdown(
	# [
	# "gpt-3.5-turbo-0125",
	# "gpt-4-0125-preview",
	# "claude-3-haiku-20240307",
	# "claude-3-sonnet-20240229",
	# "claude-3-opus-20240229",
	# "databricks/dbrx-instruct:nitro",
	# "nousresearch/nous-hermes-2-mixtral-8x7b-sft",
	# # "cohere/command-r-plus",
	# ],
	# label="Model Name",
	# info="Choose the model to use",
	# )
	# with gr.Row():
	# openai_api_key = gr.Textbox(
	# label="OpenAI API Key",
	# placeholder="Enter your OpenAI API key here",
	# type="password",
	# )
	# anthropic_api_key = gr.Textbox(
	# label="Anthropic API Key",
	# placeholder="Enter your Anthropic API key here",
	# type="password",
	# )
	# openrouter_api_key = gr.Textbox(
	# label="OpenRouter API Key",
	# placeholder="Enter your OpenRouter API key here",
	# type="password",
	# )
	# with gr.Row():
	# num_questions = gr.Slider(
	# minimum=1,
	# maximum=340,
	# value=10,
	# label="Number of questions to run the benchmark on",
	# )
	# with gr.Row():
	# run_button = gr.Button("Run Benchmark")
	# with gr.Row():
	# with gr.Accordion("Results", open=True):
	# result = gr.Dataframe()
	# with gr.Row():
	# with gr.Accordion("Summary", open=False):
	# summary = gr.Dataframe()

	# run_button.click(
	# run_benchmark_gradio,
	# inputs=[
	# tool_name,
	# model_name,
	# num_questions,
	# openai_api_key,
	# anthropic_api_key,
	# openrouter_api_key,
	# ],
	# outputs=[result, summary],
	# )


	demo.queue(default_concurrency_limit=40).launch()