Spaces:

valory
/

olas-prediction-leaderboard

Running

olas-prediction-leaderboard / automate /run_benchmark.py

cyberosa

cleaning and add main repo of benchmark

d269dc6 6 months ago

9.81 kB

	#!/usr/bin/env python

	import os
	import csv
	import json
	import time
	import pickle
	import openai
	import pandas as pd
	from pathlib import Path
	from tqdm import tqdm
	from dotenv import load_dotenv
	from mech.packages.valory.customs.prediction_request import prediction_request
	from benchmark.utils import get_logger, TokenCounterCallback

	load_dotenv()
	logger = get_logger(__name__)
	this_dir = Path(__file__).parent


	def tool_map(tool):
	"""Map the tool name to the tool class."""

	tool_dict = {
	"prediction-online": prediction_request,
	"prediction-offline": prediction_request,
	}

	tool = tool_dict.get(tool, None)

	if tool is None:
	raise Exception(f"Tool {tool} not found.")
	else:
	return tool


	def prepare_questions(kwargs):
	test_questions = json.load(
	open(
	this_dir
	/ "olas-predict-benchmark/benchmark/data/autocast/autocast_questions_filtered.json"
	)
	)
	with open(
	this_dir
	/ "olas-predict-benchmark/benchmark/data/autocast/autocast_questions_filtered.pkl",
	"rb",
	) as f:
	url_to_content = pickle.load(f)
	num_questions = kwargs.pop("num_questions", len(test_questions))

	questions = []
	for q in test_questions:
	if q["qtype"] == "t/f" and q["answer"] is not None:
	questions.append(q)
	if len(questions) >= num_questions:
	break

	return questions, url_to_content


	def parse_response(response, test_q):
	try:
	result = json.loads(response[0])
	except Exception as e:
	print("The response is not json-format compatible")
	print(f"################### response[0] = {response[0]}")
	test_q["Correct"] = False
	test_q["prediction"] = None
	return test_q

	if "p_yes" in result.keys():
	test_q["p_yes"] = float(result["p_yes"])
	else:
	test_q["p_yes"] = None

	if "p_no" in result.keys():
	test_q["p_no"] = float(result["p_no"])
	else:
	test_q["p_no"] = None

	if "confidence" in result.keys():
	test_q["confidence"] = float(result["confidence"])
	else:
	test_q["confidence"] = None

	if "info_utility" in result.keys():
	test_q["info_utility"] = float(result["info_utility"])
	else:
	test_q["info_utility"] = None

	if response[3] is not None:
	test_q["input_tokens"] = response[3].cost_dict["input_tokens"]
	test_q["output_tokens"] = response[3].cost_dict["output_tokens"]
	test_q["total_tokens"] = response[3].cost_dict["total_tokens"]
	test_q["input_cost"] = response[3].cost_dict["input_cost"]
	test_q["output_cost"] = response[3].cost_dict["output_cost"]
	test_q["total_cost"] = response[3].cost_dict["total_cost"]
	test_q["prompt_response"] = response[1].replace(os.linesep, "")

	if (test_q["p_yes"] is None) or (float(result["p_yes"]) == float(result["p_no"])):
	test_q["prediction"] = None
	else:
	test_q["prediction"] = "yes" if test_q["p_yes"] > test_q["p_no"] else "no"
	test_q["Correct"] = test_q["prediction"] == test_q["answer"]
	return test_q


	def write_results(csv_file_path):

	results_path = Path(csv_file_path.parent)
	time_string = csv_file_path.stem.split("_", 1)[-1]

	results_df = pd.read_csv(csv_file_path)
	num_errors = results_df["error"].count()
	logger.info(f"Num errors: {str(num_errors)}")
	results_df = results_df.dropna(subset=["prediction"])
	grouped_df = results_df.groupby(["tool", "model"]).agg(
	{
	"Correct": ["mean", "sum", "count"],
	"crowd_correct": ["mean"],
	"input_tokens": ["mean"],
	"output_tokens": ["mean"],
	"total_tokens": ["mean"],
	"input_cost": ["mean"],
	"output_cost": ["mean"],
	"total_cost": ["mean"],
	}
	)

	grouped_df.columns = ["_".join(col).strip() for col in grouped_df.columns.values]
	summary_df = grouped_df.reset_index().rename(
	columns={
	"Correct_mean": "accuracy",
	"Correct_sum": "correct",
	"Correct_count": "total",
	"crowd_correct_mean": "crowd_accuracy",
	}
	)

	logger.info(f"Results:\n\n {results_df}")
	summary_df.to_csv(results_path / f"summary_{time_string}.csv", index=False)


	def run_benchmark(kwargs):
	"""Start the benchmark tests. If a category flag is provided, run the categories with that mark."""

	logger.info("Running benchmark tests...")

	tools = kwargs.pop("tools")
	model = kwargs.pop("model")[0]
	MAX_RETRIES = kwargs.pop("max_retries", 3)
	questions, url_to_content = prepare_questions(kwargs)
	logger.info(f"Running {len(questions)} questions for each tool: {tools}")

	results_path = Path("results")
	if not results_path.exists():
	results_path.mkdir(exist_ok=True)

	start_time = time.time()
	time_string = time.strftime("%y%m%d%H%M%S", time.localtime(start_time))
	csv_file_path = results_path / f"results_{time_string}.csv"

	logger.info("Creating csv files...")
	with open(csv_file_path, mode="a", newline="") as file:
	fieldnames = [
	"prompt",
	"answer",
	"tool",
	"model",
	"p_yes",
	"p_no",
	"confidence",
	"info_utility",
	"prediction",
	"Correct",
	"input_tokens",
	"output_tokens",
	"total_tokens",
	"input_cost",
	"output_cost",
	"total_cost",
	"prompt_response",
	"error",
	"crowd_prediction",
	"crowd_correct",
	]
	writer = csv.DictWriter(file, fieldnames=fieldnames)

	if file.tell() == 0:
	writer.writeheader()

	for t in tools:
	logger.info("Loading the tool...")
	try:
	tool = tool_map(t)
	except Exception as e:
	logger.error(f"Error while loading the tool={tool}")
	continue
	correct_answers = 0
	total_answers = 0
	for test_question in tqdm(
	questions, desc=f"Running tool {t}", total=len(questions)
	):
	test_q = {
	"prompt": test_question["question"],
	"answer": test_question["answer"],
	"crowd_prediction": test_question["crowd"][-1]["forecast"],
	"tool": t,
	"model": model,
	"counter_callback": TokenCounterCallback(),
	"prompt_response": None,
	}

	if kwargs["provide_source_links"]:
	test_q["source_links"] = test_question["source_links"]
	test_q["source_links"] = {
	source_link: url_to_content[source_link]
	for source_link in test_q["source_links"]
	}

	crowd_forecast = test_question["crowd"][-1]["forecast"]
	test_q["crowd_prediction"] = (
	"yes"
	if crowd_forecast > 0.5
	else "no" if crowd_forecast < 0.5 else None
	)
	test_q["crowd_correct"] = test_q["crowd_prediction"] == test_q["answer"]

	CURRENT_RETRIES = 0
	while True:
	try:
	response = tool.run({test_q, **kwargs})
	test_q = parse_response(response, test_q)
	if test_q["Correct"] == True:
	correct_answers += 1
	if test_q["prediction"] is not None:
	total_answers += 1
	print(
	f"===========ACCURACY============== {correct_answers/total_answers*100}%"
	)
	break
	except openai.APIError as e:
	logger.error(f"Error running benchmark for tool {t}: {e}")
	CURRENT_RETRIES += 1
	if CURRENT_RETRIES > MAX_RETRIES:
	logger.error(
	f"Max retries reached for tool {t}. Skipping question."
	)
	test_q["error"] = e
	break
	else:
	logger.info(
	f"Retrying tool {t} for question {test_q['prompt']}"
	)
	continue

	except Exception as e:
	logger.error(f"Error running benchmark for tool {t}: {e}")
	test_q["error"] = e
	break

	if kwargs["provide_source_links"]:
	del test_q["source_links"]
	del test_q["counter_callback"]

	writer.writerow(test_q)

	write_results(csv_file_path)

	end_time = time.time()
	total_time = end_time - start_time
	logger.info(f"Total Time: {total_time} seconds")


	if __name__ == "__main__":
	kwargs = {}
	kwargs["num_questions"] = 10
	kwargs["tools"] = [
	"prediction-online",
	]
	kwargs["model"] = [
	"gpt-3.5-turbo-0125",
	]

	kwargs["api_keys"] = {}
	kwargs["api_keys"]["openai"] = os.getenv("OPENAI_API_KEY")
	kwargs["api_keys"]["anthropic"] = os.getenv("ANTHROPIC_API_KEY")
	kwargs["api_keys"]["openrouter"] = os.getenv("OPENROUTER_API_KEY")

	kwargs["num_urls"] = 3
	kwargs["num_words"] = 300
	kwargs["provide_source_links"] = True
	run_benchmark(kwargs)