cyberosa
cleaning and add main repo of benchmark
d269dc6
raw
history blame
9.81 kB
#!/usr/bin/env python
import os
import csv
import json
import time
import pickle
import openai
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from dotenv import load_dotenv
from mech.packages.valory.customs.prediction_request import prediction_request
from benchmark.utils import get_logger, TokenCounterCallback
load_dotenv()
logger = get_logger(__name__)
this_dir = Path(__file__).parent
def tool_map(tool):
"""Map the tool name to the tool class."""
tool_dict = {
"prediction-online": prediction_request,
"prediction-offline": prediction_request,
}
tool = tool_dict.get(tool, None)
if tool is None:
raise Exception(f"Tool {tool} not found.")
else:
return tool
def prepare_questions(kwargs):
test_questions = json.load(
open(
this_dir
/ "olas-predict-benchmark/benchmark/data/autocast/autocast_questions_filtered.json"
)
)
with open(
this_dir
/ "olas-predict-benchmark/benchmark/data/autocast/autocast_questions_filtered.pkl",
"rb",
) as f:
url_to_content = pickle.load(f)
num_questions = kwargs.pop("num_questions", len(test_questions))
questions = []
for q in test_questions:
if q["qtype"] == "t/f" and q["answer"] is not None:
questions.append(q)
if len(questions) >= num_questions:
break
return questions, url_to_content
def parse_response(response, test_q):
try:
result = json.loads(response[0])
except Exception as e:
print("The response is not json-format compatible")
print(f"################### response[0] = {response[0]}")
test_q["Correct"] = False
test_q["prediction"] = None
return test_q
if "p_yes" in result.keys():
test_q["p_yes"] = float(result["p_yes"])
else:
test_q["p_yes"] = None
if "p_no" in result.keys():
test_q["p_no"] = float(result["p_no"])
else:
test_q["p_no"] = None
if "confidence" in result.keys():
test_q["confidence"] = float(result["confidence"])
else:
test_q["confidence"] = None
if "info_utility" in result.keys():
test_q["info_utility"] = float(result["info_utility"])
else:
test_q["info_utility"] = None
if response[3] is not None:
test_q["input_tokens"] = response[3].cost_dict["input_tokens"]
test_q["output_tokens"] = response[3].cost_dict["output_tokens"]
test_q["total_tokens"] = response[3].cost_dict["total_tokens"]
test_q["input_cost"] = response[3].cost_dict["input_cost"]
test_q["output_cost"] = response[3].cost_dict["output_cost"]
test_q["total_cost"] = response[3].cost_dict["total_cost"]
test_q["prompt_response"] = response[1].replace(os.linesep, "")
if (test_q["p_yes"] is None) or (float(result["p_yes"]) == float(result["p_no"])):
test_q["prediction"] = None
else:
test_q["prediction"] = "yes" if test_q["p_yes"] > test_q["p_no"] else "no"
test_q["Correct"] = test_q["prediction"] == test_q["answer"]
return test_q
def write_results(csv_file_path):
results_path = Path(csv_file_path.parent)
time_string = csv_file_path.stem.split("_", 1)[-1]
results_df = pd.read_csv(csv_file_path)
num_errors = results_df["error"].count()
logger.info(f"Num errors: {str(num_errors)}")
results_df = results_df.dropna(subset=["prediction"])
grouped_df = results_df.groupby(["tool", "model"]).agg(
{
"Correct": ["mean", "sum", "count"],
"crowd_correct": ["mean"],
"input_tokens": ["mean"],
"output_tokens": ["mean"],
"total_tokens": ["mean"],
"input_cost": ["mean"],
"output_cost": ["mean"],
"total_cost": ["mean"],
}
)
grouped_df.columns = ["_".join(col).strip() for col in grouped_df.columns.values]
summary_df = grouped_df.reset_index().rename(
columns={
"Correct_mean": "accuracy",
"Correct_sum": "correct",
"Correct_count": "total",
"crowd_correct_mean": "crowd_accuracy",
}
)
logger.info(f"Results:\n\n {results_df}")
summary_df.to_csv(results_path / f"summary_{time_string}.csv", index=False)
def run_benchmark(kwargs):
"""Start the benchmark tests. If a category flag is provided, run the categories with that mark."""
logger.info("Running benchmark tests...")
tools = kwargs.pop("tools")
model = kwargs.pop("model")[0]
MAX_RETRIES = kwargs.pop("max_retries", 3)
questions, url_to_content = prepare_questions(kwargs)
logger.info(f"Running {len(questions)} questions for each tool: {tools}")
results_path = Path("results")
if not results_path.exists():
results_path.mkdir(exist_ok=True)
start_time = time.time()
time_string = time.strftime("%y%m%d%H%M%S", time.localtime(start_time))
csv_file_path = results_path / f"results_{time_string}.csv"
logger.info("Creating csv files...")
with open(csv_file_path, mode="a", newline="") as file:
fieldnames = [
"prompt",
"answer",
"tool",
"model",
"p_yes",
"p_no",
"confidence",
"info_utility",
"prediction",
"Correct",
"input_tokens",
"output_tokens",
"total_tokens",
"input_cost",
"output_cost",
"total_cost",
"prompt_response",
"error",
"crowd_prediction",
"crowd_correct",
]
writer = csv.DictWriter(file, fieldnames=fieldnames)
if file.tell() == 0:
writer.writeheader()
for t in tools:
logger.info("Loading the tool...")
try:
tool = tool_map(t)
except Exception as e:
logger.error(f"Error while loading the tool={tool}")
continue
correct_answers = 0
total_answers = 0
for test_question in tqdm(
questions, desc=f"Running tool {t}", total=len(questions)
):
test_q = {
"prompt": test_question["question"],
"answer": test_question["answer"],
"crowd_prediction": test_question["crowd"][-1]["forecast"],
"tool": t,
"model": model,
"counter_callback": TokenCounterCallback(),
"prompt_response": None,
}
if kwargs["provide_source_links"]:
test_q["source_links"] = test_question["source_links"]
test_q["source_links"] = {
source_link: url_to_content[source_link]
for source_link in test_q["source_links"]
}
crowd_forecast = test_question["crowd"][-1]["forecast"]
test_q["crowd_prediction"] = (
"yes"
if crowd_forecast > 0.5
else "no" if crowd_forecast < 0.5 else None
)
test_q["crowd_correct"] = test_q["crowd_prediction"] == test_q["answer"]
CURRENT_RETRIES = 0
while True:
try:
response = tool.run(**{**test_q, **kwargs})
test_q = parse_response(response, test_q)
if test_q["Correct"] == True:
correct_answers += 1
if test_q["prediction"] is not None:
total_answers += 1
print(
f"===========ACCURACY============== {correct_answers/total_answers*100}%"
)
break
except openai.APIError as e:
logger.error(f"Error running benchmark for tool {t}: {e}")
CURRENT_RETRIES += 1
if CURRENT_RETRIES > MAX_RETRIES:
logger.error(
f"Max retries reached for tool {t}. Skipping question."
)
test_q["error"] = e
break
else:
logger.info(
f"Retrying tool {t} for question {test_q['prompt']}"
)
continue
except Exception as e:
logger.error(f"Error running benchmark for tool {t}: {e}")
test_q["error"] = e
break
if kwargs["provide_source_links"]:
del test_q["source_links"]
del test_q["counter_callback"]
writer.writerow(test_q)
write_results(csv_file_path)
end_time = time.time()
total_time = end_time - start_time
logger.info(f"Total Time: {total_time} seconds")
if __name__ == "__main__":
kwargs = {}
kwargs["num_questions"] = 10
kwargs["tools"] = [
"prediction-online",
]
kwargs["model"] = [
"gpt-3.5-turbo-0125",
]
kwargs["api_keys"] = {}
kwargs["api_keys"]["openai"] = os.getenv("OPENAI_API_KEY")
kwargs["api_keys"]["anthropic"] = os.getenv("ANTHROPIC_API_KEY")
kwargs["api_keys"]["openrouter"] = os.getenv("OPENROUTER_API_KEY")
kwargs["num_urls"] = 3
kwargs["num_words"] = 300
kwargs["provide_source_links"] = True
run_benchmark(kwargs)