|
|
|
|
|
import os |
|
import csv |
|
import json |
|
import time |
|
import pickle |
|
import openai |
|
import pandas as pd |
|
from pathlib import Path |
|
from tqdm import tqdm |
|
from dotenv import load_dotenv |
|
from mech.packages.valory.customs.prediction_request import prediction_request |
|
from benchmark.utils import get_logger, TokenCounterCallback |
|
|
|
load_dotenv() |
|
logger = get_logger(__name__) |
|
this_dir = Path(__file__).parent |
|
|
|
|
|
def tool_map(tool): |
|
"""Map the tool name to the tool class.""" |
|
|
|
tool_dict = { |
|
"prediction-online": prediction_request, |
|
"prediction-offline": prediction_request, |
|
} |
|
|
|
tool = tool_dict.get(tool, None) |
|
|
|
if tool is None: |
|
raise Exception(f"Tool {tool} not found.") |
|
else: |
|
return tool |
|
|
|
|
|
def prepare_questions(kwargs): |
|
test_questions = json.load( |
|
open( |
|
this_dir |
|
/ "olas-predict-benchmark/benchmark/data/autocast/autocast_questions_filtered.json" |
|
) |
|
) |
|
with open( |
|
this_dir |
|
/ "olas-predict-benchmark/benchmark/data/autocast/autocast_questions_filtered.pkl", |
|
"rb", |
|
) as f: |
|
url_to_content = pickle.load(f) |
|
num_questions = kwargs.pop("num_questions", len(test_questions)) |
|
|
|
questions = [] |
|
for q in test_questions: |
|
if q["qtype"] == "t/f" and q["answer"] is not None: |
|
questions.append(q) |
|
if len(questions) >= num_questions: |
|
break |
|
|
|
return questions, url_to_content |
|
|
|
|
|
def parse_response(response, test_q): |
|
try: |
|
result = json.loads(response[0]) |
|
except Exception as e: |
|
print("The response is not json-format compatible") |
|
print(f"################### response[0] = {response[0]}") |
|
test_q["Correct"] = False |
|
test_q["prediction"] = None |
|
return test_q |
|
|
|
if "p_yes" in result.keys(): |
|
test_q["p_yes"] = float(result["p_yes"]) |
|
else: |
|
test_q["p_yes"] = None |
|
|
|
if "p_no" in result.keys(): |
|
test_q["p_no"] = float(result["p_no"]) |
|
else: |
|
test_q["p_no"] = None |
|
|
|
if "confidence" in result.keys(): |
|
test_q["confidence"] = float(result["confidence"]) |
|
else: |
|
test_q["confidence"] = None |
|
|
|
if "info_utility" in result.keys(): |
|
test_q["info_utility"] = float(result["info_utility"]) |
|
else: |
|
test_q["info_utility"] = None |
|
|
|
if response[3] is not None: |
|
test_q["input_tokens"] = response[3].cost_dict["input_tokens"] |
|
test_q["output_tokens"] = response[3].cost_dict["output_tokens"] |
|
test_q["total_tokens"] = response[3].cost_dict["total_tokens"] |
|
test_q["input_cost"] = response[3].cost_dict["input_cost"] |
|
test_q["output_cost"] = response[3].cost_dict["output_cost"] |
|
test_q["total_cost"] = response[3].cost_dict["total_cost"] |
|
test_q["prompt_response"] = response[1].replace(os.linesep, "") |
|
|
|
if (test_q["p_yes"] is None) or (float(result["p_yes"]) == float(result["p_no"])): |
|
test_q["prediction"] = None |
|
else: |
|
test_q["prediction"] = "yes" if test_q["p_yes"] > test_q["p_no"] else "no" |
|
test_q["Correct"] = test_q["prediction"] == test_q["answer"] |
|
return test_q |
|
|
|
|
|
def write_results(csv_file_path): |
|
|
|
results_path = Path(csv_file_path.parent) |
|
time_string = csv_file_path.stem.split("_", 1)[-1] |
|
|
|
results_df = pd.read_csv(csv_file_path) |
|
num_errors = results_df["error"].count() |
|
logger.info(f"Num errors: {str(num_errors)}") |
|
results_df = results_df.dropna(subset=["prediction"]) |
|
grouped_df = results_df.groupby(["tool", "model"]).agg( |
|
{ |
|
"Correct": ["mean", "sum", "count"], |
|
"crowd_correct": ["mean"], |
|
"input_tokens": ["mean"], |
|
"output_tokens": ["mean"], |
|
"total_tokens": ["mean"], |
|
"input_cost": ["mean"], |
|
"output_cost": ["mean"], |
|
"total_cost": ["mean"], |
|
} |
|
) |
|
|
|
grouped_df.columns = ["_".join(col).strip() for col in grouped_df.columns.values] |
|
summary_df = grouped_df.reset_index().rename( |
|
columns={ |
|
"Correct_mean": "accuracy", |
|
"Correct_sum": "correct", |
|
"Correct_count": "total", |
|
"crowd_correct_mean": "crowd_accuracy", |
|
} |
|
) |
|
|
|
logger.info(f"Results:\n\n {results_df}") |
|
summary_df.to_csv(results_path / f"summary_{time_string}.csv", index=False) |
|
|
|
|
|
def run_benchmark(kwargs): |
|
"""Start the benchmark tests. If a category flag is provided, run the categories with that mark.""" |
|
|
|
logger.info("Running benchmark tests...") |
|
|
|
tools = kwargs.pop("tools") |
|
model = kwargs.pop("model")[0] |
|
MAX_RETRIES = kwargs.pop("max_retries", 3) |
|
questions, url_to_content = prepare_questions(kwargs) |
|
logger.info(f"Running {len(questions)} questions for each tool: {tools}") |
|
|
|
results_path = Path("results") |
|
if not results_path.exists(): |
|
results_path.mkdir(exist_ok=True) |
|
|
|
start_time = time.time() |
|
time_string = time.strftime("%y%m%d%H%M%S", time.localtime(start_time)) |
|
csv_file_path = results_path / f"results_{time_string}.csv" |
|
|
|
logger.info("Creating csv files...") |
|
with open(csv_file_path, mode="a", newline="") as file: |
|
fieldnames = [ |
|
"prompt", |
|
"answer", |
|
"tool", |
|
"model", |
|
"p_yes", |
|
"p_no", |
|
"confidence", |
|
"info_utility", |
|
"prediction", |
|
"Correct", |
|
"input_tokens", |
|
"output_tokens", |
|
"total_tokens", |
|
"input_cost", |
|
"output_cost", |
|
"total_cost", |
|
"prompt_response", |
|
"error", |
|
"crowd_prediction", |
|
"crowd_correct", |
|
] |
|
writer = csv.DictWriter(file, fieldnames=fieldnames) |
|
|
|
if file.tell() == 0: |
|
writer.writeheader() |
|
|
|
for t in tools: |
|
logger.info("Loading the tool...") |
|
try: |
|
tool = tool_map(t) |
|
except Exception as e: |
|
logger.error(f"Error while loading the tool={tool}") |
|
continue |
|
correct_answers = 0 |
|
total_answers = 0 |
|
for test_question in tqdm( |
|
questions, desc=f"Running tool {t}", total=len(questions) |
|
): |
|
test_q = { |
|
"prompt": test_question["question"], |
|
"answer": test_question["answer"], |
|
"crowd_prediction": test_question["crowd"][-1]["forecast"], |
|
"tool": t, |
|
"model": model, |
|
"counter_callback": TokenCounterCallback(), |
|
"prompt_response": None, |
|
} |
|
|
|
if kwargs["provide_source_links"]: |
|
test_q["source_links"] = test_question["source_links"] |
|
test_q["source_links"] = { |
|
source_link: url_to_content[source_link] |
|
for source_link in test_q["source_links"] |
|
} |
|
|
|
crowd_forecast = test_question["crowd"][-1]["forecast"] |
|
test_q["crowd_prediction"] = ( |
|
"yes" |
|
if crowd_forecast > 0.5 |
|
else "no" if crowd_forecast < 0.5 else None |
|
) |
|
test_q["crowd_correct"] = test_q["crowd_prediction"] == test_q["answer"] |
|
|
|
CURRENT_RETRIES = 0 |
|
while True: |
|
try: |
|
response = tool.run(**{**test_q, **kwargs}) |
|
test_q = parse_response(response, test_q) |
|
if test_q["Correct"] == True: |
|
correct_answers += 1 |
|
if test_q["prediction"] is not None: |
|
total_answers += 1 |
|
print( |
|
f"===========ACCURACY============== {correct_answers/total_answers*100}%" |
|
) |
|
break |
|
except openai.APIError as e: |
|
logger.error(f"Error running benchmark for tool {t}: {e}") |
|
CURRENT_RETRIES += 1 |
|
if CURRENT_RETRIES > MAX_RETRIES: |
|
logger.error( |
|
f"Max retries reached for tool {t}. Skipping question." |
|
) |
|
test_q["error"] = e |
|
break |
|
else: |
|
logger.info( |
|
f"Retrying tool {t} for question {test_q['prompt']}" |
|
) |
|
continue |
|
|
|
except Exception as e: |
|
logger.error(f"Error running benchmark for tool {t}: {e}") |
|
test_q["error"] = e |
|
break |
|
|
|
if kwargs["provide_source_links"]: |
|
del test_q["source_links"] |
|
del test_q["counter_callback"] |
|
|
|
writer.writerow(test_q) |
|
|
|
write_results(csv_file_path) |
|
|
|
end_time = time.time() |
|
total_time = end_time - start_time |
|
logger.info(f"Total Time: {total_time} seconds") |
|
|
|
|
|
if __name__ == "__main__": |
|
kwargs = {} |
|
kwargs["num_questions"] = 10 |
|
kwargs["tools"] = [ |
|
"prediction-online", |
|
] |
|
kwargs["model"] = [ |
|
"gpt-3.5-turbo-0125", |
|
] |
|
|
|
kwargs["api_keys"] = {} |
|
kwargs["api_keys"]["openai"] = os.getenv("OPENAI_API_KEY") |
|
kwargs["api_keys"]["anthropic"] = os.getenv("ANTHROPIC_API_KEY") |
|
kwargs["api_keys"]["openrouter"] = os.getenv("OPENROUTER_API_KEY") |
|
|
|
kwargs["num_urls"] = 3 |
|
kwargs["num_words"] = 300 |
|
kwargs["provide_source_links"] = True |
|
run_benchmark(kwargs) |
|
|