Spaces:

valory
/

olas-prediction-leaderboard

Running

File size: 9,758 Bytes

da3b15a

#!/usr/bin/env python

import os
import csv
import json
import time
import pickle
import openai
import pandas as pd
from pathlib import Path
from tqdm import tqdm
from dotenv import load_dotenv
from mech.packages.valory.customs.prediction_request import prediction_request
from benchmark.utils import get_logger, TokenCounterCallback

load_dotenv()
logger = get_logger(__name__)
this_dir = Path(__file__).parent


def tool_map(tool):
    """Map the tool name to the tool class."""

    tool_dict = {
        "prediction-online": prediction_request,
        "prediction-offline": prediction_request,
    }

    tool = tool_dict.get(tool, None)

    if tool is None:
        raise Exception(f"Tool {tool} not found.")
    else:
        return tool


def prepare_questions(kwargs):
    test_questions = json.load(
        open(this_dir / "olas-predict-benchmark/benchmark/data/autocast/autocast_questions_filtered.json")
    )
    with open(
        this_dir / "olas-predict-benchmark/benchmark/data/autocast/autocast_questions_filtered.pkl", "rb"
    ) as f:
        url_to_content = pickle.load(f)
    num_questions = kwargs.pop("num_questions", len(test_questions))

    questions = []
    for q in test_questions:
        if q["qtype"] == "t/f" and q["answer"] is not None:
            questions.append(q)
        if len(questions) >= num_questions:
            break

    return questions, url_to_content


def parse_response(response, test_q):
    try:
        result = json.loads(response[0])
    except Exception as e:
        print("The response is not json-format compatible")
        print(f"################### response[0] = {response[0]}")
        test_q["Correct"] = False
        test_q["prediction"] = None
        return test_q

    if "p_yes" in result.keys():
        test_q["p_yes"] = float(result["p_yes"])
    else:
        test_q["p_yes"] = None

    if "p_no" in result.keys():
        test_q["p_no"] = float(result["p_no"])
    else:
        test_q["p_no"] = None
    
    if "confidence" in result.keys():
        test_q["confidence"] = float(result["confidence"])
    else:
        test_q["confidence"] = None

    if "info_utility" in result.keys():
        test_q["info_utility"] = float(result["info_utility"])
    else:
        test_q["info_utility"] = None

    if response[3] is not None:
        test_q["input_tokens"] = response[3].cost_dict["input_tokens"]
        test_q["output_tokens"] = response[3].cost_dict["output_tokens"]
        test_q["total_tokens"] = response[3].cost_dict["total_tokens"]
        test_q["input_cost"] = response[3].cost_dict["input_cost"]
        test_q["output_cost"] = response[3].cost_dict["output_cost"]
        test_q["total_cost"] = response[3].cost_dict["total_cost"]
    test_q["prompt_response"] = response[1].replace(os.linesep, "")

    if (test_q["p_yes"] is None) or (float(result["p_yes"]) == float(result["p_no"])):
        test_q["prediction"] = None
    else:
        test_q["prediction"] = "yes" if test_q["p_yes"] > test_q["p_no"] else "no"
    test_q["Correct"] = test_q["prediction"] == test_q["answer"]
    return test_q


def write_results(csv_file_path):

    results_path = Path(csv_file_path.parent)
    time_string = csv_file_path.stem.split("_", 1)[-1]

    results_df = pd.read_csv(csv_file_path)
    num_errors = results_df["error"].count()
    logger.info(f"Num errors: {str(num_errors)}")
    results_df = results_df.dropna(subset=["prediction"])
    grouped_df = results_df.groupby(["tool", "model"]).agg(
        {
            "Correct": ["mean", "sum", "count"],
            "crowd_correct": ["mean"],
            "input_tokens": ["mean"],
            "output_tokens": ["mean"],
            "total_tokens": ["mean"],
            "input_cost": ["mean"],
            "output_cost": ["mean"],
            "total_cost": ["mean"],
        }
    )

    grouped_df.columns = ["_".join(col).strip() for col in grouped_df.columns.values]
    summary_df = grouped_df.reset_index().rename(
        columns={
            "Correct_mean": "accuracy",
            "Correct_sum": "correct",
            "Correct_count": "total",
            "crowd_correct_mean": "crowd_accuracy",
        }
    )

    logger.info(f"Results:\n\n {results_df}")
    summary_df.to_csv(results_path / f"summary_{time_string}.csv", index=False)


def run_benchmark(kwargs):
    """Start the benchmark tests. If a category flag is provided, run the categories with that mark."""

    logger.info("Running benchmark tests...")

    tools = kwargs.pop("tools")
    model = kwargs.pop("model")[0]
    MAX_RETRIES = kwargs.pop("max_retries", 3)
    questions, url_to_content = prepare_questions(kwargs)
    logger.info(f"Running {len(questions)} questions for each tool: {tools}")

    results_path = Path("results")
    if not results_path.exists():
        results_path.mkdir(exist_ok=True)

    start_time = time.time()
    time_string = time.strftime("%y%m%d%H%M%S", time.localtime(start_time))
    csv_file_path = results_path / f"results_{time_string}.csv"

    logger.info("Creating csv files...")
    with open(csv_file_path, mode="a", newline="") as file:
        fieldnames = [
            "prompt",
            "answer",
            "tool",
            "model",
            "p_yes",
            "p_no",
            "confidence",
            "info_utility",
            "prediction",
            "Correct",
            "input_tokens",
            "output_tokens",
            "total_tokens",
            "input_cost",
            "output_cost",
            "total_cost",
            "prompt_response",
            "error",
            "crowd_prediction",
            "crowd_correct",
        ]
        writer = csv.DictWriter(file, fieldnames=fieldnames)

        if file.tell() == 0:
            writer.writeheader()

        for t in tools:
            logger.info("Loading the tool...")
            try:
                tool = tool_map(t)
            except Exception as e:
                logger.error(f"Error while loading the tool={tool}")
                continue
            correct_answers = 0
            total_answers = 0
            for test_question in tqdm(
                questions, desc=f"Running tool {t}", total=len(questions)
            ):
                test_q = {
                    "prompt": test_question["question"],
                    "answer": test_question["answer"],
                    "crowd_prediction": test_question["crowd"][-1]["forecast"],
                    "tool": t,
                    "model": model,
                    "counter_callback": TokenCounterCallback(),
                    "prompt_response": None,
                }

                if kwargs["provide_source_links"]:
                    test_q["source_links"] = test_question["source_links"]
                    test_q["source_links"] = {
                        source_link: url_to_content[source_link]
                        for source_link in test_q["source_links"]
                    }

                crowd_forecast = test_question["crowd"][-1]["forecast"]
                test_q["crowd_prediction"] = (
                    "yes"
                    if crowd_forecast > 0.5
                    else "no" if crowd_forecast < 0.5 else None
                )
                test_q["crowd_correct"] = test_q["crowd_prediction"] == test_q["answer"]

                CURRENT_RETRIES = 0
                while True:
                    try:
                        response = tool.run(**{**test_q, **kwargs})
                        test_q = parse_response(response, test_q)
                        if test_q["Correct"] == True:
                            correct_answers += 1
                        if test_q["prediction"] is not None:
                            total_answers += 1
                            print(
                                f"===========ACCURACY============== {correct_answers/total_answers*100}%"
                            )
                        break
                    except openai.APIError as e:
                        logger.error(f"Error running benchmark for tool {t}: {e}")
                        CURRENT_RETRIES += 1
                        if CURRENT_RETRIES > MAX_RETRIES:
                            logger.error(
                                f"Max retries reached for tool {t}. Skipping question."
                            )
                            test_q["error"] = e
                            break
                        else:
                            logger.info(
                                f"Retrying tool {t} for question {test_q['prompt']}"
                            )
                            continue

                    except Exception as e:
                        logger.error(f"Error running benchmark for tool {t}: {e}")
                        test_q["error"] = e
                        break

                if kwargs["provide_source_links"]:
                    del test_q["source_links"]
                del test_q["counter_callback"]

                writer.writerow(test_q)

    write_results(csv_file_path)

    end_time = time.time()
    total_time = end_time - start_time
    logger.info(f"Total Time: {total_time} seconds")


if __name__ == "__main__":
    kwargs = {}
    kwargs["num_questions"] = 10
    kwargs["tools"] = [
        "prediction-online",
    ]
    kwargs["model"] = [
        "gpt-3.5-turbo-0125",
    ]
    kwargs["api_keys"] = {}
    kwargs["api_keys"]["openai"] = os.getenv("OPENAI_API_KEY")
    kwargs["api_keys"]["anthropic"] = os.getenv("ANTHROPIC_API_KEY")
    kwargs["api_keys"]["openrouter"] = os.getenv("OPENROUTER_API_KEY")

    kwargs["num_urls"] = 3
    kwargs["num_words"] = 300
    kwargs["provide_source_links"] = True
    run_benchmark(kwargs)