olas-prediction-live-dashboard / scripts /update_tools_accuracy.py
rosacastillo's picture
cleaning and refactoring code
6e7e273
raw
history blame
4.26 kB
import os
import pandas as pd
import ipfshttpclient
from pathlib import Path
from utils import INC_TOOLS
from typing import List
ACCURACY_FILENAME = "tools_accuracy.csv"
IPFS_SERVER = "/dns/registry.autonolas.tech/tcp/443/https"
SCRIPTS_DIR = Path(__file__).parent
ROOT_DIR = SCRIPTS_DIR.parent
DATA_DIR = ROOT_DIR / "data"
def update_tools_accuracy(
tools_acc: pd.DataFrame, tools_df: pd.DataFrame, inc_tools: List[str]
) -> pd.DataFrame:
"""To compute/update the latest accuracy information for the different mech tools"""
# computation of the accuracy information
tools_inc = tools_df[tools_df["tool"].isin(inc_tools)]
# filtering errors
tools_non_error = tools_inc[tools_inc["error"] != 1]
tools_non_error.loc[:, "currentAnswer"] = tools_non_error["currentAnswer"].replace(
{"no": "No", "yes": "Yes"}
)
tools_non_error = tools_non_error[
tools_non_error["currentAnswer"].isin(["Yes", "No"])
]
tools_non_error = tools_non_error[tools_non_error["vote"].isin(["Yes", "No"])]
tools_non_error["win"] = (
tools_non_error["currentAnswer"] == tools_non_error["vote"]
).astype(int)
tools_non_error.columns = tools_non_error.columns.astype(str)
print("Tools dataset after filtering")
print(tools_non_error.head())
wins = tools_non_error.groupby(["tool", "win"]).size().unstack().fillna(0)
wins["tool_accuracy"] = (wins[1] / (wins[0] + wins[1])) * 100
wins.reset_index(inplace=True)
wins["total_requests"] = wins[0] + wins[1]
wins.columns = wins.columns.astype(str)
wins = wins[["tool", "tool_accuracy", "total_requests"]]
print("Wins dataset")
print(wins.head())
no_timeline_info = False
try:
timeline = tools_non_error.groupby(["tool"])["request_time"].agg(["min", "max"])
print("timeline dataset")
print(timeline.head())
acc_info = wins.merge(timeline, how="left", on="tool")
except:
print("NO REQUEST TIME INFORMATION AVAILABLE")
no_timeline_info = True
acc_info = wins
if tools_acc is None:
print("Creating accuracy file for the first time")
return acc_info
# update the old information
print("Updating accuracy information")
tools_to_update = list(acc_info["tool"].values)
print("tools to update")
print(tools_to_update)
existing_tools = list(tools_acc["tool"].values)
for tool in tools_to_update:
if tool in existing_tools:
new_accuracy = acc_info[acc_info["tool"] == tool]["tool_accuracy"].values[0]
new_volume = acc_info[acc_info["tool"] == tool]["total_requests"].values[0]
if no_timeline_info:
new_min_timeline = None
new_max_timeline = None
else:
new_min_timeline = acc_info[acc_info["tool"] == tool]["min"].values[0]
new_max_timeline = acc_info[acc_info["tool"] == tool]["max"].values[0]
tools_acc.loc[tools_acc["tool"] == tool, "tool_accuracy"] = new_accuracy
tools_acc.loc[tools_acc["tool"] == tool, "total_requests"] = new_volume
tools_acc.loc[tools_acc["tool"] == tool, "min"] = new_min_timeline
tools_acc.loc[tools_acc["tool"] == tool, "max"] = new_max_timeline
print(tools_acc)
return tools_acc
def compute_tools_accuracy():
print("Computing accuracy of tools")
print("Reading tools parquet file")
tools = pd.read_parquet(DATA_DIR / "tools.parquet")
print(tools.head())
# Computing tools accuracy information
print("Computing tool accuracy information")
# Check if the file exists
acc_data = None
if os.path.exists(DATA_DIR / ACCURACY_FILENAME):
acc_data = pd.read_csv(DATA_DIR / ACCURACY_FILENAME)
acc_data = update_tools_accuracy(acc_data, tools, INC_TOOLS)
# save acc_data into a CSV file
print("Saving into a csv file")
acc_data.to_csv(DATA_DIR / ACCURACY_FILENAME, index=False)
print(acc_data.head())
# save the data into IPFS
client = ipfshttpclient.connect(IPFS_SERVER)
result = client.add(DATA_DIR / ACCURACY_FILENAME)
print(f"HASH of the tools accuracy file: {result['Hash']}")
if __name__ == "__main__":
compute_tools_accuracy()