import os import pandas as pd import ipfshttpclient from pathlib import Path from utils import INC_TOOLS from typing import List ACCURACY_FILENAME = "tools_accuracy.csv" IPFS_SERVER = "/dns/" SCRIPTS_DIR = Path(__file__).parent ROOT_DIR = SCRIPTS_DIR.parent DATA_DIR = ROOT_DIR / "data" def update_tools_accuracy( tools_acc: pd.DataFrame, tools_df: pd.DataFrame, inc_tools: List[str] ) -> pd.DataFrame: """To compute/update the latest accuracy information for the different mech tools""" # computation of the accuracy information tools_inc = tools_df[tools_df["tool"].isin(inc_tools)] # filtering errors tools_non_error = tools_inc[tools_inc["error"] != 1] tools_non_error.loc[:, "currentAnswer"] = tools_non_error["currentAnswer"].replace( {"no": "No", "yes": "Yes"} ) tools_non_error = tools_non_error[ tools_non_error["currentAnswer"].isin(["Yes", "No"]) ] tools_non_error = tools_non_error[tools_non_error["vote"].isin(["Yes", "No"])] tools_non_error["win"] = ( tools_non_error["currentAnswer"] == tools_non_error["vote"] ).astype(int) tools_non_error.columns = tools_non_error.columns.astype(str) print("Tools dataset after filtering") print(tools_non_error.head()) wins = tools_non_error.groupby(["tool", "win"]).size().unstack().fillna(0) wins["tool_accuracy"] = (wins[1] / (wins[0] + wins[1])) * 100 wins.reset_index(inplace=True) wins["total_requests"] = wins[0] + wins[1] wins.columns = wins.columns.astype(str) wins = wins[["tool", "tool_accuracy", "total_requests"]] print("Wins dataset") print(wins.head()) no_timeline_info = False try: timeline = tools_non_error.groupby(["tool"])["request_time"].agg(["min", "max"]) print("timeline dataset") print(timeline.head()) acc_info = wins.merge(timeline, how="left", on="tool") except: print("NO REQUEST TIME INFORMATION AVAILABLE") no_timeline_info = True acc_info = wins if tools_acc is None: print("Creating accuracy file for the first time") return acc_info # update the old information print("Updating accuracy information") tools_to_update = list(acc_info["tool"].values) print("tools to update") print(tools_to_update) existing_tools = list(tools_acc["tool"].values) acc_info["min"] = acc_info["min"].dt.strftime("%Y-%m-%d %H:%M:%S") acc_info["max"] = acc_info["max"].dt.strftime("%Y-%m-%d %H:%M:%S") for tool in tools_to_update: new_accuracy = acc_info[acc_info["tool"] == tool]["tool_accuracy"].values[0] new_volume = acc_info[acc_info["tool"] == tool]["total_requests"].values[0] if no_timeline_info: new_min_timeline = None new_max_timeline = None else: new_min_timeline = acc_info[acc_info["tool"] == tool]["min"].values[0] new_max_timeline = acc_info[acc_info["tool"] == tool]["max"].values[0] if tool in existing_tools: tools_acc.loc[tools_acc["tool"] == tool, "tool_accuracy"] = new_accuracy tools_acc.loc[tools_acc["tool"] == tool, "total_requests"] = new_volume tools_acc.loc[tools_acc["tool"] == tool, "min"] = new_min_timeline tools_acc.loc[tools_acc["tool"] == tool, "max"] = new_max_timeline else: # new tool to add to the file # tool,tool_accuracy,total_requests,min,max new_row = { "tool": tool, "tool_accuracy": new_accuracy, "total_requests": new_volume, "min": new_min_timeline, "max": new_max_timeline, } tools_acc = pd.concat([tools_acc, pd.DataFrame(new_row)], ignore_index=True) print(tools_acc) return tools_acc def compute_tools_accuracy(): print("Computing accuracy of tools") print("Reading tools parquet file") tools = pd.read_parquet(DATA_DIR / "tools.parquet") print(tools.head()) # Computing tools accuracy information print("Computing tool accuracy information") # Check if the file exists acc_data = None if os.path.exists(DATA_DIR / ACCURACY_FILENAME): acc_data = pd.read_csv(DATA_DIR / ACCURACY_FILENAME) acc_data = update_tools_accuracy(acc_data, tools, INC_TOOLS) # save acc_data into a CSV file print("Saving into a csv file") acc_data.to_csv(DATA_DIR / ACCURACY_FILENAME, index=False) print(acc_data.head()) # save the data into IPFS client = ipfshttpclient.connect(IPFS_SERVER) result = client.add(DATA_DIR / ACCURACY_FILENAME) print(f"HASH of the tools accuracy file: {result['Hash']}") if __name__ == "__main__": compute_tools_accuracy()