from email.utils import parseaddr from huggingface_hub import HfApi import os import datetime import pandas as pd import json import evaluate as nlp_evaluate import re import sqlite3 import random from tqdm import tqdm import sys import numpy as np from sqlparse import parse random.seed(10001) bleu = nlp_evaluate.load("bleu") rouge = nlp_evaluate.load('rouge') LEADERBOARD_PATH = "Exploration-Lab/BookSQL-Leaderboard" RESULTS_PATH = "Exploration-Lab/BookSQL-Leaderboard" api = HfApi() TOKEN = os.environ.get("TOKEN", None) YEAR_VERSION = "2024" sqlite_path = "accounting/accounting_for_testing.sqlite" def format_error(msg): return f"
{msg}
" def format_warning(msg): return f"{msg}
" def format_log(msg): return f"{msg}
" def model_hyperlink(link, model_name): return f'{model_name}' def input_verification(method_name, url, path_to_file, organisation, mail): for input in [method_name, url, path_to_file, organisation, mail]: if input == "": return format_warning("Please fill all the fields.") # Very basic email parsing _, parsed_mail = parseaddr(mail) if not "@" in parsed_mail: return format_warning("Please provide a valid email adress.") if path_to_file is None: return format_warning("Please attach a file.") return parsed_mail def replace_current_date_and_now(_sql, _date): _sql = _sql.replace('current_date', "\'"+_date+"\'") _sql = _sql.replace(', now', ", \'"+_date+"\'") return _sql def remove_gold_Non_exec(data,df1, sqlite_path): con = sqlite3.connect(sqlite_path) cur = con.cursor() out, non_exec=[], [] new_df = df1.copy() new_df.loc[:, 'Exec/Non-Exec'] = 0 for i,s in tqdm(enumerate(data)): _sql = str(s).replace('"', "'").lower() _sql = replace_current_date_and_now(_sql, '2022-06-01') _sql = replace_percent_symbol_y(_sql) try: cur.execute(_sql) res = cur.fetchall() out.append(i) except: non_exec.append(i) # print("_sql: ", _sql) new_df.loc[out, 'Exec/Non-Exec'] = 1 con.close() return out, non_exec, new_df def remove_data_from_index(data, ind_list): new_data=[] for i in ind_list: new_data.append(data[i]) return new_data def parse_query(query): parsed = parse(query)[0] return parsed def normalize_query(query): # Remove comments query = re.sub(r'--.*', '', query) query = re.sub(r'/\*.*?\*/', '', query, flags=re.DOTALL) # Remove extra whitespace query = re.sub(r'\s+', ' ', query) # Strip leading and trailing whitespace query = query.strip() return query.lower() def get_exec_match_acc(gold, pred): assert len(gold)==len(pred) correct_sql_count=0 count=0 goldd = [re.sub(' +', ' ', str(g).replace("'", '"').lower()) for g in gold] predd = [re.sub(' +', ' ', str(p).replace("'", '"').lower()) for p in pred] # for g, p in zip(gold, pred): # #extra space, double quotes, lower_case # gg = re.sub(' +', ' ', str(g).replace("'", '"').lower()) # gg = re.sub(' +', ' ', str(p).replace("'", '"').lower()) # if gold==pred: # count+=1 for q1, q2 in zip(goldd, predd): q1 = normalize_query(q1) q2 = normalize_query(q2) parsed_query1 = parse_query(q1) parsed_query2 = parse_query(q2) if str(parsed_query1) == str(parsed_query2): correct_sql_count+=1 return correct_sql_count/len(goldd), 0 def replace_percent_symbol_y(_sql): _sql = _sql.replace('%y', "%Y") return _sql def get_exec_results(sqlite_path, scores, df, flag, gold_sql_map_res={}): con = sqlite3.connect(sqlite_path) cur = con.cursor() i,j,count=0,0,0 out,non_exec={},{} new_df = df.copy() responses=[] for s in tqdm(scores): _sql = str(s).replace('"', "'").lower() _sql = replace_current_date_and_now(_sql, '2022-06-01') _sql = replace_percent_symbol_y(_sql) try: cur.execute(_sql) res = cur.fetchall() out[i] = str(res) except Exception as err: non_exec[i]=err i+=1 if flag=='g': new_df.loc[list(out.keys()), 'GOLD_res'] = list(out.values()) # assert len(gold_sql_map_res)==count if flag=='p': new_df.loc[list(out.keys()), 'PRED_res'] = list(out.values()) if flag=='d': new_df.loc[list(out.keys()), 'DEBUG_res'] = list(out.values()) con.close() return out, non_exec, new_df def get_scores(gold_dict, pred_dict): exec_count, non_exec_count=0, 0 none_count=0 correct_sql, incorrect_sql = [], [] for k, res in pred_dict.items(): if k in gold_dict: if gold_dict[k]==str(None) or str(None) in gold_dict[k]: none_count+=1 continue if res==gold_dict[k]: exec_count+=1 correct_sql.append(k) else: non_exec_count+=1 incorrect_sql.append(k) return exec_count, non_exec_count, none_count, correct_sql, incorrect_sql def get_total_gold_none_count(gold_dict): none_count, ok_count=0, 0 for k, res in gold_dict.items(): if res==str(None) or str(None) in res: none_count+=1 else: ok_count+=1 return ok_count, none_count def Evaluate(df): # df - [id, pred_sql] pred_sql = df['pred_sql'].to_list() ids = df['id'].to_list() f = open(f"tests/test.json") questions_and_ids = json.load(f) ts = open(f"tests/test_sql.json") gold_sql = json.load(ts) gold_sql_list=[] pred_sql_list=[] questions_list=[] for idx, pred in zip(ids, pred_sql): ques = questions_and_ids[idx]['Query'] gd_sql = gold_sql[idx]['SQL'] gold_sql_list.append(gd_sql) pred_sql_list.append(pred) questions_list.append(ques) df = pd.DataFrame({'NLQ':questions_list, 'GOLD SQL':gold_sql_list, 'PREDICTED SQL':pred_sql_list}) test_size = len(df) pred_score = df['PREDICTED SQL'].str.lower().values # debug_score = df['DEBUGGED SQL'].str.lower().values gold_score1 = df['GOLD SQL'].str.lower().values print("Checking non-exec Gold sql query") gold_exec, gold_not_exec, new_df = remove_gold_Non_exec(gold_score1, df, sqlite_path) print("GOLD Total exec SQL query: {}/{}".format(len(gold_exec), test_size)) print("GOLD Total non-exec SQL query: {}/{}".format(len(gold_not_exec), test_size)) prev_non_exec_df = new_df[new_df['Exec/Non-Exec'] == 0] new_df = new_df[new_df['Exec/Non-Exec']==1] prev_non_exec_df.reset_index(inplace=True) new_df.reset_index(inplace=True) #Removing Non-exec sql from data print(f"Removing {len(gold_not_exec)} non-exec sql query from all Gold/Pred/Debug ") gold_score1 = remove_data_from_index(gold_score1, gold_exec) pred_score = remove_data_from_index(pred_score, gold_exec) # debug_score = remove_data_from_index(debug_score, gold_exec) gold_score = [[x] for x in gold_score1] assert len(gold_score) == len(pred_score) #== len(debug_score) pred_bleu_score = bleu.compute(predictions=pred_score, references=gold_score) pred_rouge_score = rouge.compute(predictions=pred_score, references=gold_score) pred_exact_match, pred_partial_f1_score = get_exec_match_acc(gold_score1, pred_score) print("PREDICTED_vs_GOLD Final bleu_score: ", pred_bleu_score['bleu']) print("PREDICTED_vs_GOLD Final rouge_score: ", pred_rouge_score['rougeL']) print("PREDICTED_vs_GOLD Exact Match Accuracy: ", pred_exact_match) # print("PREDICTED_vs_GOLD Partial CM F1 score: ", pred_partial_f1_score) print() new_df.loc[:, 'GOLD_res'] = str(None) new_df.loc[:, 'PRED_res'] = str(None) # new_df.loc[:, 'DEBUG_res'] = str(None) print("Getting Gold results") # gout_res_dict, gnon_exec_err_dict, gold_sql_map_res = get_exec_results(cur, gold_score1, 'g') gout_res_dict, gnon_exec_err_dict, new_df = get_exec_results(sqlite_path, gold_score1, new_df, 'g') total_gold_ok_count, total_gold_none_count = get_total_gold_none_count(gout_res_dict) print("Total Gold None count: ", total_gold_none_count) print("Getting Pred results") pout_res_dict, pnon_exec_err_dict, new_df = get_exec_results(sqlite_path, pred_score, new_df, 'p') # print("Getting Debug results") # dout_res_dict, dnon_exec_err_dict = get_exec_results(cur, debug_score, 'd') print("GOLD Total exec SQL query: {}/{}".format(len(gold_exec), test_size)) print("GOLD Total non-exec SQL query: {}/{}".format(len(gold_not_exec), test_size)) print() print("PRED Total exec SQL query: {}/{}".format(len(pout_res_dict), len(pred_score))) print("PRED Total non-exec SQL query: {}/{}".format(len(pnon_exec_err_dict), len(pred_score))) print() # print("DEBUG Total exec SQL query: {}/{}".format(len(dout_res_dict), len(debug_score))) # print("DEBUG Total non-exec SQL query: {}/{}".format(len(dnon_exec_err_dict), len(debug_score))) # print() pred_correct_exec_acc_count, pred_incorrect_exec_acc_count, pred_none_count, pred_correct_sql, pred_incorrect_sql = get_scores(gout_res_dict, pout_res_dict) # debug_correct_exec_acc_count, debug_incorrect_exec_acc_count, debug_none_count, debug_correct_sql, debug_incorrect_sql = get_scores(gout_res_dict, dout_res_dict) # print("PRED_vs_GOLD None_count: ", total_gold_none_count) print("PRED_vs_GOLD Correct_Exec_count without None: ", pred_correct_exec_acc_count) print("PRED_vs_GOLD Incorrect_Exec_count without None: ", pred_incorrect_exec_acc_count) print("PRED_vs_GOLD Exec_Accuracy: ", pred_correct_exec_acc_count/total_gold_ok_count) print() return pred_exact_match, pred_correct_exec_acc_count/total_gold_ok_count, pred_partial_f1_score, pred_bleu_score['bleu'], pred_rouge_score['rougeL'] def add_new_eval( method_name: str, url: str, path_to_file: str, organisation: str, mail: str, ): parsed_mail = input_verification( method_name, url, path_to_file, organisation, mail, ) # load the file df = pd.read_csv(path_to_file) submission_df = pd.read_csv(path_to_file) # modify the df to include metadata df["Method"] = method_name df["url"] = url df["organisation"] = organisation df["mail"] = parsed_mail df["timestamp"] = datetime.datetime.now() submission_df = pd.read_csv(path_to_file) submission_df["Method"] = method_name submission_df["Submitted By"] = organisation # upload to spaces using the hf api at path_in_repo = f"submissions/{method_name}" file_name = f"{method_name}-{organisation}-{datetime.datetime.now().strftime('%Y-%m-%d')}.csv" EM, EX, PCM_F1, BLEU, ROUGE = Evaluate(submission_df) sub_df = pd.DataFrame() sub_df["Method"] = method_name sub_df["Submitted By"] = organisation sub_df['EMA'] = EM sub_df['EX'] = EX # submission_df['PCM_F1'] = PCM_F1 sub_df['BLEU-4'] = BLEU sub_df['ROUGE-L'] = ROUGE # upload the df to spaces import io buffer = io.BytesIO() df.to_csv(buffer, index=False) # Write the DataFrame to a buffer in CSV format buffer.seek(0) # Rewind the buffer to the beginning api.upload_file( repo_id=RESULTS_PATH, path_in_repo=f"{path_in_repo}/{file_name}", path_or_fileobj=buffer, token=TOKEN, repo_type="space", ) # read the leaderboard leaderboard_df = pd.read_csv(f"submissions/baseline/baseline.csv") # append the new submission_df csv to the leaderboard # leaderboard_df = leaderboard_df._append(submission_df) leaderboard_df = pd.concat([leaderboard_df, sub_df], ignore_index=True) # save the new leaderboard # leaderboard_df.to_csv(f"submissions/baseline/baseline.csv", index=False) leaderboard_buffer = io.BytesIO() leaderboard_df.to_csv(leaderboard_buffer, index=False) leaderboard_buffer.seek(0) api.upload_file( repo_id=LEADERBOARD_PATH, path_in_repo=f"submissions/baseline/baseline.csv", path_or_fileobj=leaderboard_buffer, token=TOKEN, repo_type="space", ) return format_log( f"Method {method_name} submitted by {organisation} successfully. \nPlease refresh the leaderboard, and wait a bit to see the score displayed" )