import functools import operator import Levenshtein import evaluate import pandas as pd from tqdm import tqdm import config from api_wrappers import hf_data_loader from custom_metrics import gpt_eval BLEU = evaluate.load('bleu', cache_dir=config.CACHE_DIR) def bleu_fn(pred, ref, **kwargs): return BLEU.compute(predictions=[pred], references=[ref])["bleu"] METEOR = evaluate.load('meteor', cache_dir=config.CACHE_DIR) def meteor_fn(pred, ref, **kwargs): return METEOR.compute(predictions=[pred], references=[ref])["meteor"] ROUGE = evaluate.load('rouge', cache_dir=config.CACHE_DIR) def rouge1_fn(pred, ref, **kwargs): return ROUGE.compute(predictions=[pred], references=[ref])["rouge1"] def rouge2_fn(pred, ref, **kwargs): return ROUGE.compute(predictions=[pred], references=[ref])["rouge2"] def rougeL_fn(pred, ref, **kwargs): return ROUGE.compute(predictions=[pred], references=[ref])["rougeL"] BERTSCORE = evaluate.load('bertscore', cache_dir=config.CACHE_DIR) def bertscore_fn(pred, ref, **kwargs): return BERTSCORE.compute(predictions=[pred], references=[ref], model_type="distilbert-base-uncased")["f1"][0] CHRF = evaluate.load("chrf") def chrf_fn(pred, ref, **kwargs): return CHRF.compute(predictions=[pred], references=[[ref]])["score"] TER = evaluate.load("ter") def ter_fn(pred, ref, **kwargs): return TER.compute(predictions=[pred], references=[[ref]])["score"] def edit_distance_fn(pred, ref, **kwargs): return Levenshtein.distance(pred, ref) def edit_time_fn(pred, ref, **kwargs): return kwargs["edittime"] def gptscore_ref_1_fn(pred, ref, **kwargs): return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=1) def gptscore_ref_3_fn(pred, ref, **kwargs): return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=3) def gptscore_ref_5_fn(pred, ref, **kwargs): return gpt_eval.compute_ref(prediction=pred, reference=ref, n_requests=5) def gptscore_noref_1_fn(pred, ref, **kwargs): return gpt_eval.compute_noref(prediction=pred, diff=kwargs['diff'], n_requests=1) def gptscore_noref_3_fn(pred, ref, **kwargs): return gpt_eval.compute_noref(prediction=pred, diff=kwargs['diff'], n_requests=3) def gptscore_noref_5_fn(pred, ref, **kwargs): return gpt_eval.compute_noref(prediction=pred, diff=kwargs['diff'], n_requests=5) IND_METRICS = { "gptscore-ref-1-req": gptscore_ref_1_fn, # "gptscore-ref-3-req": gptscore_ref_3_fn, # "gptscore-ref-5-req": gptscore_ref_5_fn, "gptscore-noref-1-req": gptscore_noref_1_fn, # "gptscore-noref-3-req": gptscore_noref_3_fn, # "gptscore-noref-5-req": gptscore_noref_5_fn, "editdist": edit_distance_fn, "bleu": bleu_fn, "meteor": meteor_fn, "rouge1": rouge1_fn, "rouge2": rouge2_fn, "rougeL": rougeL_fn, "bertscore": bertscore_fn, "chrF": chrf_fn, "ter": ter_fn, } REL_METRICS = { "editdist": edit_distance_fn, "edittime": edit_time_fn, } def attach_references(df): reference_df = hf_data_loader.load_full_commit_as_pandas().set_index(["hash", "repo"])[["reference"]] df = df.set_index(["hash", "repo"]) return df.join(other=reference_df, how="left").reset_index() def compute_metrics(df): tqdm.pandas() def apply_metric_fn_to_row(row, fn, col_pred, col_ref): return fn(row[col_pred], row[col_ref], edittime=row['edit_time'], diff=str(row['mods'])) for metric in REL_METRICS: print(f"Computing {metric} for the related pairs") metric_fn = REL_METRICS[metric] df[f"{metric}_related"] = df.progress_apply( lambda row: apply_metric_fn_to_row(row=row, fn=metric_fn, col_pred="commit_msg_start", col_ref="commit_msg_end"), axis=1 ) for metric in IND_METRICS: print(f"Computing {metric} for the independent pairs") metric_fn = IND_METRICS[metric] df[f"{metric}_independent"] = df.progress_apply( lambda row: apply_metric_fn_to_row(row=row, fn=metric_fn, col_pred="commit_msg_start", col_ref="reference"), axis=1 ) for rel_metric in REL_METRICS: for ind_metric in IND_METRICS: df[f"rel_{rel_metric}_ind_{ind_metric}_pearson"] = ( df[f"{rel_metric}_related"].corr(df[f"{ind_metric}_independent"], method="pearson")) df[f"rel_{rel_metric}_ind_{ind_metric}_spearman"] = ( df[f"{rel_metric}_related"].corr(df[f"{ind_metric}_independent"], method="spearman")) return df def correlations_for_group(group): correlations = [] for rel_metric in REL_METRICS: # correlations.append({ # f"{metric}_pearson": group[f"{metric}_related"].corr(group[f"{metric}_independent"], method="pearson"), # f"{metric}_spearman": group[f"{metric}_related"].corr(group[f"{metric}_independent"], method="spearman") # }) for ind_metric in IND_METRICS: correlations.append({ f"rel_{rel_metric}_ind_{ind_metric}_pearson": group[f"{rel_metric}_related"].corr( group[f"{ind_metric}_independent"], method="pearson"), f"rel_{rel_metric}_ind_{ind_metric}_spearman": group[f"{rel_metric}_related"].corr( group[f"{ind_metric}_independent"], method="spearman"), }) return pd.Series(functools.reduce(operator.ior, correlations, {})) def compute_correlations(df: pd.DataFrame): grouped_df = df.groupby(by=["end_to_start", "start_to_end"]) correlations = grouped_df.apply(correlations_for_group, include_groups=False) return correlations def transform(df): print("Computing metrics") df = attach_references(df) df = compute_metrics(df) correlations_for_groups = compute_correlations(df) correlations_for_groups.to_csv(config.METRICS_CORRELATIONS_ARTIFACT) df.to_csv(config.SYNTHETIC_DATASET_ARTIFACT) print("Done") return df def main(): df = pd.read_csv(config.START_TO_END_ARTIFACT, index_col=[0]) transform(df) if __name__ == '__main__': main()