Spaces:

JetBrains-Research
/

commit-message-editing-visualization

Running

App Files Files Community

saridormi commited on 20 days ago

Commit

86f1b98

•

1 Parent(s): abb3f0c

some fixes to gradio app

Browse files

Files changed (5) hide show

analysis_util.py +0 -74
api_wrappers/hf_data_loader.py +17 -16
change_visualizer.py +12 -123
dataset_statistics.py +10 -18
generate_annotated_diffs.py +8 -2

analysis_util.py DELETED Viewed

@@ -1,74 +0,0 @@
-import functools
-import operator
-import pandas as pd
-def correlations_for_group(group):
-    REL_METRICS = [col.split("_")[0] for col in group.columns if col.endswith("_related")]
-    IND_METRICS = [col.split("_")[0] for col in group.columns if col.endswith("_independent")]
-    AGGR_METRICS = [col.split("_")[0] for col in group.columns if col.endswith("_aggr")]
-    correlations = []
-    for rel_metric in REL_METRICS:
-        for ind_metric in IND_METRICS:
-            correlations.append({
-                f"rel_{rel_metric}_ind_{ind_metric}_pearson": group[f"{rel_metric}_related"].corr(
-                    group[f"{ind_metric}_independent"], method="pearson"),
-                f"rel_{rel_metric}_ind_{ind_metric}_spearman": group[f"{rel_metric}_related"].corr(
-                    group[f"{ind_metric}_independent"], method="spearman"),
-            })
-        for aggr_metric in AGGR_METRICS:
-            correlations.append({
-                f"rel_{rel_metric}_aggr_{aggr_metric}_pearson": group[f"{rel_metric}_related"].corr(
-                    group[f"{aggr_metric}_aggr"], method="pearson"),
-                f"rel_{rel_metric}_aggr_{aggr_metric}_spearman": group[f"{rel_metric}_related"].corr(
-                    group[f"{aggr_metric}_aggr"], method="spearman"),
-            })
-    return pd.Series(functools.reduce(operator.ior, correlations, {}))
-def split_metrics_string(s):
-    tokens = s.split("_")
-    return tokens[1], tokens[3]
-def get_correlations_df(df, right_side):
-    correlations_raw = correlations_for_group(df)
-    idx = list(set("_".join(col.split("_")[:-1]) for col in correlations_raw.index if right_side in col))
-    data = []
-    for metrics in idx:
-        data.append(
-            {"metrics": metrics,
-             "spearman": correlations_raw[f"{metrics}_spearman"],
-             "pearson": correlations_raw[f"{metrics}_pearson"],
-             }
-        )
-    result = pd.DataFrame.from_records(data=data, index="metrics").sort_index()
-    result.index = pd.MultiIndex.from_tuples(result.index.map(split_metrics_string).tolist())
-    result.index.set_names(["relative", "independent"], inplace=True)
-    return result
-def get_correlations_for_groups(df, right_side):
-    correlations = {"all": get_correlations_df(df, right_side=right_side)}
-    for e2s in (False, True):
-        for s2e in (False, True):
-            group = "golden"
-            if e2s:
-                group += "+e2s"
-            if s2e:
-                group += "+s2e"
-            subdf = df[((df["end_to_start"] == e2s) & (df["start_to_end"] == s2e)) | (
-                    (df["end_to_start"] == False) & (df["start_to_end"] == False))]
-            subdf_corr = get_correlations_df(subdf, right_side=right_side)
-            correlations[group] = subdf_corr
-    correlations = pd.concat(correlations, axis=1)
-    return correlations

api_wrappers/hf_data_loader.py CHANGED Viewed

@@ -2,6 +2,7 @@ import json
 import os
 from datetime import datetime, timedelta
 from datasets import load_dataset
 from huggingface_hub import hf_hub_download, list_repo_tree
@@ -66,7 +67,7 @@ def load_processed_rewriting_as_pandas():
 def load_synthetic_as_pandas():
     return load_dataset(config.HF_SYNTHETIC_DATASET_NAME,
-                        "raw",
                         split=config.HF_SYNTHETIC_DATASET_SPLIT,
                         token=config.HF_TOKEN,
                         cache_dir=config.CACHE_DIR).to_pandas()
@@ -75,21 +76,21 @@ def load_synthetic_as_pandas():
 def load_full_commit_with_predictions_as_pandas():
     full_dataset = load_full_commit_as_pandas()
-    # TODO
-    # for prediction_file in list_repo_tree(repo_id=config.HF_PREDICTIONS_DATASET_NAME,
-    #                                       path=os.path.join("commit_message_generation/predictions", config.HF_PREDICTIONS_MODEL),
-    #                                       repo_type="dataset"):
-    #     hf_hub_download(prediction_file.path,
-    #                     repo_id=config.HF_PREDICTIONS_DATASET_NAME,
-    #                     repo_type="dataset",)
-    predictions_dataset = load_dataset(config.HF_PREDICTIONS_DATASET_NAME,
-                                       config.HF_PREDICTIONS_DATASET_SUBNAME,
-                                       split=config.HF_PREDICTIONS_DATASET_SPLIT,
-                                       cache_dir=config.CACHE_DIR
-                                       ).to_pandas().sample(frac=1, random_state=config.RANDOM_STATE
-                                                            ).set_index(['hash', 'repo'])[["prediction"]]
     predictions_dataset = predictions_dataset[~predictions_dataset.index.duplicated(keep='first')]
     dataset = full_dataset.join(other=predictions_dataset, on=('hash', 'repo'))

 import os
 from datetime import datetime, timedelta
+import pandas as pd
 from datasets import load_dataset
 from huggingface_hub import hf_hub_download, list_repo_tree
 def load_synthetic_as_pandas():
     return load_dataset(config.HF_SYNTHETIC_DATASET_NAME,
+                        "all_pairs_with_metrics",
                         split=config.HF_SYNTHETIC_DATASET_SPLIT,
                         token=config.HF_TOKEN,
                         cache_dir=config.CACHE_DIR).to_pandas()
 def load_full_commit_with_predictions_as_pandas():
     full_dataset = load_full_commit_as_pandas()
+    predictions_paths = []
+    for prediction_file in list_repo_tree(repo_id=config.HF_PREDICTIONS_DATASET_NAME,
+                                          path=os.path.join("commit_message_generation/predictions", config.HF_PREDICTIONS_MODEL),
+                                          repo_type="dataset"):
+        predictions_paths.append(hf_hub_download(prediction_file.path,
+                        repo_id=config.HF_PREDICTIONS_DATASET_NAME,
+                        repo_type="dataset",
+                        cache_dir=config.CACHE_DIR))
+    dfs = []
+    for path in predictions_paths:
+        dfs.append(pd.read_json(path, orient="records", lines=True))
+    predictions_dataset = pd.concat(dfs, axis=0, ignore_index=True)
+    predictions_dataset = predictions_dataset.sample(frac=1,
+                                                     random_state=config.RANDOM_STATE).set_index(['hash', 'repo'])[["prediction"]]
     predictions_dataset = predictions_dataset[~predictions_dataset.index.duplicated(keep='first')]
     dataset = full_dataset.join(other=predictions_dataset, on=('hash', 'repo'))

change_visualizer.py CHANGED Viewed

@@ -1,8 +1,6 @@
 import gradio as gr
-import analysis_util
 import generate_annotated_diffs
-import dataset_statistics
 df_manual = generate_annotated_diffs.manual_data_with_annotated_diffs()
 df_manual["end_to_start"] = False
@@ -14,45 +12,33 @@ n_diffs_synthetic = len(df_synthetic)
 def golden():
-    return df_synthetic[(df_synthetic['end_to_start'] == False) & (df_synthetic['start_to_end'] == False)]
-def e2s():
-    return df_synthetic[(df_synthetic['end_to_start'] == True) & (df_synthetic['start_to_end'] == False)]
-def s2e():
-    return df_synthetic[(df_synthetic['end_to_start'] == False) & (df_synthetic['start_to_end'] == True)]
-def e2s_s2e():
-    return df_synthetic[(df_synthetic['end_to_start'] == True) & (df_synthetic['start_to_end'] == True)]
 def synthetic():
     return df_synthetic[(df_synthetic['end_to_start'] == True) | (df_synthetic['start_to_end'] == True)]
-STATISTICS = {"manual": dataset_statistics.get_statistics_for_df(golden()),
-              "e2s": dataset_statistics.get_statistics_for_df(e2s()),
-              "s2e": dataset_statistics.get_statistics_for_df(s2e()),
-              "e2s_s2e": dataset_statistics.get_statistics_for_df(e2s_s2e()),
-              "synthetic": dataset_statistics.get_statistics_for_df(synthetic()),
-              "all": dataset_statistics.get_statistics_for_df(df_synthetic)}
-STATISTICS_T_TEST = dataset_statistics.t_test(STATISTICS, main_group='manual')
-STAT_NAMES = list(STATISTICS['manual'].keys())
 def update_dataset_view(diff_idx, df):
     diff_idx -= 1
     return (df.iloc[diff_idx]['annotated_diff'],
-            df.iloc[diff_idx]['commit_msg_start'],
-            df.iloc[diff_idx]['commit_msg_end'],
-            df.iloc[diff_idx]['session'],
-            str(df.iloc[diff_idx]['end_to_start']),
-            str(df.iloc[diff_idx]['start_to_end']),
             f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",)
@@ -119,103 +105,6 @@ if __name__ == '__main__':
             slider_synthetic.change(update_dataset_view_synthetic,
                                     inputs=slider_synthetic,
                                     outputs=view_synthetic)
-        with gr.Tab("Analysis"):
-            def layout_for_statistics(statistics_group_name):
-                gr.Markdown(f"### {statistics_group_name}")
-                stats = STATISTICS[statistics_group_name]
-                gr.Number(label="Count", interactive=False,
-                          value=len(stats['deletions_norm']), min_width=00)
-                gr.Number(label="Avg deletions number (rel to the initial msg length)", interactive=False,
-                          value=stats['deletions_norm'].mean().item(), precision=3, min_width=00)
-                gr.Number(label="Avg insertions number (rel to the result length)", interactive=False,
-                          value=stats['insertions_norm'].mean().item(), precision=3, min_width=00)
-                gr.Number(label="Avg changes number (rel to the initial msg length)", interactive=False,
-                          value=stats['changes_norm'].mean().item(), precision=3, min_width=00)
-                gr.Number(label="Avg deletions number", interactive=False,
-                          value=stats['deletions'].mean().item(), precision=3, min_width=00)
-                gr.Number(label="Avg insertions number", interactive=False,
-                          value=stats['insertions'].mean().item(), precision=3, min_width=00)
-                gr.Number(label="Avg changes number", interactive=False,
-                          value=stats['changes'].mean().item(), precision=3, min_width=00)
-                gr.Number(label="Avg edit distance", interactive=False,
-                          value=stats['editdist'].mean().item(), precision=3, min_width=00)
-                gr.Number(label="Avg length difference", interactive=False,
-                          value=stats['lendiff'].mean().item(), precision=3, min_width=00)
-            def layout_for_statistics_t_test(statistics_group_name):
-                gr.Markdown(f"### {statistics_group_name}")
-                stats = STATISTICS_T_TEST[statistics_group_name]
-                gr.Number(label="Deletions number (rel to the initial msg length)", interactive=False,
-                          value=stats['deletions_norm'], precision=3, min_width=00)
-                gr.Number(label="Insertions number (rel to the result length)", interactive=False,
-                          value=stats['insertions_norm'], precision=3, min_width=00)
-                gr.Number(label="Changes number (rel to the initial msg length)", interactive=False,
-                          value=stats['changes_norm'], precision=3, min_width=00)
-                gr.Number(label="Deletions number", interactive=False,
-                          value=stats['deletions'], precision=3, min_width=00)
-                gr.Number(label="Insertions number", interactive=False,
-                          value=stats['insertions'], precision=3, min_width=00)
-                gr.Number(label="Changes number", interactive=False,
-                          value=stats['changes'], precision=3, min_width=00)
-            with gr.Row():
-                with gr.Column(scale=1, min_width=100):
-                    layout_for_statistics("manual")
-                with gr.Column(scale=1, min_width=100):
-                    layout_for_statistics("e2s")
-                with gr.Column(scale=1, min_width=100):
-                    layout_for_statistics("s2e")
-                with gr.Column(scale=1, min_width=100):
-                    layout_for_statistics("e2s_s2e")
-                with gr.Column(scale=1, min_width=100):
-                    layout_for_statistics("synthetic")
-                with gr.Column(scale=1, min_width=100):
-                    layout_for_statistics("all")
-            # gr.Markdown(f"### Student t-test (p-value)")
-            # with gr.Row():
-            #     with gr.Column(scale=1, min_width=100):
-            #         layout_for_statistics_t_test("manual")
-            #     with gr.Column(scale=1, min_width=100):
-            #         layout_for_statistics_t_test("e2s")
-            #     with gr.Column(scale=1, min_width=100):
-            #         layout_for_statistics_t_test("s2e")
-            #     with gr.Column(scale=1, min_width=100):
-            #         layout_for_statistics_t_test("e2s_s2e")
-            #     with gr.Column(scale=1, min_width=100):
-            #         layout_for_statistics_t_test("synthetic")
-            #     with gr.Column(scale=1, min_width=100):
-            #         layout_for_statistics_t_test("all")
-            with gr.Row():
-                with gr.Column(scale=1):
-                    for stat_name in filter(lambda s: "_norm" not in s, STAT_NAMES):
-                        chart = dataset_statistics.build_plotly_chart(
-                            stat_golden=STATISTICS['manual'][stat_name],
-                            stat_e2s=STATISTICS['e2s'][stat_name],
-                            stat_s2e=STATISTICS['s2e'][stat_name],
-                            stat_e2s_s2e=STATISTICS['e2s_s2e'][stat_name],
-                            stat_name=stat_name
-                        )
-                        gr.Plot(value=chart)
-                with gr.Column(scale=1):
-                    with gr.Column(scale=1):
-                        for stat_name in filter(lambda s: "_norm" in s, STAT_NAMES):
-                            chart = dataset_statistics.build_plotly_chart(
-                                stat_golden=STATISTICS['manual'][stat_name],
-                                stat_e2s=STATISTICS['e2s'][stat_name],
-                                stat_s2e=STATISTICS['s2e'][stat_name],
-                                stat_e2s_s2e=STATISTICS['e2s_s2e'][stat_name],
-                                stat_name=stat_name
-                            )
-                            gr.Plot(value=chart)
-            gr.Markdown(f"### Metrics correlations")
-            gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="aggr").to_markdown())
         application.load(update_dataset_view_manual, inputs=slider_manual,
                          outputs=view_manual)

 import gradio as gr
 import generate_annotated_diffs
 df_manual = generate_annotated_diffs.manual_data_with_annotated_diffs()
 df_manual["end_to_start"] = False
 def golden():
+    return df_synthetic.loc[df_synthetic.is_related].loc[(df_synthetic['G_type'] == "initial") & (df_synthetic['E_type'] == "expert_labeled")]
+def backward():
+    return df_synthetic.loc[df_synthetic.is_related].loc[(df_synthetic['G_type'] == "backward") & (df_synthetic['E_type'].isin(["synthetic_forward", "synthtetic_forward_from_backward"]))]
+def forward():
+    return df_synthetic.loc[df_synthetic.is_related].loc[(df_synthetic['G_type'] == "initial") & (df_synthetic['E_type'] == "synthetic_forward")]
+def forward_from_backward():
+    return df_synthetic.loc[df_synthetic.is_related].loc[(df_synthetic['G_type'] == "synthetic_backward") & (df_synthetic['E_type'] == "synthetic_forward_from_backward")]
 def synthetic():
     return df_synthetic[(df_synthetic['end_to_start'] == True) | (df_synthetic['start_to_end'] == True)]
 def update_dataset_view(diff_idx, df):
     diff_idx -= 1
     return (df.iloc[diff_idx]['annotated_diff'],
+            df.iloc[diff_idx]['commit_msg_start'] if "commit_msg_start" in df.columns else df.iloc[diff_idx]['G_text'],
+            df.iloc[diff_idx]['commit_msg_end'] if "commit_msg_end" in df.columns else df.iloc[diff_idx]['E_text'],
+            df.iloc[diff_idx]['session'] if "session" in df.columns else "",
+            str(df.iloc[diff_idx]['end_to_start']) if "end_to_start" in df.columns else "",
+            str(df.iloc[diff_idx]['start_to_end']) if "start_to_end" in df.columns else "",
             f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",)
             slider_synthetic.change(update_dataset_view_synthetic,
                                     inputs=slider_synthetic,
                                     outputs=view_synthetic)
         application.load(update_dataset_view_manual, inputs=slider_manual,
                          outputs=view_manual)

dataset_statistics.py CHANGED Viewed

@@ -29,14 +29,20 @@ def get_statistics_for_sample(start_msg, end_msg, row=None):
         "changes_norm": n_changes / len(end_msg),
         "lendiff": abs(len(start_msg) - len(end_msg)),
-        "editdist": row["editdist_related"] if row is not None else Levenshtein.distance(start_msg, end_msg),
     }
 def get_statistics_for_row(row):
-    start_msg = row["commit_msg_start"]
-    end_msg = row["commit_msg_end"]
-    return get_statistics_for_sample(start_msg, end_msg, row=row)
 def get_statistics_for_df(df: pd.DataFrame):
@@ -63,17 +69,3 @@ def build_plotly_chart(stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e, stat_name)
         pickle.dump(hist_data, f)
     return fig
-def t_test(group_stats, main_group="manual"):
-    results = {}
-    for group in group_stats:
-        results[group] = {}
-        for stat in group_stats[group]:
-            a = group_stats[main_group][stat]
-            b = group_stats[group][stat]
-            p = stats.ttest_ind(a, b, equal_var=False, random_state=config.RANDOM_STATE).pvalue
-            results[group][stat] = p
-    return results

         "changes_norm": n_changes / len(end_msg),
         "lendiff": abs(len(start_msg) - len(end_msg)),
+        "editdist": row["editdist"] if row is not None else Levenshtein.distance(start_msg, end_msg),
     }
 def get_statistics_for_row(row):
+    if "commit_msg_start" in row:
+        start = row['commit_msg_start']
+    else:
+        start = row["G_text"]
+    if "commit_msg_end" in row:
+        end = row['commit_msg_end']
+    else:
+        end = row["E_text"]
+    return get_statistics_for_sample(start, end, row=row)
 def get_statistics_for_df(df: pd.DataFrame):
         pickle.dump(hist_data, f)
     return fig

generate_annotated_diffs.py CHANGED Viewed

@@ -21,8 +21,14 @@ def get_annotated_diff(start_text, end_text):
 def annotated_diff_for_row(row):
-    start = row['commit_msg_start']
-    end = row['commit_msg_end']
     return get_annotated_diff(start, end)

 def annotated_diff_for_row(row):
+    if "commit_msg_start" in row:
+        start = row['commit_msg_start']
+    else:
+        start = row["G_text"]
+    if "commit_msg_end" in row:
+        end = row['commit_msg_end']
+    else:
+        end = row["E_text"]
     return get_annotated_diff(start, end)