commit-rewriting-visualization / change_visualizer.py
Petr Tsvetkov
Display average editdist and lendiff
d6b0ac4
import gradio as gr
import analysis_util
import generate_annotated_diffs
import dataset_statistics
df_manual = generate_annotated_diffs.manual_data_with_annotated_diffs()
df_manual["end_to_start"] = False
df_manual["start_to_end"] = False
n_diffs_manual = len(df_manual)
df_synthetic = generate_annotated_diffs.synthetic_data_with_annotated_diffs()
n_diffs_synthetic = len(df_synthetic)
def golden():
return df_synthetic[(df_synthetic['end_to_start'] == False) & (df_synthetic['start_to_end'] == False)]
def e2s():
return df_synthetic[(df_synthetic['end_to_start'] == True) & (df_synthetic['start_to_end'] == False)]
def s2e():
return df_synthetic[(df_synthetic['end_to_start'] == False) & (df_synthetic['start_to_end'] == True)]
def e2s_s2e():
return df_synthetic[(df_synthetic['end_to_start'] == True) & (df_synthetic['start_to_end'] == True)]
def synthetic():
return df_synthetic[(df_synthetic['end_to_start'] == True) | (df_synthetic['start_to_end'] == True)]
STATISTICS = {"manual": dataset_statistics.get_statistics_for_df(golden()),
"e2s": dataset_statistics.get_statistics_for_df(e2s()),
"s2e": dataset_statistics.get_statistics_for_df(s2e()),
"e2s_s2e": dataset_statistics.get_statistics_for_df(e2s_s2e()),
"synthetic": dataset_statistics.get_statistics_for_df(synthetic()),
"all": dataset_statistics.get_statistics_for_df(df_synthetic)}
STATISTICS_T_TEST = dataset_statistics.t_test(STATISTICS, main_group='manual')
STAT_NAMES = list(STATISTICS['manual'].keys())
def update_dataset_view(diff_idx, df):
diff_idx -= 1
return (df.iloc[diff_idx]['annotated_diff'],
df.iloc[diff_idx]['commit_msg_start'],
df.iloc[diff_idx]['commit_msg_end'],
df.iloc[diff_idx]['session'],
str(df.iloc[diff_idx]['end_to_start']),
str(df.iloc[diff_idx]['start_to_end']),
f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",)
def update_dataset_view_manual(diff_idx):
return update_dataset_view(diff_idx, df_manual)
def update_dataset_view_synthetic(diff_idx):
return update_dataset_view(diff_idx, df_synthetic)
force_light_theme_js_func = """
function refresh() {
const url = new URL(window.location);
if (url.searchParams.get('__theme') !== 'light') {
url.searchParams.set('__theme', 'light');
window.location.href = url.href;
}
}
"""
if __name__ == '__main__':
with gr.Blocks(theme=gr.themes.Soft(), js=force_light_theme_js_func) as application:
def dataset_view_tab(n_items):
slider = gr.Slider(minimum=1, maximum=n_items, step=1, value=1,
label=f"Sample number (total: {n_items})")
diff_view = gr.Highlightedtext(combine_adjacent=True, color_map={'+': "green", '-': "red"})
start_view = gr.Textbox(interactive=False, label="Start message", container=True)
end_view = gr.Textbox(interactive=False, label="End message", container=True)
session_view = gr.Textbox(interactive=False, label="Session", container=True)
is_end_to_start_view = gr.Textbox(interactive=False,
label="Is generated on the 'end-to-start' synthesis step?",
container=True)
is_start_to_end_view = gr.Textbox(interactive=False,
label="Is generated on the 'start-to-end' synthesis step?",
container=True)
link_view = gr.Markdown()
view = [
diff_view,
start_view,
end_view,
session_view,
is_end_to_start_view,
is_start_to_end_view,
link_view
]
return slider, view
with gr.Tab("Manual"):
slider_manual, view_manual = dataset_view_tab(n_diffs_manual)
slider_manual.change(update_dataset_view_manual, inputs=slider_manual,
outputs=view_manual)
with gr.Tab("Synthetic"):
slider_synthetic, view_synthetic = dataset_view_tab(n_diffs_synthetic)
slider_synthetic.change(update_dataset_view_synthetic, inputs=slider_synthetic,
outputs=view_synthetic)
with gr.Tab("Analysis"):
def layout_for_statistics(statistics_group_name):
gr.Markdown(f"### {statistics_group_name}")
stats = STATISTICS[statistics_group_name]
gr.Number(label="Count", interactive=False,
value=len(stats['deletions_norm']), min_width=00)
gr.Number(label="Avg deletions number (rel to the initial msg length)", interactive=False,
value=stats['deletions_norm'].mean().item(), precision=3, min_width=00)
gr.Number(label="Avg insertions number (rel to the result length)", interactive=False,
value=stats['insertions_norm'].mean().item(), precision=3, min_width=00)
gr.Number(label="Avg changes number (rel to the initial msg length)", interactive=False,
value=stats['changes_norm'].mean().item(), precision=3, min_width=00)
gr.Number(label="Avg deletions number", interactive=False,
value=stats['deletions'].mean().item(), precision=3, min_width=00)
gr.Number(label="Avg insertions number", interactive=False,
value=stats['insertions'].mean().item(), precision=3, min_width=00)
gr.Number(label="Avg changes number", interactive=False,
value=stats['changes'].mean().item(), precision=3, min_width=00)
gr.Number(label="Avg edit distance", interactive=False,
value=stats['editdist'].mean().item(), precision=3, min_width=00)
gr.Number(label="Avg length difference", interactive=False,
value=stats['lendiff'].mean().item(), precision=3, min_width=00)
def layout_for_statistics_t_test(statistics_group_name):
gr.Markdown(f"### {statistics_group_name}")
stats = STATISTICS_T_TEST[statistics_group_name]
gr.Number(label="Deletions number (rel to the initial msg length)", interactive=False,
value=stats['deletions_norm'], precision=3, min_width=00)
gr.Number(label="Insertions number (rel to the result length)", interactive=False,
value=stats['insertions_norm'], precision=3, min_width=00)
gr.Number(label="Changes number (rel to the initial msg length)", interactive=False,
value=stats['changes_norm'], precision=3, min_width=00)
gr.Number(label="Deletions number", interactive=False,
value=stats['deletions'], precision=3, min_width=00)
gr.Number(label="Insertions number", interactive=False,
value=stats['insertions'], precision=3, min_width=00)
gr.Number(label="Changes number", interactive=False,
value=stats['changes'], precision=3, min_width=00)
with gr.Row():
with gr.Column(scale=1, min_width=100):
layout_for_statistics("manual")
with gr.Column(scale=1, min_width=100):
layout_for_statistics("e2s")
with gr.Column(scale=1, min_width=100):
layout_for_statistics("s2e")
with gr.Column(scale=1, min_width=100):
layout_for_statistics("e2s_s2e")
with gr.Column(scale=1, min_width=100):
layout_for_statistics("synthetic")
with gr.Column(scale=1, min_width=100):
layout_for_statistics("all")
# gr.Markdown(f"### Student t-test (p-value)")
# with gr.Row():
# with gr.Column(scale=1, min_width=100):
# layout_for_statistics_t_test("manual")
# with gr.Column(scale=1, min_width=100):
# layout_for_statistics_t_test("e2s")
# with gr.Column(scale=1, min_width=100):
# layout_for_statistics_t_test("s2e")
# with gr.Column(scale=1, min_width=100):
# layout_for_statistics_t_test("e2s_s2e")
# with gr.Column(scale=1, min_width=100):
# layout_for_statistics_t_test("synthetic")
# with gr.Column(scale=1, min_width=100):
# layout_for_statistics_t_test("all")
with gr.Row():
with gr.Column(scale=1):
for stat_name in filter(lambda s: "_norm" not in s, STAT_NAMES):
chart = dataset_statistics.build_plotly_chart(
stat_golden=STATISTICS['manual'][stat_name],
stat_e2s=STATISTICS['e2s'][stat_name],
stat_s2e=STATISTICS['s2e'][stat_name],
stat_e2s_s2e=STATISTICS['e2s_s2e'][stat_name],
stat_name=stat_name
)
gr.Plot(value=chart)
with gr.Column(scale=1):
with gr.Column(scale=1):
for stat_name in filter(lambda s: "_norm" in s, STAT_NAMES):
chart = dataset_statistics.build_plotly_chart(
stat_golden=STATISTICS['manual'][stat_name],
stat_e2s=STATISTICS['e2s'][stat_name],
stat_s2e=STATISTICS['s2e'][stat_name],
stat_e2s_s2e=STATISTICS['e2s_s2e'][stat_name],
stat_name=stat_name
)
gr.Plot(value=chart)
gr.Markdown(f"### Reference-only correlations")
gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="ind").to_markdown())
gr.Markdown(f"### Aggregated correlations")
gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="aggr").to_markdown())
application.load(update_dataset_view_manual, inputs=slider_manual,
outputs=view_manual)
application.load(update_dataset_view_synthetic, inputs=slider_synthetic,
outputs=view_synthetic)
application.launch()