commit-message-editing-visualization / change_visualizer.py
Petr Tsvetkov
Use FUS logs (not uploaded to repo) to compare length difference and edit distance distributions in FUS and in our dataset (resulting charts are not included).
5bd86a2
raw
history blame
No virus
10.5 kB
import gradio as gr
import analysis_util
import generate_annotated_diffs
import dataset_statistics
df_manual = generate_annotated_diffs.manual_data_with_annotated_diffs()
df_manual["end_to_start"] = False
df_manual["start_to_end"] = False
n_diffs_manual = len(df_manual)
df_synthetic = generate_annotated_diffs.synthetic_data_with_annotated_diffs()
n_diffs_synthetic = len(df_synthetic)
def golden():
return df_synthetic[(df_synthetic['end_to_start'] == False) & (df_synthetic['start_to_end'] == False)]
def e2s():
return df_synthetic[(df_synthetic['end_to_start'] == True) & (df_synthetic['start_to_end'] == False)]
def s2e():
return df_synthetic[(df_synthetic['end_to_start'] == False) & (df_synthetic['start_to_end'] == True)]
def e2s_s2e():
return df_synthetic[(df_synthetic['end_to_start'] == True) & (df_synthetic['start_to_end'] == True)]
def synthetic():
return df_synthetic[(df_synthetic['end_to_start'] == True) | (df_synthetic['start_to_end'] == True)]
STATISTICS = {"manual": dataset_statistics.get_statistics_for_df(golden()),
"e2s": dataset_statistics.get_statistics_for_df(e2s()),
"s2e": dataset_statistics.get_statistics_for_df(s2e()),
"e2s_s2e": dataset_statistics.get_statistics_for_df(e2s_s2e()),
"synthetic": dataset_statistics.get_statistics_for_df(synthetic()),
"all": dataset_statistics.get_statistics_for_df(df_synthetic)}
STATISTICS_T_TEST = dataset_statistics.t_test(STATISTICS, main_group='manual')
STAT_NAMES = list(STATISTICS['manual'].keys())
def update_dataset_view(diff_idx, df):
diff_idx -= 1
return (df.iloc[diff_idx]['annotated_diff'],
df.iloc[diff_idx]['commit_msg_start'],
df.iloc[diff_idx]['commit_msg_end'],
df.iloc[diff_idx]['session'],
str(df.iloc[diff_idx]['end_to_start']),
str(df.iloc[diff_idx]['start_to_end']),
f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",)
def update_dataset_view_manual(diff_idx):
return update_dataset_view(diff_idx, df_manual)
def update_dataset_view_synthetic(diff_idx):
return update_dataset_view(diff_idx, df_synthetic)
force_light_theme_js_func = """
function refresh() {
const url = new URL(window.location);
if (url.searchParams.get('__theme') !== 'light') {
url.searchParams.set('__theme', 'light');
window.location.href = url.href;
}
}
"""
if __name__ == '__main__':
with gr.Blocks(theme=gr.themes.Soft(), js=force_light_theme_js_func) as application:
def dataset_view_tab(n_items):
slider = gr.Slider(minimum=1, maximum=n_items, step=1, value=1,
label=f"Sample number (total: {n_items})")
diff_view = gr.Highlightedtext(combine_adjacent=True, color_map={'+': "green", '-': "red"})
start_view = gr.Textbox(interactive=False, label="Start message", container=True)
end_view = gr.Textbox(interactive=False, label="End message", container=True)
session_view = gr.Textbox(interactive=False, label="Session", container=True)
is_end_to_start_view = gr.Textbox(interactive=False,
label="Is generated on the 'end-to-start' synthesis step?",
container=True)
is_start_to_end_view = gr.Textbox(interactive=False,
label="Is generated on the 'start-to-end' synthesis step?",
container=True)
link_view = gr.Markdown()
view = [
diff_view,
start_view,
end_view,
session_view,
is_end_to_start_view,
is_start_to_end_view,
link_view
]
return slider, view
with gr.Tab("Manual"):
slider_manual, view_manual = dataset_view_tab(n_diffs_manual)
slider_manual.change(update_dataset_view_manual, inputs=slider_manual,
outputs=view_manual)
with gr.Tab("Synthetic"):
slider_synthetic, view_synthetic = dataset_view_tab(n_diffs_synthetic)
slider_synthetic.change(update_dataset_view_synthetic, inputs=slider_synthetic,
outputs=view_synthetic)
with gr.Tab("Analysis"):
def layout_for_statistics(statistics_group_name):
gr.Markdown(f"### {statistics_group_name}")
stats = STATISTICS[statistics_group_name]
gr.Number(label="Count", interactive=False,
value=len(stats['deletions_norm']), min_width=00)
gr.Number(label="Avg deletions number (rel to the initial msg length)", interactive=False,
value=stats['deletions_norm'].mean().item(), precision=3, min_width=00)
gr.Number(label="Avg insertions number (rel to the result length)", interactive=False,
value=stats['insertions_norm'].mean().item(), precision=3, min_width=00)
gr.Number(label="Avg changes number (rel to the initial msg length)", interactive=False,
value=stats['changes_norm'].mean().item(), precision=3, min_width=00)
gr.Number(label="Avg deletions number", interactive=False,
value=stats['deletions'].mean().item(), precision=3, min_width=00)
gr.Number(label="Avg insertions number", interactive=False,
value=stats['insertions'].mean().item(), precision=3, min_width=00)
gr.Number(label="Avg changes number", interactive=False,
value=stats['changes'].mean().item(), precision=3, min_width=00)
def layout_for_statistics_t_test(statistics_group_name):
gr.Markdown(f"### {statistics_group_name}")
stats = STATISTICS_T_TEST[statistics_group_name]
gr.Number(label="Deletions number (rel to the initial msg length)", interactive=False,
value=stats['deletions_norm'], precision=3, min_width=00)
gr.Number(label="Insertions number (rel to the result length)", interactive=False,
value=stats['insertions_norm'], precision=3, min_width=00)
gr.Number(label="Changes number (rel to the initial msg length)", interactive=False,
value=stats['changes_norm'], precision=3, min_width=00)
gr.Number(label="Deletions number", interactive=False,
value=stats['deletions'], precision=3, min_width=00)
gr.Number(label="Insertions number", interactive=False,
value=stats['insertions'], precision=3, min_width=00)
gr.Number(label="Changes number", interactive=False,
value=stats['changes'], precision=3, min_width=00)
with gr.Row():
with gr.Column(scale=1, min_width=100):
layout_for_statistics("manual")
with gr.Column(scale=1, min_width=100):
layout_for_statistics("e2s")
with gr.Column(scale=1, min_width=100):
layout_for_statistics("s2e")
with gr.Column(scale=1, min_width=100):
layout_for_statistics("e2s_s2e")
with gr.Column(scale=1, min_width=100):
layout_for_statistics("synthetic")
with gr.Column(scale=1, min_width=100):
layout_for_statistics("all")
# gr.Markdown(f"### Student t-test (p-value)")
# with gr.Row():
# with gr.Column(scale=1, min_width=100):
# layout_for_statistics_t_test("manual")
# with gr.Column(scale=1, min_width=100):
# layout_for_statistics_t_test("e2s")
# with gr.Column(scale=1, min_width=100):
# layout_for_statistics_t_test("s2e")
# with gr.Column(scale=1, min_width=100):
# layout_for_statistics_t_test("e2s_s2e")
# with gr.Column(scale=1, min_width=100):
# layout_for_statistics_t_test("synthetic")
# with gr.Column(scale=1, min_width=100):
# layout_for_statistics_t_test("all")
with gr.Row():
with gr.Column(scale=1):
for stat_name in filter(lambda s: "_norm" not in s, STAT_NAMES):
chart = dataset_statistics.build_plotly_chart(
stat_golden=STATISTICS['manual'][stat_name],
stat_e2s=STATISTICS['e2s'][stat_name],
stat_s2e=STATISTICS['s2e'][stat_name],
stat_e2s_s2e=STATISTICS['e2s_s2e'][stat_name],
stat_name=stat_name
)
gr.Plot(value=chart)
with gr.Column(scale=1):
with gr.Column(scale=1):
for stat_name in filter(lambda s: "_norm" in s, STAT_NAMES):
chart = dataset_statistics.build_plotly_chart(
stat_golden=STATISTICS['manual'][stat_name],
stat_e2s=STATISTICS['e2s'][stat_name],
stat_s2e=STATISTICS['s2e'][stat_name],
stat_e2s_s2e=STATISTICS['e2s_s2e'][stat_name],
stat_name=stat_name
)
gr.Plot(value=chart)
gr.Markdown(f"### Reference-only correlations")
gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="ind").to_markdown())
gr.Markdown(f"### Aggregated correlations")
gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="aggr").to_markdown())
application.load(update_dataset_view_manual, inputs=slider_manual,
outputs=view_manual)
application.load(update_dataset_view_synthetic, inputs=slider_synthetic,
outputs=view_synthetic)
application.launch()