Spaces:

JetBrains-Research
/

commit-message-editing-visualization

Sleeping

commit-message-editing-visualization / change_visualizer.py

Petr Tsvetkov

Use FUS logs (not uploaded to repo) to compare length difference and edit distance distributions in FUS and in our dataset (resulting charts are not included).

5bd86a2 about 2 months ago

raw

history blame

No virus

10.5 kB

	import gradio as gr

	import analysis_util
	import generate_annotated_diffs
	import dataset_statistics

	df_manual = generate_annotated_diffs.manual_data_with_annotated_diffs()
	df_manual["end_to_start"] = False
	df_manual["start_to_end"] = False
	n_diffs_manual = len(df_manual)

	df_synthetic = generate_annotated_diffs.synthetic_data_with_annotated_diffs()
	n_diffs_synthetic = len(df_synthetic)


	def golden():
	return df_synthetic[(df_synthetic['end_to_start'] == False) & (df_synthetic['start_to_end'] == False)]


	def e2s():
	return df_synthetic[(df_synthetic['end_to_start'] == True) & (df_synthetic['start_to_end'] == False)]


	def s2e():
	return df_synthetic[(df_synthetic['end_to_start'] == False) & (df_synthetic['start_to_end'] == True)]


	def e2s_s2e():
	return df_synthetic[(df_synthetic['end_to_start'] == True) & (df_synthetic['start_to_end'] == True)]


	def synthetic():
	return df_synthetic[(df_synthetic['end_to_start'] == True) \| (df_synthetic['start_to_end'] == True)]


	STATISTICS = {"manual": dataset_statistics.get_statistics_for_df(golden()),
	"e2s": dataset_statistics.get_statistics_for_df(e2s()),
	"s2e": dataset_statistics.get_statistics_for_df(s2e()),
	"e2s_s2e": dataset_statistics.get_statistics_for_df(e2s_s2e()),
	"synthetic": dataset_statistics.get_statistics_for_df(synthetic()),
	"all": dataset_statistics.get_statistics_for_df(df_synthetic)}

	STATISTICS_T_TEST = dataset_statistics.t_test(STATISTICS, main_group='manual')

	STAT_NAMES = list(STATISTICS['manual'].keys())


	def update_dataset_view(diff_idx, df):
	diff_idx -= 1
	return (df.iloc[diff_idx]['annotated_diff'],
	df.iloc[diff_idx]['commit_msg_start'],
	df.iloc[diff_idx]['commit_msg_end'],
	df.iloc[diff_idx]['session'],
	str(df.iloc[diff_idx]['end_to_start']),
	str(df.iloc[diff_idx]['start_to_end']),
	f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",)


	def update_dataset_view_manual(diff_idx):
	return update_dataset_view(diff_idx, df_manual)


	def update_dataset_view_synthetic(diff_idx):
	return update_dataset_view(diff_idx, df_synthetic)


	force_light_theme_js_func = """
	function refresh() {
	const url = new URL(window.location);

	if (url.searchParams.get('__theme') !== 'light') {
	url.searchParams.set('__theme', 'light');
	window.location.href = url.href;
	}
	}
	"""

	if __name__ == '__main__':
	with gr.Blocks(theme=gr.themes.Soft(), js=force_light_theme_js_func) as application:
	def dataset_view_tab(n_items):
	slider = gr.Slider(minimum=1, maximum=n_items, step=1, value=1,
	label=f"Sample number (total: {n_items})")

	diff_view = gr.Highlightedtext(combine_adjacent=True, color_map={'+': "green", '-': "red"})
	start_view = gr.Textbox(interactive=False, label="Start message", container=True)
	end_view = gr.Textbox(interactive=False, label="End message", container=True)
	session_view = gr.Textbox(interactive=False, label="Session", container=True)
	is_end_to_start_view = gr.Textbox(interactive=False,
	label="Is generated on the 'end-to-start' synthesis step?",
	container=True)
	is_start_to_end_view = gr.Textbox(interactive=False,
	label="Is generated on the 'start-to-end' synthesis step?",
	container=True)
	link_view = gr.Markdown()

	view = [
	diff_view,
	start_view,
	end_view,
	session_view,
	is_end_to_start_view,
	is_start_to_end_view,
	link_view
	]

	return slider, view


	with gr.Tab("Manual"):
	slider_manual, view_manual = dataset_view_tab(n_diffs_manual)

	slider_manual.change(update_dataset_view_manual, inputs=slider_manual,
	outputs=view_manual)

	with gr.Tab("Synthetic"):
	slider_synthetic, view_synthetic = dataset_view_tab(n_diffs_synthetic)

	slider_synthetic.change(update_dataset_view_synthetic, inputs=slider_synthetic,
	outputs=view_synthetic)
	with gr.Tab("Analysis"):
	def layout_for_statistics(statistics_group_name):
	gr.Markdown(f"### {statistics_group_name}")
	stats = STATISTICS[statistics_group_name]
	gr.Number(label="Count", interactive=False,
	value=len(stats['deletions_norm']), min_width=00)
	gr.Number(label="Avg deletions number (rel to the initial msg length)", interactive=False,
	value=stats['deletions_norm'].mean().item(), precision=3, min_width=00)
	gr.Number(label="Avg insertions number (rel to the result length)", interactive=False,
	value=stats['insertions_norm'].mean().item(), precision=3, min_width=00)
	gr.Number(label="Avg changes number (rel to the initial msg length)", interactive=False,
	value=stats['changes_norm'].mean().item(), precision=3, min_width=00)
	gr.Number(label="Avg deletions number", interactive=False,
	value=stats['deletions'].mean().item(), precision=3, min_width=00)
	gr.Number(label="Avg insertions number", interactive=False,
	value=stats['insertions'].mean().item(), precision=3, min_width=00)
	gr.Number(label="Avg changes number", interactive=False,
	value=stats['changes'].mean().item(), precision=3, min_width=00)


	def layout_for_statistics_t_test(statistics_group_name):
	gr.Markdown(f"### {statistics_group_name}")
	stats = STATISTICS_T_TEST[statistics_group_name]
	gr.Number(label="Deletions number (rel to the initial msg length)", interactive=False,
	value=stats['deletions_norm'], precision=3, min_width=00)
	gr.Number(label="Insertions number (rel to the result length)", interactive=False,
	value=stats['insertions_norm'], precision=3, min_width=00)
	gr.Number(label="Changes number (rel to the initial msg length)", interactive=False,
	value=stats['changes_norm'], precision=3, min_width=00)
	gr.Number(label="Deletions number", interactive=False,
	value=stats['deletions'], precision=3, min_width=00)
	gr.Number(label="Insertions number", interactive=False,
	value=stats['insertions'], precision=3, min_width=00)
	gr.Number(label="Changes number", interactive=False,
	value=stats['changes'], precision=3, min_width=00)


	with gr.Row():
	with gr.Column(scale=1, min_width=100):
	layout_for_statistics("manual")
	with gr.Column(scale=1, min_width=100):
	layout_for_statistics("e2s")
	with gr.Column(scale=1, min_width=100):
	layout_for_statistics("s2e")
	with gr.Column(scale=1, min_width=100):
	layout_for_statistics("e2s_s2e")
	with gr.Column(scale=1, min_width=100):
	layout_for_statistics("synthetic")
	with gr.Column(scale=1, min_width=100):
	layout_for_statistics("all")

	# gr.Markdown(f"### Student t-test (p-value)")
	# with gr.Row():
	# with gr.Column(scale=1, min_width=100):
	# layout_for_statistics_t_test("manual")
	# with gr.Column(scale=1, min_width=100):
	# layout_for_statistics_t_test("e2s")
	# with gr.Column(scale=1, min_width=100):
	# layout_for_statistics_t_test("s2e")
	# with gr.Column(scale=1, min_width=100):
	# layout_for_statistics_t_test("e2s_s2e")
	# with gr.Column(scale=1, min_width=100):
	# layout_for_statistics_t_test("synthetic")
	# with gr.Column(scale=1, min_width=100):
	# layout_for_statistics_t_test("all")

	with gr.Row():
	with gr.Column(scale=1):
	for stat_name in filter(lambda s: "_norm" not in s, STAT_NAMES):
	chart = dataset_statistics.build_plotly_chart(
	stat_golden=STATISTICS['manual'][stat_name],
	stat_e2s=STATISTICS['e2s'][stat_name],
	stat_s2e=STATISTICS['s2e'][stat_name],
	stat_e2s_s2e=STATISTICS['e2s_s2e'][stat_name],
	stat_name=stat_name
	)

	gr.Plot(value=chart)
	with gr.Column(scale=1):
	with gr.Column(scale=1):
	for stat_name in filter(lambda s: "_norm" in s, STAT_NAMES):
	chart = dataset_statistics.build_plotly_chart(
	stat_golden=STATISTICS['manual'][stat_name],
	stat_e2s=STATISTICS['e2s'][stat_name],
	stat_s2e=STATISTICS['s2e'][stat_name],
	stat_e2s_s2e=STATISTICS['e2s_s2e'][stat_name],
	stat_name=stat_name
	)

	gr.Plot(value=chart)

	gr.Markdown(f"### Reference-only correlations")
	gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="ind").to_markdown())

	gr.Markdown(f"### Aggregated correlations")
	gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="aggr").to_markdown())

	application.load(update_dataset_view_manual, inputs=slider_manual,
	outputs=view_manual)

	application.load(update_dataset_view_synthetic, inputs=slider_synthetic,
	outputs=view_synthetic)

	application.launch()