Petr Tsvetkov
Add distribution charts; add more detailed statistics; compute multi-reference TER as mean of TERs for each reference to improve the performance
303303b
raw
history blame
No virus
1.61 kB
import Levenshtein
import numpy as np
import pandas as pd
import plotly.figure_factory as ff
def get_statistics(start_msg, end_msg, annotated_msg):
edit_ops = Levenshtein.editops(start_msg, end_msg)
n_deletes = sum([1 if op == 'delete' else 0 for op, _, _ in edit_ops])
n_inserts = sum([1 if op == 'insert' else 0 for op, _, _ in edit_ops])
n_replaces = sum([1 if op == 'replace' else 0 for op, _, _ in edit_ops])
n_changes = n_deletes + n_inserts + n_replaces
n_deletes += n_replaces
n_inserts += n_replaces
return {
"deletions": n_deletes,
"insertions": n_inserts,
"changes": n_changes,
"deletions_norm": n_deletes / len(start_msg),
"insertions_norm": n_inserts / len(end_msg),
"changes_norm": n_changes / len(end_msg),
}
def get_statistics_for_df(df: pd.DataFrame):
stats = [get_statistics(row["commit_msg_start"], row["commit_msg_end"], row["annotated_diff"]) for _, row in
df.iterrows()]
assert len(stats) > 0
return {stat_name: np.asarray([e[stat_name] for e in stats]) for stat_name in stats[0]}
def build_plotly_chart(stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e, stat_name):
hist_data = [stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e,
np.concatenate((stat_e2s, stat_s2e, stat_e2s_s2e), axis=0)]
group_labels = ['Golden', 'e2s', 's2e', 'e2s+s 2e', 'Synthetic']
fig = ff.create_distplot(hist_data, group_labels,
bin_size=.1, show_rug=False, show_hist=True)
fig.update_layout(title_text=stat_name)
return fig