File size: 2,156 Bytes
7ab7be2 303303b f26a894 303303b 7ab7be2 f26a894 347f566 303303b a6b5a66 303303b f26a894 303303b 5bd86a2 86f1b98 f26a894 347f566 86f1b98 a6b5a66 86f1b98 a6b5a66 86f1b98 347f566 c151bb0 a6b5a66 a8a595d 303303b a6b5a66 303303b a6b5a66 303303b a6b5a66 303303b 7ab7be2 303303b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 |
import pickle
import Levenshtein
import numpy as np
import pandas as pd
import plotly.figure_factory as ff
import config
def get_statistics_for_sample(start_msg, end_msg, row=None):
edit_ops = Levenshtein.editops(start_msg, end_msg)
n_deletes = sum([1 if op == "delete" else 0 for op, _, _ in edit_ops])
n_inserts = sum([1 if op == "insert" else 0 for op, _, _ in edit_ops])
n_replaces = sum([1 if op == "replace" else 0 for op, _, _ in edit_ops])
n_changes = n_deletes + n_inserts + n_replaces
n_deletes += n_replaces
n_inserts += n_replaces
return {
"deletions": n_deletes,
"insertions": n_inserts,
"changes": n_changes,
"deletions_norm": n_deletes / len(start_msg),
"insertions_norm": n_inserts / len(end_msg),
"changes_norm": n_changes / len(end_msg),
"lendiff": abs(len(start_msg) - len(end_msg)),
"editdist": row["editdist"] if row is not None else Levenshtein.distance(start_msg, end_msg),
}
def get_statistics_for_row(row):
if "commit_msg_start" in row:
start = row["commit_msg_start"]
else:
start = row["G_text"]
if "commit_msg_end" in row:
end = row["commit_msg_end"]
else:
end = row["E_text"]
return get_statistics_for_sample(start, end, row=row)
def get_statistics_for_df(df: pd.DataFrame):
stats = [get_statistics_for_row(row) for _, row in df.iterrows()]
assert len(stats) > 0
return {stat_name: np.asarray([e[stat_name] for e in stats]) for stat_name in stats[0]}
def build_plotly_chart(stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e, stat_name):
hist_data = [
stat_golden,
stat_e2s,
stat_s2e,
stat_e2s_s2e,
np.concatenate((stat_e2s, stat_s2e, stat_e2s_s2e), axis=0),
]
group_labels = ["Golden", "e2s", "s2e", "e2s+s2e", "Synthetic"]
fig = ff.create_distplot(hist_data, group_labels, bin_size=0.05, show_rug=False, show_hist=False)
fig.update_layout(title_text=stat_name)
with open(config.OUTPUT_CHARTS_DIR / f"{stat_name}_data.pkl", "wb") as f:
pickle.dump(hist_data, f)
return fig
|