|
import pickle |
|
|
|
import Levenshtein |
|
import numpy as np |
|
import pandas as pd |
|
import plotly.figure_factory as ff |
|
from scipy.stats import stats |
|
|
|
import config |
|
|
|
|
|
def get_statistics_for_sample(start_msg, end_msg, row=None): |
|
edit_ops = Levenshtein.editops(start_msg, end_msg) |
|
n_deletes = sum([1 if op == 'delete' else 0 for op, _, _ in edit_ops]) |
|
n_inserts = sum([1 if op == 'insert' else 0 for op, _, _ in edit_ops]) |
|
n_replaces = sum([1 if op == 'replace' else 0 for op, _, _ in edit_ops]) |
|
|
|
n_changes = n_deletes + n_inserts + n_replaces |
|
n_deletes += n_replaces |
|
n_inserts += n_replaces |
|
|
|
return { |
|
"deletions": n_deletes, |
|
"insertions": n_inserts, |
|
"changes": n_changes, |
|
|
|
"deletions_norm": n_deletes / len(start_msg), |
|
"insertions_norm": n_inserts / len(end_msg), |
|
"changes_norm": n_changes / len(end_msg), |
|
|
|
"lendiff": abs(len(start_msg) - len(end_msg)), |
|
"editdist": row["editdist_related"] if row is not None else Levenshtein.distance(start_msg, end_msg), |
|
} |
|
|
|
|
|
def get_statistics_for_row(row): |
|
start_msg = row["commit_msg_start"] |
|
end_msg = row["commit_msg_end"] |
|
return get_statistics_for_sample(start_msg, end_msg, row=row) |
|
|
|
|
|
def get_statistics_for_df(df: pd.DataFrame): |
|
stats = [get_statistics_for_row(row) for _, row in |
|
df.iterrows()] |
|
|
|
assert len(stats) > 0 |
|
|
|
return {stat_name: np.asarray([e[stat_name] for e in stats]) for stat_name in stats[0]} |
|
|
|
|
|
def build_plotly_chart(stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e, stat_name): |
|
hist_data = [stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e, |
|
np.concatenate((stat_e2s, stat_s2e, stat_e2s_s2e), axis=0)] |
|
|
|
group_labels = ['Golden', 'e2s', 's2e', 'e2s+s2e', 'Synthetic'] |
|
|
|
fig = ff.create_distplot(hist_data, group_labels, |
|
bin_size=.05, show_rug=False, show_hist=False) |
|
|
|
fig.update_layout(title_text=stat_name) |
|
|
|
with open(config.OUTPUT_CHARTS_DIR / f"{stat_name}_data.pkl", "wb") as f: |
|
pickle.dump(hist_data, f) |
|
|
|
return fig |
|
|
|
|
|
def t_test(group_stats, main_group="manual"): |
|
results = {} |
|
for group in group_stats: |
|
results[group] = {} |
|
for stat in group_stats[group]: |
|
a = group_stats[main_group][stat] |
|
b = group_stats[group][stat] |
|
|
|
p = stats.ttest_ind(a, b, equal_var=False, random_state=config.RANDOM_STATE).pvalue |
|
results[group][stat] = p |
|
|
|
return results |
|
|