import numpy as np | |
import pandas as pd | |
def get_statistics(start_msg, end_msg, annotated_msg): | |
sum_deletions = 0 | |
sum_insertions = 0 | |
for text, change_type in annotated_msg: | |
if change_type == '-': | |
sum_deletions += len(text) | |
elif change_type == '+': | |
sum_insertions += len(text) | |
sum_changes = sum_deletions + sum_insertions | |
end_length = len(end_msg) | |
start_length = len(start_msg) | |
return { | |
"deletions": sum_deletions / start_length, | |
"insertions": sum_insertions / end_length, | |
"changes": sum_changes / end_length | |
} | |
def get_statistics_for_df(df: pd.DataFrame): | |
stats = [get_statistics(row["commit_msg_start"], row["commit_msg_end"], row["annotated_diff"]) for _, row in | |
df.iterrows()] | |
assert len(stats) > 0 | |
return {stat_name: np.asarray([e[stat_name] for e in stats]) for stat_name in stats[0]} | |