File size: 1,278 Bytes
f26a894 a8a595d f26a894 a8a595d f26a894 a8a595d f26a894 4017643 f26a894 e2a35c0 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 |
import numpy as np
import pandas as pd
def get_statistics(start_msg, end_msg, annotated_msg):
sum_deletions = 0
sum_insertions = 0
for text, change_type in annotated_msg:
if change_type == '-':
sum_deletions += len(text)
elif change_type == '+':
sum_insertions += len(text)
sum_changes = sum_deletions + sum_insertions
end_length = len(end_msg)
start_length = len(start_msg)
return {
"deletions": sum_deletions / start_length,
"insertions": sum_insertions / end_length,
"changes": sum_changes / end_length
}
def get_statistics_for_df(df: pd.DataFrame, start_col, end_col, annotated_col):
stats = [get_statistics(row[start_col], row[end_col], row[annotated_col]) for _, row in df.iterrows()]
assert len(stats) > 0
return {stat_name: np.asarray([e[stat_name] for e in stats]) for stat_name in stats[0]}
def get_statistics_for_manual_df(df):
return get_statistics_for_df(df, start_col="commit_msg_start", end_col='commit_msg_end',
annotated_col='annotated_diff')
def get_statistics_for_synthetic_df(df):
return get_statistics_for_df(df, start_col="initial_msg_pred", end_col='reference', annotated_col='annotated_diff')
|