import numpy as np import pandas as pd def get_statistics(start_msg, end_msg, annotated_msg): sum_deletions = 0 sum_insertions = 0 for text, change_type in annotated_msg: if change_type == '-': sum_deletions += len(text) elif change_type == '+': sum_insertions += len(text) sum_changes = sum_deletions + sum_insertions end_length = len(end_msg) start_length = len(start_msg) return { "deletions": sum_deletions / start_length, "insertions": sum_insertions / end_length, "changes": sum_changes / end_length } def get_statistics_for_df(df: pd.DataFrame): stats = [get_statistics(row["commit_msg_start"], row["commit_msg_end"], row["annotated_diff"]) for _, row in df.iterrows()] assert len(stats) > 0 return {stat_name: np.asarray([e[stat_name] for e in stats]) for stat_name in stats[0]}