|
import diff_match_patch as dmp_module |
|
|
|
import hf_data_loader |
|
|
|
|
|
def get_annotated_diff(start_text, end_text): |
|
dmp = dmp_module.diff_match_patch() |
|
dmp_mapping = { |
|
-1: '-', |
|
0: None, |
|
1: '+' |
|
} |
|
|
|
diff = dmp.diff_main(start_text, end_text) |
|
dmp.diff_cleanupSemantic(diff) |
|
|
|
result = [[w, dmp_mapping[t]] for t, w in diff] |
|
|
|
return result |
|
|
|
|
|
def annotated_diff_for_row_manual_df(row): |
|
start = row['commit_msg_start'] |
|
end = row['commit_msg_end'] |
|
return get_annotated_diff(start, end) |
|
|
|
|
|
def annotated_diff_for_row_synthetic_df(row): |
|
start = row['initial_msg_pred'] |
|
end = row['reference'] |
|
return get_annotated_diff(start, end) |
|
|
|
|
|
def manual_data_with_annotated_diffs(): |
|
df = hf_data_loader.load_raw_rewriting_dataset_as_pandas() |
|
annotated = df.apply(annotated_diff_for_row_manual_df, axis=1) |
|
df['annotated_diff'] = annotated |
|
return df |
|
|
|
|
|
def synthetic_data_with_annotated_diffs(): |
|
df = hf_data_loader.load_synthetic_dataset_as_pandas() |
|
annotated = df.apply(annotated_diff_for_row_synthetic_df, axis=1) |
|
df['annotated_diff'] = annotated |
|
return df |
|
|