import diff_match_patch as dmp_module | |
from api_wrappers import hf_data_loader | |
def get_annotated_diff(start_text, end_text): | |
dmp = dmp_module.diff_match_patch() | |
dmp_mapping = { | |
-1: '-', | |
0: None, | |
1: '+' | |
} | |
diff = dmp.diff_main(start_text, end_text) | |
dmp.diff_cleanupSemantic(diff) | |
result = [[w, dmp_mapping[t]] for t, w in diff] | |
return result | |
def annotated_diff_for_row(row): | |
start = row['commit_msg_start'] | |
end = row['commit_msg_end'] | |
return get_annotated_diff(start, end) | |
def manual_data_with_annotated_diffs(): | |
df = hf_data_loader.load_raw_rewriting_dataset_as_pandas() | |
annotated = df.apply(annotated_diff_for_row, axis=1) | |
df['annotated_diff'] = annotated | |
return df | |
def synthetic_data_with_annotated_diffs(): | |
df = hf_data_loader.load_synthetic_dataset_as_pandas() | |
annotated = df.apply(annotated_diff_for_row, axis=1) | |
df['annotated_diff'] = annotated | |
return df | |