File size: 2,156 Bytes
7ab7be2
 
303303b
f26a894
 
303303b
7ab7be2
 
f26a894
 
347f566
303303b
a6b5a66
 
 
303303b
 
 
 
f26a894
 
303303b
 
 
 
 
 
5bd86a2
86f1b98
f26a894
 
 
347f566
86f1b98
a6b5a66
86f1b98
 
 
a6b5a66
86f1b98
 
 
347f566
 
c151bb0
a6b5a66
a8a595d
 
 
 
303303b
 
 
a6b5a66
 
 
 
 
 
 
303303b
a6b5a66
303303b
a6b5a66
303303b
 
 
7ab7be2
 
 
303303b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import pickle

import Levenshtein
import numpy as np
import pandas as pd
import plotly.figure_factory as ff

import config


def get_statistics_for_sample(start_msg, end_msg, row=None):
    edit_ops = Levenshtein.editops(start_msg, end_msg)
    n_deletes = sum([1 if op == "delete" else 0 for op, _, _ in edit_ops])
    n_inserts = sum([1 if op == "insert" else 0 for op, _, _ in edit_ops])
    n_replaces = sum([1 if op == "replace" else 0 for op, _, _ in edit_ops])

    n_changes = n_deletes + n_inserts + n_replaces
    n_deletes += n_replaces
    n_inserts += n_replaces

    return {
        "deletions": n_deletes,
        "insertions": n_inserts,
        "changes": n_changes,
        "deletions_norm": n_deletes / len(start_msg),
        "insertions_norm": n_inserts / len(end_msg),
        "changes_norm": n_changes / len(end_msg),
        "lendiff": abs(len(start_msg) - len(end_msg)),
        "editdist": row["editdist"] if row is not None else Levenshtein.distance(start_msg, end_msg),
    }


def get_statistics_for_row(row):
    if "commit_msg_start" in row:
        start = row["commit_msg_start"]
    else:
        start = row["G_text"]
    if "commit_msg_end" in row:
        end = row["commit_msg_end"]
    else:
        end = row["E_text"]
    return get_statistics_for_sample(start, end, row=row)


def get_statistics_for_df(df: pd.DataFrame):
    stats = [get_statistics_for_row(row) for _, row in df.iterrows()]

    assert len(stats) > 0

    return {stat_name: np.asarray([e[stat_name] for e in stats]) for stat_name in stats[0]}


def build_plotly_chart(stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e, stat_name):
    hist_data = [
        stat_golden,
        stat_e2s,
        stat_s2e,
        stat_e2s_s2e,
        np.concatenate((stat_e2s, stat_s2e, stat_e2s_s2e), axis=0),
    ]

    group_labels = ["Golden", "e2s", "s2e", "e2s+s2e", "Synthetic"]

    fig = ff.create_distplot(hist_data, group_labels, bin_size=0.05, show_rug=False, show_hist=False)

    fig.update_layout(title_text=stat_name)

    with open(config.OUTPUT_CHARTS_DIR / f"{stat_name}_data.pkl", "wb") as f:
        pickle.dump(hist_data, f)

    return fig