File size: 2,475 Bytes
7ab7be2
 
303303b
f26a894
 
303303b
7ab7be2
 
 
f26a894
 
347f566
303303b
 
 
 
 
 
 
 
f26a894
 
303303b
 
 
 
 
 
 
5bd86a2
 
347f566
f26a894
 
 
347f566
 
 
 
 
 
c151bb0
347f566
c151bb0
a8a595d
 
 
 
303303b
 
 
 
 
 
7ab7be2
303303b
 
7ab7be2
303303b
 
 
7ab7be2
 
 
303303b
7ab7be2
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
import pickle

import Levenshtein
import numpy as np
import pandas as pd
import plotly.figure_factory as ff
from scipy.stats import stats

import config


def get_statistics_for_sample(start_msg, end_msg, row=None):
    edit_ops = Levenshtein.editops(start_msg, end_msg)
    n_deletes = sum([1 if op == 'delete' else 0 for op, _, _ in edit_ops])
    n_inserts = sum([1 if op == 'insert' else 0 for op, _, _ in edit_ops])
    n_replaces = sum([1 if op == 'replace' else 0 for op, _, _ in edit_ops])

    n_changes = n_deletes + n_inserts + n_replaces
    n_deletes += n_replaces
    n_inserts += n_replaces

    return {
        "deletions": n_deletes,
        "insertions": n_inserts,
        "changes": n_changes,

        "deletions_norm": n_deletes / len(start_msg),
        "insertions_norm": n_inserts / len(end_msg),
        "changes_norm": n_changes / len(end_msg),

        "lendiff": abs(len(start_msg) - len(end_msg)),
        "editdist": row["editdist_related"] if row is not None else Levenshtein.distance(start_msg, end_msg),
    }


def get_statistics_for_row(row):
    start_msg = row["commit_msg_start"]
    end_msg = row["commit_msg_end"]
    return get_statistics_for_sample(start_msg, end_msg, row=row)


def get_statistics_for_df(df: pd.DataFrame):
    stats = [get_statistics_for_row(row) for _, row in
             df.iterrows()]

    assert len(stats) > 0

    return {stat_name: np.asarray([e[stat_name] for e in stats]) for stat_name in stats[0]}


def build_plotly_chart(stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e, stat_name):
    hist_data = [stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e,
                 np.concatenate((stat_e2s, stat_s2e, stat_e2s_s2e), axis=0)]

    group_labels = ['Golden', 'e2s', 's2e', 'e2s+s2e', 'Synthetic']

    fig = ff.create_distplot(hist_data, group_labels,
                             bin_size=.05, show_rug=False, show_hist=False)

    fig.update_layout(title_text=stat_name)

    with open(config.OUTPUT_CHARTS_DIR / f"{stat_name}_data.pkl", "wb") as f:
        pickle.dump(hist_data, f)

    return fig


def t_test(group_stats, main_group="manual"):
    results = {}
    for group in group_stats:
        results[group] = {}
        for stat in group_stats[group]:
            a = group_stats[main_group][stat]
            b = group_stats[group][stat]

            p = stats.ttest_ind(a, b, equal_var=False, random_state=config.RANDOM_STATE).pvalue
            results[group][stat] = p

    return results