saridormi commited on
Commit
86f1b98
β€’
1 Parent(s): abb3f0c

some fixes to gradio app

Browse files
analysis_util.py DELETED
@@ -1,74 +0,0 @@
1
- import functools
2
- import operator
3
-
4
- import pandas as pd
5
-
6
-
7
- def correlations_for_group(group):
8
- REL_METRICS = [col.split("_")[0] for col in group.columns if col.endswith("_related")]
9
- IND_METRICS = [col.split("_")[0] for col in group.columns if col.endswith("_independent")]
10
- AGGR_METRICS = [col.split("_")[0] for col in group.columns if col.endswith("_aggr")]
11
-
12
- correlations = []
13
- for rel_metric in REL_METRICS:
14
- for ind_metric in IND_METRICS:
15
- correlations.append({
16
- f"rel_{rel_metric}_ind_{ind_metric}_pearson": group[f"{rel_metric}_related"].corr(
17
- group[f"{ind_metric}_independent"], method="pearson"),
18
- f"rel_{rel_metric}_ind_{ind_metric}_spearman": group[f"{rel_metric}_related"].corr(
19
- group[f"{ind_metric}_independent"], method="spearman"),
20
- })
21
- for aggr_metric in AGGR_METRICS:
22
- correlations.append({
23
- f"rel_{rel_metric}_aggr_{aggr_metric}_pearson": group[f"{rel_metric}_related"].corr(
24
- group[f"{aggr_metric}_aggr"], method="pearson"),
25
- f"rel_{rel_metric}_aggr_{aggr_metric}_spearman": group[f"{rel_metric}_related"].corr(
26
- group[f"{aggr_metric}_aggr"], method="spearman"),
27
- })
28
- return pd.Series(functools.reduce(operator.ior, correlations, {}))
29
-
30
-
31
- def split_metrics_string(s):
32
- tokens = s.split("_")
33
- return tokens[1], tokens[3]
34
-
35
-
36
- def get_correlations_df(df, right_side):
37
- correlations_raw = correlations_for_group(df)
38
-
39
- idx = list(set("_".join(col.split("_")[:-1]) for col in correlations_raw.index if right_side in col))
40
-
41
- data = []
42
- for metrics in idx:
43
- data.append(
44
- {"metrics": metrics,
45
- "spearman": correlations_raw[f"{metrics}_spearman"],
46
- "pearson": correlations_raw[f"{metrics}_pearson"],
47
- }
48
- )
49
-
50
- result = pd.DataFrame.from_records(data=data, index="metrics").sort_index()
51
- result.index = pd.MultiIndex.from_tuples(result.index.map(split_metrics_string).tolist())
52
- result.index.set_names(["relative", "independent"], inplace=True)
53
-
54
- return result
55
-
56
-
57
- def get_correlations_for_groups(df, right_side):
58
- correlations = {"all": get_correlations_df(df, right_side=right_side)}
59
-
60
- for e2s in (False, True):
61
- for s2e in (False, True):
62
- group = "golden"
63
- if e2s:
64
- group += "+e2s"
65
- if s2e:
66
- group += "+s2e"
67
-
68
- subdf = df[((df["end_to_start"] == e2s) & (df["start_to_end"] == s2e)) | (
69
- (df["end_to_start"] == False) & (df["start_to_end"] == False))]
70
- subdf_corr = get_correlations_df(subdf, right_side=right_side)
71
- correlations[group] = subdf_corr
72
-
73
- correlations = pd.concat(correlations, axis=1)
74
- return correlations
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
api_wrappers/hf_data_loader.py CHANGED
@@ -2,6 +2,7 @@ import json
2
  import os
3
  from datetime import datetime, timedelta
4
 
 
5
  from datasets import load_dataset
6
  from huggingface_hub import hf_hub_download, list_repo_tree
7
 
@@ -66,7 +67,7 @@ def load_processed_rewriting_as_pandas():
66
 
67
  def load_synthetic_as_pandas():
68
  return load_dataset(config.HF_SYNTHETIC_DATASET_NAME,
69
- "raw",
70
  split=config.HF_SYNTHETIC_DATASET_SPLIT,
71
  token=config.HF_TOKEN,
72
  cache_dir=config.CACHE_DIR).to_pandas()
@@ -75,21 +76,21 @@ def load_synthetic_as_pandas():
75
  def load_full_commit_with_predictions_as_pandas():
76
  full_dataset = load_full_commit_as_pandas()
77
 
78
- # TODO
79
- # for prediction_file in list_repo_tree(repo_id=config.HF_PREDICTIONS_DATASET_NAME,
80
- # path=os.path.join("commit_message_generation/predictions", config.HF_PREDICTIONS_MODEL),
81
- # repo_type="dataset"):
82
- # hf_hub_download(prediction_file.path,
83
- # repo_id=config.HF_PREDICTIONS_DATASET_NAME,
84
- # repo_type="dataset",)
85
-
86
- predictions_dataset = load_dataset(config.HF_PREDICTIONS_DATASET_NAME,
87
- config.HF_PREDICTIONS_DATASET_SUBNAME,
88
- split=config.HF_PREDICTIONS_DATASET_SPLIT,
89
- cache_dir=config.CACHE_DIR
90
- ).to_pandas().sample(frac=1, random_state=config.RANDOM_STATE
91
- ).set_index(['hash', 'repo'])[["prediction"]]
92
-
93
  predictions_dataset = predictions_dataset[~predictions_dataset.index.duplicated(keep='first')]
94
 
95
  dataset = full_dataset.join(other=predictions_dataset, on=('hash', 'repo'))
 
2
  import os
3
  from datetime import datetime, timedelta
4
 
5
+ import pandas as pd
6
  from datasets import load_dataset
7
  from huggingface_hub import hf_hub_download, list_repo_tree
8
 
 
67
 
68
  def load_synthetic_as_pandas():
69
  return load_dataset(config.HF_SYNTHETIC_DATASET_NAME,
70
+ "all_pairs_with_metrics",
71
  split=config.HF_SYNTHETIC_DATASET_SPLIT,
72
  token=config.HF_TOKEN,
73
  cache_dir=config.CACHE_DIR).to_pandas()
 
76
  def load_full_commit_with_predictions_as_pandas():
77
  full_dataset = load_full_commit_as_pandas()
78
 
79
+ predictions_paths = []
80
+ for prediction_file in list_repo_tree(repo_id=config.HF_PREDICTIONS_DATASET_NAME,
81
+ path=os.path.join("commit_message_generation/predictions", config.HF_PREDICTIONS_MODEL),
82
+ repo_type="dataset"):
83
+ predictions_paths.append(hf_hub_download(prediction_file.path,
84
+ repo_id=config.HF_PREDICTIONS_DATASET_NAME,
85
+ repo_type="dataset",
86
+ cache_dir=config.CACHE_DIR))
87
+
88
+ dfs = []
89
+ for path in predictions_paths:
90
+ dfs.append(pd.read_json(path, orient="records", lines=True))
91
+ predictions_dataset = pd.concat(dfs, axis=0, ignore_index=True)
92
+ predictions_dataset = predictions_dataset.sample(frac=1,
93
+ random_state=config.RANDOM_STATE).set_index(['hash', 'repo'])[["prediction"]]
94
  predictions_dataset = predictions_dataset[~predictions_dataset.index.duplicated(keep='first')]
95
 
96
  dataset = full_dataset.join(other=predictions_dataset, on=('hash', 'repo'))
change_visualizer.py CHANGED
@@ -1,8 +1,6 @@
1
  import gradio as gr
2
 
3
- import analysis_util
4
  import generate_annotated_diffs
5
- import dataset_statistics
6
 
7
  df_manual = generate_annotated_diffs.manual_data_with_annotated_diffs()
8
  df_manual["end_to_start"] = False
@@ -14,45 +12,33 @@ n_diffs_synthetic = len(df_synthetic)
14
 
15
 
16
  def golden():
17
- return df_synthetic[(df_synthetic['end_to_start'] == False) & (df_synthetic['start_to_end'] == False)]
18
 
19
 
20
- def e2s():
21
- return df_synthetic[(df_synthetic['end_to_start'] == True) & (df_synthetic['start_to_end'] == False)]
22
 
23
 
24
- def s2e():
25
- return df_synthetic[(df_synthetic['end_to_start'] == False) & (df_synthetic['start_to_end'] == True)]
26
 
27
 
28
- def e2s_s2e():
29
- return df_synthetic[(df_synthetic['end_to_start'] == True) & (df_synthetic['start_to_end'] == True)]
30
 
31
 
32
  def synthetic():
33
  return df_synthetic[(df_synthetic['end_to_start'] == True) | (df_synthetic['start_to_end'] == True)]
34
 
35
 
36
- STATISTICS = {"manual": dataset_statistics.get_statistics_for_df(golden()),
37
- "e2s": dataset_statistics.get_statistics_for_df(e2s()),
38
- "s2e": dataset_statistics.get_statistics_for_df(s2e()),
39
- "e2s_s2e": dataset_statistics.get_statistics_for_df(e2s_s2e()),
40
- "synthetic": dataset_statistics.get_statistics_for_df(synthetic()),
41
- "all": dataset_statistics.get_statistics_for_df(df_synthetic)}
42
-
43
- STATISTICS_T_TEST = dataset_statistics.t_test(STATISTICS, main_group='manual')
44
-
45
- STAT_NAMES = list(STATISTICS['manual'].keys())
46
-
47
-
48
  def update_dataset_view(diff_idx, df):
49
  diff_idx -= 1
50
  return (df.iloc[diff_idx]['annotated_diff'],
51
- df.iloc[diff_idx]['commit_msg_start'],
52
- df.iloc[diff_idx]['commit_msg_end'],
53
- df.iloc[diff_idx]['session'],
54
- str(df.iloc[diff_idx]['end_to_start']),
55
- str(df.iloc[diff_idx]['start_to_end']),
56
  f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",)
57
 
58
 
@@ -119,103 +105,6 @@ if __name__ == '__main__':
119
  slider_synthetic.change(update_dataset_view_synthetic,
120
  inputs=slider_synthetic,
121
  outputs=view_synthetic)
122
- with gr.Tab("Analysis"):
123
- def layout_for_statistics(statistics_group_name):
124
- gr.Markdown(f"### {statistics_group_name}")
125
- stats = STATISTICS[statistics_group_name]
126
- gr.Number(label="Count", interactive=False,
127
- value=len(stats['deletions_norm']), min_width=00)
128
- gr.Number(label="Avg deletions number (rel to the initial msg length)", interactive=False,
129
- value=stats['deletions_norm'].mean().item(), precision=3, min_width=00)
130
- gr.Number(label="Avg insertions number (rel to the result length)", interactive=False,
131
- value=stats['insertions_norm'].mean().item(), precision=3, min_width=00)
132
- gr.Number(label="Avg changes number (rel to the initial msg length)", interactive=False,
133
- value=stats['changes_norm'].mean().item(), precision=3, min_width=00)
134
- gr.Number(label="Avg deletions number", interactive=False,
135
- value=stats['deletions'].mean().item(), precision=3, min_width=00)
136
- gr.Number(label="Avg insertions number", interactive=False,
137
- value=stats['insertions'].mean().item(), precision=3, min_width=00)
138
- gr.Number(label="Avg changes number", interactive=False,
139
- value=stats['changes'].mean().item(), precision=3, min_width=00)
140
- gr.Number(label="Avg edit distance", interactive=False,
141
- value=stats['editdist'].mean().item(), precision=3, min_width=00)
142
- gr.Number(label="Avg length difference", interactive=False,
143
- value=stats['lendiff'].mean().item(), precision=3, min_width=00)
144
-
145
-
146
- def layout_for_statistics_t_test(statistics_group_name):
147
- gr.Markdown(f"### {statistics_group_name}")
148
- stats = STATISTICS_T_TEST[statistics_group_name]
149
- gr.Number(label="Deletions number (rel to the initial msg length)", interactive=False,
150
- value=stats['deletions_norm'], precision=3, min_width=00)
151
- gr.Number(label="Insertions number (rel to the result length)", interactive=False,
152
- value=stats['insertions_norm'], precision=3, min_width=00)
153
- gr.Number(label="Changes number (rel to the initial msg length)", interactive=False,
154
- value=stats['changes_norm'], precision=3, min_width=00)
155
- gr.Number(label="Deletions number", interactive=False,
156
- value=stats['deletions'], precision=3, min_width=00)
157
- gr.Number(label="Insertions number", interactive=False,
158
- value=stats['insertions'], precision=3, min_width=00)
159
- gr.Number(label="Changes number", interactive=False,
160
- value=stats['changes'], precision=3, min_width=00)
161
-
162
-
163
- with gr.Row():
164
- with gr.Column(scale=1, min_width=100):
165
- layout_for_statistics("manual")
166
- with gr.Column(scale=1, min_width=100):
167
- layout_for_statistics("e2s")
168
- with gr.Column(scale=1, min_width=100):
169
- layout_for_statistics("s2e")
170
- with gr.Column(scale=1, min_width=100):
171
- layout_for_statistics("e2s_s2e")
172
- with gr.Column(scale=1, min_width=100):
173
- layout_for_statistics("synthetic")
174
- with gr.Column(scale=1, min_width=100):
175
- layout_for_statistics("all")
176
-
177
- # gr.Markdown(f"### Student t-test (p-value)")
178
- # with gr.Row():
179
- # with gr.Column(scale=1, min_width=100):
180
- # layout_for_statistics_t_test("manual")
181
- # with gr.Column(scale=1, min_width=100):
182
- # layout_for_statistics_t_test("e2s")
183
- # with gr.Column(scale=1, min_width=100):
184
- # layout_for_statistics_t_test("s2e")
185
- # with gr.Column(scale=1, min_width=100):
186
- # layout_for_statistics_t_test("e2s_s2e")
187
- # with gr.Column(scale=1, min_width=100):
188
- # layout_for_statistics_t_test("synthetic")
189
- # with gr.Column(scale=1, min_width=100):
190
- # layout_for_statistics_t_test("all")
191
-
192
- with gr.Row():
193
- with gr.Column(scale=1):
194
- for stat_name in filter(lambda s: "_norm" not in s, STAT_NAMES):
195
- chart = dataset_statistics.build_plotly_chart(
196
- stat_golden=STATISTICS['manual'][stat_name],
197
- stat_e2s=STATISTICS['e2s'][stat_name],
198
- stat_s2e=STATISTICS['s2e'][stat_name],
199
- stat_e2s_s2e=STATISTICS['e2s_s2e'][stat_name],
200
- stat_name=stat_name
201
- )
202
-
203
- gr.Plot(value=chart)
204
- with gr.Column(scale=1):
205
- with gr.Column(scale=1):
206
- for stat_name in filter(lambda s: "_norm" in s, STAT_NAMES):
207
- chart = dataset_statistics.build_plotly_chart(
208
- stat_golden=STATISTICS['manual'][stat_name],
209
- stat_e2s=STATISTICS['e2s'][stat_name],
210
- stat_s2e=STATISTICS['s2e'][stat_name],
211
- stat_e2s_s2e=STATISTICS['e2s_s2e'][stat_name],
212
- stat_name=stat_name
213
- )
214
-
215
- gr.Plot(value=chart)
216
-
217
- gr.Markdown(f"### Metrics correlations")
218
- gr.Markdown(value=analysis_util.get_correlations_for_groups(df_synthetic, right_side="aggr").to_markdown())
219
 
220
  application.load(update_dataset_view_manual, inputs=slider_manual,
221
  outputs=view_manual)
 
1
  import gradio as gr
2
 
 
3
  import generate_annotated_diffs
 
4
 
5
  df_manual = generate_annotated_diffs.manual_data_with_annotated_diffs()
6
  df_manual["end_to_start"] = False
 
12
 
13
 
14
  def golden():
15
+ return df_synthetic.loc[df_synthetic.is_related].loc[(df_synthetic['G_type'] == "initial") & (df_synthetic['E_type'] == "expert_labeled")]
16
 
17
 
18
+ def backward():
19
+ return df_synthetic.loc[df_synthetic.is_related].loc[(df_synthetic['G_type'] == "backward") & (df_synthetic['E_type'].isin(["synthetic_forward", "synthtetic_forward_from_backward"]))]
20
 
21
 
22
+ def forward():
23
+ return df_synthetic.loc[df_synthetic.is_related].loc[(df_synthetic['G_type'] == "initial") & (df_synthetic['E_type'] == "synthetic_forward")]
24
 
25
 
26
+ def forward_from_backward():
27
+ return df_synthetic.loc[df_synthetic.is_related].loc[(df_synthetic['G_type'] == "synthetic_backward") & (df_synthetic['E_type'] == "synthetic_forward_from_backward")]
28
 
29
 
30
  def synthetic():
31
  return df_synthetic[(df_synthetic['end_to_start'] == True) | (df_synthetic['start_to_end'] == True)]
32
 
33
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def update_dataset_view(diff_idx, df):
35
  diff_idx -= 1
36
  return (df.iloc[diff_idx]['annotated_diff'],
37
+ df.iloc[diff_idx]['commit_msg_start'] if "commit_msg_start" in df.columns else df.iloc[diff_idx]['G_text'],
38
+ df.iloc[diff_idx]['commit_msg_end'] if "commit_msg_end" in df.columns else df.iloc[diff_idx]['E_text'],
39
+ df.iloc[diff_idx]['session'] if "session" in df.columns else "",
40
+ str(df.iloc[diff_idx]['end_to_start']) if "end_to_start" in df.columns else "",
41
+ str(df.iloc[diff_idx]['start_to_end']) if "start_to_end" in df.columns else "",
42
  f"https://github.com/{df.iloc[diff_idx]['repo']}/commit/{df.iloc[diff_idx]['hash']}",)
43
 
44
 
 
105
  slider_synthetic.change(update_dataset_view_synthetic,
106
  inputs=slider_synthetic,
107
  outputs=view_synthetic)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
108
 
109
  application.load(update_dataset_view_manual, inputs=slider_manual,
110
  outputs=view_manual)
dataset_statistics.py CHANGED
@@ -29,14 +29,20 @@ def get_statistics_for_sample(start_msg, end_msg, row=None):
29
  "changes_norm": n_changes / len(end_msg),
30
 
31
  "lendiff": abs(len(start_msg) - len(end_msg)),
32
- "editdist": row["editdist_related"] if row is not None else Levenshtein.distance(start_msg, end_msg),
33
  }
34
 
35
 
36
  def get_statistics_for_row(row):
37
- start_msg = row["commit_msg_start"]
38
- end_msg = row["commit_msg_end"]
39
- return get_statistics_for_sample(start_msg, end_msg, row=row)
 
 
 
 
 
 
40
 
41
 
42
  def get_statistics_for_df(df: pd.DataFrame):
@@ -63,17 +69,3 @@ def build_plotly_chart(stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e, stat_name)
63
  pickle.dump(hist_data, f)
64
 
65
  return fig
66
-
67
-
68
- def t_test(group_stats, main_group="manual"):
69
- results = {}
70
- for group in group_stats:
71
- results[group] = {}
72
- for stat in group_stats[group]:
73
- a = group_stats[main_group][stat]
74
- b = group_stats[group][stat]
75
-
76
- p = stats.ttest_ind(a, b, equal_var=False, random_state=config.RANDOM_STATE).pvalue
77
- results[group][stat] = p
78
-
79
- return results
 
29
  "changes_norm": n_changes / len(end_msg),
30
 
31
  "lendiff": abs(len(start_msg) - len(end_msg)),
32
+ "editdist": row["editdist"] if row is not None else Levenshtein.distance(start_msg, end_msg),
33
  }
34
 
35
 
36
  def get_statistics_for_row(row):
37
+ if "commit_msg_start" in row:
38
+ start = row['commit_msg_start']
39
+ else:
40
+ start = row["G_text"]
41
+ if "commit_msg_end" in row:
42
+ end = row['commit_msg_end']
43
+ else:
44
+ end = row["E_text"]
45
+ return get_statistics_for_sample(start, end, row=row)
46
 
47
 
48
  def get_statistics_for_df(df: pd.DataFrame):
 
69
  pickle.dump(hist_data, f)
70
 
71
  return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
generate_annotated_diffs.py CHANGED
@@ -21,8 +21,14 @@ def get_annotated_diff(start_text, end_text):
21
 
22
 
23
  def annotated_diff_for_row(row):
24
- start = row['commit_msg_start']
25
- end = row['commit_msg_end']
 
 
 
 
 
 
26
  return get_annotated_diff(start, end)
27
 
28
 
 
21
 
22
 
23
  def annotated_diff_for_row(row):
24
+ if "commit_msg_start" in row:
25
+ start = row['commit_msg_start']
26
+ else:
27
+ start = row["G_text"]
28
+ if "commit_msg_end" in row:
29
+ end = row['commit_msg_end']
30
+ else:
31
+ end = row["E_text"]
32
  return get_annotated_diff(start, end)
33
 
34