Petr Tsvetkov commited on
Commit
7ab7be2
β€’
1 Parent(s): 827777f

Generate charts for the presentation & diploma;some refactoring; add (commented) Student's t-test

Browse files
change_visualizer.py CHANGED
@@ -2,7 +2,7 @@ import gradio as gr
2
 
3
  import analysis_util
4
  import generate_annotated_diffs
5
- import statistics
6
 
7
  df_manual = generate_annotated_diffs.manual_data_with_annotated_diffs()
8
  df_manual["end_to_start"] = False
@@ -33,12 +33,14 @@ def synthetic():
33
  return df_synthetic[(df_synthetic['end_to_start'] == True) | (df_synthetic['start_to_end'] == True)]
34
 
35
 
36
- STATISTICS = {"manual": statistics.get_statistics_for_df(df_manual),
37
- "e2s": statistics.get_statistics_for_df(e2s()),
38
- "s2e": statistics.get_statistics_for_df(s2e()),
39
- "e2s_s2e": statistics.get_statistics_for_df(e2s_s2e()),
40
- "synthetic": statistics.get_statistics_for_df(synthetic()),
41
- "all": statistics.get_statistics_for_df(df_synthetic)}
 
 
42
 
43
  STAT_NAMES = list(STATISTICS['manual'].keys())
44
 
@@ -135,6 +137,23 @@ if __name__ == '__main__':
135
  value=stats['changes'].mean().item(), precision=3, min_width=00)
136
 
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  with gr.Row():
139
  with gr.Column(scale=1, min_width=100):
140
  layout_for_statistics("manual")
@@ -149,10 +168,25 @@ if __name__ == '__main__':
149
  with gr.Column(scale=1, min_width=100):
150
  layout_for_statistics("all")
151
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
152
  with gr.Row():
153
  with gr.Column(scale=1):
154
  for stat_name in filter(lambda s: "_norm" not in s, STAT_NAMES):
155
- chart = statistics.build_plotly_chart(
156
  stat_golden=STATISTICS['manual'][stat_name],
157
  stat_e2s=STATISTICS['e2s'][stat_name],
158
  stat_s2e=STATISTICS['s2e'][stat_name],
@@ -164,7 +198,7 @@ if __name__ == '__main__':
164
  with gr.Column(scale=1):
165
  with gr.Column(scale=1):
166
  for stat_name in filter(lambda s: "_norm" in s, STAT_NAMES):
167
- chart = statistics.build_plotly_chart(
168
  stat_golden=STATISTICS['manual'][stat_name],
169
  stat_e2s=STATISTICS['e2s'][stat_name],
170
  stat_s2e=STATISTICS['s2e'][stat_name],
 
2
 
3
  import analysis_util
4
  import generate_annotated_diffs
5
+ import dataset_statistics
6
 
7
  df_manual = generate_annotated_diffs.manual_data_with_annotated_diffs()
8
  df_manual["end_to_start"] = False
 
33
  return df_synthetic[(df_synthetic['end_to_start'] == True) | (df_synthetic['start_to_end'] == True)]
34
 
35
 
36
+ STATISTICS = {"manual": dataset_statistics.get_statistics_for_df(df_manual),
37
+ "e2s": dataset_statistics.get_statistics_for_df(e2s()),
38
+ "s2e": dataset_statistics.get_statistics_for_df(s2e()),
39
+ "e2s_s2e": dataset_statistics.get_statistics_for_df(e2s_s2e()),
40
+ "synthetic": dataset_statistics.get_statistics_for_df(synthetic()),
41
+ "all": dataset_statistics.get_statistics_for_df(df_synthetic)}
42
+
43
+ STATISTICS_T_TEST = dataset_statistics.t_test(STATISTICS, main_group='manual')
44
 
45
  STAT_NAMES = list(STATISTICS['manual'].keys())
46
 
 
137
  value=stats['changes'].mean().item(), precision=3, min_width=00)
138
 
139
 
140
+ def layout_for_statistics_t_test(statistics_group_name):
141
+ gr.Markdown(f"### {statistics_group_name}")
142
+ stats = STATISTICS_T_TEST[statistics_group_name]
143
+ gr.Number(label="Deletions number (rel to the initial msg length)", interactive=False,
144
+ value=stats['deletions_norm'], precision=3, min_width=00)
145
+ gr.Number(label="Insertions number (rel to the result length)", interactive=False,
146
+ value=stats['insertions_norm'], precision=3, min_width=00)
147
+ gr.Number(label="Changes number (rel to the initial msg length)", interactive=False,
148
+ value=stats['changes_norm'], precision=3, min_width=00)
149
+ gr.Number(label="Deletions number", interactive=False,
150
+ value=stats['deletions'], precision=3, min_width=00)
151
+ gr.Number(label="Insertions number", interactive=False,
152
+ value=stats['insertions'], precision=3, min_width=00)
153
+ gr.Number(label="Changes number", interactive=False,
154
+ value=stats['changes'], precision=3, min_width=00)
155
+
156
+
157
  with gr.Row():
158
  with gr.Column(scale=1, min_width=100):
159
  layout_for_statistics("manual")
 
168
  with gr.Column(scale=1, min_width=100):
169
  layout_for_statistics("all")
170
 
171
+ # gr.Markdown(f"### Student t-test (p-value)")
172
+ # with gr.Row():
173
+ # with gr.Column(scale=1, min_width=100):
174
+ # layout_for_statistics_t_test("manual")
175
+ # with gr.Column(scale=1, min_width=100):
176
+ # layout_for_statistics_t_test("e2s")
177
+ # with gr.Column(scale=1, min_width=100):
178
+ # layout_for_statistics_t_test("s2e")
179
+ # with gr.Column(scale=1, min_width=100):
180
+ # layout_for_statistics_t_test("e2s_s2e")
181
+ # with gr.Column(scale=1, min_width=100):
182
+ # layout_for_statistics_t_test("synthetic")
183
+ # with gr.Column(scale=1, min_width=100):
184
+ # layout_for_statistics_t_test("all")
185
+
186
  with gr.Row():
187
  with gr.Column(scale=1):
188
  for stat_name in filter(lambda s: "_norm" not in s, STAT_NAMES):
189
+ chart = dataset_statistics.build_plotly_chart(
190
  stat_golden=STATISTICS['manual'][stat_name],
191
  stat_e2s=STATISTICS['e2s'][stat_name],
192
  stat_s2e=STATISTICS['s2e'][stat_name],
 
198
  with gr.Column(scale=1):
199
  with gr.Column(scale=1):
200
  for stat_name in filter(lambda s: "_norm" in s, STAT_NAMES):
201
+ chart = dataset_statistics.build_plotly_chart(
202
  stat_golden=STATISTICS['manual'][stat_name],
203
  stat_e2s=STATISTICS['e2s'][stat_name],
204
  stat_s2e=STATISTICS['s2e'][stat_name],
chart_processing.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
config.py CHANGED
@@ -35,3 +35,6 @@ START_TO_END_ARTIFACT = OUTPUT_DIR / "start_to_end.csv"
35
  SYNTHETIC_DATASET_ARTIFACT = OUTPUT_DIR / "synthetic.csv"
36
  METRICS_CORRELATIONS_ARTIFACT = OUTPUT_DIR / "metrics_correlations.csv"
37
  DATA_FOR_LABELING_ARTIFACT = OUTPUT_DIR / "data_for_labeling.csv"
 
 
 
 
35
  SYNTHETIC_DATASET_ARTIFACT = OUTPUT_DIR / "synthetic.csv"
36
  METRICS_CORRELATIONS_ARTIFACT = OUTPUT_DIR / "metrics_correlations.csv"
37
  DATA_FOR_LABELING_ARTIFACT = OUTPUT_DIR / "data_for_labeling.csv"
38
+
39
+ OUTPUT_CHARTS_DIR = OUTPUT_DIR / "charts"
40
+ OUTPUT_CHARTS_DIR.mkdir(exist_ok=True)
statistics.py β†’ dataset_statistics.py RENAMED
@@ -1,7 +1,12 @@
 
 
1
  import Levenshtein
2
  import numpy as np
3
  import pandas as pd
4
  import plotly.figure_factory as ff
 
 
 
5
 
6
 
7
  def get_statistics(start_msg, end_msg, annotated_msg):
@@ -38,11 +43,28 @@ def build_plotly_chart(stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e, stat_name)
38
  hist_data = [stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e,
39
  np.concatenate((stat_e2s, stat_s2e, stat_e2s_s2e), axis=0)]
40
 
41
- group_labels = ['Golden', 'e2s', 's2e', 'e2s+s 2e', 'Synthetic']
42
 
43
  fig = ff.create_distplot(hist_data, group_labels,
44
- bin_size=.1, show_rug=False, show_hist=False)
45
 
46
  fig.update_layout(title_text=stat_name)
47
 
 
 
 
48
  return fig
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pickle
2
+
3
  import Levenshtein
4
  import numpy as np
5
  import pandas as pd
6
  import plotly.figure_factory as ff
7
+ from scipy.stats import stats
8
+
9
+ import config
10
 
11
 
12
  def get_statistics(start_msg, end_msg, annotated_msg):
 
43
  hist_data = [stat_golden, stat_e2s, stat_s2e, stat_e2s_s2e,
44
  np.concatenate((stat_e2s, stat_s2e, stat_e2s_s2e), axis=0)]
45
 
46
+ group_labels = ['Golden', 'e2s', 's2e', 'e2s+s2e', 'Synthetic']
47
 
48
  fig = ff.create_distplot(hist_data, group_labels,
49
+ bin_size=.05, show_rug=False, show_hist=False)
50
 
51
  fig.update_layout(title_text=stat_name)
52
 
53
+ with open(config.OUTPUT_CHARTS_DIR / f"{stat_name}_data.pkl", "wb") as f:
54
+ pickle.dump(hist_data, f)
55
+
56
  return fig
57
+
58
+
59
+ def t_test(group_stats, main_group="manual"):
60
+ results = {}
61
+ for group in group_stats:
62
+ results[group] = {}
63
+ for stat in group_stats[group]:
64
+ a = group_stats[main_group][stat]
65
+ b = group_stats[group][stat]
66
+
67
+ p = stats.ttest_ind(a, b, equal_var=False, random_state=config.RANDOM_STATE).pvalue
68
+ results[group][stat] = p
69
+
70
+ return results
generation_steps/synthetic_end_to_start.py CHANGED
@@ -5,7 +5,7 @@ from tqdm import tqdm
5
 
6
  import config
7
  import generate_annotated_diffs
8
- import statistics
9
  from api_wrappers import grazie_wrapper, hf_data_loader
10
  from generation_steps import examples
11
 
 
5
 
6
  import config
7
  import generate_annotated_diffs
8
+ import dataset_statistics
9
  from api_wrappers import grazie_wrapper, hf_data_loader
10
  from generation_steps import examples
11
 
generation_steps/synthetic_start_to_end.py CHANGED
@@ -3,7 +3,7 @@ from tqdm import tqdm
3
 
4
  import config
5
  import generate_annotated_diffs
6
- import statistics
7
  from api_wrappers import grazie_wrapper
8
  from generation_steps import examples
9
 
 
3
 
4
  import config
5
  import generate_annotated_diffs
6
+ import dataset_statistics
7
  from api_wrappers import grazie_wrapper
8
  from generation_steps import examples
9
 
requirements.txt CHANGED
@@ -163,4 +163,5 @@ zipp==3.18.1
163
 
164
  plotly==5.22.0
165
  tenacity==8.2.3
166
- Levenshtein==0.25.1
 
 
163
 
164
  plotly==5.22.0
165
  tenacity==8.2.3
166
+ Levenshtein==0.25.1
167
+ kaleido==0.2.1