Zekun Wu commited on
Commit
a41445d
1 Parent(s): cd0fc76
Files changed (1) hide show
  1. util/evaluation.py +0 -187
util/evaluation.py CHANGED
@@ -10,68 +10,6 @@ from scipy.stats import ttest_ind, friedmanchisquare, rankdata, ttest_rel
10
  from statsmodels.stats.multicomp import pairwise_tukeyhsd
11
  from scipy.stats import ttest_1samp
12
 
13
- # def bootstrap_t_test(data1, data2, num_bootstrap=1000):
14
- # """Perform a bootstrapped t-test."""
15
- # observed_t_stat, _ = ttest_ind(data1, data2)
16
- # combined = np.concatenate([data1, data2])
17
- # t_stats = []
18
- #
19
- # for _ in range(num_bootstrap):
20
- # np.random.shuffle(combined)
21
- # new_data1 = combined[:len(data1)]
22
- # new_data2 = combined[len(data1):]
23
- # t_stat, _ = ttest_ind(new_data1, new_data2)
24
- # t_stats.append(t_stat)
25
- #
26
- # p_value = np.sum(np.abs(t_stats) >= np.abs(observed_t_stat)) / num_bootstrap
27
- # return observed_t_stat, p_value
28
-
29
-
30
- # def bootstrap_t_test(data1, data2, num_bootstrap=1000):
31
- # """Perform a bootstrapped paired t-test for mean difference being zero."""
32
- # # Calculate the observed differences between paired samples
33
- # differences = data1 - data2
34
- # # Compute the observed t-statistic for the differences
35
- # observed_t_stat, _ = ttest_1samp(differences, 0)
36
- #
37
- # t_stats = []
38
- #
39
- # for _ in range(num_bootstrap):
40
- # # Resample the differences with replacement
41
- # resampled_diffs = np.random.choice(differences, size=len(differences), replace=True)
42
- # # Perform a one-sample t-test on the resampled differences against zero
43
- # t_stat, _ = ttest_1samp(resampled_diffs, 0)
44
- # # Append the t-statistic to the list
45
- # t_stats.append(t_stat)
46
- #
47
- # # Calculate the p-value as the proportion of bootstrap t-statistics
48
- # # that are as extreme as or more extreme than the observed t-statistic
49
- # p_value = np.sum(np.abs(t_stats) >= np.abs(observed_t_stat)) / num_bootstrap
50
- # return observed_t_stat, p_value
51
-
52
- # def posthoc_friedman(data, variables, rank_suffix='_Rank'):
53
- # """Perform a post-hoc analysis for the Friedman test using pairwise comparisons."""
54
- # ranked_data = data[[v + rank_suffix for v in variables]].to_numpy()
55
- # num_subjects = ranked_data.shape[0]
56
- # num_conditions = ranked_data.shape[1]
57
- # comparisons = []
58
- #
59
- # for i in range(num_conditions):
60
- # for j in range(i + 1, num_conditions):
61
- # diff = ranked_data[:, i] - ranked_data[:, j]
62
- # abs_diff = np.abs(diff)
63
- # avg_diff = np.mean(diff)
64
- # se_diff = np.std(diff, ddof=1) / np.sqrt(num_subjects)
65
- # z_value = avg_diff / se_diff
66
- # p_value = 2 * (1 - stats.norm.cdf(np.abs(z_value)))
67
- # comparisons.append({
68
- # "Group1": variables[i],
69
- # "Group2": variables[j],
70
- # "Z": z_value,
71
- # "p-value": p_value
72
- # })
73
- #
74
- # return comparisons
75
 
76
  def statistical_tests(data):
77
  """Perform various statistical tests to evaluate potential biases."""
@@ -108,11 +46,6 @@ def statistical_tests(data):
108
  wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
109
  pairwise_results['Wilcoxon Test'][pair_rank_score] = {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p}
110
 
111
-
112
- # # Bootstrapped T-test for independent samples
113
- # t_stat, t_p = bootstrap_t_test(data[f'{var1}{rank_suffix}'], data[f'{var2}{rank_suffix}'])
114
- # pairwise_results['T-Test'][pair_rank_score] = {"Statistic": t_stat, "p-value": t_p}
115
-
116
  # Friedman test
117
  friedman_stat, friedman_p = friedmanchisquare(*rank_data)
118
 
@@ -183,123 +116,3 @@ def calculate_divergences(df):
183
  divergences['Hellinger Distance'][f'{col1} vs {col2}'] = hellinger_distance(probabilities[col1],
184
  probabilities[col2])
185
  return divergences
186
-
187
- # def statistical_tests(data):
188
- # """Perform various statistical tests to evaluate potential biases."""
189
- # variables = ['Privilege', 'Protect', 'Neutral']
190
- # rank_suffix = '_Rank'
191
- # score_suffix = '_Avg_Score'
192
- #
193
- # # # Calculate average ranks
194
- # rank_columns = [v + rank_suffix for v in variables]
195
- # average_ranks = data[rank_columns].mean()
196
- #
197
- # # Statistical tests
198
- # rank_data = [data[col] for col in rank_columns]
199
- #
200
- # # Pairwise tests
201
- # pairs = [
202
- # ('Privilege', 'Protect'),
203
- # ('Protect', 'Neutral'),
204
- # ('Privilege', 'Neutral')
205
- # ]
206
- #
207
- # pairwise_results = {
208
- # 'T-Test': {}
209
- # }
210
- #
211
- # for (var1, var2) in pairs:
212
- # pair_name_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
213
- #
214
- # # T-test for independent samples
215
- # t_stat, t_p = ttest_ind(data[f'{var1}{score_suffix}'], data[f'{var2}{score_suffix}'])
216
- # pairwise_results['T-Test'][pair_name_score] = {"Statistic": t_stat, "p-value": t_p}
217
- #
218
- # results = {
219
- # "Average Ranks": average_ranks.to_dict(),
220
- # "Friedman Test": {
221
- # "Statistic": friedmanchisquare(*rank_data).statistic,
222
- # "p-value": friedmanchisquare(*rank_data).pvalue
223
- # },
224
- # **pairwise_results,
225
- # }
226
- #
227
- # return results
228
-
229
- def disabled_statistical_tests(data):
230
- """Perform various statistical tests to evaluate potential biases."""
231
- variables = ['Privilege', 'Protect', 'Neutral']
232
- rank_suffix = '_Rank'
233
- score_suffix = '_Avg_Score'
234
-
235
- # # Calculate average ranks
236
- rank_columns = [v + rank_suffix for v in variables]
237
- # average_ranks = data[rank_columns].mean()
238
-
239
- # Statistical tests
240
- rank_data = [data[col] for col in rank_columns]
241
- kw_stat, kw_p = kruskal(*rank_data)
242
-
243
- # Pairwise tests
244
- pairwise_results = {}
245
- pairs = [
246
- ('Privilege', 'Protect'),
247
- ('Protect', 'Neutral'),
248
- ('Privilege', 'Neutral')
249
- ]
250
-
251
- pairwise_results = {
252
- # 'Mann-Whitney U Test': {},
253
- # 'Wilcoxon Test': {},
254
- # 'Levene\'s Test': {},
255
- 'T-Test': {}
256
- }
257
-
258
- for (var1, var2) in pairs:
259
- pair_name_rank = f'{var1}{rank_suffix} vs {var2}{rank_suffix}'
260
- pair_name_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
261
-
262
- # # Mann-Whitney U Test
263
- # mw_stat, mw_p = mannwhitneyu(data[f'{var1}{rank_suffix}'], data[f'{var2}{rank_suffix}'])
264
- # pairwise_results['Mann-Whitney U Test'][pair_name_rank] = {"Statistic": mw_stat, "p-value": mw_p}
265
- #
266
- # # Wilcoxon Signed-Rank Test
267
- # if len(data) > 20:
268
- # wilcoxon_stat, wilcoxon_p = wilcoxon(data[f'{var1}{rank_suffix}'], data[f'{var2}{rank_suffix}'])
269
- # else:
270
- # wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
271
- # pairwise_results['Wilcoxon Test'][pair_name_rank] = {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p}
272
- #
273
- # Levene's Test for equality of variances
274
- # levene_stat, levene_p = levene(data[f'{var1}{score_suffix}'], data[f'{var2}{score_suffix}'])
275
- # pairwise_results['Levene\'s Test'][pair_name_score] = {"Statistic": levene_stat, "p-value": levene_p}
276
-
277
- # T-test for independent samples
278
- t_stat, t_p = ttest_ind(data[f'{var1}{score_suffix}'], data[f'{var2}{score_suffix}'])
279
- #equal_var=(levene_p > 0.05))
280
- pairwise_results['T-Test'][pair_name_score] = {"Statistic": t_stat, "p-value": t_p}
281
-
282
- # ANOVA and post-hoc tests if applicable
283
- # score_columns = [v + score_suffix for v in variables]
284
- # score_data = [data[col] for col in score_columns]
285
- # anova_stat, anova_p = f_oneway(*score_data)
286
- # if anova_p < 0.05:
287
- # mc = MultiComparison(data.melt()['value'], data.melt()['variable'])
288
- # tukey_result = mc.tukeyhsd()
289
- # tukey_result_summary = tukey_result.summary().as_html()
290
- # else:
291
- # tukey_result_summary = "ANOVA not significant, no post-hoc test performed."
292
-
293
- results = {
294
- #"Average Ranks": average_ranks.to_dict(),
295
- "Friedman Test": {
296
- "Statistic": friedmanchisquare(*rank_data).statistic,
297
- "p-value": friedmanchisquare(*rank_data).pvalue
298
- },
299
- # "Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
300
- **pairwise_results,
301
- # "ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
302
- #"Tukey HSD Test": tukey_result_summary
303
- }
304
-
305
- return results
 
10
  from statsmodels.stats.multicomp import pairwise_tukeyhsd
11
  from scipy.stats import ttest_1samp
12
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
13
 
14
  def statistical_tests(data):
15
  """Perform various statistical tests to evaluate potential biases."""
 
46
  wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
47
  pairwise_results['Wilcoxon Test'][pair_rank_score] = {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p}
48
 
 
 
 
 
 
49
  # Friedman test
50
  friedman_stat, friedman_p = friedmanchisquare(*rank_data)
51
 
 
116
  divergences['Hellinger Distance'][f'{col1} vs {col2}'] = hellinger_distance(probabilities[col1],
117
  probabilities[col2])
118
  return divergences