Zekun Wu commited on
Commit
ae16dbc
1 Parent(s): 7a70a60
Files changed (2) hide show
  1. requirements.txt +2 -1
  2. util/evaluation.py +51 -42
requirements.txt CHANGED
@@ -2,4 +2,5 @@ openai
2
  pandas
3
  tqdm
4
  scipy
5
- statsmodels
 
 
2
  pandas
3
  tqdm
4
  scipy
5
+ statsmodels
6
+ scikit-posthocs
util/evaluation.py CHANGED
@@ -8,22 +8,23 @@ from scipy.spatial.distance import jensenshannon
8
  from scipy.stats import ttest_ind, friedmanchisquare, rankdata, ttest_rel
9
  from statsmodels.stats.multicomp import pairwise_tukeyhsd
10
  from scipy.stats import ttest_1samp
 
11
 
12
- def bootstrap_t_test(data1, data2, num_bootstrap=1000):
13
- """Perform a bootstrapped t-test."""
14
- observed_t_stat, _ = ttest_ind(data1, data2)
15
- combined = np.concatenate([data1, data2])
16
- t_stats = []
17
-
18
- for _ in range(num_bootstrap):
19
- np.random.shuffle(combined)
20
- new_data1 = combined[:len(data1)]
21
- new_data2 = combined[len(data1):]
22
- t_stat, _ = ttest_ind(new_data1, new_data2)
23
- t_stats.append(t_stat)
24
-
25
- p_value = np.sum(np.abs(t_stats) >= np.abs(observed_t_stat)) / num_bootstrap
26
- return observed_t_stat, p_value
27
 
28
 
29
  # def bootstrap_t_test(data1, data2, num_bootstrap=1000):
@@ -48,30 +49,33 @@ def bootstrap_t_test(data1, data2, num_bootstrap=1000):
48
  # p_value = np.sum(np.abs(t_stats) >= np.abs(observed_t_stat)) / num_bootstrap
49
  # return observed_t_stat, p_value
50
 
51
- def posthoc_friedman(data, variables, rank_suffix='_Rank'):
52
- """Perform a post-hoc analysis for the Friedman test using pairwise comparisons."""
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
53
  ranked_data = data[[v + rank_suffix for v in variables]].to_numpy()
54
- num_subjects = ranked_data.shape[0]
55
- num_conditions = ranked_data.shape[1]
56
- comparisons = []
57
-
58
- for i in range(num_conditions):
59
- for j in range(i + 1, num_conditions):
60
- diff = ranked_data[:, i] - ranked_data[:, j]
61
- abs_diff = np.abs(diff)
62
- avg_diff = np.mean(diff)
63
- se_diff = np.std(diff, ddof=1) / np.sqrt(num_subjects)
64
- z_value = avg_diff / se_diff
65
- p_value = 2 * (1 - stats.norm.cdf(np.abs(z_value)))
66
- comparisons.append({
67
- "Group1": variables[i],
68
- "Group2": variables[j],
69
- "Z": z_value,
70
- "p-value": p_value
71
- })
72
-
73
- return comparisons
74
-
75
  def statistical_tests(data):
76
  """Perform various statistical tests to evaluate potential biases."""
77
  variables = ['Privilege', 'Protect', 'Neutral']
@@ -96,17 +100,22 @@ def statistical_tests(data):
96
  'T-Test': {}
97
  }
98
 
 
 
 
 
99
  for (var1, var2) in pairs:
100
  pair_name_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
101
  pair_rank_score = f'{var1}{rank_suffix} vs {var2}{rank_suffix}'
102
 
103
- # Bootstrapped T-test for independent samples
104
- t_stat, t_p = bootstrap_t_test(data[f'{var1}{rank_suffix}'], data[f'{var2}{rank_suffix}'])
105
- pairwise_results['T-Test'][pair_rank_score] = {"Statistic": t_stat, "p-value": t_p}
 
106
 
107
  # Friedman test
108
  friedman_stat, friedman_p = friedmanchisquare(*rank_data)
109
- posthoc_results = posthoc_friedman(data, variables, rank_suffix)
110
 
111
  results = {
112
  "Average Ranks": average_ranks.to_dict(),
 
8
  from scipy.stats import ttest_ind, friedmanchisquare, rankdata, ttest_rel
9
  from statsmodels.stats.multicomp import pairwise_tukeyhsd
10
  from scipy.stats import ttest_1samp
11
+ from scikit_posthocs import posthoc_nemenyi
12
 
13
+ # def bootstrap_t_test(data1, data2, num_bootstrap=1000):
14
+ # """Perform a bootstrapped t-test."""
15
+ # observed_t_stat, _ = ttest_ind(data1, data2)
16
+ # combined = np.concatenate([data1, data2])
17
+ # t_stats = []
18
+ #
19
+ # for _ in range(num_bootstrap):
20
+ # np.random.shuffle(combined)
21
+ # new_data1 = combined[:len(data1)]
22
+ # new_data2 = combined[len(data1):]
23
+ # t_stat, _ = ttest_ind(new_data1, new_data2)
24
+ # t_stats.append(t_stat)
25
+ #
26
+ # p_value = np.sum(np.abs(t_stats) >= np.abs(observed_t_stat)) / num_bootstrap
27
+ # return observed_t_stat, p_value
28
 
29
 
30
  # def bootstrap_t_test(data1, data2, num_bootstrap=1000):
 
49
  # p_value = np.sum(np.abs(t_stats) >= np.abs(observed_t_stat)) / num_bootstrap
50
  # return observed_t_stat, p_value
51
 
52
+ # def posthoc_friedman(data, variables, rank_suffix='_Rank'):
53
+ # """Perform a post-hoc analysis for the Friedman test using pairwise comparisons."""
54
+ # ranked_data = data[[v + rank_suffix for v in variables]].to_numpy()
55
+ # num_subjects = ranked_data.shape[0]
56
+ # num_conditions = ranked_data.shape[1]
57
+ # comparisons = []
58
+ #
59
+ # for i in range(num_conditions):
60
+ # for j in range(i + 1, num_conditions):
61
+ # diff = ranked_data[:, i] - ranked_data[:, j]
62
+ # abs_diff = np.abs(diff)
63
+ # avg_diff = np.mean(diff)
64
+ # se_diff = np.std(diff, ddof=1) / np.sqrt(num_subjects)
65
+ # z_value = avg_diff / se_diff
66
+ # p_value = 2 * (1 - stats.norm.cdf(np.abs(z_value)))
67
+ # comparisons.append({
68
+ # "Group1": variables[i],
69
+ # "Group2": variables[j],
70
+ # "Z": z_value,
71
+ # "p-value": p_value
72
+ # })
73
+ #
74
+ # return comparisons
75
+ def posthoc_friedman_nemenyi(data, variables, rank_suffix='_Rank'):
76
+ """Perform post-hoc Nemenyi test for the Friedman test."""
77
  ranked_data = data[[v + rank_suffix for v in variables]].to_numpy()
78
+ return posthoc_nemenyi(ranked_data)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
  def statistical_tests(data):
80
  """Perform various statistical tests to evaluate potential biases."""
81
  variables = ['Privilege', 'Protect', 'Neutral']
 
100
  'T-Test': {}
101
  }
102
 
103
+ pairwise_results = {
104
+ 'Wilcoxon Signed-Rank Test': {}
105
+ }
106
+
107
  for (var1, var2) in pairs:
108
  pair_name_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
109
  pair_rank_score = f'{var1}{rank_suffix} vs {var2}{rank_suffix}'
110
 
111
+ # Wilcoxon signed-rank test for pairwise comparisons
112
+ wilcoxon_stat, wilcoxon_p = wilcoxon(data[f'{var1}{score_suffix}'], data[f'{var2}{score_suffix}'])
113
+ pairwise_results['Wilcoxon Signed-Rank Test'][pair_name_score] = {"Statistic": wilcoxon_stat,
114
+ "p-value": wilcoxon_p}
115
 
116
  # Friedman test
117
  friedman_stat, friedman_p = friedmanchisquare(*rank_data)
118
+ posthoc_results = posthoc_friedman_nemenyi(data, variables, rank_suffix)
119
 
120
  results = {
121
  "Average Ranks": average_ranks.to_dict(),