Spaces:
Running
Running
Zekun Wu
commited on
Commit
•
ae16dbc
1
Parent(s):
7a70a60
update
Browse files- requirements.txt +2 -1
- util/evaluation.py +51 -42
requirements.txt
CHANGED
@@ -2,4 +2,5 @@ openai
|
|
2 |
pandas
|
3 |
tqdm
|
4 |
scipy
|
5 |
-
statsmodels
|
|
|
|
2 |
pandas
|
3 |
tqdm
|
4 |
scipy
|
5 |
+
statsmodels
|
6 |
+
scikit-posthocs
|
util/evaluation.py
CHANGED
@@ -8,22 +8,23 @@ from scipy.spatial.distance import jensenshannon
|
|
8 |
from scipy.stats import ttest_ind, friedmanchisquare, rankdata, ttest_rel
|
9 |
from statsmodels.stats.multicomp import pairwise_tukeyhsd
|
10 |
from scipy.stats import ttest_1samp
|
|
|
11 |
|
12 |
-
def bootstrap_t_test(data1, data2, num_bootstrap=1000):
|
13 |
-
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
18 |
-
|
19 |
-
|
20 |
-
|
21 |
-
|
22 |
-
|
23 |
-
|
24 |
-
|
25 |
-
|
26 |
-
|
27 |
|
28 |
|
29 |
# def bootstrap_t_test(data1, data2, num_bootstrap=1000):
|
@@ -48,30 +49,33 @@ def bootstrap_t_test(data1, data2, num_bootstrap=1000):
|
|
48 |
# p_value = np.sum(np.abs(t_stats) >= np.abs(observed_t_stat)) / num_bootstrap
|
49 |
# return observed_t_stat, p_value
|
50 |
|
51 |
-
def posthoc_friedman(data, variables, rank_suffix='_Rank'):
|
52 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
53 |
ranked_data = data[[v + rank_suffix for v in variables]].to_numpy()
|
54 |
-
|
55 |
-
num_conditions = ranked_data.shape[1]
|
56 |
-
comparisons = []
|
57 |
-
|
58 |
-
for i in range(num_conditions):
|
59 |
-
for j in range(i + 1, num_conditions):
|
60 |
-
diff = ranked_data[:, i] - ranked_data[:, j]
|
61 |
-
abs_diff = np.abs(diff)
|
62 |
-
avg_diff = np.mean(diff)
|
63 |
-
se_diff = np.std(diff, ddof=1) / np.sqrt(num_subjects)
|
64 |
-
z_value = avg_diff / se_diff
|
65 |
-
p_value = 2 * (1 - stats.norm.cdf(np.abs(z_value)))
|
66 |
-
comparisons.append({
|
67 |
-
"Group1": variables[i],
|
68 |
-
"Group2": variables[j],
|
69 |
-
"Z": z_value,
|
70 |
-
"p-value": p_value
|
71 |
-
})
|
72 |
-
|
73 |
-
return comparisons
|
74 |
-
|
75 |
def statistical_tests(data):
|
76 |
"""Perform various statistical tests to evaluate potential biases."""
|
77 |
variables = ['Privilege', 'Protect', 'Neutral']
|
@@ -96,17 +100,22 @@ def statistical_tests(data):
|
|
96 |
'T-Test': {}
|
97 |
}
|
98 |
|
|
|
|
|
|
|
|
|
99 |
for (var1, var2) in pairs:
|
100 |
pair_name_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
|
101 |
pair_rank_score = f'{var1}{rank_suffix} vs {var2}{rank_suffix}'
|
102 |
|
103 |
-
#
|
104 |
-
|
105 |
-
pairwise_results['
|
|
|
106 |
|
107 |
# Friedman test
|
108 |
friedman_stat, friedman_p = friedmanchisquare(*rank_data)
|
109 |
-
posthoc_results =
|
110 |
|
111 |
results = {
|
112 |
"Average Ranks": average_ranks.to_dict(),
|
|
|
8 |
from scipy.stats import ttest_ind, friedmanchisquare, rankdata, ttest_rel
|
9 |
from statsmodels.stats.multicomp import pairwise_tukeyhsd
|
10 |
from scipy.stats import ttest_1samp
|
11 |
+
from scikit_posthocs import posthoc_nemenyi
|
12 |
|
13 |
+
# def bootstrap_t_test(data1, data2, num_bootstrap=1000):
|
14 |
+
# """Perform a bootstrapped t-test."""
|
15 |
+
# observed_t_stat, _ = ttest_ind(data1, data2)
|
16 |
+
# combined = np.concatenate([data1, data2])
|
17 |
+
# t_stats = []
|
18 |
+
#
|
19 |
+
# for _ in range(num_bootstrap):
|
20 |
+
# np.random.shuffle(combined)
|
21 |
+
# new_data1 = combined[:len(data1)]
|
22 |
+
# new_data2 = combined[len(data1):]
|
23 |
+
# t_stat, _ = ttest_ind(new_data1, new_data2)
|
24 |
+
# t_stats.append(t_stat)
|
25 |
+
#
|
26 |
+
# p_value = np.sum(np.abs(t_stats) >= np.abs(observed_t_stat)) / num_bootstrap
|
27 |
+
# return observed_t_stat, p_value
|
28 |
|
29 |
|
30 |
# def bootstrap_t_test(data1, data2, num_bootstrap=1000):
|
|
|
49 |
# p_value = np.sum(np.abs(t_stats) >= np.abs(observed_t_stat)) / num_bootstrap
|
50 |
# return observed_t_stat, p_value
|
51 |
|
52 |
+
# def posthoc_friedman(data, variables, rank_suffix='_Rank'):
|
53 |
+
# """Perform a post-hoc analysis for the Friedman test using pairwise comparisons."""
|
54 |
+
# ranked_data = data[[v + rank_suffix for v in variables]].to_numpy()
|
55 |
+
# num_subjects = ranked_data.shape[0]
|
56 |
+
# num_conditions = ranked_data.shape[1]
|
57 |
+
# comparisons = []
|
58 |
+
#
|
59 |
+
# for i in range(num_conditions):
|
60 |
+
# for j in range(i + 1, num_conditions):
|
61 |
+
# diff = ranked_data[:, i] - ranked_data[:, j]
|
62 |
+
# abs_diff = np.abs(diff)
|
63 |
+
# avg_diff = np.mean(diff)
|
64 |
+
# se_diff = np.std(diff, ddof=1) / np.sqrt(num_subjects)
|
65 |
+
# z_value = avg_diff / se_diff
|
66 |
+
# p_value = 2 * (1 - stats.norm.cdf(np.abs(z_value)))
|
67 |
+
# comparisons.append({
|
68 |
+
# "Group1": variables[i],
|
69 |
+
# "Group2": variables[j],
|
70 |
+
# "Z": z_value,
|
71 |
+
# "p-value": p_value
|
72 |
+
# })
|
73 |
+
#
|
74 |
+
# return comparisons
|
75 |
+
def posthoc_friedman_nemenyi(data, variables, rank_suffix='_Rank'):
|
76 |
+
"""Perform post-hoc Nemenyi test for the Friedman test."""
|
77 |
ranked_data = data[[v + rank_suffix for v in variables]].to_numpy()
|
78 |
+
return posthoc_nemenyi(ranked_data)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
def statistical_tests(data):
|
80 |
"""Perform various statistical tests to evaluate potential biases."""
|
81 |
variables = ['Privilege', 'Protect', 'Neutral']
|
|
|
100 |
'T-Test': {}
|
101 |
}
|
102 |
|
103 |
+
pairwise_results = {
|
104 |
+
'Wilcoxon Signed-Rank Test': {}
|
105 |
+
}
|
106 |
+
|
107 |
for (var1, var2) in pairs:
|
108 |
pair_name_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
|
109 |
pair_rank_score = f'{var1}{rank_suffix} vs {var2}{rank_suffix}'
|
110 |
|
111 |
+
# Wilcoxon signed-rank test for pairwise comparisons
|
112 |
+
wilcoxon_stat, wilcoxon_p = wilcoxon(data[f'{var1}{score_suffix}'], data[f'{var2}{score_suffix}'])
|
113 |
+
pairwise_results['Wilcoxon Signed-Rank Test'][pair_name_score] = {"Statistic": wilcoxon_stat,
|
114 |
+
"p-value": wilcoxon_p}
|
115 |
|
116 |
# Friedman test
|
117 |
friedman_stat, friedman_p = friedmanchisquare(*rank_data)
|
118 |
+
posthoc_results = posthoc_friedman_nemenyi(data, variables, rank_suffix)
|
119 |
|
120 |
results = {
|
121 |
"Average Ranks": average_ranks.to_dict(),
|