Spaces:
Running
Running
Zekun Wu
commited on
Commit
•
a41445d
1
Parent(s):
cd0fc76
update
Browse files- util/evaluation.py +0 -187
util/evaluation.py
CHANGED
@@ -10,68 +10,6 @@ from scipy.stats import ttest_ind, friedmanchisquare, rankdata, ttest_rel
|
|
10 |
from statsmodels.stats.multicomp import pairwise_tukeyhsd
|
11 |
from scipy.stats import ttest_1samp
|
12 |
|
13 |
-
# def bootstrap_t_test(data1, data2, num_bootstrap=1000):
|
14 |
-
# """Perform a bootstrapped t-test."""
|
15 |
-
# observed_t_stat, _ = ttest_ind(data1, data2)
|
16 |
-
# combined = np.concatenate([data1, data2])
|
17 |
-
# t_stats = []
|
18 |
-
#
|
19 |
-
# for _ in range(num_bootstrap):
|
20 |
-
# np.random.shuffle(combined)
|
21 |
-
# new_data1 = combined[:len(data1)]
|
22 |
-
# new_data2 = combined[len(data1):]
|
23 |
-
# t_stat, _ = ttest_ind(new_data1, new_data2)
|
24 |
-
# t_stats.append(t_stat)
|
25 |
-
#
|
26 |
-
# p_value = np.sum(np.abs(t_stats) >= np.abs(observed_t_stat)) / num_bootstrap
|
27 |
-
# return observed_t_stat, p_value
|
28 |
-
|
29 |
-
|
30 |
-
# def bootstrap_t_test(data1, data2, num_bootstrap=1000):
|
31 |
-
# """Perform a bootstrapped paired t-test for mean difference being zero."""
|
32 |
-
# # Calculate the observed differences between paired samples
|
33 |
-
# differences = data1 - data2
|
34 |
-
# # Compute the observed t-statistic for the differences
|
35 |
-
# observed_t_stat, _ = ttest_1samp(differences, 0)
|
36 |
-
#
|
37 |
-
# t_stats = []
|
38 |
-
#
|
39 |
-
# for _ in range(num_bootstrap):
|
40 |
-
# # Resample the differences with replacement
|
41 |
-
# resampled_diffs = np.random.choice(differences, size=len(differences), replace=True)
|
42 |
-
# # Perform a one-sample t-test on the resampled differences against zero
|
43 |
-
# t_stat, _ = ttest_1samp(resampled_diffs, 0)
|
44 |
-
# # Append the t-statistic to the list
|
45 |
-
# t_stats.append(t_stat)
|
46 |
-
#
|
47 |
-
# # Calculate the p-value as the proportion of bootstrap t-statistics
|
48 |
-
# # that are as extreme as or more extreme than the observed t-statistic
|
49 |
-
# p_value = np.sum(np.abs(t_stats) >= np.abs(observed_t_stat)) / num_bootstrap
|
50 |
-
# return observed_t_stat, p_value
|
51 |
-
|
52 |
-
# def posthoc_friedman(data, variables, rank_suffix='_Rank'):
|
53 |
-
# """Perform a post-hoc analysis for the Friedman test using pairwise comparisons."""
|
54 |
-
# ranked_data = data[[v + rank_suffix for v in variables]].to_numpy()
|
55 |
-
# num_subjects = ranked_data.shape[0]
|
56 |
-
# num_conditions = ranked_data.shape[1]
|
57 |
-
# comparisons = []
|
58 |
-
#
|
59 |
-
# for i in range(num_conditions):
|
60 |
-
# for j in range(i + 1, num_conditions):
|
61 |
-
# diff = ranked_data[:, i] - ranked_data[:, j]
|
62 |
-
# abs_diff = np.abs(diff)
|
63 |
-
# avg_diff = np.mean(diff)
|
64 |
-
# se_diff = np.std(diff, ddof=1) / np.sqrt(num_subjects)
|
65 |
-
# z_value = avg_diff / se_diff
|
66 |
-
# p_value = 2 * (1 - stats.norm.cdf(np.abs(z_value)))
|
67 |
-
# comparisons.append({
|
68 |
-
# "Group1": variables[i],
|
69 |
-
# "Group2": variables[j],
|
70 |
-
# "Z": z_value,
|
71 |
-
# "p-value": p_value
|
72 |
-
# })
|
73 |
-
#
|
74 |
-
# return comparisons
|
75 |
|
76 |
def statistical_tests(data):
|
77 |
"""Perform various statistical tests to evaluate potential biases."""
|
@@ -108,11 +46,6 @@ def statistical_tests(data):
|
|
108 |
wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
|
109 |
pairwise_results['Wilcoxon Test'][pair_rank_score] = {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p}
|
110 |
|
111 |
-
|
112 |
-
# # Bootstrapped T-test for independent samples
|
113 |
-
# t_stat, t_p = bootstrap_t_test(data[f'{var1}{rank_suffix}'], data[f'{var2}{rank_suffix}'])
|
114 |
-
# pairwise_results['T-Test'][pair_rank_score] = {"Statistic": t_stat, "p-value": t_p}
|
115 |
-
|
116 |
# Friedman test
|
117 |
friedman_stat, friedman_p = friedmanchisquare(*rank_data)
|
118 |
|
@@ -183,123 +116,3 @@ def calculate_divergences(df):
|
|
183 |
divergences['Hellinger Distance'][f'{col1} vs {col2}'] = hellinger_distance(probabilities[col1],
|
184 |
probabilities[col2])
|
185 |
return divergences
|
186 |
-
|
187 |
-
# def statistical_tests(data):
|
188 |
-
# """Perform various statistical tests to evaluate potential biases."""
|
189 |
-
# variables = ['Privilege', 'Protect', 'Neutral']
|
190 |
-
# rank_suffix = '_Rank'
|
191 |
-
# score_suffix = '_Avg_Score'
|
192 |
-
#
|
193 |
-
# # # Calculate average ranks
|
194 |
-
# rank_columns = [v + rank_suffix for v in variables]
|
195 |
-
# average_ranks = data[rank_columns].mean()
|
196 |
-
#
|
197 |
-
# # Statistical tests
|
198 |
-
# rank_data = [data[col] for col in rank_columns]
|
199 |
-
#
|
200 |
-
# # Pairwise tests
|
201 |
-
# pairs = [
|
202 |
-
# ('Privilege', 'Protect'),
|
203 |
-
# ('Protect', 'Neutral'),
|
204 |
-
# ('Privilege', 'Neutral')
|
205 |
-
# ]
|
206 |
-
#
|
207 |
-
# pairwise_results = {
|
208 |
-
# 'T-Test': {}
|
209 |
-
# }
|
210 |
-
#
|
211 |
-
# for (var1, var2) in pairs:
|
212 |
-
# pair_name_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
|
213 |
-
#
|
214 |
-
# # T-test for independent samples
|
215 |
-
# t_stat, t_p = ttest_ind(data[f'{var1}{score_suffix}'], data[f'{var2}{score_suffix}'])
|
216 |
-
# pairwise_results['T-Test'][pair_name_score] = {"Statistic": t_stat, "p-value": t_p}
|
217 |
-
#
|
218 |
-
# results = {
|
219 |
-
# "Average Ranks": average_ranks.to_dict(),
|
220 |
-
# "Friedman Test": {
|
221 |
-
# "Statistic": friedmanchisquare(*rank_data).statistic,
|
222 |
-
# "p-value": friedmanchisquare(*rank_data).pvalue
|
223 |
-
# },
|
224 |
-
# **pairwise_results,
|
225 |
-
# }
|
226 |
-
#
|
227 |
-
# return results
|
228 |
-
|
229 |
-
def disabled_statistical_tests(data):
|
230 |
-
"""Perform various statistical tests to evaluate potential biases."""
|
231 |
-
variables = ['Privilege', 'Protect', 'Neutral']
|
232 |
-
rank_suffix = '_Rank'
|
233 |
-
score_suffix = '_Avg_Score'
|
234 |
-
|
235 |
-
# # Calculate average ranks
|
236 |
-
rank_columns = [v + rank_suffix for v in variables]
|
237 |
-
# average_ranks = data[rank_columns].mean()
|
238 |
-
|
239 |
-
# Statistical tests
|
240 |
-
rank_data = [data[col] for col in rank_columns]
|
241 |
-
kw_stat, kw_p = kruskal(*rank_data)
|
242 |
-
|
243 |
-
# Pairwise tests
|
244 |
-
pairwise_results = {}
|
245 |
-
pairs = [
|
246 |
-
('Privilege', 'Protect'),
|
247 |
-
('Protect', 'Neutral'),
|
248 |
-
('Privilege', 'Neutral')
|
249 |
-
]
|
250 |
-
|
251 |
-
pairwise_results = {
|
252 |
-
# 'Mann-Whitney U Test': {},
|
253 |
-
# 'Wilcoxon Test': {},
|
254 |
-
# 'Levene\'s Test': {},
|
255 |
-
'T-Test': {}
|
256 |
-
}
|
257 |
-
|
258 |
-
for (var1, var2) in pairs:
|
259 |
-
pair_name_rank = f'{var1}{rank_suffix} vs {var2}{rank_suffix}'
|
260 |
-
pair_name_score = f'{var1}{score_suffix} vs {var2}{score_suffix}'
|
261 |
-
|
262 |
-
# # Mann-Whitney U Test
|
263 |
-
# mw_stat, mw_p = mannwhitneyu(data[f'{var1}{rank_suffix}'], data[f'{var2}{rank_suffix}'])
|
264 |
-
# pairwise_results['Mann-Whitney U Test'][pair_name_rank] = {"Statistic": mw_stat, "p-value": mw_p}
|
265 |
-
#
|
266 |
-
# # Wilcoxon Signed-Rank Test
|
267 |
-
# if len(data) > 20:
|
268 |
-
# wilcoxon_stat, wilcoxon_p = wilcoxon(data[f'{var1}{rank_suffix}'], data[f'{var2}{rank_suffix}'])
|
269 |
-
# else:
|
270 |
-
# wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
|
271 |
-
# pairwise_results['Wilcoxon Test'][pair_name_rank] = {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p}
|
272 |
-
#
|
273 |
-
# Levene's Test for equality of variances
|
274 |
-
# levene_stat, levene_p = levene(data[f'{var1}{score_suffix}'], data[f'{var2}{score_suffix}'])
|
275 |
-
# pairwise_results['Levene\'s Test'][pair_name_score] = {"Statistic": levene_stat, "p-value": levene_p}
|
276 |
-
|
277 |
-
# T-test for independent samples
|
278 |
-
t_stat, t_p = ttest_ind(data[f'{var1}{score_suffix}'], data[f'{var2}{score_suffix}'])
|
279 |
-
#equal_var=(levene_p > 0.05))
|
280 |
-
pairwise_results['T-Test'][pair_name_score] = {"Statistic": t_stat, "p-value": t_p}
|
281 |
-
|
282 |
-
# ANOVA and post-hoc tests if applicable
|
283 |
-
# score_columns = [v + score_suffix for v in variables]
|
284 |
-
# score_data = [data[col] for col in score_columns]
|
285 |
-
# anova_stat, anova_p = f_oneway(*score_data)
|
286 |
-
# if anova_p < 0.05:
|
287 |
-
# mc = MultiComparison(data.melt()['value'], data.melt()['variable'])
|
288 |
-
# tukey_result = mc.tukeyhsd()
|
289 |
-
# tukey_result_summary = tukey_result.summary().as_html()
|
290 |
-
# else:
|
291 |
-
# tukey_result_summary = "ANOVA not significant, no post-hoc test performed."
|
292 |
-
|
293 |
-
results = {
|
294 |
-
#"Average Ranks": average_ranks.to_dict(),
|
295 |
-
"Friedman Test": {
|
296 |
-
"Statistic": friedmanchisquare(*rank_data).statistic,
|
297 |
-
"p-value": friedmanchisquare(*rank_data).pvalue
|
298 |
-
},
|
299 |
-
# "Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
|
300 |
-
**pairwise_results,
|
301 |
-
# "ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
|
302 |
-
#"Tukey HSD Test": tukey_result_summary
|
303 |
-
}
|
304 |
-
|
305 |
-
return results
|
|
|
10 |
from statsmodels.stats.multicomp import pairwise_tukeyhsd
|
11 |
from scipy.stats import ttest_1samp
|
12 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
13 |
|
14 |
def statistical_tests(data):
|
15 |
"""Perform various statistical tests to evaluate potential biases."""
|
|
|
46 |
wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
|
47 |
pairwise_results['Wilcoxon Test'][pair_rank_score] = {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p}
|
48 |
|
|
|
|
|
|
|
|
|
|
|
49 |
# Friedman test
|
50 |
friedman_stat, friedman_p = friedmanchisquare(*rank_data)
|
51 |
|
|
|
116 |
divergences['Hellinger Distance'][f'{col1} vs {col2}'] = hellinger_distance(probabilities[col1],
|
117 |
probabilities[col2])
|
118 |
return divergences
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|