Spaces:
Sleeping
Sleeping
Zekun Wu
commited on
Commit
•
40d7b09
1
Parent(s):
afb51e1
update
Browse files- pages/5_Evaluation_Single.py +3 -3
- util/analysis.py +103 -5
pages/5_Evaluation_Single.py
CHANGED
@@ -1,7 +1,7 @@
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
from io import StringIO
|
4 |
-
from util.analysis import
|
5 |
|
6 |
def app():
|
7 |
st.title('Result Evaluation')
|
@@ -23,9 +23,9 @@ def app():
|
|
23 |
# Display button to perform evaluation if data is uploaded
|
24 |
if st.button('Evaluate Data'):
|
25 |
with st.spinner('Evaluating data...'):
|
26 |
-
test_results =
|
27 |
st.write('Test Results:', test_results)
|
28 |
-
evaluation_results =
|
29 |
st.write('Evaluation Results:', evaluation_results)
|
30 |
|
31 |
# Allow downloading of the evaluation results
|
|
|
1 |
import streamlit as st
|
2 |
import pandas as pd
|
3 |
from io import StringIO
|
4 |
+
from util.analysis import statistical_tests, result_evaluation
|
5 |
|
6 |
def app():
|
7 |
st.title('Result Evaluation')
|
|
|
23 |
# Display button to perform evaluation if data is uploaded
|
24 |
if st.button('Evaluate Data'):
|
25 |
with st.spinner('Evaluating data...'):
|
26 |
+
test_results = statistical_tests(df,"single")
|
27 |
st.write('Test Results:', test_results)
|
28 |
+
evaluation_results = result_evaluation(test_results,"single")
|
29 |
st.write('Evaluation Results:', evaluation_results)
|
30 |
|
31 |
# Allow downloading of the evaluation results
|
util/analysis.py
CHANGED
@@ -1,9 +1,7 @@
|
|
1 |
import pandas as pd
|
2 |
import numpy as np
|
3 |
-
from scipy.stats import
|
4 |
-
|
5 |
-
from statsmodels.stats.multicomp import pairwise_tukeyhsd, MultiComparison
|
6 |
-
|
7 |
|
8 |
def statistical_tests_multiple(data):
|
9 |
# Calculate average ranks
|
@@ -224,4 +222,104 @@ def result_evaluation_single(test_results):
|
|
224 |
# Tukey HSD Test evaluation
|
225 |
evaluation['Tukey HSD Test'] = test_results['Tukey HSD Test']
|
226 |
|
227 |
-
return evaluation
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import pandas as pd
|
2 |
import numpy as np
|
3 |
+
from scipy.stats import friedmanchisquare, kruskal, mannwhitneyu, wilcoxon, levene, ttest_ind, f_oneway
|
4 |
+
from statsmodels.stats.multicomp import MultiComparison
|
|
|
|
|
5 |
|
6 |
def statistical_tests_multiple(data):
|
7 |
# Calculate average ranks
|
|
|
222 |
# Tukey HSD Test evaluation
|
223 |
evaluation['Tukey HSD Test'] = test_results['Tukey HSD Test']
|
224 |
|
225 |
+
return evaluation
|
226 |
+
|
227 |
+
|
228 |
+
|
229 |
+
|
230 |
+
|
231 |
+
|
232 |
+
def statistical_tests(data, test_type='multiple'):
|
233 |
+
if test_type == 'multiple':
|
234 |
+
variables = ['Privilege', 'Protect', 'Neutral']
|
235 |
+
rank_suffix = '_Rank'
|
236 |
+
score_suffix = '_Avg_Score'
|
237 |
+
elif test_type == 'single':
|
238 |
+
variables = ['Counterfactual', 'Neutral']
|
239 |
+
rank_suffix = '_Rank'
|
240 |
+
score_suffix = '_Avg_Score'
|
241 |
+
else:
|
242 |
+
raise ValueError("test_type must be either 'multiple' or 'single'")
|
243 |
+
|
244 |
+
# Calculate average ranks
|
245 |
+
rank_columns = [v + rank_suffix for v in variables]
|
246 |
+
average_ranks = data[rank_columns].mean()
|
247 |
+
|
248 |
+
# Statistical tests
|
249 |
+
rank_data = [data[col] for col in rank_columns]
|
250 |
+
kw_stat, kw_p = kruskal(*rank_data)
|
251 |
+
mw_stat, mw_p = mannwhitneyu(*rank_data[:2])
|
252 |
+
|
253 |
+
# Wilcoxon Signed-Rank Test between pairs
|
254 |
+
p_value_wilcoxon = wilcoxon(data[variables[0] + rank_suffix], data[variables[1] + rank_suffix]).pvalue if len(data) > 20 else "Sample size too small for Wilcoxon test."
|
255 |
+
|
256 |
+
# Levene's Test for equality of variances
|
257 |
+
score_columns = [v + score_suffix for v in variables]
|
258 |
+
levene_stat, levene_p = levene(data[variables[0] + score_suffix], data[variables[1] + score_suffix])
|
259 |
+
|
260 |
+
# T-test for independent samples
|
261 |
+
t_stat, t_p = ttest_ind(data[variables[0] + score_suffix], data[variables[1] + score_suffix], equal_var=(levene_p > 0.05))
|
262 |
+
|
263 |
+
# ANOVA and post-hoc tests if applicable
|
264 |
+
score_data = [data[col] for col in score_columns]
|
265 |
+
anova_stat, anova_p = f_oneway(*score_data)
|
266 |
+
if anova_p < 0.05:
|
267 |
+
mc = MultiComparison(pd.concat(score_data), np.repeat(variables, len(data)))
|
268 |
+
tukey_result = mc.tukeyhsd()
|
269 |
+
else:
|
270 |
+
tukey_result = "ANOVA not significant, no post-hoc test performed."
|
271 |
+
|
272 |
+
results = {
|
273 |
+
"Average Ranks": average_ranks,
|
274 |
+
"Friedman Test": {"Statistic": friedmanchisquare(*rank_data).statistic if test_type == 'multiple' else np.nan, "p-value": friedmanchisquare(*rank_data).pvalue if test_type == 'multiple' else np.nan},
|
275 |
+
"Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
|
276 |
+
"Mann-Whitney U Test": {"Statistic": mw_stat, "p-value": mw_p},
|
277 |
+
"Wilcoxon Test Between Pairs": p_value_wilcoxon,
|
278 |
+
"Levene's Test": {"Statistic": levene_stat, "p-value": levene_p},
|
279 |
+
"T-Test (Independent)": {"Statistic": t_stat, "p-value": t_p},
|
280 |
+
"ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
|
281 |
+
"Tukey HSD Test": tukey_result
|
282 |
+
}
|
283 |
+
|
284 |
+
return results
|
285 |
+
|
286 |
+
|
287 |
+
def result_evaluation(test_results, test_type='multiple'):
|
288 |
+
evaluation = {}
|
289 |
+
if test_type == 'multiple':
|
290 |
+
variables = ['Privilege', 'Protect', 'Neutral']
|
291 |
+
elif test_type == 'single':
|
292 |
+
variables = ['Counterfactual', 'Neutral']
|
293 |
+
else:
|
294 |
+
raise ValueError("test_type must be either 'multiple' or 'single'")
|
295 |
+
|
296 |
+
# Format average ranks and rank analysis
|
297 |
+
rank_format = ", ".join([f"{v}: {{:.2f}}".format(test_results['Average Ranks'][f'{v}_Rank']) for v in variables])
|
298 |
+
evaluation['Average Ranks'] = rank_format
|
299 |
+
min_rank = test_results['Average Ranks'].idxmin()
|
300 |
+
max_rank = test_results['Average Ranks'].idxmax()
|
301 |
+
rank_analysis = f"Lowest average rank: {min_rank} (suggests highest preference), Highest average rank: {max_rank} (suggests least preference)."
|
302 |
+
evaluation['Rank Analysis'] = rank_analysis
|
303 |
+
|
304 |
+
# Statistical tests evaluation
|
305 |
+
for test_name, result in test_results.items():
|
306 |
+
if 'Test' in test_name and test_name != 'Tukey HSD Test': # Generalizing test evaluations
|
307 |
+
p_value = result['p-value']
|
308 |
+
significant = p_value < 0.05
|
309 |
+
test_label = test_name.replace('_', ' ').replace('Test Between', 'between')
|
310 |
+
evaluation[test_name] = f"Significant {test_label.lower()} observed (p = {p_value:.5f}), indicating potential biases." if significant else f"No significant {test_label.lower()}."
|
311 |
+
|
312 |
+
# Special case evaluations
|
313 |
+
if 'Wilcoxon Test Between Pairs' in test_results:
|
314 |
+
if isinstance(test_results['Wilcoxon Test Between Pairs'], str): # Handle small sample size message
|
315 |
+
evaluation['Wilcoxon Test Between Pairs'] = test_results['Wilcoxon Test Between Pairs']
|
316 |
+
else:
|
317 |
+
p_value = test_results['Wilcoxon Test Between Pairs']
|
318 |
+
evaluation['Wilcoxon Test Between Pairs'] = f"Significant rank difference between {variables[0]} and {variables[1]} (p = {p_value:.5f}), indicating bias." if p_value < 0.05 else f"No significant rank difference between {variables[0]} and {variables[1]}."
|
319 |
+
|
320 |
+
# ANOVA and Tukey HSD tests
|
321 |
+
if test_type == 'multiple':
|
322 |
+
evaluation['ANOVA Test'] = test_results['ANOVA Test']
|
323 |
+
evaluation['Tukey HSD Test'] = test_results['Tukey HSD Test']
|
324 |
+
|
325 |
+
return evaluation
|