Zekun Wu commited on
Commit
5fd4442
1 Parent(s): f89eb08
Files changed (2) hide show
  1. pages/2_Evaluation.py +9 -7
  2. util/evaluation.py +4 -99
pages/2_Evaluation.py CHANGED
@@ -44,8 +44,8 @@ def app():
44
  if st.button('Evaluate Data'):
45
  with st.spinner('Evaluating data...'):
46
  # Existing statistical tests
47
- test_results = statistical_tests(df)
48
- #3sst.write('Test Results:', test_results)
49
  # evaluation_results = result_evaluation(test_results)
50
  # st.write('Evaluation Results:', evaluation_results)
51
 
@@ -58,14 +58,16 @@ def app():
58
  #st.write('Divergence Results:', divergence_results)
59
 
60
  # Flatten the results for combining
61
- flat_test_results = {f"{key1}_{key2}": value2 for key1, value1 in test_results.items() for key2, value2
62
- in (value1.items() if isinstance(value1, dict) else {key1: value1}.items())}
63
- flat_correlation_results = {f"Correlation_{key1}": value1 for key1, value1 in
64
- correlation_results.items()}
 
 
65
  flat_divergence_results = {f"Divergence_{key1}": value1 for key1, value1 in divergence_results.items()}
66
 
67
  # Combine all results
68
- results_combined = {**flat_test_results, **flat_correlation_results, **flat_divergence_results}
69
 
70
  # Convert to DataFrame for download
71
  results_df = pd.DataFrame(list(results_combined.items()), columns=['Metric', 'Value'])
 
44
  if st.button('Evaluate Data'):
45
  with st.spinner('Evaluating data...'):
46
  # Existing statistical tests
47
+ statistical_results = statistical_tests(df)
48
+ #st.write('Test Results:', test_results)
49
  # evaluation_results = result_evaluation(test_results)
50
  # st.write('Evaluation Results:', evaluation_results)
51
 
 
58
  #st.write('Divergence Results:', divergence_results)
59
 
60
  # Flatten the results for combining
61
+ #flat_test_results = {f"{key1}_{key2}": value2 for key1, value1 in test_results.items() for key2, value2
62
+ #in (value1.items() if isinstance(value1, dict) else {key1: value1}.items())}
63
+
64
+ flat_statistical_results = {f"Statistical_{key1}": value1 for key1, value1 in statistical_results.items()}
65
+
66
+ flat_correlation_results = {f"Correlation_{key1}": value1 for key1, value1 in correlation_results.items()}
67
  flat_divergence_results = {f"Divergence_{key1}": value1 for key1, value1 in divergence_results.items()}
68
 
69
  # Combine all results
70
+ results_combined = {**flat_statistical_results, **flat_correlation_results, **flat_divergence_results}
71
 
72
  # Convert to DataFrame for download
73
  results_df = pd.DataFrame(list(results_combined.items()), columns=['Metric', 'Value'])
util/evaluation.py CHANGED
@@ -66,9 +66,9 @@ def statistical_tests(data):
66
  rank_suffix = '_Rank'
67
  score_suffix = '_Avg_Score'
68
 
69
- # Calculate average ranks
70
  rank_columns = [v + rank_suffix for v in variables]
71
- average_ranks = data[rank_columns].mean()
72
 
73
  # Statistical tests
74
  rank_data = [data[col] for col in rank_columns]
@@ -117,7 +117,7 @@ def statistical_tests(data):
117
  tukey_result_summary = "ANOVA not significant, no post-hoc test performed."
118
 
119
  results = {
120
- "Average Ranks": average_ranks.to_dict(),
121
  "Friedman Test": {
122
  "Statistic": friedmanchisquare(*rank_data).statistic,
123
  "p-value": friedmanchisquare(*rank_data).pvalue
@@ -128,99 +128,4 @@ def statistical_tests(data):
128
  "Tukey HSD Test": tukey_result_summary
129
  }
130
 
131
- return results
132
-
133
- # def statistical_tests(data):
134
- # """Perform various statistical tests to evaluate potential biases."""
135
- # variables = ['Privilege', 'Protect', 'Neutral']
136
- # rank_suffix = '_Rank'
137
- # score_suffix = '_Avg_Score'
138
- #
139
- # # Calculate average ranks
140
- # rank_columns = [v + rank_suffix for v in variables]
141
- # average_ranks = data[rank_columns].mean()
142
- #
143
- # # Statistical tests
144
- # rank_data = [data[col] for col in rank_columns]
145
- # kw_stat, kw_p = kruskal(*rank_data)
146
- # mw_stat, mw_p = mannwhitneyu(rank_data[0], rank_data[1])
147
- #
148
- # # Wilcoxon Signed-Rank Test between pairs
149
- # if len(data) > 20:
150
- # wilcoxon_stat, wilcoxon_p = wilcoxon(rank_data[0], rank_data[1])
151
- # else:
152
- # wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
153
- #
154
- # # Levene's Test for equality of variances
155
- # score_columns = [v + score_suffix for v in variables]
156
- # levene_stat, levene_p = levene(data[score_columns[0]], data[score_columns[1]])
157
- #
158
- # # T-test for independent samples
159
- # t_stat, t_p = ttest_ind(data[score_columns[0]], data[score_columns[1]], equal_var=(levene_p > 0.05))
160
- #
161
- # # ANOVA and post-hoc tests if applicable
162
- # score_data = [data[col] for col in score_columns]
163
- # anova_stat, anova_p = f_oneway(*score_data)
164
- # if anova_p < 0.05:
165
- # mc = MultiComparison(data.melt()['value'], data.melt()['variable'])
166
- # tukey_result = mc.tukeyhsd()
167
- # tukey_result_summary = tukey_result.summary().as_html()
168
- # else:
169
- # tukey_result_summary = "ANOVA not significant, no post-hoc test performed."
170
- #
171
- # results = {
172
- # "Average Ranks": average_ranks.to_dict(),
173
- # "Friedman Test": {
174
- # "Statistic": friedmanchisquare(*rank_data).statistic,
175
- # "p-value": friedmanchisquare(*rank_data).pvalue
176
- # },
177
- # "Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
178
- # "Mann-Whitney U Test": {"Statistic": mw_stat, "p-value": mw_p},
179
- # "Wilcoxon Test Between Pairs": {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p},
180
- # "Levene's Test": {"Statistic": levene_stat, "p-value": levene_p},
181
- # "T-Test (Independent)": {"Statistic": t_stat, "p-value": t_p},
182
- # "ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
183
- # "Tukey HSD Test": tukey_result_summary
184
- # }
185
- #
186
- # return results
187
-
188
- # def result_evaluation(test_results):
189
- # """Evaluate the results of statistical tests to provide insights on potential biases."""
190
- # evaluation = {}
191
- # variables = ['Privilege', 'Protect', 'Neutral']
192
- #
193
- # # Format average ranks and rank analysis
194
- # rank_format = ", ".join([f"{v}: {test_results['Average Ranks'][f'{v}_Rank']:.2f}" for v in variables])
195
- # evaluation['Average Ranks'] = rank_format
196
- # min_rank = test_results['Average Ranks'].idxmin()
197
- # max_rank = test_results['Average Ranks'].idxmax()
198
- # rank_analysis = f"Lowest average rank: {min_rank} (suggests highest preference), Highest average rank: {max_rank} (suggests least preference)."
199
- # evaluation['Rank Analysis'] = rank_analysis
200
- #
201
- # # Statistical tests evaluation
202
- # for test_name, result in test_results.items():
203
- # if 'Test' in test_name and test_name != 'Tukey HSD Test':
204
- # if isinstance(result, dict) and 'p-value' in result:
205
- # p_value = result['p-value']
206
- # significant = p_value < 0.05
207
- # test_label = test_name.replace('_', ' ').replace('Test Between', 'between')
208
- # evaluation[test_name] = f"Significant {test_label.lower()} observed (p = {p_value:.5f}), indicating potential biases." if significant else f"No significant {test_label.lower()}."
209
- # else:
210
- # evaluation[test_name] = "Test result format error or incomplete data."
211
- #
212
- # # Special case evaluations
213
- # if 'Wilcoxon Test Between Pairs' in test_results:
214
- # wilcoxon_result = test_results['Wilcoxon Test Between Pairs']
215
- # if isinstance(wilcoxon_result['p-value'], float):
216
- # evaluation['Wilcoxon Test Between Pairs'] = f"Significant rank difference between {variables[0]} and {variables[1]} (p = {wilcoxon_result['p-value']:.5f}), indicating bias." if wilcoxon_result['p-value'] < 0.05 else f"No significant rank difference between {variables[0]} and {variables[1]}."
217
- # else:
218
- # evaluation['Wilcoxon Test Between Pairs'] = wilcoxon_result['p-value'] # Presuming it's an error message or non-numeric value
219
- #
220
- # # ANOVA and Tukey HSD tests
221
- # anova_p = test_results['ANOVA Test'].get('p-value', 1) # Default to 1 if p-value is missing
222
- # evaluation['ANOVA Test'] = f"No significant differences among all groups (p = {anova_p:.5f}), no further post-hoc analysis required." if anova_p >= 0.05 else f"Significant differences found among groups (p = {anova_p:.5f})."
223
- # evaluation['Tukey HSD Test'] = test_results.get('Tukey HSD Test', 'Tukey test not performed or data missing.')
224
- #
225
- # return evaluation
226
-
 
66
  rank_suffix = '_Rank'
67
  score_suffix = '_Avg_Score'
68
 
69
+ # # Calculate average ranks
70
  rank_columns = [v + rank_suffix for v in variables]
71
+ # average_ranks = data[rank_columns].mean()
72
 
73
  # Statistical tests
74
  rank_data = [data[col] for col in rank_columns]
 
117
  tukey_result_summary = "ANOVA not significant, no post-hoc test performed."
118
 
119
  results = {
120
+ #"Average Ranks": average_ranks.to_dict(),
121
  "Friedman Test": {
122
  "Statistic": friedmanchisquare(*rank_data).statistic,
123
  "p-value": friedmanchisquare(*rank_data).pvalue
 
128
  "Tukey HSD Test": tukey_result_summary
129
  }
130
 
131
+ return results