Spaces:

holistic-ai
/

job-fair

Sleeping

App Files Files Community

Zekun Wu commited on May 14, 2024

Commit

5fd4442

1 Parent(s): f89eb08

update

Browse files

Files changed (2) hide show

pages/2_Evaluation.py +9 -7
util/evaluation.py +4 -99

pages/2_Evaluation.py CHANGED Viewed

@@ -44,8 +44,8 @@ def app():
             if st.button('Evaluate Data'):
                 with st.spinner('Evaluating data...'):
                     # Existing statistical tests
-                    test_results = statistical_tests(df)
-                    #3sst.write('Test Results:', test_results)
                     # evaluation_results = result_evaluation(test_results)
                     # st.write('Evaluation Results:', evaluation_results)
@@ -58,14 +58,16 @@ def app():
                     #st.write('Divergence Results:', divergence_results)
                     # Flatten the results for combining
-                    flat_test_results = {f"{key1}_{key2}": value2 for key1, value1 in test_results.items() for key2, value2
-                                         in (value1.items() if isinstance(value1, dict) else {key1: value1}.items())}
-                    flat_correlation_results = {f"Correlation_{key1}": value1 for key1, value1 in
-                                                correlation_results.items()}
                     flat_divergence_results = {f"Divergence_{key1}": value1 for key1, value1 in divergence_results.items()}
                     # Combine all results
-                    results_combined = {**flat_test_results, **flat_correlation_results, **flat_divergence_results}
                     # Convert to DataFrame for download
                     results_df = pd.DataFrame(list(results_combined.items()), columns=['Metric', 'Value'])

             if st.button('Evaluate Data'):
                 with st.spinner('Evaluating data...'):
                     # Existing statistical tests
+                    statistical_results = statistical_tests(df)
+                    #st.write('Test Results:', test_results)
                     # evaluation_results = result_evaluation(test_results)
                     # st.write('Evaluation Results:', evaluation_results)
                     #st.write('Divergence Results:', divergence_results)
                     # Flatten the results for combining
+                    #flat_test_results = {f"{key1}_{key2}": value2 for key1, value1 in test_results.items() for key2, value2
+                                         #in (value1.items() if isinstance(value1, dict) else {key1: value1}.items())}
+                    flat_statistical_results = {f"Statistical_{key1}": value1 for key1, value1 in statistical_results.items()}
+                    flat_correlation_results = {f"Correlation_{key1}": value1 for key1, value1 in correlation_results.items()}
                     flat_divergence_results = {f"Divergence_{key1}": value1 for key1, value1 in divergence_results.items()}
                     # Combine all results
+                    results_combined = {**flat_statistical_results, **flat_correlation_results, **flat_divergence_results}
                     # Convert to DataFrame for download
                     results_df = pd.DataFrame(list(results_combined.items()), columns=['Metric', 'Value'])

util/evaluation.py CHANGED Viewed

@@ -66,9 +66,9 @@ def statistical_tests(data):
     rank_suffix = '_Rank'
     score_suffix = '_Avg_Score'
-    # Calculate average ranks
     rank_columns = [v + rank_suffix for v in variables]
-    average_ranks = data[rank_columns].mean()
     # Statistical tests
     rank_data = [data[col] for col in rank_columns]
@@ -117,7 +117,7 @@ def statistical_tests(data):
         tukey_result_summary = "ANOVA not significant, no post-hoc test performed."
     results = {
-        "Average Ranks": average_ranks.to_dict(),
         "Friedman Test": {
             "Statistic": friedmanchisquare(*rank_data).statistic,
             "p-value": friedmanchisquare(*rank_data).pvalue
@@ -128,99 +128,4 @@ def statistical_tests(data):
         "Tukey HSD Test": tukey_result_summary
     }
-    return results
-# def statistical_tests(data):
-#     """Perform various statistical tests to evaluate potential biases."""
-#     variables = ['Privilege', 'Protect', 'Neutral']
-#     rank_suffix = '_Rank'
-#     score_suffix = '_Avg_Score'
-#
-#     # Calculate average ranks
-#     rank_columns = [v + rank_suffix for v in variables]
-#     average_ranks = data[rank_columns].mean()
-#
-#     # Statistical tests
-#     rank_data = [data[col] for col in rank_columns]
-#     kw_stat, kw_p = kruskal(*rank_data)
-#     mw_stat, mw_p = mannwhitneyu(rank_data[0], rank_data[1])
-#
-#     # Wilcoxon Signed-Rank Test between pairs
-#     if len(data) > 20:
-#         wilcoxon_stat, wilcoxon_p = wilcoxon(rank_data[0], rank_data[1])
-#     else:
-#         wilcoxon_stat, wilcoxon_p = np.nan, "Sample size too small for Wilcoxon test."
-#
-#     # Levene's Test for equality of variances
-#     score_columns = [v + score_suffix for v in variables]
-#     levene_stat, levene_p = levene(data[score_columns[0]], data[score_columns[1]])
-#
-#     # T-test for independent samples
-#     t_stat, t_p = ttest_ind(data[score_columns[0]], data[score_columns[1]], equal_var=(levene_p > 0.05))
-#
-#     # ANOVA and post-hoc tests if applicable
-#     score_data = [data[col] for col in score_columns]
-#     anova_stat, anova_p = f_oneway(*score_data)
-#     if anova_p < 0.05:
-#         mc = MultiComparison(data.melt()['value'], data.melt()['variable'])
-#         tukey_result = mc.tukeyhsd()
-#         tukey_result_summary = tukey_result.summary().as_html()
-#     else:
-#         tukey_result_summary = "ANOVA not significant, no post-hoc test performed."
-#
-#     results = {
-#         "Average Ranks": average_ranks.to_dict(),
-#         "Friedman Test": {
-#             "Statistic": friedmanchisquare(*rank_data).statistic,
-#             "p-value": friedmanchisquare(*rank_data).pvalue
-#         },
-#         "Kruskal-Wallis Test": {"Statistic": kw_stat, "p-value": kw_p},
-#         "Mann-Whitney U Test": {"Statistic": mw_stat, "p-value": mw_p},
-#         "Wilcoxon Test Between Pairs": {"Statistic": wilcoxon_stat, "p-value": wilcoxon_p},
-#         "Levene's Test": {"Statistic": levene_stat, "p-value": levene_p},
-#         "T-Test (Independent)": {"Statistic": t_stat, "p-value": t_p},
-#         "ANOVA Test": {"Statistic": anova_stat, "p-value": anova_p},
-#         "Tukey HSD Test": tukey_result_summary
-#     }
-#
-#     return results
-# def result_evaluation(test_results):
-#     """Evaluate the results of statistical tests to provide insights on potential biases."""
-#     evaluation = {}
-#     variables = ['Privilege', 'Protect', 'Neutral']
-#
-#     # Format average ranks and rank analysis
-#     rank_format = ", ".join([f"{v}: {test_results['Average Ranks'][f'{v}_Rank']:.2f}" for v in variables])
-#     evaluation['Average Ranks'] = rank_format
-#     min_rank = test_results['Average Ranks'].idxmin()
-#     max_rank = test_results['Average Ranks'].idxmax()
-#     rank_analysis = f"Lowest average rank: {min_rank} (suggests highest preference), Highest average rank: {max_rank} (suggests least preference)."
-#     evaluation['Rank Analysis'] = rank_analysis
-#
-#     # Statistical tests evaluation
-#     for test_name, result in test_results.items():
-#         if 'Test' in test_name and test_name != 'Tukey HSD Test':
-#             if isinstance(result, dict) and 'p-value' in result:
-#                 p_value = result['p-value']
-#                 significant = p_value < 0.05
-#                 test_label = test_name.replace('_', ' ').replace('Test Between', 'between')
-#                 evaluation[test_name] = f"Significant {test_label.lower()} observed (p = {p_value:.5f}), indicating potential biases." if significant else f"No significant {test_label.lower()}."
-#             else:
-#                 evaluation[test_name] = "Test result format error or incomplete data."
-#
-#     # Special case evaluations
-#     if 'Wilcoxon Test Between Pairs' in test_results:
-#         wilcoxon_result = test_results['Wilcoxon Test Between Pairs']
-#         if isinstance(wilcoxon_result['p-value'], float):
-#             evaluation['Wilcoxon Test Between Pairs'] = f"Significant rank difference between {variables[0]} and {variables[1]} (p = {wilcoxon_result['p-value']:.5f}), indicating bias." if wilcoxon_result['p-value'] < 0.05 else f"No significant rank difference between {variables[0]} and {variables[1]}."
-#         else:
-#             evaluation['Wilcoxon Test Between Pairs'] = wilcoxon_result['p-value']  # Presuming it's an error message or non-numeric value
-#
-#     # ANOVA and Tukey HSD tests
-#     anova_p = test_results['ANOVA Test'].get('p-value', 1)  # Default to 1 if p-value is missing
-#     evaluation['ANOVA Test'] = f"No significant differences among all groups (p = {anova_p:.5f}), no further post-hoc analysis required." if anova_p >= 0.05 else f"Significant differences found among groups (p = {anova_p:.5f})."
-#     evaluation['Tukey HSD Test'] = test_results.get('Tukey HSD Test', 'Tukey test not performed or data missing.')
-#
-#     return evaluation

     rank_suffix = '_Rank'
     score_suffix = '_Avg_Score'
+    # # Calculate average ranks
     rank_columns = [v + rank_suffix for v in variables]
+    # average_ranks = data[rank_columns].mean()
     # Statistical tests
     rank_data = [data[col] for col in rank_columns]
         tukey_result_summary = "ANOVA not significant, no post-hoc test performed."
     results = {
+        #"Average Ranks": average_ranks.to_dict(),
         "Friedman Test": {
             "Statistic": friedmanchisquare(*rank_data).statistic,
             "p-value": friedmanchisquare(*rank_data).pvalue
         "Tukey HSD Test": tukey_result_summary
     }
+    return results