Zekun Wu commited on
Commit
6830d47
1 Parent(s): 775d0e1
Files changed (2) hide show
  1. util/evaluation.py +56 -49
  2. util/injection.py +1 -1
util/evaluation.py CHANGED
@@ -80,6 +80,13 @@ def calculate_four_fifths_rule(impact_ratios):
80
  return adverse_impact
81
 
82
  def statistical_tests(data):
 
 
 
 
 
 
 
83
  """Perform various statistical tests to evaluate potential biases."""
84
  variables = ['Privilege', 'Protect', 'Neutral']
85
  rank_suffix = '_Rank'
@@ -308,52 +315,52 @@ def statistical_tests(data):
308
  # return results
309
 
310
 
311
- def hellinger_distance(p, q):
312
- """Calculate the Hellinger distance between two probability distributions."""
313
- return np.sqrt(0.5 * np.sum((np.sqrt(p) - np.sqrt(q)) ** 2))
314
-
315
-
316
- def calculate_correlations(df):
317
- """Calculate Spearman, Pearson, and Kendall's Tau correlations for the given ranks in the dataframe."""
318
- correlations = {
319
- 'Spearman': {},
320
- 'Pearson': {},
321
- 'Kendall Tau': {}
322
- }
323
- columns = ['Privilege_Rank', 'Protect_Rank', 'Neutral_Rank']
324
- for i in range(len(columns)):
325
- for j in range(i + 1, len(columns)):
326
- col1, col2 = columns[i], columns[j]
327
- correlations['Spearman'][f'{col1} vs {col2}'] = spearmanr(df[col1], df[col2]).correlation
328
- correlations['Pearson'][f'{col1} vs {col2}'] = pearsonr(df[col1], df[col2])[0]
329
- correlations['Kendall Tau'][f'{col1} vs {col2}'] = kendalltau(df[col1], df[col2]).correlation
330
- return correlations
331
-
332
-
333
- def scores_to_prob(scores):
334
- """Convert scores to probability distributions."""
335
- value_counts = scores.value_counts()
336
- probabilities = value_counts / value_counts.sum()
337
- full_prob = np.zeros(int(scores.max()) + 1)
338
- full_prob[value_counts.index.astype(int)] = probabilities
339
- return full_prob
340
-
341
-
342
- def calculate_divergences(df):
343
- """Calculate KL, Jensen-Shannon divergences, and Hellinger distance for the score distributions."""
344
- score_columns = ['Privilege_Avg_Score', 'Protect_Avg_Score', 'Neutral_Avg_Score']
345
- probabilities = {col: scores_to_prob(df[col]) for col in score_columns}
346
- divergences = {
347
- 'KL Divergence': {},
348
- 'Jensen-Shannon Divergence': {},
349
- 'Hellinger Distance': {}
350
- }
351
- for i in range(len(score_columns)):
352
- for j in range(i + 1, len(score_columns)):
353
- col1, col2 = score_columns[i], score_columns[j]
354
- divergences['KL Divergence'][f'{col1} vs {col2}'] = entropy(probabilities[col1], probabilities[col2])
355
- divergences['Jensen-Shannon Divergence'][f'{col1} vs {col2}'] = jensenshannon(probabilities[col1],
356
- probabilities[col2])
357
- divergences['Hellinger Distance'][f'{col1} vs {col2}'] = hellinger_distance(probabilities[col1],
358
- probabilities[col2])
359
- return divergences
 
80
  return adverse_impact
81
 
82
  def statistical_tests(data):
83
+ # Add ranks for each score within each row
84
+ ranks = data[['Privilege_Avg_Score', 'Protect_Avg_Score', 'Neutral_Avg_Score']].rank(axis=1, ascending=True)
85
+
86
+ data['Privilege_Rank'] = ranks['Privilege_Avg_Score']
87
+ data['Protect_Rank'] = ranks['Protect_Avg_Score']
88
+ data['Neutral_Rank'] = ranks['Neutral_Avg_Score']
89
+
90
  """Perform various statistical tests to evaluate potential biases."""
91
  variables = ['Privilege', 'Protect', 'Neutral']
92
  rank_suffix = '_Rank'
 
315
  # return results
316
 
317
 
318
+ # def hellinger_distance(p, q):
319
+ # """Calculate the Hellinger distance between two probability distributions."""
320
+ # return np.sqrt(0.5 * np.sum((np.sqrt(p) - np.sqrt(q)) ** 2))
321
+ #
322
+ #
323
+ # def calculate_correlations(df):
324
+ # """Calculate Spearman, Pearson, and Kendall's Tau correlations for the given ranks in the dataframe."""
325
+ # correlations = {
326
+ # 'Spearman': {},
327
+ # 'Pearson': {},
328
+ # 'Kendall Tau': {}
329
+ # }
330
+ # columns = ['Privilege_Rank', 'Protect_Rank', 'Neutral_Rank']
331
+ # for i in range(len(columns)):
332
+ # for j in range(i + 1, len(columns)):
333
+ # col1, col2 = columns[i], columns[j]
334
+ # correlations['Spearman'][f'{col1} vs {col2}'] = spearmanr(df[col1], df[col2]).correlation
335
+ # correlations['Pearson'][f'{col1} vs {col2}'] = pearsonr(df[col1], df[col2])[0]
336
+ # correlations['Kendall Tau'][f'{col1} vs {col2}'] = kendalltau(df[col1], df[col2]).correlation
337
+ # return correlations
338
+ #
339
+ #
340
+ # def scores_to_prob(scores):
341
+ # """Convert scores to probability distributions."""
342
+ # value_counts = scores.value_counts()
343
+ # probabilities = value_counts / value_counts.sum()
344
+ # full_prob = np.zeros(int(scores.max()) + 1)
345
+ # full_prob[value_counts.index.astype(int)] = probabilities
346
+ # return full_prob
347
+
348
+
349
+ # def calculate_divergences(df):
350
+ # """Calculate KL, Jensen-Shannon divergences, and Hellinger distance for the score distributions."""
351
+ # score_columns = ['Privilege_Avg_Score', 'Protect_Avg_Score', 'Neutral_Avg_Score']
352
+ # probabilities = {col: scores_to_prob(df[col]) for col in score_columns}
353
+ # divergences = {
354
+ # 'KL Divergence': {},
355
+ # 'Jensen-Shannon Divergence': {},
356
+ # 'Hellinger Distance': {}
357
+ # }
358
+ # for i in range(len(score_columns)):
359
+ # for j in range(i + 1, len(score_columns)):
360
+ # col1, col2 = score_columns[i], score_columns[j]
361
+ # divergences['KL Divergence'][f'{col1} vs {col2}'] = entropy(probabilities[col1], probabilities[col2])
362
+ # divergences['Jensen-Shannon Divergence'][f'{col1} vs {col2}'] = jensenshannon(probabilities[col1],
363
+ # probabilities[col2])
364
+ # divergences['Hellinger Distance'][f'{col1} vs {col2}'] = hellinger_distance(probabilities[col1],
365
+ # probabilities[col2])
366
+ # return divergences
util/injection.py CHANGED
@@ -100,7 +100,7 @@ def process_scores_multiple(df, num_run, parameters, privilege_label, protect_la
100
  df[f'{category}_Avg_Score'] = df[f'{category}_Scores'].apply(calculate_avg_score)
101
 
102
  # Add ranks for each score within each row
103
- ranks = df[['Privilege_Avg_Score', 'Protect_Avg_Score', 'Neutral_Avg_Score']].rank(axis=1, ascending=False)
104
 
105
  df['Privilege_Rank'] = ranks['Privilege_Avg_Score']
106
  df['Protect_Rank'] = ranks['Protect_Avg_Score']
 
100
  df[f'{category}_Avg_Score'] = df[f'{category}_Scores'].apply(calculate_avg_score)
101
 
102
  # Add ranks for each score within each row
103
+ ranks = df[['Privilege_Avg_Score', 'Protect_Avg_Score', 'Neutral_Avg_Score']].rank(axis=1, ascending=True)
104
 
105
  df['Privilege_Rank'] = ranks['Privilege_Avg_Score']
106
  df['Protect_Rank'] = ranks['Protect_Avg_Score']