File size: 5,408 Bytes
92b387d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
import pandas as pd
import json


class ResultsProcessor:
    def __init__(self, prompt_option, result_file, data_dict):
        self.prompt_option = prompt_option
        self.result_file = result_file
        self.data_dict = data_dict
        

    def get_overall_performance(self):
        return round(self.data_dict["Overall performance"]*100, 2)
    
    def get_bias_ratios_df(self):
        fairness_results = self.data_dict['Fairness results']

        characteristic_list = []
        fairness_ratio_list = []
        for key, val in fairness_results.items():
            characteristic_list += [key]
            fairness_ratio_list += [val['OverallFairness']]

        ch_df = pd.DataFrame({
            'Characteristic': characteristic_list,
            'Bias ratio': fairness_ratio_list
        }).sort_values(by=['Characteristic'])
        return ch_df

    def get_global_perturbers_df(self):
        global_perturber_families = self.data_dict['Perturber Families']
        perf_pert_values = []
        normalized_perf_pert_values = []
        family_levels = []
        family_names_list = []
        levels_index_list = []
        for item in global_perturber_families:
            family_name = item['family name']
            family_results = self.data_dict['Performance Robustness']['Perturber family wise results'][family_name]["PerformancePerturbers"]# TODO: change the structuer of post processing here 
            family_levels += item['levels']
            original_perf = family_results[item['levels'][0]]
            count = 0
            for t_item in item['levels']:
                perf_pert_values += [family_results[t_item]]
                normalized_perf_pert_values += [family_results[t_item]/original_perf]
                family_names_list += [family_name]
                levels_index_list += [count]
                count += 1

        t_pert_df_global = pd.DataFrame({
            'Perturbation level': family_levels,
            'Performance': perf_pert_values,
            'normalized performance': normalized_perf_pert_values,
            'Perturbation family': family_names_list,
            'Levels' : levels_index_list
        })
        t_pert_df_global['category'] = 'Overall'

        return t_pert_df_global

    def get_data_distribution(self, embedder_option):
        embedder_perf_ci_table = self.data_dict['Performance results'][embedder_option]['CI_Table']
        n_points = self.data_dict['n points']
        category_share_of_data = {}
        categories_list = []
        share_of_data_list = []
        n_points_list = []
        for key, val in embedder_perf_ci_table.items():
            categories_list += [val['category']]
            share_of_data_list += [val['Share of Data']]
            n_points_list += [int(val['Share of Data']*n_points/100)]

        t_df = pd.DataFrame({
            'Category': categories_list,
            'Share of data': share_of_data_list,
            'Number of points': n_points_list
        })
        return t_df
    
    def get_fairness_confidence_interval_df(self, embedder_option):
        embedder_fair_ci_table = self.data_dict['Fairness results'][embedder_option]['CI_Table']
        categories_list = []
        estimates_list = []
        uppers_list = []
        lowers_list = []
        for key, val in embedder_fair_ci_table.items():
            categories_list += [val['category']]
            estimates_list += [val['Estimate']]
            uppers_list += [val['Upper']]
            lowers_list += [val['Lower']]

        t_fair_df = pd.DataFrame({
            'Category': categories_list,
            'Estimate': estimates_list,
            'Upper': uppers_list,
            'Lower': lowers_list,
            'Index': list(range(len(uppers_list)))
        })
        t_fair_df['Index'] = t_fair_df['Index'].astype(float)

        t_fair_df['Diff upper'] = t_fair_df['Upper'] - t_fair_df['Estimate']
        t_fair_df['Diff lower'] = t_fair_df['Estimate'] - t_fair_df['Lower']

        return t_fair_df

    def get_performance_robustness(self, embedder_option):
        t_pert_df_global = self.get_global_perturbers_df()
        global_perturber_families = self.data_dict['Perturber Families']
        t_result = self.data_dict['Performance Robustness']['Embedder wise results'][embedder_option]
        merged_dfs_list = []
        t_pert_df_global_temps_list = []
        family_names_list = []
        # Embedder categories
        for item in global_perturber_families:
            family_name = item['family name']
            dfs_list = []
            count = 0
            for t_item in item['levels']:
                df = pd.DataFrame(t_result[t_item])
                df['Perturber'] = t_item
                df['Perturber family'] = family_name
                df['Levels'] = count
                dfs_list += [df]
                count += 1
            merged_df = pd.concat(dfs_list, axis=0)
            merged_dfs_list += [merged_df]
            family_names_list += [family_name]

            t_pert_df_global_temp = t_pert_df_global[t_pert_df_global['Perturbation family'] == family_name].copy(deep=True)
            t_pert_df_global_temps_list +=[t_pert_df_global_temp]
        return  {
            'merged_dfs_list' : merged_dfs_list,
            't_pert_df_global_temps_list' : t_pert_df_global_temps_list,
            'family_names_list' : family_names_list
        }