File size: 6,967 Bytes
843a5ef dcadab7 d96fdf9 843a5ef ee5ac8e d96fdf9 ee5ac8e d96fdf9 ee5ac8e 843a5ef 31bed1a 843a5ef 794b32b ee5ac8e 794b32b 68bce52 794b32b ee5ac8e abac22e e03b231 abac22e dcadab7 843a5ef 6d41115 e79bcf3 6d41115 68bce52 d96fdf9 68bce52 6d41115 843a5ef ee5ac8e e79bcf3 ee5ac8e 843a5ef ee5ac8e 843a5ef ee5ac8e 843a5ef 7d69bda ee5ac8e 9549fcc dcadab7 7d69bda dcadab7 e03b231 a5840fb 1a1910c dcadab7 31bed1a a5840fb 31bed1a ee5ac8e 843a5ef ee5ac8e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 |
import pandas as pd
import os
import fnmatch
import json
import re
import numpy as np
import logging
logging.basicConfig(filename='error_log.log', level=logging.ERROR)
class ResultDataProcessor:
def __init__(self, directory='results', pattern='results*.json'):
self.directory = directory
self.pattern = pattern
self.data = self.process_data()
self.ranked_data = self.rank_data()
def _find_files(self, directory='results', pattern='results*.json'):
matching_files = {}
for root, dirs, files in os.walk(directory):
for basename in files:
if fnmatch.fnmatch(basename, pattern):
filename = os.path.join(root, basename)
matching_files[root] = filename
# TODO decide on removing this since I am catching the error when processing the file
matching_files = {key: value for key, value in matching_files.items() if 'gpt-j-6b' not in key}
matching_files = list(matching_files.values())
return matching_files
def _read_and_transform_data(self, filename):
with open(filename) as f:
data = json.load(f)
df = pd.DataFrame(data['results']).T
return df
def _cleanup_dataframe(self, df, model_name):
df = df.rename(columns={'acc': model_name})
df.index = (df.index.str.replace('hendrycksTest-', 'MMLU_', regex=True)
.str.replace('harness\|', '', regex=True)
.str.replace('\|5', '', regex=True))
return df[[model_name]]
def _extract_mc1(self, df, model_name):
df = df.rename(columns={'mc1': model_name})
# rename row harness|truthfulqa:mc|0 to truthfulqa:mc1
df.index = (df.index.str.replace('mc\|0', 'mc1', regex=True))
# just return the harness|truthfulqa:mc1 row
df = df.loc[['harness|truthfulqa:mc1']]
return df[[model_name]]
def _extract_mc2(self, df, model_name):
# rename row harness|truthfulqa:mc|0 to truthfulqa:mc2
df = df.rename(columns={'mc2': model_name})
df.index = (df.index.str.replace('mc\|0', 'mc2', regex=True))
df = df.loc[['harness|truthfulqa:mc2']]
return df[[model_name]]
# remove extreme outliers from column harness|truthfulqa:mc1
def _remove_mc1_outliers(self, df):
mc1 = df['harness|truthfulqa:mc1']
# Identify the outliers
# outliers_condition = mc1 > mc1.quantile(.95)
outliers_condition = mc1 == 1.0
# Replace the outliers with NaN
df.loc[outliers_condition, 'harness|truthfulqa:mc1'] = np.nan
return df
@staticmethod
def _extract_parameters(model_name):
"""
Function to extract parameters from model name.
It handles names with 'b/B' for billions and 'm/M' for millions.
"""
# pattern to match a number followed by 'b' (representing billions) or 'm' (representing millions)
pattern = re.compile(r'(\d+\.?\d*)([bBmM])')
match = pattern.search(model_name)
if match:
num, magnitude = match.groups()
num = float(num)
# convert millions to billions
if magnitude.lower() == 'm':
num /= 1000
return num
# return NaN if no match
return np.nan
def process_data(self):
dataframes = []
organization_names = []
for filename in self._find_files(self.directory, self.pattern):
try:
raw_data = self._read_and_transform_data(filename)
split_path = filename.split('/')
model_name = split_path[2]
organization_name = split_path[1]
cleaned_data = self._cleanup_dataframe(raw_data, model_name)
mc1 = self._extract_mc1(raw_data, model_name)
mc2 = self._extract_mc2(raw_data, model_name)
cleaned_data = pd.concat([cleaned_data, mc1])
cleaned_data = pd.concat([cleaned_data, mc2])
organization_names.append(organization_name)
dataframes.append(cleaned_data)
except Exception as e:
logging.error(f'Error processing {filename}')
logging.error(f'The error is: {e}')
continue
data = pd.concat(dataframes, axis=1).transpose()
# Add organization column
data['organization'] = organization_names
# Add Model Name and rearrange columns
data['Model Name'] = data.index
cols = data.columns.tolist()
cols = cols[-1:] + cols[:-1]
data = data[cols]
# Remove the 'Model Name' column
data = data.drop(columns=['Model Name'])
# Add average column
data['MMLU_average'] = data.filter(regex='MMLU').mean(axis=1)
# Reorder columns to move 'MMLU_average' to the third position
cols = data.columns.tolist()
cols = cols[:2] + cols[-1:] + cols[2:-1]
data = data[cols]
# Drop specific columns
data = data.drop(columns=['all', 'truthfulqa:mc|0'])
# Add parameter count column using extract_parameters function
data['Parameters'] = data.index.to_series().apply(self._extract_parameters)
# move the parameters column to the front of the dataframe
cols = data.columns.tolist()
cols = cols[-1:] + cols[:-1]
print(cols)
data = data[cols]
# Reorder columns to move 'organization' to the second position
cols = data.columns.tolist()
cols = cols[-1:] + cols[:-1]
data = data[cols]
# remove extreme outliers from column harness|truthfulqa:mc1
data = self._remove_mc1_outliers(data)
data = self.manual_removal_of_models(data)
# save to csv with the current date as part of the filename
data.to_csv(f'processed_data_{pd.Timestamp.now().strftime("%Y-%m-%d")}.csv')
return data
def manual_removal_of_models(self, df):
# remove models verified to be trained on evaluation data
# load the list of models
with open('contaminated_models.txt') as f:
contaminated_models = f.read().splitlines()
# remove the models from the dataframe
df = df[~df.index.isin(contaminated_models)]
return df
def rank_data(self):
# add rank for each column to the dataframe
# copy the data dataframe to avoid modifying the original dataframe
rank_data = self.data.copy()
for col in list(rank_data.columns):
rank_data[col + "_rank"] = rank_data[col].rank(ascending=False, method='min')
return rank_data
def get_data(self, selected_models):
return self.data[self.data.index.isin(selected_models)]
|