Corey Morris commited on
Commit
ee9e25e
·
1 Parent(s): 9f7d306

Added basic structure of details data processing and testing. For downloading huggingface details dataset files

Browse files
details_data_processor.py ADDED
@@ -0,0 +1,155 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import os
3
+ import fnmatch
4
+ import json
5
+ import re
6
+ import numpy as np
7
+
8
+ class DetailsDataProcessor:
9
+
10
+ def __init__(self, directory='results', pattern='results*.json'):
11
+ self.directory = directory
12
+ self.pattern = pattern
13
+ # self.data = self.process_data()
14
+ # self.ranked_data = self.rank_data()
15
+
16
+ # @staticmethod
17
+ # def _find_files(directory, pattern):
18
+ # for root, dirs, files in os.walk(directory):
19
+ # for basename in files:
20
+ # if fnmatch.fnmatch(basename, pattern):
21
+ # filename = os.path.join(root, basename)
22
+ # yield filename
23
+
24
+ # def _read_and_transform_data(self, filename):
25
+ # with open(filename) as f:
26
+ # data = json.load(f)
27
+ # df = pd.DataFrame(data['results']).T
28
+ # return df
29
+
30
+ # def _cleanup_dataframe(self, df, model_name):
31
+ # df = df.rename(columns={'acc': model_name})
32
+ # df.index = (df.index.str.replace('hendrycksTest-', 'MMLU_', regex=True)
33
+ # .str.replace('harness\|', '', regex=True)
34
+ # .str.replace('\|5', '', regex=True))
35
+ # return df[[model_name]]
36
+
37
+ # def _extract_mc1(self, df, model_name):
38
+ # df = df.rename(columns={'mc1': model_name})
39
+ # # rename row harness|truthfulqa:mc|0 to truthfulqa:mc1
40
+ # df.index = (df.index.str.replace('mc\|0', 'mc1', regex=True))
41
+ # # just return the harness|truthfulqa:mc1 row
42
+ # df = df.loc[['harness|truthfulqa:mc1']]
43
+ # return df[[model_name]]
44
+
45
+ # def _extract_mc2(self, df, model_name):
46
+ # # rename row harness|truthfulqa:mc|0 to truthfulqa:mc2
47
+ # df = df.rename(columns={'mc2': model_name})
48
+ # df.index = (df.index.str.replace('mc\|0', 'mc2', regex=True))
49
+ # df = df.loc[['harness|truthfulqa:mc2']]
50
+ # return df[[model_name]]
51
+
52
+ # # remove extreme outliers from column harness|truthfulqa:mc1
53
+ # def _remove_mc1_outliers(self, df):
54
+ # mc1 = df['harness|truthfulqa:mc1']
55
+ # # Identify the outliers
56
+ # # outliers_condition = mc1 > mc1.quantile(.95)
57
+ # outliers_condition = mc1 == 1.0
58
+ # # Replace the outliers with NaN
59
+ # df.loc[outliers_condition, 'harness|truthfulqa:mc1'] = np.nan
60
+ # return df
61
+
62
+
63
+
64
+ # @staticmethod
65
+ # def _extract_parameters(model_name):
66
+ # """
67
+ # Function to extract parameters from model name.
68
+ # It handles names with 'b/B' for billions and 'm/M' for millions.
69
+ # """
70
+ # # pattern to match a number followed by 'b' (representing billions) or 'm' (representing millions)
71
+ # pattern = re.compile(r'(\d+\.?\d*)([bBmM])')
72
+
73
+ # match = pattern.search(model_name)
74
+
75
+ # if match:
76
+ # num, magnitude = match.groups()
77
+ # num = float(num)
78
+
79
+ # # convert millions to billions
80
+ # if magnitude.lower() == 'm':
81
+ # num /= 1000
82
+
83
+ # return num
84
+
85
+ # # return NaN if no match
86
+ # return np.nan
87
+
88
+
89
+ # def process_data(self):
90
+
91
+ # dataframes = []
92
+ # organization_names = []
93
+ # for filename in self._find_files(self.directory, self.pattern):
94
+ # raw_data = self._read_and_transform_data(filename)
95
+ # split_path = filename.split('/')
96
+ # model_name = split_path[2]
97
+ # organization_name = split_path[1]
98
+ # cleaned_data = self._cleanup_dataframe(raw_data, model_name)
99
+ # mc1 = self._extract_mc1(raw_data, model_name)
100
+ # mc2 = self._extract_mc2(raw_data, model_name)
101
+ # cleaned_data = pd.concat([cleaned_data, mc1])
102
+ # cleaned_data = pd.concat([cleaned_data, mc2])
103
+ # organization_names.append(organization_name)
104
+ # dataframes.append(cleaned_data)
105
+
106
+
107
+ # data = pd.concat(dataframes, axis=1).transpose()
108
+
109
+ # # Add organization column
110
+ # data['organization'] = organization_names
111
+
112
+ # # Add Model Name and rearrange columns
113
+ # data['Model Name'] = data.index
114
+ # cols = data.columns.tolist()
115
+ # cols = cols[-1:] + cols[:-1]
116
+ # data = data[cols]
117
+
118
+ # # Remove the 'Model Name' column
119
+ # data = data.drop(columns=['Model Name'])
120
+
121
+ # # Add average column
122
+ # data['MMLU_average'] = data.filter(regex='MMLU').mean(axis=1)
123
+
124
+ # # Reorder columns to move 'MMLU_average' to the third position
125
+ # cols = data.columns.tolist()
126
+ # cols = cols[:2] + cols[-1:] + cols[2:-1]
127
+ # data = data[cols]
128
+
129
+ # # Drop specific columns
130
+ # data = data.drop(columns=['all', 'truthfulqa:mc|0'])
131
+
132
+ # # Add parameter count column using extract_parameters function
133
+ # data['Parameters'] = data.index.to_series().apply(self._extract_parameters)
134
+
135
+ # # move the parameters column to the front of the dataframe
136
+ # cols = data.columns.tolist()
137
+ # cols = cols[-1:] + cols[:-1]
138
+ # data = data[cols]
139
+
140
+ # # remove extreme outliers from column harness|truthfulqa:mc1
141
+ # data = self._remove_mc1_outliers(data)
142
+
143
+ # return data
144
+
145
+ # def rank_data(self):
146
+ # # add rank for each column to the dataframe
147
+ # # copy the data dataframe to avoid modifying the original dataframe
148
+ # rank_data = self.data.copy()
149
+ # for col in list(rank_data.columns):
150
+ # rank_data[col + "_rank"] = rank_data[col].rank(ascending=False, method='min')
151
+
152
+ # return rank_data
153
+
154
+ # def get_data(self, selected_models):
155
+ # return self.data[self.data.index.isin(selected_models)]
test_details_data_processing.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import unittest
2
+ from details_data_processor import DetailsDataProcessor
3
+ import pandas as pd
4
+
5
+ class TestDetailsDataProcessor(unittest.TestCase):
6
+
7
+ def setUp(self):
8
+ self.processor = DetailsDataProcessor()
9
+
10
+ # check that the result is a pandas dataframe
11
+ def test_process_data(self):
12
+ pass
13
+ # data = self.processor.data
14
+ # self.assertIsInstance(data, pd.DataFrame)
15
+
16
+
17
+ if __name__ == '__main__':
18
+ unittest.main()