Corey Morris commited on
Commit
dcadab7
1 Parent(s): b94ee8f

Extracting parameter data from the names of the models

Browse files
Files changed (1) hide show
  1. result_data_processor.py +39 -1
result_data_processor.py CHANGED
@@ -2,6 +2,8 @@ import pandas as pd
2
  import os
3
  import fnmatch
4
  import json
 
 
5
 
6
  class ResultDataProcessor:
7
 
@@ -31,6 +33,31 @@ class ResultDataProcessor:
31
  .str.replace('\|5', '', regex=True))
32
  return df[[model_name]]
33
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
34
  def process_data(self):
35
  dataframes = [self._cleanup_dataframe(self._read_and_transform_data(filename), filename.split('/')[2])
36
  for filename in self._find_files(self.directory, self.pattern)]
@@ -55,7 +82,18 @@ class ResultDataProcessor:
55
  data = data[cols]
56
 
57
  # Drop specific columns
58
- return data.drop(columns=['all', 'truthfulqa:mc|0'])
 
 
 
 
 
 
 
 
 
 
 
59
 
60
  def get_data(self, selected_models):
61
  return self.data[self.data.index.isin(selected_models)]
 
2
  import os
3
  import fnmatch
4
  import json
5
+ import re
6
+ import numpy as np
7
 
8
  class ResultDataProcessor:
9
 
 
33
  .str.replace('\|5', '', regex=True))
34
  return df[[model_name]]
35
 
36
+ @staticmethod
37
+ def _extract_parameters(model_name):
38
+ """
39
+ Function to extract parameters from model name.
40
+ It handles names with 'b/B' for billions and 'm/M' for millions.
41
+ """
42
+ # pattern to match a number followed by 'b' (representing billions) or 'm' (representing millions)
43
+ pattern = re.compile(r'(\d+\.?\d*)([bBmM])')
44
+
45
+ match = pattern.search(model_name)
46
+
47
+ if match:
48
+ num, magnitude = match.groups()
49
+ num = float(num)
50
+
51
+ # convert millions to billions
52
+ if magnitude.lower() == 'm':
53
+ num /= 1000
54
+
55
+ return num
56
+
57
+ # return NaN if no match
58
+ return np.nan
59
+
60
+
61
  def process_data(self):
62
  dataframes = [self._cleanup_dataframe(self._read_and_transform_data(filename), filename.split('/')[2])
63
  for filename in self._find_files(self.directory, self.pattern)]
 
82
  data = data[cols]
83
 
84
  # Drop specific columns
85
+ data.drop(columns=['all', 'truthfulqa:mc|0'])
86
+
87
+
88
+ # Add parameter count column using extract_parameters function
89
+ data['Parameters'] = data.index.to_series().apply(self._extract_parameters)
90
+
91
+ # move the parameters column to the front of the dataframe
92
+ cols = data.columns.tolist()
93
+ cols = cols[-1:] + cols[:-1]
94
+ data = data[cols]
95
+
96
+ return data
97
 
98
  def get_data(self, selected_models):
99
  return self.data[self.data.index.isin(selected_models)]