Spaces:

CoreyMorris
/

MMLU-by-task-Leaderboard

Running

App Files Files Community

Corey Morris commited on Aug 20, 2023

Commit

0a77c60

1 Parent(s): 6251f5a

WIP. Updated download file. Can now download all files. Need to integrate that code to loop through all files to download or combine files first into a single dataframe and then save that

Browse files

Files changed (2) hide show

details_data_processor.py +22 -31
test_details_data_processing.py +13 -0

details_data_processor.py CHANGED Viewed

@@ -7,6 +7,7 @@ import numpy as np
 import requests
 from urllib.parse import quote
 from datetime import datetime
@@ -17,8 +18,6 @@ class DetailsDataProcessor:
     def __init__(self, directory='results', pattern='results*.json'):
         self.directory = directory
         self.pattern = pattern
-        # self.data = self.process_data()
-        # self.ranked_data = self.rank_data()
     def _find_files(self, directory='results', pattern='results*.json'):
         matching_files = []  # List to hold matching filenames
@@ -29,29 +28,22 @@ class DetailsDataProcessor:
                     matching_files.append(filename)  # Append the matching filename to the list
         return matching_files  # Return the list of matching filenames
-    # download a file from a single url and save it to a local directory
-    # @staticmethod
-    # def download_file(url, file_path):
-    #     #TODO: I may not need to save the file.  I can just read it in and convert to a dataframe
-    #     r = requests.get(url, allow_redirects=True)
-    #     open(file_path, 'wb').write(r.content)
-    #     # return dataframe
-    #     df = pd.DataFrame(r.content)
-    #     return df
     @staticmethod
     def download_file(url, save_file_path):
         # Get the current date and time
-        timestamp = datetime.now()
         # Format the timestamp as a string, suitable for use in a filename
-        filename_timestamp = timestamp.strftime("%Y-%m-%dT%H-%M-%S")
-        # Construct the full save file path
-        save_file_path = save_file_path + filename_timestamp + ".json"
-        print(save_file_path)  # Output will be something like "results_2023-08-20T12-34-56.txt"
         try:
             # Sending a GET request
@@ -62,21 +54,16 @@ class DetailsDataProcessor:
             with open(save_file_path, 'wb') as file:
                 file.write(r.content)
-            print(f"Successfully downloaded file: {save_file_path}")
         except requests.ConnectionError as e:
-            print(f"Failed to connect to the URL: {url}")
-            raise e
         except requests.HTTPError as e:
-            print(f"HTTP error occurred: {e}")
-            raise e
         except FileNotFoundError as e:
-            print(f"File not found at path: {save_file_path}")
-            raise e
         except Exception as e:
-            print(f"An unexpected error occurred: {e}")
-            raise e
-        return None
@@ -95,10 +82,14 @@ class DetailsDataProcessor:
         segments = file_path.split('/')
         bits = segments[1]
         model_name = segments[2]
-        timestamp = segments[3].split('_')[1]
         url = f'https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/{bits}/{model_name}/details_harness%7ChendrycksTest-moral_scenarios%7C5_{quote(timestamp, safe="")}'
-        print(url)
         return url
     def pipeline(self):

 import requests
 from urllib.parse import quote
 from datetime import datetime
+import uuid
     def __init__(self, directory='results', pattern='results*.json'):
         self.directory = directory
         self.pattern = pattern
     def _find_files(self, directory='results', pattern='results*.json'):
         matching_files = []  # List to hold matching filenames
                     matching_files.append(filename)  # Append the matching filename to the list
         return matching_files  # Return the list of matching filenames
     @staticmethod
     def download_file(url, save_file_path):
+        #TODO: I may not need to save the file.  I can just read it in and convert to a dataframe
         # Get the current date and time
+        error_count = 0
+        success_count = 0
+        # timestamp = datetime.now()
         # Format the timestamp as a string, suitable for use in a filename
+        # filename_timestamp = timestamp.strftime("%Y-%m-%dT%H-%M-%S")
+        # Generate a unique UUID
+        unique_id = uuid.uuid4()
+        # Append the UUID to the filename
+        save_file_path = save_file_path  + "_" + str(unique_id) + ".json"
         try:
             # Sending a GET request
             with open(save_file_path, 'wb') as file:
                 file.write(r.content)
+            success_count += 1
         except requests.ConnectionError as e:
+            error_count += 1
         except requests.HTTPError as e:
+            error_count += 1
         except FileNotFoundError as e:
+            error_count += 1
         except Exception as e:
+            error_count += 1
+        return error_count, success_count
         segments = file_path.split('/')
         bits = segments[1]
         model_name = segments[2]
+        try:
+            timestamp = segments[3].split('_')[1]
+        except IndexError:
+            print(f"Error: {file_path}")
+            return None
         url = f'https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/{bits}/{model_name}/details_harness%7ChendrycksTest-moral_scenarios%7C5_{quote(timestamp, safe="")}'
         return url
     def pipeline(self):

test_details_data_processing.py CHANGED Viewed

@@ -58,6 +58,19 @@ class TestDetailsDataProcessor(unittest.TestCase):
         # print(files)
         self.assertIsInstance(files, list)
 if __name__ == '__main__':
     unittest.main()

         # print(files)
         self.assertIsInstance(files, list)
+    def test_build_url_harness_types(self):
+        test_cases = [
+            ('results/shaohang/Sparse0.5_OPT-1.3/results_2023-07-19T19:10:31.005235.json', 'details',
+             'https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/shaohang/Sparse0.5_OPT-1.3/details_harness%7ChendrycksTest-moral_scenarios%7C5_2023-07-19T19%3A10%3A31.005235.json'),
+            ('results/shaohang/Sparse0.5_OPT-1.3/results_2023-07-19T19:10:31.005235.json', 'queries',
+             'https://huggingface.co/datasets/open-llm-leaderboard/details/resolve/main/shaohang/Sparse0.5_OPT-1.3/queries_harness%7ChendrycksTest-moral_scenarios%7C5_2023-07-19T19%3A10%3A31.005235.json')
+        ]
+        for file_path, harness_type, expected in test_cases:
+            self.assertEqual(self.processor.build_url(file_path, harness_type), expected,
+                             f"Test failed for file_path: {file_path}, harness_type: {harness_type}")
 if __name__ == '__main__':
     unittest.main()