Corey Morris
commited on
Commit
•
b58e1f0
1
Parent(s):
cc32c4f
Updated download_file method
Browse files- details_data_processor.py +46 -3
- test_details_data_processing.py +1 -1
details_data_processor.py
CHANGED
@@ -6,6 +6,9 @@ import re
|
|
6 |
import numpy as np
|
7 |
import requests
|
8 |
from urllib.parse import quote
|
|
|
|
|
|
|
9 |
|
10 |
class DetailsDataProcessor:
|
11 |
# Download
|
@@ -27,10 +30,50 @@ class DetailsDataProcessor:
|
|
27 |
return matching_files # Return the list of matching filenames
|
28 |
|
29 |
# download a file from a single url and save it to a local directory
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
30 |
@staticmethod
|
31 |
-
def download_file(url,
|
32 |
-
|
33 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
34 |
|
35 |
@staticmethod
|
36 |
def single_file_pipeline(url, filename):
|
|
|
6 |
import numpy as np
|
7 |
import requests
|
8 |
from urllib.parse import quote
|
9 |
+
from datetime import datetime
|
10 |
+
|
11 |
+
|
12 |
|
13 |
class DetailsDataProcessor:
|
14 |
# Download
|
|
|
30 |
return matching_files # Return the list of matching filenames
|
31 |
|
32 |
# download a file from a single url and save it to a local directory
|
33 |
+
# @staticmethod
|
34 |
+
# def download_file(url, file_path):
|
35 |
+
# #TODO: I may not need to save the file. I can just read it in and convert to a dataframe
|
36 |
+
# r = requests.get(url, allow_redirects=True)
|
37 |
+
# open(file_path, 'wb').write(r.content)
|
38 |
+
# # return dataframe
|
39 |
+
# df = pd.DataFrame(r.content)
|
40 |
+
# return df
|
41 |
+
|
42 |
+
|
43 |
@staticmethod
|
44 |
+
def download_file(url, save_file_path):
|
45 |
+
# Get the current date and time
|
46 |
+
timestamp = datetime.now()
|
47 |
+
|
48 |
+
# Format the timestamp as a string, suitable for use in a filename
|
49 |
+
filename_timestamp = timestamp.strftime("%Y-%m-%dT%H-%M-%S")
|
50 |
+
|
51 |
+
# Example usage in a filename
|
52 |
+
save_file_path = save_file_path + filename_timestamp + ".json"
|
53 |
+
|
54 |
+
print(save_file_path) # Output will be something like "results_2023-08-20T12-34-56.txt"
|
55 |
+
|
56 |
+
try:
|
57 |
+
# Sending a GET request
|
58 |
+
r = requests.get(url, allow_redirects=True)
|
59 |
+
r.raise_for_status() # Raises an HTTPError if the HTTP request returned an unsuccessful status code
|
60 |
+
|
61 |
+
# Writing the content to the specified file
|
62 |
+
with open(save_file_path, 'wb') as file:
|
63 |
+
file.write(r.content)
|
64 |
+
|
65 |
+
print(f"Successfully downloaded file: {save_file_path}")
|
66 |
+
except requests.ConnectionError:
|
67 |
+
print(f"Failed to connect to the URL: {url}")
|
68 |
+
except requests.HTTPError as e:
|
69 |
+
print(f"HTTP error occurred: {e}")
|
70 |
+
except FileNotFoundError:
|
71 |
+
print(f"File not found at path: {save_file_path}")
|
72 |
+
except Exception as e:
|
73 |
+
print(f"An unexpected error occurred: {e}")
|
74 |
+
|
75 |
+
return None
|
76 |
+
|
77 |
|
78 |
@staticmethod
|
79 |
def single_file_pipeline(url, filename):
|
test_details_data_processing.py
CHANGED
@@ -16,7 +16,7 @@ class TestDetailsDataProcessor(unittest.TestCase):
|
|
16 |
# self.assertIsInstance(data, pd.DataFrame)
|
17 |
|
18 |
def test_download_file(self):
|
19 |
-
DetailsDataProcessor.download_file('https://www.google.com', '
|
20 |
self.assertTrue(os.path.exists('test.html'))
|
21 |
os.remove('test.html')
|
22 |
|
|
|
16 |
# self.assertIsInstance(data, pd.DataFrame)
|
17 |
|
18 |
def test_download_file(self):
|
19 |
+
DetailsDataProcessor.download_file('https://www.google.com', 'test_file_please_remove')
|
20 |
self.assertTrue(os.path.exists('test.html'))
|
21 |
os.remove('test.html')
|
22 |
|