heymenn commited on
Commit
718f3ed
1 Parent(s): ffe8e3b

Update scrape_3gpp.py

Browse files
Files changed (1) hide show
  1. scrape_3gpp.py +21 -44
scrape_3gpp.py CHANGED
@@ -76,50 +76,27 @@ def scrape(url, folder_name, status_list, sorted_files, progress=gr.Progress()):
76
  df = pd.DataFrame() # Initialize df to ensure it's always defined
77
  excel_file = "guide_status.xlsx"
78
 
79
- # Try to process the Excel file if provided and valid
80
- if excel_file and os.path.exists(excel_file):
81
- try:
82
- df = pd.read_excel(excel_file)
83
- print(f"Initial DataFrame size: {len(df)}")
84
-
85
-
86
- if 'TDoc Status' in df.columns and status_list:
87
- df = df[df['TDoc Status'].isin(status_list)]
88
- print(f"Filtered DataFrame size: {len(df)}")
89
-
90
- if not df.empty:
91
- if 'TDoc' in df.columns and not df['TDoc'].isnull().all():
92
- status_filenames = [f"{url}{row['TDoc']}.zip" for index, row in df.iterrows()]
93
- elif 'URL' in df.columns and not df['URL'].isnull().all():
94
- status_filenames = df['URL'].tolist()
95
-
96
- print(f"Filenames from Excel: {status_filenames}")
97
- except Exception as e:
98
- print(f"Error reading Excel file: {e}")
99
-
100
- # If no valid Excel file is given or no status_filenames are found, download zip files directly from the URL
101
- if not excel_file or not status_filenames:
102
- print("Downloading zip files directly from the URL...")
103
- response = requests.get(url)
104
- soup = BeautifulSoup(response.content, 'html.parser')
105
-
106
- # Select all zip files
107
- zip_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.zip') ]
108
-
109
- sorted_files_tab = []
110
- # Check if the user selected some filters
111
- if len(sorted_files) != 0:
112
- for link in zip_links:
113
- for file in sorted_files:
114
- if file in link:
115
- sorted_files_tab.append(link)
116
-
117
- if len(sorted_files_tab) != 0:
118
- zip_links = sorted_files_tab
119
-
120
- # Construct absolute URLs for zip files
121
- status_filenames = [url + link if not link.startswith('http') else link for link in zip_links]
122
- print(f"Filenames from URL: {status_filenames}")
123
 
124
  download_directory = folder_name
125
  if not os.path.exists(download_directory):
 
76
  df = pd.DataFrame() # Initialize df to ensure it's always defined
77
  excel_file = "guide_status.xlsx"
78
 
79
+ print("Downloading zip files directly from the URL...")
80
+ response = requests.get(url)
81
+ soup = BeautifulSoup(response.content, 'html.parser')
82
+
83
+ # Select all zip files
84
+ zip_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.zip') ]
85
+
86
+ sorted_files_tab = []
87
+ # Check if the user selected some filters
88
+ if len(sorted_files) != 0:
89
+ for link in zip_links:
90
+ for file in sorted_files:
91
+ if file in link:
92
+ sorted_files_tab.append(link)
93
+
94
+ if len(sorted_files_tab) != 0:
95
+ zip_links = sorted_files_tab
96
+
97
+ # Construct absolute URLs for zip files
98
+ status_filenames = [url + link if not link.startswith('http') else link for link in zip_links]
99
+ print(f"Filenames from URL: {status_filenames}")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
  download_directory = folder_name
102
  if not os.path.exists(download_directory):