MaksG commited on
Commit
31f53ef
1 Parent(s): 1f21438

Update scrape_3gpp.py

Browse files
Files changed (1) hide show
  1. scrape_3gpp.py +36 -106
scrape_3gpp.py CHANGED
@@ -65,135 +65,65 @@ def extract_statuses(url):
65
  def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
66
  filenames = []
67
  status_filenames = []
68
- # Check if the excel_file argument is provided and if the file exists.
69
- excel_file_path = "guide_status.xlsx" # Hardcoded path to the Excel file
70
 
71
- if os.path.exists(excel_file_path):
72
  try:
73
- df = pd.read_excel(excel_file_path)
74
  print(f"Initial DataFrame size: {len(df)}")
75
 
76
- if 'TDoc Status' in df.columns:
77
- # Check if status_list is empty; if so, consider all rows valid
78
- if not status_list:
79
- print("No status list provided, considering all statuses.")
80
- # No need to filter df based on status_list, as we're considering all statuses
81
- else:
82
- # Proceed with filtering if status_list is not empty
83
- df = df[df['TDoc Status'].isin(status_list)]
84
- print(f"Filtered DataFrame size: {len(df)}")
85
-
86
- if df.empty:
87
- print("No files match the specified 'TDoc Status'.")
88
-
89
  else:
90
- if 'TDoc' in df.columns and not df['TDoc'].isnull().all():
91
- status_filenames = [f"{url}{row['TDoc']}.zip" for index, row in df.iterrows()]
92
- elif 'URL' in df.columns and not df['URL'].isnull().all():
93
- status_filenames = df['URL'].tolist()
94
- else:
95
- print("No valid 'File' or 'URL' entries found for the filtered statuses.")
96
-
97
- print(f"Filenames: {status_filenames}")
98
  else:
99
- print("'TDoc Status' column not found in the Excel file.")
100
-
101
- except Exception as e:
102
- print(f"Error reading Excel file: {e}")
103
-
104
-
105
-
106
- if excel_file and os.path.exists(excel_file):
107
- try:
108
- df = pd.read_excel(excel_file)
109
 
110
- # If 'Actions' in df.columns and filter based on it, and construct URLs from 'TDoc' or 'URL' columns
111
- if 'Actions' in df.columns:
112
- df = df[df['Actions'] == 'x']
113
-
114
- elif 'File' in df.columns:
115
- filenames = [f"{url}{row['File']}.zip" for index, row in df.iterrows()]
116
- elif 'URL' in df.columns:
117
- filenames = df['URL'].tolist()
118
  except Exception as e:
119
  print(f"Error reading Excel file: {e}")
120
- # Optionally, handle the error or return a message if needed
121
 
122
- # If no Excel file is provided or found, or if it lacks 'TDoc'/'URL', the function can still continue with predefined URLs or other logic
123
  download_directory = folder_name
124
  if not os.path.exists(download_directory):
125
  os.makedirs(download_directory)
126
 
127
  pourcentss = 0.05
128
- print(f'filenames: {status_filenames}')
129
- if not filenames and not status_filenames:
130
- print("No Excel file provided, or no valid URLs found in the file.")
131
- # You can either return here or continue with other predefined logic
132
- response = requests.get(url)
133
-
134
- # Analyser le contenu HTML de la page
135
- soup = BeautifulSoup(response.content, "html.parser")
136
-
137
- # Trouver tous les balises <a> avec des attributs href (liens)
138
- links = soup.find_all("a", href=True)
139
-
140
- # Filtrer les liens se terminant par ".zip"
141
- zip_links = [link['href'] for link in links if link['href'].endswith('.zip')]
142
-
143
- # Télécharger chaque fichier zip
144
- for zip_link in zip_links:
145
- progress(pourcentss,desc='Downloading')
146
- pourcentss+=0.4/len(df)
147
- # Construire l'URL absolue du fichier zip
148
- absolute_url = urljoin(url, zip_link)
149
-
150
- # Extraire le nom de fichier de l'URL
151
- filename = os.path.basename(absolute_url)
152
 
153
- # Chemin où le fichier sera enregistré
154
- save_path = os.path.join(download_directory, filename)
 
155
 
156
- # Envoyer une requête GET pour télécharger le fichier
157
- with requests.get(absolute_url, stream=True) as r:
 
 
 
 
 
 
 
158
  r.raise_for_status()
159
  with open(save_path, 'wb') as f:
160
  for chunk in r.iter_content(chunk_size=8192):
161
  f.write(chunk)
 
 
 
162
 
163
- elif not filenames:
164
- # Proceed with downloading files using the filenames list
165
- for file_url in status_filenames:
166
- filename = os.path.basename(file_url)
167
- save_path = os.path.join(download_directory, filename)
168
- progress(pourcentss,desc='Downloading')
169
- pourcentss+=0.4/len(df)
170
- try:
171
- with requests.get(file_url, stream=True) as r:
172
- r.raise_for_status()
173
- with open(save_path, 'wb') as f:
174
- for chunk in r.iter_content(chunk_size=8192):
175
- f.write(chunk)
176
- except requests.exceptions.HTTPError as e:
177
- print(f"skipped file: {file_url}: {e}")
178
 
179
- else:
180
- # Proceed with downloading files using the filenames list
181
- for file_url in filenames:
182
- filename = os.path.basename(file_url)
183
- save_path = os.path.join(download_directory, filename)
184
- progress(pourcentss,desc='Downloading')
185
- pourcentss+=0.4/len(df)
186
- try:
187
- with requests.get(file_url, stream=True) as r:
188
- r.raise_for_status()
189
- with open(save_path, 'wb') as f:
190
- for chunk in r.iter_content(chunk_size=8192):
191
- f.write(chunk)
192
- except requests.exceptions.HTTPError as e:
193
- print(f"HTTP error occurred: {file_url}: {e}")
194
- return False, "Il n'y a pas de colonne action ou alors celle ci n'est pas bien écrite, format attendu: 'Actions'"
195
-
196
- return True, len(df)
197
 
198
 
199
 
 
65
  def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
66
  filenames = []
67
  status_filenames = []
68
+ df = pd.DataFrame() # Initialize df to ensure it's always defined
 
69
 
70
+ if os.path.exists(excel_file):
71
  try:
72
+ df = pd.read_excel(excel_file)
73
  print(f"Initial DataFrame size: {len(df)}")
74
 
75
+ if 'TDoc Status' in df.columns and status_list:
76
+ df = df[df['TDoc Status'].isin(status_list)]
77
+ print(f"Filtered DataFrame size: {len(df)}")
78
+ else:
79
+ # If status_list is empty, consider all statuses
80
+ print("No filtering applied based on TDoc Status")
81
+
82
+ if not df.empty:
83
+ if 'TDoc' in df.columns and not df['TDoc'].isnull().all():
84
+ status_filenames = [f"{url}{row['TDoc']}.zip" for index, row in df.iterrows()]
85
+ elif 'URL' in df.columns and not df['URL'].isnull().all():
86
+ status_filenames = df['URL'].tolist()
 
87
  else:
88
+ print("No valid 'TDoc' or 'URL' entries found.")
89
+
90
+ print(f"Filenames: {status_filenames}")
 
 
 
 
 
91
  else:
92
+ print("DataFrame is empty after filtering.")
 
 
 
 
 
 
 
 
 
93
 
 
 
 
 
 
 
 
 
94
  except Exception as e:
95
  print(f"Error reading Excel file: {e}")
 
96
 
 
97
  download_directory = folder_name
98
  if not os.path.exists(download_directory):
99
  os.makedirs(download_directory)
100
 
101
  pourcentss = 0.05
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
102
 
103
+ if not status_filenames:
104
+ print("No Excel file provided, or no valid URLs found in the file.")
105
+ return False, 0
106
 
107
+ # Proceed with downloading files using the filenames list
108
+ for file_url in status_filenames:
109
+ filename = os.path.basename(file_url)
110
+ save_path = os.path.join(download_directory, filename)
111
+ progress(pourcentss, desc='Downloading')
112
+ # Adjust progress calculation based on actual number of files
113
+ pourcentss += 0.4 / len(status_filenames)
114
+ try:
115
+ with requests.get(file_url, stream=True) as r:
116
  r.raise_for_status()
117
  with open(save_path, 'wb') as f:
118
  for chunk in r.iter_content(chunk_size=8192):
119
  f.write(chunk)
120
+ except requests.exceptions.HTTPError as e:
121
+ print(f"HTTP error occurred: {file_url}: {e}")
122
+ # Decide how you want to handle HTTP errors (e.g., skip this file, stop the process, etc.)
123
 
124
+ # Ensure correct return value, especially if the function should indicate success/failure and the number of processed files
125
+ return True, len(status_filenames)
 
 
 
 
 
 
 
 
 
 
 
 
 
126
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
 
128
 
129