heymenn commited on
Commit
103847d
1 Parent(s): 7a48811

Add the sorting of files (#2)

Browse files

- Add the sorting of files (621a4e205f9990bac2cf0a0f044a2c82f5d99f4f)

Files changed (1) hide show
  1. scrape_3gpp.py +54 -4
scrape_3gpp.py CHANGED
@@ -69,17 +69,23 @@ from bs4 import BeautifulSoup
69
  import pandas as pd
70
  import gradio as gr
71
 
72
- def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
 
 
 
73
  filenames = []
74
  status_filenames = []
75
  df = pd.DataFrame() # Initialize df to ensure it's always defined
76
 
77
  # Try to process the Excel file if provided and valid
 
78
  if excel_file and os.path.exists(excel_file):
79
  try:
80
  df = pd.read_excel(excel_file)
81
  print(f"Initial DataFrame size: {len(df)}")
82
 
 
 
83
  if 'TDoc Status' in df.columns and status_list:
84
  df = df[df['TDoc Status'].isin(status_list)]
85
  print(f"Filtered DataFrame size: {len(df)}")
@@ -99,7 +105,20 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
99
  print("Downloading zip files directly from the URL...")
100
  response = requests.get(url)
101
  soup = BeautifulSoup(response.content, 'html.parser')
102
- zip_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.zip')]
 
 
 
 
 
 
 
 
 
 
 
 
 
103
 
104
  # Construct absolute URLs for zip files
105
  status_filenames = [url + link if not link.startswith('http') else link for link in zip_links]
@@ -111,12 +130,19 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
111
 
112
  pourcentss = 0.05
113
 
 
 
114
  # Proceed with downloading files
115
  for file_url in status_filenames:
 
 
116
  filename = os.path.basename(file_url)
117
  save_path = os.path.join(download_directory, filename)
118
  progress(pourcentss, desc='Downloading')
119
  pourcentss += 0.4 / max(len(status_filenames), 1) # Ensure non-zero division
 
 
 
120
  try:
121
  with requests.get(file_url, stream=True) as r:
122
  r.raise_for_status()
@@ -243,8 +269,32 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
243
  temp_excel = nom_status
244
 
245
  progress(0.0,desc='Downloading')
246
-
247
- result, count = scrape(url, excel_file, folder_name, status_list)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
248
  if result:
249
  print("Success")
250
  else:
 
69
  import pandas as pd
70
  import gradio as gr
71
 
72
+
73
+ def scrape(url, excel_file, folder_name, status_list, sorted_files, progress=gr.Progress()):
74
+
75
+ print("ENTERING SCRAPE FUNCTION")
76
  filenames = []
77
  status_filenames = []
78
  df = pd.DataFrame() # Initialize df to ensure it's always defined
79
 
80
  # Try to process the Excel file if provided and valid
81
+ print(f"WE ARE TESTING IF OS.PATH.EXISTS WITH THIS FILE : {excel_file}")
82
  if excel_file and os.path.exists(excel_file):
83
  try:
84
  df = pd.read_excel(excel_file)
85
  print(f"Initial DataFrame size: {len(df)}")
86
 
87
+ print(f"WE ARE TRYING TO LOOK AT status_list : {status_list}")
88
+ print(f"WE ARE TRYING TO LOOK AT df.columns : {df.columns.tolist()}")
89
  if 'TDoc Status' in df.columns and status_list:
90
  df = df[df['TDoc Status'].isin(status_list)]
91
  print(f"Filtered DataFrame size: {len(df)}")
 
105
  print("Downloading zip files directly from the URL...")
106
  response = requests.get(url)
107
  soup = BeautifulSoup(response.content, 'html.parser')
108
+
109
+ # Select all zip files
110
+ zip_links = [a['href'] for a in soup.find_all('a', href=True) if a['href'].endswith('.zip') ]
111
+
112
+ # If the user selected
113
+ if len(sorted_files) != 0:
114
+ sorted_files_tab = []
115
+ for link in zip_links:
116
+ for file in sorted_files:
117
+ if file in link:
118
+ sorted_files_tab.append(link)
119
+
120
+ if len(sorted_files_tab) != 0:
121
+ zip_links = sorted_files_tab
122
 
123
  # Construct absolute URLs for zip files
124
  status_filenames = [url + link if not link.startswith('http') else link for link in zip_links]
 
130
 
131
  pourcentss = 0.05
132
 
133
+
134
+
135
  # Proceed with downloading files
136
  for file_url in status_filenames:
137
+
138
+
139
  filename = os.path.basename(file_url)
140
  save_path = os.path.join(download_directory, filename)
141
  progress(pourcentss, desc='Downloading')
142
  pourcentss += 0.4 / max(len(status_filenames), 1) # Ensure non-zero division
143
+
144
+
145
+
146
  try:
147
  with requests.get(file_url, stream=True) as r:
148
  r.raise_for_status()
 
269
  temp_excel = nom_status
270
 
271
  progress(0.0,desc='Downloading')
272
+
273
+ #Sorting files, downloading only files which have the status selected by the user
274
+ sorted_files = []
275
+
276
+ try:
277
+ guide_file = 'guide.xlsx'
278
+ if os.path.exists(guide_file):
279
+ dfStatus = pd.read_excel(guide_file)
280
+
281
+ # Look if the user selected some filter status
282
+ if len(dfStatus['TDoc Status'].unique().tolist()) != len (status_list):
283
+
284
+
285
+ keys_statuses_filename = dfStatus['TDoc'].tolist()
286
+ values_unique_statuses = dfStatus['TDoc Status'].tolist()
287
+
288
+ doc_statuses = dict(zip(keys_statuses_filename, values_unique_statuses))
289
+ for key in doc_statuses.keys():
290
+ if doc_statuses[key] in status_list:
291
+ sorted_files.append(key)
292
+
293
+ print(sorted_files)
294
+ except Exception as e:
295
+ print(f"Not able to retrieve informations from 'guide.xlsx' ")
296
+
297
+ result, count = scrape(url, excel_file, folder_name, status_list, sorted_files)
298
  if result:
299
  print("Success")
300
  else: