MaksG commited on
Commit
0226df2
1 Parent(s): 6c86532

Update scrape_3gpp.py

Browse files
Files changed (1) hide show
  1. scrape_3gpp.py +38 -18
scrape_3gpp.py CHANGED
@@ -9,6 +9,31 @@ import textract
9
  import gradio as gr
10
 
11
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
12
  def browse_folder(url):
13
  if url.lower().endswith(('docs', 'docs/')):
14
  return gr.update(choices=[])
@@ -62,7 +87,7 @@ def extract_statuses(url):
62
  return []
63
 
64
 
65
- def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
66
  filenames = []
67
  status_filenames = []
68
  # Check if the excel_file argument is provided and if the file exists.
@@ -116,8 +141,8 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
116
  download_directory = folder_name
117
  if not os.path.exists(download_directory):
118
  os.makedirs(download_directory)
119
- download_num = 0
120
- pourcentss = 0.1
121
  print(f'filenames: {status_filenames}')
122
  if not filenames and not status_filenames:
123
  print("No Excel file provided, or no valid URLs found in the file.")
@@ -135,11 +160,8 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
135
 
136
  # Télécharger chaque fichier zip
137
  for zip_link in zip_links:
138
- if download_num%10 == 0:
139
- pourcentss = pourcentss + download_num/500
140
- progress(pourcentss,desc='Telechargement')
141
- download_num = 0
142
- download_num+=1
143
  # Construire l'URL absolue du fichier zip
144
  absolute_url = urljoin(url, zip_link)
145
 
@@ -161,11 +183,8 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
161
  for file_url in status_filenames:
162
  filename = os.path.basename(file_url)
163
  save_path = os.path.join(download_directory, filename)
164
- if download_num%10 == 0:
165
- pourcentss = pourcentss + download_num/500
166
- progress(pourcentss,desc='Telechargement')
167
- download_num = 0
168
- download_num+=1
169
  try:
170
  with requests.get(file_url, stream=True) as r:
171
  r.raise_for_status()
@@ -283,7 +302,8 @@ def update_excel(data, excel_file, url):
283
  def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
284
  folder_name = 'nom provisoire'
285
  temp_excel = url.split("/")[-2] + "_status.xlsx"
286
- progress(0.0,desc='Telechargement')
 
287
  result, message = scrape(url, excel_file, folder_name, status_list)
288
  if result:
289
  print("Success:", message)
@@ -294,7 +314,7 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
294
  extractZip(folder_name)
295
  progress(0.5,desc='Extraction 2')
296
  excel3gpp(url)
297
- progress(0.6,desc='Mise en forme Excel')
298
 
299
 
300
  extract_directory = folder_name +" extraction"
@@ -311,7 +331,7 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
311
  "pptx": ["URL", "File", "Type", "Title", "Source", "Content"]
312
  }
313
 
314
- num=0.6
315
  data = []
316
  errors_count = 0
317
  processed_count = 0 # Counter for processed files
@@ -328,8 +348,8 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
328
  folder_path = os.path.join(extract_directory, folder)
329
  if os.path.isdir(folder_path):
330
  for file in os.listdir(folder_path):
331
- num = min(num + 0.001, 0.9)
332
- progress(num,desc='Mise en forme Excel')
333
 
334
 
335
  if file == "__MACOSX":
 
9
  import gradio as gr
10
 
11
 
12
+ def count_links(url):
13
+ # Define common file extensions for downloadable content
14
+ file_extensions = ('.zip')
15
+
16
+ try:
17
+ # Send a HTTP request to the URL
18
+ response = requests.get(url)
19
+ response.raise_for_status() # Raise an exception for HTTP errors
20
+
21
+ # Parse the HTML content of the page
22
+ soup = BeautifulSoup(response.text, 'html.parser')
23
+
24
+ # Find all <a> tags in the HTML
25
+ links = soup.find_all('a')
26
+
27
+ # Count the number of links that point to downloadable files
28
+ count = sum(1 for link in links if any(link.get('href', '').endswith(ext) for ext in file_extensions))
29
+
30
+ return count
31
+ except requests.RequestException as e:
32
+ print(f"Error fetching the page: {e}")
33
+ return None
34
+
35
+
36
+
37
  def browse_folder(url):
38
  if url.lower().endswith(('docs', 'docs/')):
39
  return gr.update(choices=[])
 
87
  return []
88
 
89
 
90
+ def scrape(url, excel_file, folder_name, status_list,count, progress=gr.Progress()):
91
  filenames = []
92
  status_filenames = []
93
  # Check if the excel_file argument is provided and if the file exists.
 
141
  download_directory = folder_name
142
  if not os.path.exists(download_directory):
143
  os.makedirs(download_directory)
144
+
145
+ pourcentss = 0.05
146
  print(f'filenames: {status_filenames}')
147
  if not filenames and not status_filenames:
148
  print("No Excel file provided, or no valid URLs found in the file.")
 
160
 
161
  # Télécharger chaque fichier zip
162
  for zip_link in zip_links:
163
+ progress(pourcentss,desc='Downloading')
164
+ pourcentss+=0.4/count
 
 
 
165
  # Construire l'URL absolue du fichier zip
166
  absolute_url = urljoin(url, zip_link)
167
 
 
183
  for file_url in status_filenames:
184
  filename = os.path.basename(file_url)
185
  save_path = os.path.join(download_directory, filename)
186
+ progress(pourcentss,desc='Downloading')
187
+ pourcentss+=0.4/count
 
 
 
188
  try:
189
  with requests.get(file_url, stream=True) as r:
190
  r.raise_for_status()
 
302
  def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
303
  folder_name = 'nom provisoire'
304
  temp_excel = url.split("/")[-2] + "_status.xlsx"
305
+ progress(0.0,desc='Downloading')
306
+ count = count_links(url)
307
  result, message = scrape(url, excel_file, folder_name, status_list)
308
  if result:
309
  print("Success:", message)
 
314
  extractZip(folder_name)
315
  progress(0.5,desc='Extraction 2')
316
  excel3gpp(url)
317
+ progress(0.6,desc='Creating Excel File')
318
 
319
 
320
  extract_directory = folder_name +" extraction"
 
331
  "pptx": ["URL", "File", "Type", "Title", "Source", "Content"]
332
  }
333
 
334
+ pourcents2=0.6
335
  data = []
336
  errors_count = 0
337
  processed_count = 0 # Counter for processed files
 
348
  folder_path = os.path.join(extract_directory, folder)
349
  if os.path.isdir(folder_path):
350
  for file in os.listdir(folder_path):
351
+ progress(pourcents2,desc='Creating Excel File')
352
+ pourcents2+=0.4/count
353
 
354
 
355
  if file == "__MACOSX":