Spaces:
Sleeping
Sleeping
Update scrape_3gpp.py
Browse files- scrape_3gpp.py +38 -18
scrape_3gpp.py
CHANGED
@@ -9,6 +9,31 @@ import textract
|
|
9 |
import gradio as gr
|
10 |
|
11 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
12 |
def browse_folder(url):
|
13 |
if url.lower().endswith(('docs', 'docs/')):
|
14 |
return gr.update(choices=[])
|
@@ -62,7 +87,7 @@ def extract_statuses(url):
|
|
62 |
return []
|
63 |
|
64 |
|
65 |
-
def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
66 |
filenames = []
|
67 |
status_filenames = []
|
68 |
# Check if the excel_file argument is provided and if the file exists.
|
@@ -116,8 +141,8 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
|
116 |
download_directory = folder_name
|
117 |
if not os.path.exists(download_directory):
|
118 |
os.makedirs(download_directory)
|
119 |
-
|
120 |
-
pourcentss = 0.
|
121 |
print(f'filenames: {status_filenames}')
|
122 |
if not filenames and not status_filenames:
|
123 |
print("No Excel file provided, or no valid URLs found in the file.")
|
@@ -135,11 +160,8 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
|
135 |
|
136 |
# Télécharger chaque fichier zip
|
137 |
for zip_link in zip_links:
|
138 |
-
|
139 |
-
|
140 |
-
progress(pourcentss,desc='Telechargement')
|
141 |
-
download_num = 0
|
142 |
-
download_num+=1
|
143 |
# Construire l'URL absolue du fichier zip
|
144 |
absolute_url = urljoin(url, zip_link)
|
145 |
|
@@ -161,11 +183,8 @@ def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
|
161 |
for file_url in status_filenames:
|
162 |
filename = os.path.basename(file_url)
|
163 |
save_path = os.path.join(download_directory, filename)
|
164 |
-
|
165 |
-
|
166 |
-
progress(pourcentss,desc='Telechargement')
|
167 |
-
download_num = 0
|
168 |
-
download_num+=1
|
169 |
try:
|
170 |
with requests.get(file_url, stream=True) as r:
|
171 |
r.raise_for_status()
|
@@ -283,7 +302,8 @@ def update_excel(data, excel_file, url):
|
|
283 |
def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
|
284 |
folder_name = 'nom provisoire'
|
285 |
temp_excel = url.split("/")[-2] + "_status.xlsx"
|
286 |
-
progress(0.0,desc='
|
|
|
287 |
result, message = scrape(url, excel_file, folder_name, status_list)
|
288 |
if result:
|
289 |
print("Success:", message)
|
@@ -294,7 +314,7 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
294 |
extractZip(folder_name)
|
295 |
progress(0.5,desc='Extraction 2')
|
296 |
excel3gpp(url)
|
297 |
-
progress(0.6,desc='
|
298 |
|
299 |
|
300 |
extract_directory = folder_name +" extraction"
|
@@ -311,7 +331,7 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
311 |
"pptx": ["URL", "File", "Type", "Title", "Source", "Content"]
|
312 |
}
|
313 |
|
314 |
-
|
315 |
data = []
|
316 |
errors_count = 0
|
317 |
processed_count = 0 # Counter for processed files
|
@@ -328,8 +348,8 @@ def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Pro
|
|
328 |
folder_path = os.path.join(extract_directory, folder)
|
329 |
if os.path.isdir(folder_path):
|
330 |
for file in os.listdir(folder_path):
|
331 |
-
|
332 |
-
|
333 |
|
334 |
|
335 |
if file == "__MACOSX":
|
|
|
9 |
import gradio as gr
|
10 |
|
11 |
|
12 |
+
def count_links(url):
|
13 |
+
# Define common file extensions for downloadable content
|
14 |
+
file_extensions = ('.zip')
|
15 |
+
|
16 |
+
try:
|
17 |
+
# Send a HTTP request to the URL
|
18 |
+
response = requests.get(url)
|
19 |
+
response.raise_for_status() # Raise an exception for HTTP errors
|
20 |
+
|
21 |
+
# Parse the HTML content of the page
|
22 |
+
soup = BeautifulSoup(response.text, 'html.parser')
|
23 |
+
|
24 |
+
# Find all <a> tags in the HTML
|
25 |
+
links = soup.find_all('a')
|
26 |
+
|
27 |
+
# Count the number of links that point to downloadable files
|
28 |
+
count = sum(1 for link in links if any(link.get('href', '').endswith(ext) for ext in file_extensions))
|
29 |
+
|
30 |
+
return count
|
31 |
+
except requests.RequestException as e:
|
32 |
+
print(f"Error fetching the page: {e}")
|
33 |
+
return None
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
def browse_folder(url):
|
38 |
if url.lower().endswith(('docs', 'docs/')):
|
39 |
return gr.update(choices=[])
|
|
|
87 |
return []
|
88 |
|
89 |
|
90 |
+
def scrape(url, excel_file, folder_name, status_list,count, progress=gr.Progress()):
|
91 |
filenames = []
|
92 |
status_filenames = []
|
93 |
# Check if the excel_file argument is provided and if the file exists.
|
|
|
141 |
download_directory = folder_name
|
142 |
if not os.path.exists(download_directory):
|
143 |
os.makedirs(download_directory)
|
144 |
+
|
145 |
+
pourcentss = 0.05
|
146 |
print(f'filenames: {status_filenames}')
|
147 |
if not filenames and not status_filenames:
|
148 |
print("No Excel file provided, or no valid URLs found in the file.")
|
|
|
160 |
|
161 |
# Télécharger chaque fichier zip
|
162 |
for zip_link in zip_links:
|
163 |
+
progress(pourcentss,desc='Downloading')
|
164 |
+
pourcentss+=0.4/count
|
|
|
|
|
|
|
165 |
# Construire l'URL absolue du fichier zip
|
166 |
absolute_url = urljoin(url, zip_link)
|
167 |
|
|
|
183 |
for file_url in status_filenames:
|
184 |
filename = os.path.basename(file_url)
|
185 |
save_path = os.path.join(download_directory, filename)
|
186 |
+
progress(pourcentss,desc='Downloading')
|
187 |
+
pourcentss+=0.4/count
|
|
|
|
|
|
|
188 |
try:
|
189 |
with requests.get(file_url, stream=True) as r:
|
190 |
r.raise_for_status()
|
|
|
302 |
def extractionPrincipale(url, excel_file=None, status_list=None, progress=gr.Progress()):
|
303 |
folder_name = 'nom provisoire'
|
304 |
temp_excel = url.split("/")[-2] + "_status.xlsx"
|
305 |
+
progress(0.0,desc='Downloading')
|
306 |
+
count = count_links(url)
|
307 |
result, message = scrape(url, excel_file, folder_name, status_list)
|
308 |
if result:
|
309 |
print("Success:", message)
|
|
|
314 |
extractZip(folder_name)
|
315 |
progress(0.5,desc='Extraction 2')
|
316 |
excel3gpp(url)
|
317 |
+
progress(0.6,desc='Creating Excel File')
|
318 |
|
319 |
|
320 |
extract_directory = folder_name +" extraction"
|
|
|
331 |
"pptx": ["URL", "File", "Type", "Title", "Source", "Content"]
|
332 |
}
|
333 |
|
334 |
+
pourcents2=0.6
|
335 |
data = []
|
336 |
errors_count = 0
|
337 |
processed_count = 0 # Counter for processed files
|
|
|
348 |
folder_path = os.path.join(extract_directory, folder)
|
349 |
if os.path.isdir(folder_path):
|
350 |
for file in os.listdir(folder_path):
|
351 |
+
progress(pourcents2,desc='Creating Excel File')
|
352 |
+
pourcents2+=0.4/count
|
353 |
|
354 |
|
355 |
if file == "__MACOSX":
|