Spaces:
Sleeping
Sleeping
Update scrape_3gpp.py
Browse files- scrape_3gpp.py +40 -0
scrape_3gpp.py
CHANGED
@@ -22,6 +22,46 @@ def browse_folder(url):
|
|
22 |
return gr.update(choices=excel_links)
|
23 |
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
26 |
filenames = []
|
27 |
status_filenames = []
|
|
|
22 |
return gr.update(choices=excel_links)
|
23 |
|
24 |
|
25 |
+
|
26 |
+
def extract_statuses(url):
|
27 |
+
# Send a GET request to the webpage
|
28 |
+
response = requests.get(url)
|
29 |
+
|
30 |
+
# Parse the webpage content
|
31 |
+
soup = BeautifulSoup(response.content, 'html.parser')
|
32 |
+
|
33 |
+
# Find all links in the webpage
|
34 |
+
links = soup.find_all('a')
|
35 |
+
|
36 |
+
# Identify and download the Excel file
|
37 |
+
for link in links:
|
38 |
+
href = link.get('href')
|
39 |
+
if href and (href.endswith('.xls') or href.endswith('.xlsx')):
|
40 |
+
excel_url = href if href.startswith('http') else url + href
|
41 |
+
excel_response = requests.get(excel_url)
|
42 |
+
file_name = 'guide_status.xlsx' #excel_url.split('/')[-1]
|
43 |
+
|
44 |
+
# Save the file
|
45 |
+
with open(file_name, 'wb') as f:
|
46 |
+
f.write(excel_response.content)
|
47 |
+
|
48 |
+
# Read the Excel file
|
49 |
+
df = pd.read_excel(file_name)
|
50 |
+
|
51 |
+
# Check if 'TDoc Status' column exists and extract unique statuses
|
52 |
+
if 'TDoc Status' in df.columns:
|
53 |
+
unique_statuses = df['TDoc Status'].unique().tolist()
|
54 |
+
print(f'Downloaded {file_name} and extracted statuses: {unique_statuses}')
|
55 |
+
|
56 |
+
|
57 |
+
if 'withdrawn' in unique_statuses:
|
58 |
+
unique_statuses.remove('withdrawn')
|
59 |
+
return unique_statuses
|
60 |
+
else:
|
61 |
+
print(f"'TDoc Status' column not found in {file_name}")
|
62 |
+
return []
|
63 |
+
|
64 |
+
|
65 |
def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
|
66 |
filenames = []
|
67 |
status_filenames = []
|