Standard_Intelligence_Dev

Sleeping

App Files Files Community

MaksG commited on Mar 7, 2024

Commit

660f700

verified ·

1 Parent(s): ed31679

Update scrape_3gpp.py

Browse files

Files changed (1) hide show

scrape_3gpp.py +40 -0

scrape_3gpp.py CHANGED Viewed

@@ -22,6 +22,46 @@ def browse_folder(url):
     return gr.update(choices=excel_links)
 def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
     filenames = []
     status_filenames = []

     return gr.update(choices=excel_links)
+def extract_statuses(url):
+    # Send a GET request to the webpage
+    response = requests.get(url)
+    # Parse the webpage content
+    soup = BeautifulSoup(response.content, 'html.parser')
+    # Find all links in the webpage
+    links = soup.find_all('a')
+    # Identify and download the Excel file
+    for link in links:
+        href = link.get('href')
+        if href and (href.endswith('.xls') or href.endswith('.xlsx')):
+            excel_url = href if href.startswith('http') else url + href
+            excel_response = requests.get(excel_url)
+            file_name = 'guide_status.xlsx' #excel_url.split('/')[-1]
+            # Save the file
+            with open(file_name, 'wb') as f:
+                f.write(excel_response.content)
+            # Read the Excel file
+            df = pd.read_excel(file_name)
+            # Check if 'TDoc Status' column exists and extract unique statuses
+            if 'TDoc Status' in df.columns:
+                unique_statuses = df['TDoc Status'].unique().tolist()
+                print(f'Downloaded {file_name} and extracted statuses: {unique_statuses}')
+                if 'withdrawn' in unique_statuses:
+                    unique_statuses.remove('withdrawn')
+                return unique_statuses
+            else:
+                print(f"'TDoc Status' column not found in {file_name}")
+                return []
 def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
     filenames = []
     status_filenames = []