MaksG commited on
Commit
660f700
1 Parent(s): ed31679

Update scrape_3gpp.py

Browse files
Files changed (1) hide show
  1. scrape_3gpp.py +40 -0
scrape_3gpp.py CHANGED
@@ -22,6 +22,46 @@ def browse_folder(url):
22
  return gr.update(choices=excel_links)
23
 
24
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
25
  def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
26
  filenames = []
27
  status_filenames = []
 
22
  return gr.update(choices=excel_links)
23
 
24
 
25
+
26
+ def extract_statuses(url):
27
+ # Send a GET request to the webpage
28
+ response = requests.get(url)
29
+
30
+ # Parse the webpage content
31
+ soup = BeautifulSoup(response.content, 'html.parser')
32
+
33
+ # Find all links in the webpage
34
+ links = soup.find_all('a')
35
+
36
+ # Identify and download the Excel file
37
+ for link in links:
38
+ href = link.get('href')
39
+ if href and (href.endswith('.xls') or href.endswith('.xlsx')):
40
+ excel_url = href if href.startswith('http') else url + href
41
+ excel_response = requests.get(excel_url)
42
+ file_name = 'guide_status.xlsx' #excel_url.split('/')[-1]
43
+
44
+ # Save the file
45
+ with open(file_name, 'wb') as f:
46
+ f.write(excel_response.content)
47
+
48
+ # Read the Excel file
49
+ df = pd.read_excel(file_name)
50
+
51
+ # Check if 'TDoc Status' column exists and extract unique statuses
52
+ if 'TDoc Status' in df.columns:
53
+ unique_statuses = df['TDoc Status'].unique().tolist()
54
+ print(f'Downloaded {file_name} and extracted statuses: {unique_statuses}')
55
+
56
+
57
+ if 'withdrawn' in unique_statuses:
58
+ unique_statuses.remove('withdrawn')
59
+ return unique_statuses
60
+ else:
61
+ print(f"'TDoc Status' column not found in {file_name}")
62
+ return []
63
+
64
+
65
  def scrape(url, excel_file, folder_name, status_list, progress=gr.Progress()):
66
  filenames = []
67
  status_filenames = []