import requests from bs4 import BeautifulSoup as bs class LOAD_ONLINE_PDF_IPM_PACKAGES: def __init__(self): self.base_url = 'https://ppqs.gov.in/ipm-packages' self.ipm_packages = [] self.pdfs_urls = [] self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', } def _get_ipm_packages_name_list(self): """ Parse HTML page to get the names of each IPM Package """ response = requests.get( self.base_url, headers=self.headers, ) soup = bs(response.text, 'html.parser') packages = soup.findAll('span', {'class': 'field-content region-name'}, limit=None) for package in packages: self.ipm_packages.append(package.a['href'].split('/')[-1]) def get_ipm_packages_pdfs_list(self): """ Parse HTML page to get the PDF URLs of each IPM Package """ self._get_ipm_packages_name_list() for ip in self.ipm_packages: source_url = f'{self.base_url}/{ip}' print(f'Loading PDFs from: {source_url}') response = requests.get( source_url, headers=self.headers, ) soup = bs(response.text, 'html.parser') urls = soup.findAll('td', {'class': 'views-field views-field-php'}, limit=None) for url in urls: self.pdfs_urls.append(url.a['href']) def get_ipm_packages_pdfs_urls(): pdf = LOAD_ONLINE_PDF_IPM_PACKAGES() pdf.get_ipm_packages_pdfs_list() print('Total pdfs:', len(pdf.pdfs_urls)) return pdf.pdfs_urls