Spaces:
Runtime error
Runtime error
import requests | |
from bs4 import BeautifulSoup as bs | |
class LOAD_ONLINE_PDF_IPM_PACKAGES: | |
def __init__(self): | |
self.base_url = 'https://ppqs.gov.in/ipm-packages' | |
self.ipm_packages = [] | |
self.pdfs_urls = [] | |
self.headers = { | |
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7', | |
} | |
def _get_ipm_packages_name_list(self): | |
""" | |
Parse HTML page to get the names of each IPM Package | |
""" | |
response = requests.get( | |
self.base_url, | |
headers=self.headers, | |
) | |
soup = bs(response.text, 'html.parser') | |
packages = soup.findAll('span', {'class': 'field-content region-name'}, limit=None) | |
for package in packages: | |
self.ipm_packages.append(package.a['href'].split('/')[-1]) | |
def get_ipm_packages_pdfs_list(self): | |
""" | |
Parse HTML page to get the PDF URLs of each IPM Package | |
""" | |
self._get_ipm_packages_name_list() | |
for ip in self.ipm_packages: | |
source_url = f'{self.base_url}/{ip}' | |
print(f'Loading PDFs from: {source_url}') | |
response = requests.get( | |
source_url, | |
headers=self.headers, | |
) | |
soup = bs(response.text, 'html.parser') | |
urls = soup.findAll('td', {'class': 'views-field views-field-php'}, limit=None) | |
for url in urls: | |
self.pdfs_urls.append(url.a['href']) | |
def get_ipm_packages_pdfs_urls(): | |
pdf = LOAD_ONLINE_PDF_IPM_PACKAGES() | |
pdf.get_ipm_packages_pdfs_list() | |
print('Total pdfs:', len(pdf.pdfs_urls)) | |
return pdf.pdfs_urls | |