Spaces:
Runtime error
Runtime error
File size: 1,818 Bytes
b16454e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import requests
from bs4 import BeautifulSoup as bs
class LOAD_ONLINE_PDF_IPM_PACKAGES:
def __init__(self):
self.base_url = 'https://ppqs.gov.in/ipm-packages'
self.ipm_packages = []
self.pdfs_urls = []
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
}
def _get_ipm_packages_name_list(self):
"""
Parse HTML page to get the names of each IPM Package
"""
response = requests.get(
self.base_url,
headers=self.headers,
)
soup = bs(response.text, 'html.parser')
packages = soup.findAll('span', {'class': 'field-content region-name'}, limit=None)
for package in packages:
self.ipm_packages.append(package.a['href'].split('/')[-1])
def get_ipm_packages_pdfs_list(self):
"""
Parse HTML page to get the PDF URLs of each IPM Package
"""
self._get_ipm_packages_name_list()
for ip in self.ipm_packages:
source_url = f'{self.base_url}/{ip}'
print(f'Loading PDFs from: {source_url}')
response = requests.get(
source_url,
headers=self.headers,
)
soup = bs(response.text, 'html.parser')
urls = soup.findAll('td', {'class': 'views-field views-field-php'}, limit=None)
for url in urls:
self.pdfs_urls.append(url.a['href'])
def get_ipm_packages_pdfs_urls():
pdf = LOAD_ONLINE_PDF_IPM_PACKAGES()
pdf.get_ipm_packages_pdfs_list()
print('Total pdfs:', len(pdf.pdfs_urls))
return pdf.pdfs_urls
|