KKMS-KSSW-HF / src /web_crawler.py
Chintan Donda
Moving kkms_kssw.py to src/
04e306a
import requests
from bs4 import BeautifulSoup as bs
class LOAD_ONLINE_PDF_IPM_PACKAGES:
def __init__(self):
self.base_url = 'https://ppqs.gov.in/ipm-packages'
self.ipm_packages = []
self.pdfs_urls = []
self.headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
}
def _get_ipm_packages_name_list(self):
"""
Parse HTML page to get the names of each IPM Package
"""
response = requests.get(
self.base_url,
headers=self.headers,
)
soup = bs(response.text, 'html.parser')
packages = soup.findAll('span', {'class': 'field-content region-name'}, limit=None)
for package in packages:
self.ipm_packages.append(package.a['href'].split('/')[-1])
def get_ipm_packages_pdfs_list(self):
"""
Parse HTML page to get the PDF URLs of each IPM Package
"""
self._get_ipm_packages_name_list()
for ip in self.ipm_packages:
source_url = f'{self.base_url}/{ip}'
print(f'Loading PDFs from: {source_url}')
response = requests.get(
source_url,
headers=self.headers,
)
soup = bs(response.text, 'html.parser')
urls = soup.findAll('td', {'class': 'views-field views-field-php'}, limit=None)
for url in urls:
self.pdfs_urls.append(url.a['href'])
def get_ipm_packages_pdfs_urls():
pdf = LOAD_ONLINE_PDF_IPM_PACKAGES()
pdf.get_ipm_packages_pdfs_list()
print('Total pdfs:', len(pdf.pdfs_urls))
return pdf.pdfs_urls