File size: 1,818 Bytes
b16454e
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import requests
from bs4 import BeautifulSoup as bs


class LOAD_ONLINE_PDF_IPM_PACKAGES:
    def __init__(self):
        self.base_url = 'https://ppqs.gov.in/ipm-packages'
        
        self.ipm_packages = []
        self.pdfs_urls = []
        
        self.headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7',
        }
    
    
    def _get_ipm_packages_name_list(self):
        """
            Parse HTML page to get the names of each IPM Package
        """
        
        response = requests.get(
            self.base_url,
            headers=self.headers,
        )

        soup = bs(response.text, 'html.parser')
        packages = soup.findAll('span', {'class': 'field-content region-name'}, limit=None)
        for package in packages:
            self.ipm_packages.append(package.a['href'].split('/')[-1])
       
    
    def get_ipm_packages_pdfs_list(self):
        """
            Parse HTML page to get the PDF URLs of each IPM Package
        """
        self._get_ipm_packages_name_list()

        for ip in self.ipm_packages:
            source_url = f'{self.base_url}/{ip}'
            print(f'Loading PDFs from: {source_url}')
            
            response = requests.get(
                source_url,
                headers=self.headers,
            )

            soup = bs(response.text, 'html.parser')
            urls = soup.findAll('td', {'class': 'views-field views-field-php'}, limit=None)
            for url in urls:
                self.pdfs_urls.append(url.a['href'])


def get_ipm_packages_pdfs_urls():
    pdf = LOAD_ONLINE_PDF_IPM_PACKAGES()
    pdf.get_ipm_packages_pdfs_list()
    print('Total pdfs:', len(pdf.pdfs_urls))
    return pdf.pdfs_urls