File size: 4,381 Bytes
2375a67
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
from typing import List, Union, Optional
import os
import requests
import tempfile
from bs4 import BeautifulSoup


class PatentDownloader:
    """
    A class to automate downloading patent PDFs from Google Patents.
    """
    base_url = "https://patents.google.com/patent"

    def __init__(self, verbose: bool = False):
        """
        Initialize the downloader.
        Parameters
        ----------
        verbose : bool
            If True, print detailed debug information.
        """
        self.verbose = verbose

    def download(self, patents: Union[str, List[str]], output_path: Optional[str] = None) -> List[str]:
        """
        Download single or multiple patent PDFs.
        Parameters
        ----------
        patents : str or List[str]
            Single patent number or a list of patent numbers.
        output_path : Optional[str]
            Directory to save the PDFs. Defaults to a temporary directory.
        Returns
        -------
        List[str]
            List of paths to the downloaded PDFs.
        """
        if isinstance(patents, str):
            patents = [patents]

        # Use a temporary directory if no output path is provided
        output_dir = output_path or tempfile.gettempdir()
        os.makedirs(output_dir, exist_ok=True)

        downloaded_files = []

        for i, patent in enumerate(patents):
            try:
                if self.verbose:
                    print(f"🔍 Downloading {i+1}/{len(patents)}: {patent}")
                file_path = self._download_single_pdf(patent, output_dir)
                downloaded_files.append(file_path)
                print(f"✅ Successfully downloaded: {file_path}")
            except Exception as e:
                print(f"❌ Failed to download {patent}: {e}")

        return downloaded_files

    def _download_single_pdf(self, patent_number: str, output_dir: str) -> str:
        """
        Download a single patent PDF.
        Parameters
        ----------
        patent_number : str
            The patent number (e.g., "US8676427B1").
        output_dir : str
            Directory to save the PDF.
        Returns
        -------
        str
            Path to the downloaded PDF file.
        """
        # Construct the Google Patents URL
        patent_url = f"{self.base_url}/{patent_number}/en"

        if self.verbose:
            print(f"Fetching patent page: {patent_url}")

        # Fetch the HTML content of the patent page
        response = requests.get(patent_url)
        if response.status_code != 200:
            raise Exception(f"Failed to fetch patent page for {patent_number}. HTTP Status: {response.status_code}")

        # Parse the HTML content and extract the PDF link
        soup = BeautifulSoup(response.content, "html.parser")
        pdf_url = self._extract_pdf_link(soup)

        if not pdf_url:
            raise Exception(f"No PDF link found for patent {patent_number}.")

        if self.verbose:
            print(f"Found PDF link: {pdf_url}")

        # Download the PDF file
        pdf_response = requests.get(pdf_url)
        if pdf_response.status_code != 200:
            raise Exception(f"Failed to download PDF for {patent_number}. HTTP Status: {pdf_response.status_code}")

        # Save the PDF to the specified output directory
        file_path = os.path.join(output_dir, f"{patent_number}.pdf")
        with open(file_path, "wb") as pdf_file:
            pdf_file.write(pdf_response.content)

        return file_path

    @staticmethod
    def _extract_pdf_link(soup: BeautifulSoup) -> Optional[str]:
        """
        Extract the PDF link from the page's metadata.
        Parameters
        ----------
        soup : BeautifulSoup
            Parsed HTML content of the patent page.
        Returns
        -------
        Optional[str]
            The direct PDF link if found.
        """
        # Look for the 'citation_pdf_url' meta tag
        pdf_meta = soup.find("meta", {"name": "citation_pdf_url"})
        if pdf_meta and pdf_meta.get("content"):
            return pdf_meta["content"]

        # Fallback: search for any <a> tag containing '.pdf' in its href
        pdf_links = [a['href'] for a in soup.find_all("a", href=True) if a['href'].endswith(".pdf")]
        if pdf_links:
            return pdf_links[0]  # Return the first matching PDF link

        return None