Spaces:

DrishtiSharma
/

quick-spin

Sleeping

File size: 4,381 Bytes

2375a67

from typing import List, Union, Optional
import os
import requests
import tempfile
from bs4 import BeautifulSoup


class PatentDownloader:
    """
    A class to automate downloading patent PDFs from Google Patents.
    """
    base_url = "https://patents.google.com/patent"

    def __init__(self, verbose: bool = False):
        """
        Initialize the downloader.
        Parameters
        ----------
        verbose : bool
            If True, print detailed debug information.
        """
        self.verbose = verbose

    def download(self, patents: Union[str, List[str]], output_path: Optional[str] = None) -> List[str]:
        """
        Download single or multiple patent PDFs.
        Parameters
        ----------
        patents : str or List[str]
            Single patent number or a list of patent numbers.
        output_path : Optional[str]
            Directory to save the PDFs. Defaults to a temporary directory.
        Returns
        -------
        List[str]
            List of paths to the downloaded PDFs.
        """
        if isinstance(patents, str):
            patents = [patents]

        # Use a temporary directory if no output path is provided
        output_dir = output_path or tempfile.gettempdir()
        os.makedirs(output_dir, exist_ok=True)

        downloaded_files = []

        for i, patent in enumerate(patents):
            try:
                if self.verbose:
                    print(f"🔍 Downloading {i+1}/{len(patents)}: {patent}")
                file_path = self._download_single_pdf(patent, output_dir)
                downloaded_files.append(file_path)
                print(f"✅ Successfully downloaded: {file_path}")
            except Exception as e:
                print(f"❌ Failed to download {patent}: {e}")

        return downloaded_files

    def _download_single_pdf(self, patent_number: str, output_dir: str) -> str:
        """
        Download a single patent PDF.
        Parameters
        ----------
        patent_number : str
            The patent number (e.g., "US8676427B1").
        output_dir : str
            Directory to save the PDF.
        Returns
        -------
        str
            Path to the downloaded PDF file.
        """
        # Construct the Google Patents URL
        patent_url = f"{self.base_url}/{patent_number}/en"

        if self.verbose:
            print(f"Fetching patent page: {patent_url}")

        # Fetch the HTML content of the patent page
        response = requests.get(patent_url)
        if response.status_code != 200:
            raise Exception(f"Failed to fetch patent page for {patent_number}. HTTP Status: {response.status_code}")

        # Parse the HTML content and extract the PDF link
        soup = BeautifulSoup(response.content, "html.parser")
        pdf_url = self._extract_pdf_link(soup)

        if not pdf_url:
            raise Exception(f"No PDF link found for patent {patent_number}.")

        if self.verbose:
            print(f"Found PDF link: {pdf_url}")

        # Download the PDF file
        pdf_response = requests.get(pdf_url)
        if pdf_response.status_code != 200:
            raise Exception(f"Failed to download PDF for {patent_number}. HTTP Status: {pdf_response.status_code}")

        # Save the PDF to the specified output directory
        file_path = os.path.join(output_dir, f"{patent_number}.pdf")
        with open(file_path, "wb") as pdf_file:
            pdf_file.write(pdf_response.content)

        return file_path

    @staticmethod
    def _extract_pdf_link(soup: BeautifulSoup) -> Optional[str]:
        """
        Extract the PDF link from the page's metadata.
        Parameters
        ----------
        soup : BeautifulSoup
            Parsed HTML content of the patent page.
        Returns
        -------
        Optional[str]
            The direct PDF link if found.
        """
        # Look for the 'citation_pdf_url' meta tag
        pdf_meta = soup.find("meta", {"name": "citation_pdf_url"})
        if pdf_meta and pdf_meta.get("content"):
            return pdf_meta["content"]

        # Fallback: search for any <a> tag containing '.pdf' in its href
        pdf_links = [a['href'] for a in soup.find_all("a", href=True) if a['href'].endswith(".pdf")]
        if pdf_links:
            return pdf_links[0]  # Return the first matching PDF link

        return None