Spaces:

DrishtiSharma
/

quick-spin

Sleeping

App Files Files Community

DrishtiSharma commited on Dec 20, 2024

Commit

2375a67

verified ·

1 Parent(s): 2621cc6

Create patent_downloader.py

Browse files

Files changed (1) hide show

patent_downloader.py +129 -0

patent_downloader.py ADDED Viewed

	@@ -0,0 +1,129 @@

+from typing import List, Union, Optional
+import os
+import requests
+import tempfile
+from bs4 import BeautifulSoup
+class PatentDownloader:
+    """
+    A class to automate downloading patent PDFs from Google Patents.
+    """
+    base_url = "https://patents.google.com/patent"
+    def __init__(self, verbose: bool = False):
+        """
+        Initialize the downloader.
+        Parameters
+        ----------
+        verbose : bool
+            If True, print detailed debug information.
+        """
+        self.verbose = verbose
+    def download(self, patents: Union[str, List[str]], output_path: Optional[str] = None) -> List[str]:
+        """
+        Download single or multiple patent PDFs.
+        Parameters
+        ----------
+        patents : str or List[str]
+            Single patent number or a list of patent numbers.
+        output_path : Optional[str]
+            Directory to save the PDFs. Defaults to a temporary directory.
+        Returns
+        -------
+        List[str]
+            List of paths to the downloaded PDFs.
+        """
+        if isinstance(patents, str):
+            patents = [patents]
+        # Use a temporary directory if no output path is provided
+        output_dir = output_path or tempfile.gettempdir()
+        os.makedirs(output_dir, exist_ok=True)
+        downloaded_files = []
+        for i, patent in enumerate(patents):
+            try:
+                if self.verbose:
+                    print(f"🔍 Downloading {i+1}/{len(patents)}: {patent}")
+                file_path = self._download_single_pdf(patent, output_dir)
+                downloaded_files.append(file_path)
+                print(f"✅ Successfully downloaded: {file_path}")
+            except Exception as e:
+                print(f"❌ Failed to download {patent}: {e}")
+        return downloaded_files
+    def _download_single_pdf(self, patent_number: str, output_dir: str) -> str:
+        """
+        Download a single patent PDF.
+        Parameters
+        ----------
+        patent_number : str
+            The patent number (e.g., "US8676427B1").
+        output_dir : str
+            Directory to save the PDF.
+        Returns
+        -------
+        str
+            Path to the downloaded PDF file.
+        """
+        # Construct the Google Patents URL
+        patent_url = f"{self.base_url}/{patent_number}/en"
+        if self.verbose:
+            print(f"Fetching patent page: {patent_url}")
+        # Fetch the HTML content of the patent page
+        response = requests.get(patent_url)
+        if response.status_code != 200:
+            raise Exception(f"Failed to fetch patent page for {patent_number}. HTTP Status: {response.status_code}")
+        # Parse the HTML content and extract the PDF link
+        soup = BeautifulSoup(response.content, "html.parser")
+        pdf_url = self._extract_pdf_link(soup)
+        if not pdf_url:
+            raise Exception(f"No PDF link found for patent {patent_number}.")
+        if self.verbose:
+            print(f"Found PDF link: {pdf_url}")
+        # Download the PDF file
+        pdf_response = requests.get(pdf_url)
+        if pdf_response.status_code != 200:
+            raise Exception(f"Failed to download PDF for {patent_number}. HTTP Status: {pdf_response.status_code}")
+        # Save the PDF to the specified output directory
+        file_path = os.path.join(output_dir, f"{patent_number}.pdf")
+        with open(file_path, "wb") as pdf_file:
+            pdf_file.write(pdf_response.content)
+        return file_path
+    @staticmethod
+    def _extract_pdf_link(soup: BeautifulSoup) -> Optional[str]:
+        """
+        Extract the PDF link from the page's metadata.
+        Parameters
+        ----------
+        soup : BeautifulSoup
+            Parsed HTML content of the patent page.
+        Returns
+        -------
+        Optional[str]
+            The direct PDF link if found.
+        """
+        # Look for the 'citation_pdf_url' meta tag
+        pdf_meta = soup.find("meta", {"name": "citation_pdf_url"})
+        if pdf_meta and pdf_meta.get("content"):
+            return pdf_meta["content"]
+        # Fallback: search for any <a> tag containing '.pdf' in its href
+        pdf_links = [a['href'] for a in soup.find_all("a", href=True) if a['href'].endswith(".pdf")]
+        if pdf_links:
+            return pdf_links[0]  # Return the first matching PDF link
+        return None