from typing import List, Union, Optional import os import requests import tempfile from bs4 import BeautifulSoup class PatentDownloader: """ A class to automate downloading patent PDFs from Google Patents. """ base_url = "https://patents.google.com/patent" def __init__(self, verbose: bool = False): """ Initialize the downloader. Parameters ---------- verbose : bool If True, print detailed debug information. """ self.verbose = verbose def download(self, patents: Union[str, List[str]], output_path: Optional[str] = None) -> List[str]: """ Download single or multiple patent PDFs. Parameters ---------- patents : str or List[str] Single patent number or a list of patent numbers. output_path : Optional[str] Directory to save the PDFs. Defaults to a temporary directory. Returns ------- List[str] List of paths to the downloaded PDFs. """ if isinstance(patents, str): patents = [patents] # Use a temporary directory if no output path is provided output_dir = output_path or tempfile.gettempdir() os.makedirs(output_dir, exist_ok=True) downloaded_files = [] for i, patent in enumerate(patents): try: if self.verbose: print(f"🔍 Downloading {i+1}/{len(patents)}: {patent}") file_path = self._download_single_pdf(patent, output_dir) downloaded_files.append(file_path) print(f"✅ Successfully downloaded: {file_path}") except Exception as e: print(f"❌ Failed to download {patent}: {e}") return downloaded_files def _download_single_pdf(self, patent_number: str, output_dir: str) -> str: """ Download a single patent PDF. Parameters ---------- patent_number : str The patent number (e.g., "US8676427B1"). output_dir : str Directory to save the PDF. Returns ------- str Path to the downloaded PDF file. """ # Construct the Google Patents URL patent_url = f"{self.base_url}/{patent_number}/en" if self.verbose: print(f"Fetching patent page: {patent_url}") # Fetch the HTML content of the patent page response = requests.get(patent_url) if response.status_code != 200: raise Exception(f"Failed to fetch patent page for {patent_number}. HTTP Status: {response.status_code}") # Parse the HTML content and extract the PDF link soup = BeautifulSoup(response.content, "html.parser") pdf_url = self._extract_pdf_link(soup) if not pdf_url: raise Exception(f"No PDF link found for patent {patent_number}.") if self.verbose: print(f"Found PDF link: {pdf_url}") # Download the PDF file pdf_response = requests.get(pdf_url) if pdf_response.status_code != 200: raise Exception(f"Failed to download PDF for {patent_number}. HTTP Status: {pdf_response.status_code}") # Save the PDF to the specified output directory file_path = os.path.join(output_dir, f"{patent_number}.pdf") with open(file_path, "wb") as pdf_file: pdf_file.write(pdf_response.content) return file_path @staticmethod def _extract_pdf_link(soup: BeautifulSoup) -> Optional[str]: """ Extract the PDF link from the page's metadata. Parameters ---------- soup : BeautifulSoup Parsed HTML content of the patent page. Returns ------- Optional[str] The direct PDF link if found. """ # Look for the 'citation_pdf_url' meta tag pdf_meta = soup.find("meta", {"name": "citation_pdf_url"}) if pdf_meta and pdf_meta.get("content"): return pdf_meta["content"] # Fallback: search for any tag containing '.pdf' in its href pdf_links = [a['href'] for a in soup.find_all("a", href=True) if a['href'].endswith(".pdf")] if pdf_links: return pdf_links[0] # Return the first matching PDF link return None