Spaces:
Sleeping
Sleeping
File size: 4,381 Bytes
2375a67 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 |
from typing import List, Union, Optional
import os
import requests
import tempfile
from bs4 import BeautifulSoup
class PatentDownloader:
"""
A class to automate downloading patent PDFs from Google Patents.
"""
base_url = "https://patents.google.com/patent"
def __init__(self, verbose: bool = False):
"""
Initialize the downloader.
Parameters
----------
verbose : bool
If True, print detailed debug information.
"""
self.verbose = verbose
def download(self, patents: Union[str, List[str]], output_path: Optional[str] = None) -> List[str]:
"""
Download single or multiple patent PDFs.
Parameters
----------
patents : str or List[str]
Single patent number or a list of patent numbers.
output_path : Optional[str]
Directory to save the PDFs. Defaults to a temporary directory.
Returns
-------
List[str]
List of paths to the downloaded PDFs.
"""
if isinstance(patents, str):
patents = [patents]
# Use a temporary directory if no output path is provided
output_dir = output_path or tempfile.gettempdir()
os.makedirs(output_dir, exist_ok=True)
downloaded_files = []
for i, patent in enumerate(patents):
try:
if self.verbose:
print(f"🔍 Downloading {i+1}/{len(patents)}: {patent}")
file_path = self._download_single_pdf(patent, output_dir)
downloaded_files.append(file_path)
print(f"✅ Successfully downloaded: {file_path}")
except Exception as e:
print(f"❌ Failed to download {patent}: {e}")
return downloaded_files
def _download_single_pdf(self, patent_number: str, output_dir: str) -> str:
"""
Download a single patent PDF.
Parameters
----------
patent_number : str
The patent number (e.g., "US8676427B1").
output_dir : str
Directory to save the PDF.
Returns
-------
str
Path to the downloaded PDF file.
"""
# Construct the Google Patents URL
patent_url = f"{self.base_url}/{patent_number}/en"
if self.verbose:
print(f"Fetching patent page: {patent_url}")
# Fetch the HTML content of the patent page
response = requests.get(patent_url)
if response.status_code != 200:
raise Exception(f"Failed to fetch patent page for {patent_number}. HTTP Status: {response.status_code}")
# Parse the HTML content and extract the PDF link
soup = BeautifulSoup(response.content, "html.parser")
pdf_url = self._extract_pdf_link(soup)
if not pdf_url:
raise Exception(f"No PDF link found for patent {patent_number}.")
if self.verbose:
print(f"Found PDF link: {pdf_url}")
# Download the PDF file
pdf_response = requests.get(pdf_url)
if pdf_response.status_code != 200:
raise Exception(f"Failed to download PDF for {patent_number}. HTTP Status: {pdf_response.status_code}")
# Save the PDF to the specified output directory
file_path = os.path.join(output_dir, f"{patent_number}.pdf")
with open(file_path, "wb") as pdf_file:
pdf_file.write(pdf_response.content)
return file_path
@staticmethod
def _extract_pdf_link(soup: BeautifulSoup) -> Optional[str]:
"""
Extract the PDF link from the page's metadata.
Parameters
----------
soup : BeautifulSoup
Parsed HTML content of the patent page.
Returns
-------
Optional[str]
The direct PDF link if found.
"""
# Look for the 'citation_pdf_url' meta tag
pdf_meta = soup.find("meta", {"name": "citation_pdf_url"})
if pdf_meta and pdf_meta.get("content"):
return pdf_meta["content"]
# Fallback: search for any <a> tag containing '.pdf' in its href
pdf_links = [a['href'] for a in soup.find_all("a", href=True) if a['href'].endswith(".pdf")]
if pdf_links:
return pdf_links[0] # Return the first matching PDF link
return None
|