Spaces:
Running
Running
import requests | |
from bs4 import BeautifulSoup | |
import os | |
import json | |
import re | |
from typing import List, Dict | |
import logging | |
from urllib.parse import urljoin, urlparse | |
class DigitalCommonwealthScraper: | |
def __init__(self, base_url: str = "https://www.digitalcommonwealth.org"): | |
""" | |
Initialize the scraper with base URL and logging | |
:param base_url: Base URL for Digital Commonwealth | |
""" | |
self.base_url = base_url | |
logging.basicConfig(level=logging.INFO) | |
self.logger = logging.getLogger(__name__) | |
# Headers to mimic browser request | |
self.headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
} | |
def fetch_page(self, url: str) -> requests.Response: | |
""" | |
Fetch webpage content with error handling | |
:param url: URL to fetch | |
:return: Response object | |
""" | |
try: | |
response = requests.get(url, headers=self.headers) | |
response.raise_for_status() | |
return response | |
except requests.RequestException as e: | |
self.logger.error(f"Error fetching {url}: {e}") | |
return None | |
def extract_json_metadata(self, url: str) -> Dict: | |
""" | |
Extract JSON metadata from the page | |
:param url: URL of the page | |
:return: Dictionary of metadata | |
""" | |
json_url = f"{url}.json" | |
response = self.fetch_page(json_url) | |
if response: | |
try: | |
return response.json() | |
except json.JSONDecodeError: | |
self.logger.error(f"Could not parse JSON from {json_url}") | |
return {} | |
return {} | |
def extract_images(self, url: str) -> List[Dict]: | |
""" | |
Extract images from the page | |
:param url: URL of the page to scrape | |
:return: List of image dictionaries | |
""" | |
# Fetch page content | |
response = self.fetch_page(url) | |
if not response: | |
return [] | |
# Parse HTML | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Extract JSON metadata | |
metadata = self.extract_json_metadata(url) | |
# List to store images | |
images = [] | |
# Strategy 1: Look for image viewers or specific image containers | |
image_containers = [ | |
soup.find('div', class_='viewer-container'), | |
soup.find('div', class_='image-viewer'), | |
soup.find('div', id='image-container') | |
] | |
# Strategy 2: Find all image tags | |
img_tags = soup.find_all('img') | |
# Combine image sources | |
for img in img_tags: | |
# Get image source | |
src = img.get('src') | |
if not src: | |
continue | |
# Resolve relative URLs | |
full_src = urljoin(url, src) | |
# Extract alt text or use filename | |
alt = img.get('alt', os.path.basename(urlparse(full_src).path)) | |
# Create image dictionary | |
image_info = { | |
'url': full_src, | |
'alt': alt, | |
'source_page': url | |
} | |
# Try to add metadata if available | |
if metadata: | |
try: | |
# Extract relevant metadata from JSON if possible | |
image_info['metadata'] = { | |
'title': metadata.get('data', {}).get('attributes', {}).get('title_info_primary_tsi'), | |
'description': metadata.get('data', {}).get('attributes', {}).get('abstract_tsi'), | |
'subject': metadata.get('data', {}).get('attributes', {}).get('subject_geographic_sim') | |
} | |
except Exception as e: | |
self.logger.warning(f"Error extracting metadata: {e}") | |
images.append(image_info) | |
return images | |
def download_images(self, images: List[Dict], output_dir: str = 'downloaded_images') -> List[str]: | |
""" | |
Download images to local directory | |
:param images: List of image dictionaries | |
:param output_dir: Directory to save images | |
:return: List of downloaded file paths | |
""" | |
# Create output directory | |
os.makedirs(output_dir, exist_ok=True) | |
downloaded_files = [] | |
for i, image in enumerate(images): | |
try: | |
response = requests.get(image['url'], headers=self.headers) | |
response.raise_for_status() | |
# Generate filename | |
ext = os.path.splitext(urlparse(image['url']).path)[1] or '.jpg' | |
filename = os.path.join(output_dir, f'image_{i}{ext}') | |
with open(filename, 'wb') as f: | |
f.write(response.content) | |
downloaded_files.append(filename) | |
self.logger.info(f"Downloaded: {filename}") | |
except Exception as e: | |
self.logger.error(f"Error downloading {image['url']}: {e}") | |
return downloaded_files | |
#def main(): | |
# Example usage | |
# scraper = DigitalCommonwealthScraper() | |
# | |
# Example URL from input | |
# url = "https://www.digitalcommonwealth.org/search/commonwealth-oai:5712qh738" | |
# Extract images | |
#images = scraper.extract_images(url) | |
# Print image information | |
#for img in images: | |
# print(json.dumps(img, indent=2)) | |
# Optional: Download images | |
#scraper.download_images(images) | |
#if __name__ == "__main__": | |
# main() |