File size: 5,381 Bytes
f78d694
1
import requests
from bs4 import BeautifulSoup
import os
import json
import re
from typing import List, Dict
import logging
from urllib.parse import urljoin, urlparse

class DigitalCommonwealthScraper:
    def __init__(self, base_url: str = "https://www.digitalcommonwealth.org"):
        """
        Initialize the scraper with base URL and logging
        
        :param base_url: Base URL for Digital Commonwealth
        """
        self.base_url = base_url
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
        # Headers to mimic browser request
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
    
    def fetch_page(self, url: str) -> requests.Response:
        """
        Fetch webpage content with error handling
        
        :param url: URL to fetch
        :return: Response object
        """
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            return response
        except requests.RequestException as e:
            self.logger.error(f"Error fetching {url}: {e}")
            return None
    
    def extract_json_metadata(self, url: str) -> Dict:
        """
        Extract JSON metadata from the page
        
        :param url: URL of the page
        :return: Dictionary of metadata
        """
        json_url = f"{url}.json"
        response = self.fetch_page(json_url)
        
        if response:
            try:
                return response.json()
            except json.JSONDecodeError:
                self.logger.error(f"Could not parse JSON from {json_url}")
                return {}
        return {}
    
    def extract_images(self, url: str) -> List[Dict]:
        """
        Extract images from the page
        
        :param url: URL of the page to scrape
        :return: List of image dictionaries
        """
        # Fetch page content
        response = self.fetch_page(url)
        if not response:
            return []
        
        # Parse HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract JSON metadata
        metadata = self.extract_json_metadata(url)
        
        # List to store images
        images = []
        
        # Strategy 1: Look for image viewers or specific image containers
        image_containers = [
            soup.find('div', class_='viewer-container'),
            soup.find('div', class_='image-viewer'),
            soup.find('div', id='image-container')
        ]
        
        # Strategy 2: Find all image tags
        img_tags = soup.find_all('img')
        
        # Combine image sources
        for img in img_tags:
            # Get image source
            src = img.get('src')
            if not src:
                continue
            
            # Resolve relative URLs
            full_src = urljoin(url, src)
            
            # Extract alt text or use filename
            alt = img.get('alt', os.path.basename(urlparse(full_src).path))
            
            # Create image dictionary
            image_info = {
                'url': full_src,
                'alt': alt,
                'source_page': url
            }
            
            # Try to add metadata if available
            if metadata:
                try:
                    # Extract relevant metadata from JSON if possible
                    image_info['metadata'] = {
                        'title': metadata.get('data', {}).get('attributes', {}).get('title_info_primary_tsi'),
                        'description': metadata.get('data', {}).get('attributes', {}).get('abstract_tsi'),
                        'subject': metadata.get('data', {}).get('attributes', {}).get('subject_geographic_sim')
                    }
                except Exception as e:
                    self.logger.warning(f"Error extracting metadata: {e}")
            
            images.append(image_info)
        
        return images
    
    def download_images(self, images: List[Dict], output_dir: str = 'downloaded_images') -> List[str]:
        """
        Download images to local directory
        
        :param images: List of image dictionaries
        :param output_dir: Directory to save images
        :return: List of downloaded file paths
        """
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
        
        downloaded_files = []
        
        for i, image in enumerate(images):
            try:
                response = requests.get(image['url'], headers=self.headers)
                response.raise_for_status()
                
                # Generate filename
                ext = os.path.splitext(urlparse(image['url']).path)[1] or '.jpg'
                filename = os.path.join(output_dir, f'image_{i}{ext}')
                
                with open(filename, 'wb') as f:
                    f.write(response.content)
                
                downloaded_files.append(filename)
                self.logger.info(f"Downloaded: {filename}")
            
            except Exception as e:
                self.logger.error(f"Error downloading {image['url']}: {e}")
        
        return downloaded_files