File size: 5,861 Bytes
29b3738
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
import requests
from bs4 import BeautifulSoup
import os
import json
import re
from typing import List, Dict
import logging
from urllib.parse import urljoin, urlparse

class DigitalCommonwealthScraper:
    def __init__(self, base_url: str = "https://www.digitalcommonwealth.org"):
        """
        Initialize the scraper with base URL and logging
        
        :param base_url: Base URL for Digital Commonwealth
        """
        self.base_url = base_url
        logging.basicConfig(level=logging.INFO)
        self.logger = logging.getLogger(__name__)
        
        # Headers to mimic browser request
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
        }
    
    def fetch_page(self, url: str) -> requests.Response:
        """
        Fetch webpage content with error handling
        
        :param url: URL to fetch
        :return: Response object
        """
        try:
            response = requests.get(url, headers=self.headers)
            response.raise_for_status()
            return response
        except requests.RequestException as e:
            self.logger.error(f"Error fetching {url}: {e}")
            return None
    
    def extract_json_metadata(self, url: str) -> Dict:
        """
        Extract JSON metadata from the page
        
        :param url: URL of the page
        :return: Dictionary of metadata
        """
        json_url = f"{url}.json"
        response = self.fetch_page(json_url)
        
        if response:
            try:
                return response.json()
            except json.JSONDecodeError:
                self.logger.error(f"Could not parse JSON from {json_url}")
                return {}
        return {}
    
    def extract_images(self, url: str) -> List[Dict]:
        """
        Extract images from the page
        
        :param url: URL of the page to scrape
        :return: List of image dictionaries
        """
        # Fetch page content
        response = self.fetch_page(url)
        if not response:
            return []
        
        # Parse HTML
        soup = BeautifulSoup(response.text, 'html.parser')
        
        # Extract JSON metadata
        metadata = self.extract_json_metadata(url)
        
        # List to store images
        images = []
        
        # Strategy 1: Look for image viewers or specific image containers
        image_containers = [
            soup.find('div', class_='viewer-container'),
            soup.find('div', class_='image-viewer'),
            soup.find('div', id='image-container')
        ]
        
        # Strategy 2: Find all image tags
        img_tags = soup.find_all('img')
        
        # Combine image sources
        for img in img_tags:
            # Get image source
            src = img.get('src')
            if not src:
                continue
            
            # Resolve relative URLs
            full_src = urljoin(url, src)
            
            # Extract alt text or use filename
            alt = img.get('alt', os.path.basename(urlparse(full_src).path))
            
            # Create image dictionary
            image_info = {
                'url': full_src,
                'alt': alt,
                'source_page': url
            }
            
            # Try to add metadata if available
            if metadata:
                try:
                    # Extract relevant metadata from JSON if possible
                    image_info['metadata'] = {
                        'title': metadata.get('data', {}).get('attributes', {}).get('title_info_primary_tsi'),
                        'description': metadata.get('data', {}).get('attributes', {}).get('abstract_tsi'),
                        'subject': metadata.get('data', {}).get('attributes', {}).get('subject_geographic_sim')
                    }
                except Exception as e:
                    self.logger.warning(f"Error extracting metadata: {e}")
            
            images.append(image_info)
        
        return images
    
    def download_images(self, images: List[Dict], output_dir: str = 'downloaded_images') -> List[str]:
        """
        Download images to local directory
        
        :param images: List of image dictionaries
        :param output_dir: Directory to save images
        :return: List of downloaded file paths
        """
        # Create output directory
        os.makedirs(output_dir, exist_ok=True)
        
        downloaded_files = []
        
        for i, image in enumerate(images):
            try:
                response = requests.get(image['url'], headers=self.headers)
                response.raise_for_status()
                
                # Generate filename
                ext = os.path.splitext(urlparse(image['url']).path)[1] or '.jpg'
                filename = os.path.join(output_dir, f'image_{i}{ext}')
                
                with open(filename, 'wb') as f:
                    f.write(response.content)
                
                downloaded_files.append(filename)
                self.logger.info(f"Downloaded: {filename}")
            
            except Exception as e:
                self.logger.error(f"Error downloading {image['url']}: {e}")
        
        return downloaded_files

#def main():
    # Example usage
 #   scraper = DigitalCommonwealthScraper()
  #  
    # Example URL from input
   # url = "https://www.digitalcommonwealth.org/search/commonwealth-oai:5712qh738"
    
    # Extract images
    #images = scraper.extract_images(url)
    
    # Print image information
    #for img in images:
     #   print(json.dumps(img, indent=2))
    
    # Optional: Download images
    #scraper.download_images(images)

#if __name__ == "__main__":
 #   main()