File size: 8,901 Bytes
d6926f4
 
 
 
 
a590766
d6926f4
a590766
babcb0b
d6926f4
 
 
 
 
 
 
 
 
a590766
d6926f4
 
 
 
 
 
 
 
a590766
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
d6926f4
 
babcb0b
d6926f4
babcb0b
d6926f4
 
 
 
babcb0b
 
d6926f4
babcb0b
d6926f4
babcb0b
d6926f4
 
babcb0b
d6926f4
 
babcb0b
d6926f4
babcb0b
a590766
d6926f4
 
babcb0b
d6926f4
babcb0b
d6926f4
babcb0b
a590766
d6926f4
babcb0b
a590766
d6926f4
 
 
babcb0b
d6926f4
babcb0b
a590766
d6926f4
babcb0b
a590766
d6926f4
 
 
babcb0b
d6926f4
 
babcb0b
d6926f4
 
babcb0b
a590766
 
 
 
 
 
 
babcb0b
d6926f4
 
babcb0b
d6926f4
 
 
babcb0b
d6926f4
 
 
 
 
 
babcb0b
d6926f4
babcb0b
d6926f4
 
 
babcb0b
d6926f4
 
 
babcb0b
d6926f4
 
 
babcb0b
d6926f4
babcb0b
d6926f4
 
 
 
 
babcb0b
d6926f4
babcb0b
 
d6926f4
babcb0b
d6926f4
 
 
 
babcb0b
d6926f4
 
babcb0b
d6926f4
 
babcb0b
d6926f4
 
 
babcb0b
d6926f4
 
babcb0b
d6926f4
 
 
 
babcb0b
d6926f4
 
 
 
babcb0b
d6926f4
 
 
a590766
 
 
 
 
d6926f4
babcb0b
d6926f4
 
babcb0b
d6926f4
 
 
babcb0b
d6926f4
 
 
babcb0b
d6926f4
 
babcb0b
 
 
d6926f4
babcb0b
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import os
import re
from playwright.sync_api import sync_playwright
import requests
import sys
import subprocess
from PIL import Image, UnidentifiedImageError
from io import BytesIO

log_file = "app_log.txt"  # ログファイルのパス

# ログフォーマットの定義
log_format = '%(asctime)s - %(levelname)s - %(message)s'

import logging
file_handler = logging.FileHandler(log_file, encoding='utf-8')
# ログの設定
logging.basicConfig(
    level=logging.DEBUG,  # ログレベルをDEBUGに設定
    format='%(asctime)s - %(levelname)s - %(message)s',  # ログのフォーマットを指定
    handlers=[
        logging.StreamHandler(sys.stdout),  # 標準出力にログを出力
        file_handler,
    ]
)
logger = logging.getLogger(__name__)

# コマンド実行結果をログに記録する関数
def run_command(command):
    logger.info(f"Running command: {command}")
    try:
        result = subprocess.run(command, shell=True, capture_output=True, text=True)
        logger.info(f"Command output: {result.stdout}")
        if result.stderr:
            logger.error(f"Command error output: {result.stderr}")
        return result.returncode
    except Exception as e:
        logger.error(f"Failed to run command '{command}': {e}")
        return None

# Playwrightのインストールを実行
install_command = "python3 -m playwright install"
if run_command(install_command) != 0:
    logger.error("Playwright installation failed.")
    sys.exit(1)
else:
    logger.info("Playwright installed successfully.")

# 安全なフォルダ名を生成する関数
def generate_safe_folder_name(url):
    logger.info(f"Generating a safe folder name from URL: {url}")
    safe_name = re.sub(r'[^a-zA-Z0-9_\-]', '_', url)
    logger.info(f"Generated folder name: {safe_name}")
    return safe_name

# 画像を保存する関数 (JPG 80%の品質で保存)
def save_image_as_jpg(image_url, save_folder, image_name):
    logger.info(f"Saving image from {image_url} to folder: {save_folder} with name: {image_name}")
    
    if not os.path.exists(save_folder):
        logger.info(f"Folder does not exist, creating new folder: {save_folder}")
        os.makedirs(save_folder)
        logger.info(f"Folder created: {save_folder}")
    
    try:
        logger.info(f"Downloading image from URL: {image_url}")
        response = requests.get(image_url, timeout=10)
        response.raise_for_status()  # HTTPエラーが発生した場合例外を投げる
        logger.info(f"Successfully downloaded image: {image_url}")
    except requests.exceptions.RequestException as e:
        logger.error(f"Error occurred during image download: {e}")
        return None
    
    try:
        logger.info(f"Opening image from response content")
        image = Image.open(BytesIO(response.content))
        logger.info(f"Image successfully opened")
    except UnidentifiedImageError:
        logger.warning(f"Unidentified image file from URL: {image_url}. Skipping.")
        return None
    except Exception as e:
        logger.error(f"Error occurred while opening image: {e}")
        return None
    
    image_path = os.path.join(save_folder, image_name)
    try:
        logger.info(f"Converting image to JPEG and saving to {image_path}")
        image.convert("RGB").save(image_path, "JPEG", quality=80)
        logger.info(f"Image saved successfully: {image_path}")
        return image_path
    except Exception as e:
        logger.error(f"Error occurred while saving image: {e}")
        return None

# 画像の再帰的取得
def scrape_images_by_page(url, folder_name='scraped_images'):
    logger.info(f"Starting image scraping for URL: {url}")
    original_url = url
    url = url.rstrip('/')
    logger.info(f"Processed URL for scraping: {url}")

    with sync_playwright() as p:
        logger.info(f"Launching Chromium browser in headless mode")
        try:
            browser = p.chromium.launch(headless=True)  # ブラウザを非表示で起動
            page = browser.new_page()
        except Exception as e:
            logger.error(f"Failed to launch Chromium browser: {e}")
            return

        logger.info(f"Accessing page: {url}")
        page.goto(url)
        page.wait_for_load_state('networkidle')
        logger.info(f"Page fully loaded: {url}")

        # lazy-loading属性を無効にするためのJavaScriptを挿入
        try:
            logger.info(f"Disabling lazy-loading for images on the page")
            page.evaluate("""
                document.querySelectorAll('img[loading="lazy"]').forEach(img => {
                    img.setAttribute('loading', 'eager');
                    img.src = img.src;  // 画像を強制的にリロード
                });
            """)
            logger.info(f"Lazy-loading disabled")
        except Exception as eval_error:
            logger.warning(f"Error occurred during lazy-loading disablement: {eval_error}")

        safe_folder_name = generate_safe_folder_name(url)
        folder_path = os.path.join(folder_name, safe_folder_name)
        logger.info(f"Images will be saved to: {folder_path}")

        # ページ数を取得
        try:
            logger.info(f"Attempting to retrieve number of pages from the website")
            page_count_selector = 'div.tag-container:nth-child(8) > span:nth-child(1) > a:nth-child(1) > span:nth-child(1)'
            page_count_text = page.locator(page_count_selector).text_content().strip()
            num_pages = int(re.search(r'\d+', page_count_text).group())
            logger.info(f"Number of pages found: {num_pages}")
        except Exception as e:
            logger.warning(f"Failed to retrieve number of pages from the primary selector: {e}")
            try:
                fallback_selector = 'section.reader-bar:nth-child(2) > div:nth-child(2) > button:nth-child(3) > span:nth-child(3)'
                page.wait_for_selector(fallback_selector, timeout=5000)
                num_pages_text = page.locator(fallback_selector).text_content().strip()
                num_pages = int(re.search(r'\d+', num_pages_text).group())
                logger.info(f"Number of pages found using fallback selector: {num_pages}")
            except Exception as e2:
                logger.error(f"Failed to retrieve page count: {e2}. Defaulting to 1 page.")
                num_pages = 1

        logger.info(f"Starting to scrape {num_pages} pages")

        # 各ページにアクセスして画像を取得
        for i in range(1, num_pages + 1):
            page_url = f"{url}/{i}"
            logger.info(f"Accessing page: {page_url}")
            page.goto(page_url)
            page.wait_for_load_state('networkidle')
            logger.info(f"Page {i} fully loaded")

            try:
                logger.info(f"Attempting to locate images on page {i}")
                img_selector = '#image-container > a > img'
                img_elements = page.locator(img_selector)
                img_count = img_elements.count()
                logger.info(f"Found {img_count} images on page {i}")

                if img_count == 0:
                    logger.warning(f"No images found on page {i}")
                    continue

                for j in range(img_count):
                    try:
                        logger.info(f"Processing image {j + 1} on page {i}")
                        image_element = img_elements.nth(j)
                        image_url = image_element.get_attribute('src')
                        if not image_url:
                            image_url = image_element.get_attribute('data-src')
                        logger.info(f"Image URL found: {image_url}")

                        if image_url:
                            image_name = f'page_{str(i).zfill(5)}_img_{str(j + 1).zfill(5)}.jpg'
                            saved_image_path = save_image_as_jpg(image_url, folder_path, image_name)
                            if saved_image_path:
                                logger.info(f"Image saved successfully at: {saved_image_path}")
                            else:
                                logger.error(f"Failed to save image {image_name} from page {i}")
                    except Exception as e:
                        logger.error(f"Error processing image {j + 1} on page {i}: {e}")
                        continue
            except Exception as e:
                logger.error(f"Error occurred while retrieving images on page {i}: {e}")
                continue

        browser.close()
        logger.info(f"Browser closed")

if __name__ == "__main__":
    if len(sys.argv) < 2:
        logger.error("Usage: python scrape_images_worker.py <URL>")
        sys.exit(1)

    url = sys.argv[1]
    folder_name = 'scraped_images'
    logger.info(f"Script started with URL: {url}")
    scrape_images_by_page(url, folder_name)
    logger.info("Script completed")