import os import re from playwright.sync_api import sync_playwright import requests import sys import subprocess from PIL import Image, UnidentifiedImageError from io import BytesIO log_file = "app_log.txt" # ログファイルのパス # ログフォーマットの定義 log_format = '%(asctime)s - %(levelname)s - %(message)s' import logging file_handler = logging.FileHandler(log_file, encoding='utf-8') # ログの設定 logging.basicConfig( level=logging.DEBUG, # ログレベルをDEBUGに設定 format='%(asctime)s - %(levelname)s - %(message)s', # ログのフォーマットを指定 handlers=[ logging.StreamHandler(sys.stdout), # 標準出力にログを出力 file_handler, ] ) logger = logging.getLogger(__name__) # コマンド実行結果をログに記録する関数 def run_command(command): logger.info(f"Running command: {command}") try: result = subprocess.run(command, shell=True, capture_output=True, text=True) logger.info(f"Command output: {result.stdout}") if result.stderr: logger.error(f"Command error output: {result.stderr}") return result.returncode except Exception as e: logger.error(f"Failed to run command '{command}': {e}") return None # Playwrightのインストールを実行 install_command = "python3 -m playwright install" if run_command(install_command) != 0: logger.error("Playwright installation failed.") sys.exit(1) else: logger.info("Playwright installed successfully.") # 安全なフォルダ名を生成する関数 def generate_safe_folder_name(url): logger.info(f"Generating a safe folder name from URL: {url}") safe_name = re.sub(r'[^a-zA-Z0-9_\-]', '_', url) logger.info(f"Generated folder name: {safe_name}") return safe_name # 画像を保存する関数 (JPG 80%の品質で保存) def save_image_as_jpg(image_url, save_folder, image_name): logger.info(f"Saving image from {image_url} to folder: {save_folder} with name: {image_name}") if not os.path.exists(save_folder): logger.info(f"Folder does not exist, creating new folder: {save_folder}") os.makedirs(save_folder) logger.info(f"Folder created: {save_folder}") try: logger.info(f"Downloading image from URL: {image_url}") response = requests.get(image_url, timeout=10) response.raise_for_status() # HTTPエラーが発生した場合例外を投げる logger.info(f"Successfully downloaded image: {image_url}") except requests.exceptions.RequestException as e: logger.error(f"Error occurred during image download: {e}") return None try: logger.info(f"Opening image from response content") image = Image.open(BytesIO(response.content)) logger.info(f"Image successfully opened") except UnidentifiedImageError: logger.warning(f"Unidentified image file from URL: {image_url}. Skipping.") return None except Exception as e: logger.error(f"Error occurred while opening image: {e}") return None image_path = os.path.join(save_folder, image_name) try: logger.info(f"Converting image to JPEG and saving to {image_path}") image.convert("RGB").save(image_path, "JPEG", quality=80) logger.info(f"Image saved successfully: {image_path}") return image_path except Exception as e: logger.error(f"Error occurred while saving image: {e}") return None # 画像の再帰的取得 def scrape_images_by_page(url, folder_name='scraped_images'): logger.info(f"Starting image scraping for URL: {url}") original_url = url url = url.rstrip('/') logger.info(f"Processed URL for scraping: {url}") with sync_playwright() as p: logger.info(f"Launching Chromium browser in headless mode") try: browser = p.chromium.launch(headless=True) # ブラウザを非表示で起動 page = browser.new_page() except Exception as e: logger.error(f"Failed to launch Chromium browser: {e}") return logger.info(f"Accessing page: {url}") page.goto(url) page.wait_for_load_state('networkidle') logger.info(f"Page fully loaded: {url}") # lazy-loading属性を無効にするためのJavaScriptを挿入 try: logger.info(f"Disabling lazy-loading for images on the page") page.evaluate(""" document.querySelectorAll('img[loading="lazy"]').forEach(img => { img.setAttribute('loading', 'eager'); img.src = img.src; // 画像を強制的にリロード }); """) logger.info(f"Lazy-loading disabled") except Exception as eval_error: logger.warning(f"Error occurred during lazy-loading disablement: {eval_error}") safe_folder_name = generate_safe_folder_name(url) folder_path = os.path.join(folder_name, safe_folder_name) logger.info(f"Images will be saved to: {folder_path}") # ページ数を取得 try: logger.info(f"Attempting to retrieve number of pages from the website") page_count_selector = 'div.tag-container:nth-child(8) > span:nth-child(1) > a:nth-child(1) > span:nth-child(1)' page_count_text = page.locator(page_count_selector).text_content().strip() num_pages = int(re.search(r'\d+', page_count_text).group()) logger.info(f"Number of pages found: {num_pages}") except Exception as e: logger.warning(f"Failed to retrieve number of pages from the primary selector: {e}") try: fallback_selector = 'section.reader-bar:nth-child(2) > div:nth-child(2) > button:nth-child(3) > span:nth-child(3)' page.wait_for_selector(fallback_selector, timeout=5000) num_pages_text = page.locator(fallback_selector).text_content().strip() num_pages = int(re.search(r'\d+', num_pages_text).group()) logger.info(f"Number of pages found using fallback selector: {num_pages}") except Exception as e2: logger.error(f"Failed to retrieve page count: {e2}. Defaulting to 1 page.") num_pages = 1 logger.info(f"Starting to scrape {num_pages} pages") # 各ページにアクセスして画像を取得 for i in range(1, num_pages + 1): page_url = f"{url}/{i}" logger.info(f"Accessing page: {page_url}") page.goto(page_url) page.wait_for_load_state('networkidle') logger.info(f"Page {i} fully loaded") try: logger.info(f"Attempting to locate images on page {i}") img_selector = '#image-container > a > img' img_elements = page.locator(img_selector) img_count = img_elements.count() logger.info(f"Found {img_count} images on page {i}") if img_count == 0: logger.warning(f"No images found on page {i}") continue for j in range(img_count): try: logger.info(f"Processing image {j + 1} on page {i}") image_element = img_elements.nth(j) image_url = image_element.get_attribute('src') if not image_url: image_url = image_element.get_attribute('data-src') logger.info(f"Image URL found: {image_url}") if image_url: image_name = f'page_{str(i).zfill(5)}_img_{str(j + 1).zfill(5)}.jpg' saved_image_path = save_image_as_jpg(image_url, folder_path, image_name) if saved_image_path: logger.info(f"Image saved successfully at: {saved_image_path}") else: logger.error(f"Failed to save image {image_name} from page {i}") except Exception as e: logger.error(f"Error processing image {j + 1} on page {i}: {e}") continue except Exception as e: logger.error(f"Error occurred while retrieving images on page {i}: {e}") continue browser.close() logger.info(f"Browser closed") if __name__ == "__main__": if len(sys.argv) < 2: logger.error("Usage: python scrape_images_worker.py ") sys.exit(1) url = sys.argv[1] folder_name = 'scraped_images' logger.info(f"Script started with URL: {url}") scrape_images_by_page(url, folder_name) logger.info("Script completed")