gallary2 / scrape_images_worker.py
OzoneAsai's picture
Update scrape_images_worker.py
a590766 verified
raw
history blame
8.9 kB
import os
import re
from playwright.sync_api import sync_playwright
import requests
import sys
import subprocess
from PIL import Image, UnidentifiedImageError
from io import BytesIO
log_file = "app_log.txt" # ログファイルのパス
# ログフォーマットの定義
log_format = '%(asctime)s - %(levelname)s - %(message)s'
import logging
file_handler = logging.FileHandler(log_file, encoding='utf-8')
# ログの設定
logging.basicConfig(
level=logging.DEBUG, # ログレベルをDEBUGに設定
format='%(asctime)s - %(levelname)s - %(message)s', # ログのフォーマットを指定
handlers=[
logging.StreamHandler(sys.stdout), # 標準出力にログを出力
file_handler,
]
)
logger = logging.getLogger(__name__)
# コマンド実行結果をログに記録する関数
def run_command(command):
logger.info(f"Running command: {command}")
try:
result = subprocess.run(command, shell=True, capture_output=True, text=True)
logger.info(f"Command output: {result.stdout}")
if result.stderr:
logger.error(f"Command error output: {result.stderr}")
return result.returncode
except Exception as e:
logger.error(f"Failed to run command '{command}': {e}")
return None
# Playwrightのインストールを実行
install_command = "python3 -m playwright install"
if run_command(install_command) != 0:
logger.error("Playwright installation failed.")
sys.exit(1)
else:
logger.info("Playwright installed successfully.")
# 安全なフォルダ名を生成する関数
def generate_safe_folder_name(url):
logger.info(f"Generating a safe folder name from URL: {url}")
safe_name = re.sub(r'[^a-zA-Z0-9_\-]', '_', url)
logger.info(f"Generated folder name: {safe_name}")
return safe_name
# 画像を保存する関数 (JPG 80%の品質で保存)
def save_image_as_jpg(image_url, save_folder, image_name):
logger.info(f"Saving image from {image_url} to folder: {save_folder} with name: {image_name}")
if not os.path.exists(save_folder):
logger.info(f"Folder does not exist, creating new folder: {save_folder}")
os.makedirs(save_folder)
logger.info(f"Folder created: {save_folder}")
try:
logger.info(f"Downloading image from URL: {image_url}")
response = requests.get(image_url, timeout=10)
response.raise_for_status() # HTTPエラーが発生した場合例外を投げる
logger.info(f"Successfully downloaded image: {image_url}")
except requests.exceptions.RequestException as e:
logger.error(f"Error occurred during image download: {e}")
return None
try:
logger.info(f"Opening image from response content")
image = Image.open(BytesIO(response.content))
logger.info(f"Image successfully opened")
except UnidentifiedImageError:
logger.warning(f"Unidentified image file from URL: {image_url}. Skipping.")
return None
except Exception as e:
logger.error(f"Error occurred while opening image: {e}")
return None
image_path = os.path.join(save_folder, image_name)
try:
logger.info(f"Converting image to JPEG and saving to {image_path}")
image.convert("RGB").save(image_path, "JPEG", quality=80)
logger.info(f"Image saved successfully: {image_path}")
return image_path
except Exception as e:
logger.error(f"Error occurred while saving image: {e}")
return None
# 画像の再帰的取得
def scrape_images_by_page(url, folder_name='scraped_images'):
logger.info(f"Starting image scraping for URL: {url}")
original_url = url
url = url.rstrip('/')
logger.info(f"Processed URL for scraping: {url}")
with sync_playwright() as p:
logger.info(f"Launching Chromium browser in headless mode")
try:
browser = p.chromium.launch(headless=True) # ブラウザを非表示で起動
page = browser.new_page()
except Exception as e:
logger.error(f"Failed to launch Chromium browser: {e}")
return
logger.info(f"Accessing page: {url}")
page.goto(url)
page.wait_for_load_state('networkidle')
logger.info(f"Page fully loaded: {url}")
# lazy-loading属性を無効にするためのJavaScriptを挿入
try:
logger.info(f"Disabling lazy-loading for images on the page")
page.evaluate("""
document.querySelectorAll('img[loading="lazy"]').forEach(img => {
img.setAttribute('loading', 'eager');
img.src = img.src; // 画像を強制的にリロード
});
""")
logger.info(f"Lazy-loading disabled")
except Exception as eval_error:
logger.warning(f"Error occurred during lazy-loading disablement: {eval_error}")
safe_folder_name = generate_safe_folder_name(url)
folder_path = os.path.join(folder_name, safe_folder_name)
logger.info(f"Images will be saved to: {folder_path}")
# ページ数を取得
try:
logger.info(f"Attempting to retrieve number of pages from the website")
page_count_selector = 'div.tag-container:nth-child(8) > span:nth-child(1) > a:nth-child(1) > span:nth-child(1)'
page_count_text = page.locator(page_count_selector).text_content().strip()
num_pages = int(re.search(r'\d+', page_count_text).group())
logger.info(f"Number of pages found: {num_pages}")
except Exception as e:
logger.warning(f"Failed to retrieve number of pages from the primary selector: {e}")
try:
fallback_selector = 'section.reader-bar:nth-child(2) > div:nth-child(2) > button:nth-child(3) > span:nth-child(3)'
page.wait_for_selector(fallback_selector, timeout=5000)
num_pages_text = page.locator(fallback_selector).text_content().strip()
num_pages = int(re.search(r'\d+', num_pages_text).group())
logger.info(f"Number of pages found using fallback selector: {num_pages}")
except Exception as e2:
logger.error(f"Failed to retrieve page count: {e2}. Defaulting to 1 page.")
num_pages = 1
logger.info(f"Starting to scrape {num_pages} pages")
# 各ページにアクセスして画像を取得
for i in range(1, num_pages + 1):
page_url = f"{url}/{i}"
logger.info(f"Accessing page: {page_url}")
page.goto(page_url)
page.wait_for_load_state('networkidle')
logger.info(f"Page {i} fully loaded")
try:
logger.info(f"Attempting to locate images on page {i}")
img_selector = '#image-container > a > img'
img_elements = page.locator(img_selector)
img_count = img_elements.count()
logger.info(f"Found {img_count} images on page {i}")
if img_count == 0:
logger.warning(f"No images found on page {i}")
continue
for j in range(img_count):
try:
logger.info(f"Processing image {j + 1} on page {i}")
image_element = img_elements.nth(j)
image_url = image_element.get_attribute('src')
if not image_url:
image_url = image_element.get_attribute('data-src')
logger.info(f"Image URL found: {image_url}")
if image_url:
image_name = f'page_{str(i).zfill(5)}_img_{str(j + 1).zfill(5)}.jpg'
saved_image_path = save_image_as_jpg(image_url, folder_path, image_name)
if saved_image_path:
logger.info(f"Image saved successfully at: {saved_image_path}")
else:
logger.error(f"Failed to save image {image_name} from page {i}")
except Exception as e:
logger.error(f"Error processing image {j + 1} on page {i}: {e}")
continue
except Exception as e:
logger.error(f"Error occurred while retrieving images on page {i}: {e}")
continue
browser.close()
logger.info(f"Browser closed")
if __name__ == "__main__":
if len(sys.argv) < 2:
logger.error("Usage: python scrape_images_worker.py <URL>")
sys.exit(1)
url = sys.argv[1]
folder_name = 'scraped_images'
logger.info(f"Script started with URL: {url}")
scrape_images_by_page(url, folder_name)
logger.info("Script completed")