File size: 8,901 Bytes
d6926f4 a590766 d6926f4 a590766 babcb0b d6926f4 a590766 d6926f4 a590766 d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b a590766 d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b a590766 d6926f4 babcb0b a590766 d6926f4 babcb0b d6926f4 babcb0b a590766 d6926f4 babcb0b a590766 d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b a590766 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 a590766 d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b d6926f4 babcb0b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 |
import os
import re
from playwright.sync_api import sync_playwright
import requests
import sys
import subprocess
from PIL import Image, UnidentifiedImageError
from io import BytesIO
log_file = "app_log.txt" # ログファイルのパス
# ログフォーマットの定義
log_format = '%(asctime)s - %(levelname)s - %(message)s'
import logging
file_handler = logging.FileHandler(log_file, encoding='utf-8')
# ログの設定
logging.basicConfig(
level=logging.DEBUG, # ログレベルをDEBUGに設定
format='%(asctime)s - %(levelname)s - %(message)s', # ログのフォーマットを指定
handlers=[
logging.StreamHandler(sys.stdout), # 標準出力にログを出力
file_handler,
]
)
logger = logging.getLogger(__name__)
# コマンド実行結果をログに記録する関数
def run_command(command):
logger.info(f"Running command: {command}")
try:
result = subprocess.run(command, shell=True, capture_output=True, text=True)
logger.info(f"Command output: {result.stdout}")
if result.stderr:
logger.error(f"Command error output: {result.stderr}")
return result.returncode
except Exception as e:
logger.error(f"Failed to run command '{command}': {e}")
return None
# Playwrightのインストールを実行
install_command = "python3 -m playwright install"
if run_command(install_command) != 0:
logger.error("Playwright installation failed.")
sys.exit(1)
else:
logger.info("Playwright installed successfully.")
# 安全なフォルダ名を生成する関数
def generate_safe_folder_name(url):
logger.info(f"Generating a safe folder name from URL: {url}")
safe_name = re.sub(r'[^a-zA-Z0-9_\-]', '_', url)
logger.info(f"Generated folder name: {safe_name}")
return safe_name
# 画像を保存する関数 (JPG 80%の品質で保存)
def save_image_as_jpg(image_url, save_folder, image_name):
logger.info(f"Saving image from {image_url} to folder: {save_folder} with name: {image_name}")
if not os.path.exists(save_folder):
logger.info(f"Folder does not exist, creating new folder: {save_folder}")
os.makedirs(save_folder)
logger.info(f"Folder created: {save_folder}")
try:
logger.info(f"Downloading image from URL: {image_url}")
response = requests.get(image_url, timeout=10)
response.raise_for_status() # HTTPエラーが発生した場合例外を投げる
logger.info(f"Successfully downloaded image: {image_url}")
except requests.exceptions.RequestException as e:
logger.error(f"Error occurred during image download: {e}")
return None
try:
logger.info(f"Opening image from response content")
image = Image.open(BytesIO(response.content))
logger.info(f"Image successfully opened")
except UnidentifiedImageError:
logger.warning(f"Unidentified image file from URL: {image_url}. Skipping.")
return None
except Exception as e:
logger.error(f"Error occurred while opening image: {e}")
return None
image_path = os.path.join(save_folder, image_name)
try:
logger.info(f"Converting image to JPEG and saving to {image_path}")
image.convert("RGB").save(image_path, "JPEG", quality=80)
logger.info(f"Image saved successfully: {image_path}")
return image_path
except Exception as e:
logger.error(f"Error occurred while saving image: {e}")
return None
# 画像の再帰的取得
def scrape_images_by_page(url, folder_name='scraped_images'):
logger.info(f"Starting image scraping for URL: {url}")
original_url = url
url = url.rstrip('/')
logger.info(f"Processed URL for scraping: {url}")
with sync_playwright() as p:
logger.info(f"Launching Chromium browser in headless mode")
try:
browser = p.chromium.launch(headless=True) # ブラウザを非表示で起動
page = browser.new_page()
except Exception as e:
logger.error(f"Failed to launch Chromium browser: {e}")
return
logger.info(f"Accessing page: {url}")
page.goto(url)
page.wait_for_load_state('networkidle')
logger.info(f"Page fully loaded: {url}")
# lazy-loading属性を無効にするためのJavaScriptを挿入
try:
logger.info(f"Disabling lazy-loading for images on the page")
page.evaluate("""
document.querySelectorAll('img[loading="lazy"]').forEach(img => {
img.setAttribute('loading', 'eager');
img.src = img.src; // 画像を強制的にリロード
});
""")
logger.info(f"Lazy-loading disabled")
except Exception as eval_error:
logger.warning(f"Error occurred during lazy-loading disablement: {eval_error}")
safe_folder_name = generate_safe_folder_name(url)
folder_path = os.path.join(folder_name, safe_folder_name)
logger.info(f"Images will be saved to: {folder_path}")
# ページ数を取得
try:
logger.info(f"Attempting to retrieve number of pages from the website")
page_count_selector = 'div.tag-container:nth-child(8) > span:nth-child(1) > a:nth-child(1) > span:nth-child(1)'
page_count_text = page.locator(page_count_selector).text_content().strip()
num_pages = int(re.search(r'\d+', page_count_text).group())
logger.info(f"Number of pages found: {num_pages}")
except Exception as e:
logger.warning(f"Failed to retrieve number of pages from the primary selector: {e}")
try:
fallback_selector = 'section.reader-bar:nth-child(2) > div:nth-child(2) > button:nth-child(3) > span:nth-child(3)'
page.wait_for_selector(fallback_selector, timeout=5000)
num_pages_text = page.locator(fallback_selector).text_content().strip()
num_pages = int(re.search(r'\d+', num_pages_text).group())
logger.info(f"Number of pages found using fallback selector: {num_pages}")
except Exception as e2:
logger.error(f"Failed to retrieve page count: {e2}. Defaulting to 1 page.")
num_pages = 1
logger.info(f"Starting to scrape {num_pages} pages")
# 各ページにアクセスして画像を取得
for i in range(1, num_pages + 1):
page_url = f"{url}/{i}"
logger.info(f"Accessing page: {page_url}")
page.goto(page_url)
page.wait_for_load_state('networkidle')
logger.info(f"Page {i} fully loaded")
try:
logger.info(f"Attempting to locate images on page {i}")
img_selector = '#image-container > a > img'
img_elements = page.locator(img_selector)
img_count = img_elements.count()
logger.info(f"Found {img_count} images on page {i}")
if img_count == 0:
logger.warning(f"No images found on page {i}")
continue
for j in range(img_count):
try:
logger.info(f"Processing image {j + 1} on page {i}")
image_element = img_elements.nth(j)
image_url = image_element.get_attribute('src')
if not image_url:
image_url = image_element.get_attribute('data-src')
logger.info(f"Image URL found: {image_url}")
if image_url:
image_name = f'page_{str(i).zfill(5)}_img_{str(j + 1).zfill(5)}.jpg'
saved_image_path = save_image_as_jpg(image_url, folder_path, image_name)
if saved_image_path:
logger.info(f"Image saved successfully at: {saved_image_path}")
else:
logger.error(f"Failed to save image {image_name} from page {i}")
except Exception as e:
logger.error(f"Error processing image {j + 1} on page {i}: {e}")
continue
except Exception as e:
logger.error(f"Error occurred while retrieving images on page {i}: {e}")
continue
browser.close()
logger.info(f"Browser closed")
if __name__ == "__main__":
if len(sys.argv) < 2:
logger.error("Usage: python scrape_images_worker.py <URL>")
sys.exit(1)
url = sys.argv[1]
folder_name = 'scraped_images'
logger.info(f"Script started with URL: {url}")
scrape_images_by_page(url, folder_name)
logger.info("Script completed")
|