|
import os |
|
import re |
|
from playwright.sync_api import sync_playwright |
|
import requests |
|
import sys |
|
import subprocess |
|
from PIL import Image, UnidentifiedImageError |
|
from io import BytesIO |
|
|
|
log_file = "app_log.txt" |
|
|
|
|
|
log_format = '%(asctime)s - %(levelname)s - %(message)s' |
|
|
|
import logging |
|
file_handler = logging.FileHandler(log_file, encoding='utf-8') |
|
|
|
logging.basicConfig( |
|
level=logging.DEBUG, |
|
format='%(asctime)s - %(levelname)s - %(message)s', |
|
handlers=[ |
|
logging.StreamHandler(sys.stdout), |
|
file_handler, |
|
] |
|
) |
|
logger = logging.getLogger(__name__) |
|
|
|
|
|
def run_command(command): |
|
logger.info(f"Running command: {command}") |
|
try: |
|
result = subprocess.run(command, shell=True, capture_output=True, text=True) |
|
logger.info(f"Command output: {result.stdout}") |
|
if result.stderr: |
|
logger.error(f"Command error output: {result.stderr}") |
|
return result.returncode |
|
except Exception as e: |
|
logger.error(f"Failed to run command '{command}': {e}") |
|
return None |
|
|
|
|
|
install_command = "python3 -m playwright install" |
|
if run_command(install_command) != 0: |
|
logger.error("Playwright installation failed.") |
|
sys.exit(1) |
|
else: |
|
logger.info("Playwright installed successfully.") |
|
|
|
|
|
def generate_safe_folder_name(url): |
|
logger.info(f"Generating a safe folder name from URL: {url}") |
|
safe_name = re.sub(r'[^a-zA-Z0-9_\-]', '_', url) |
|
logger.info(f"Generated folder name: {safe_name}") |
|
return safe_name |
|
|
|
|
|
def save_image_as_jpg(image_url, save_folder, image_name): |
|
logger.info(f"Saving image from {image_url} to folder: {save_folder} with name: {image_name}") |
|
|
|
if not os.path.exists(save_folder): |
|
logger.info(f"Folder does not exist, creating new folder: {save_folder}") |
|
os.makedirs(save_folder) |
|
logger.info(f"Folder created: {save_folder}") |
|
|
|
try: |
|
logger.info(f"Downloading image from URL: {image_url}") |
|
response = requests.get(image_url, timeout=10) |
|
response.raise_for_status() |
|
logger.info(f"Successfully downloaded image: {image_url}") |
|
except requests.exceptions.RequestException as e: |
|
logger.error(f"Error occurred during image download: {e}") |
|
return None |
|
|
|
try: |
|
logger.info(f"Opening image from response content") |
|
image = Image.open(BytesIO(response.content)) |
|
logger.info(f"Image successfully opened") |
|
except UnidentifiedImageError: |
|
logger.warning(f"Unidentified image file from URL: {image_url}. Skipping.") |
|
return None |
|
except Exception as e: |
|
logger.error(f"Error occurred while opening image: {e}") |
|
return None |
|
|
|
image_path = os.path.join(save_folder, image_name) |
|
try: |
|
logger.info(f"Converting image to JPEG and saving to {image_path}") |
|
image.convert("RGB").save(image_path, "JPEG", quality=80) |
|
logger.info(f"Image saved successfully: {image_path}") |
|
return image_path |
|
except Exception as e: |
|
logger.error(f"Error occurred while saving image: {e}") |
|
return None |
|
|
|
|
|
def scrape_images_by_page(url, folder_name='scraped_images'): |
|
logger.info(f"Starting image scraping for URL: {url}") |
|
original_url = url |
|
url = url.rstrip('/') |
|
logger.info(f"Processed URL for scraping: {url}") |
|
|
|
with sync_playwright() as p: |
|
logger.info(f"Launching Chromium browser in headless mode") |
|
try: |
|
browser = p.chromium.launch(headless=True) |
|
page = browser.new_page() |
|
except Exception as e: |
|
logger.error(f"Failed to launch Chromium browser: {e}") |
|
return |
|
|
|
logger.info(f"Accessing page: {url}") |
|
page.goto(url) |
|
page.wait_for_load_state('networkidle') |
|
logger.info(f"Page fully loaded: {url}") |
|
|
|
|
|
try: |
|
logger.info(f"Disabling lazy-loading for images on the page") |
|
page.evaluate(""" |
|
document.querySelectorAll('img[loading="lazy"]').forEach(img => { |
|
img.setAttribute('loading', 'eager'); |
|
img.src = img.src; // 画像を強制的にリロード |
|
}); |
|
""") |
|
logger.info(f"Lazy-loading disabled") |
|
except Exception as eval_error: |
|
logger.warning(f"Error occurred during lazy-loading disablement: {eval_error}") |
|
|
|
safe_folder_name = generate_safe_folder_name(url) |
|
folder_path = os.path.join(folder_name, safe_folder_name) |
|
logger.info(f"Images will be saved to: {folder_path}") |
|
|
|
|
|
try: |
|
logger.info(f"Attempting to retrieve number of pages from the website") |
|
page_count_selector = 'div.tag-container:nth-child(8) > span:nth-child(1) > a:nth-child(1) > span:nth-child(1)' |
|
page_count_text = page.locator(page_count_selector).text_content().strip() |
|
num_pages = int(re.search(r'\d+', page_count_text).group()) |
|
logger.info(f"Number of pages found: {num_pages}") |
|
except Exception as e: |
|
logger.warning(f"Failed to retrieve number of pages from the primary selector: {e}") |
|
try: |
|
fallback_selector = 'section.reader-bar:nth-child(2) > div:nth-child(2) > button:nth-child(3) > span:nth-child(3)' |
|
page.wait_for_selector(fallback_selector, timeout=5000) |
|
num_pages_text = page.locator(fallback_selector).text_content().strip() |
|
num_pages = int(re.search(r'\d+', num_pages_text).group()) |
|
logger.info(f"Number of pages found using fallback selector: {num_pages}") |
|
except Exception as e2: |
|
logger.error(f"Failed to retrieve page count: {e2}. Defaulting to 1 page.") |
|
num_pages = 1 |
|
|
|
logger.info(f"Starting to scrape {num_pages} pages") |
|
|
|
|
|
for i in range(1, num_pages + 1): |
|
page_url = f"{url}/{i}" |
|
logger.info(f"Accessing page: {page_url}") |
|
page.goto(page_url) |
|
page.wait_for_load_state('networkidle') |
|
logger.info(f"Page {i} fully loaded") |
|
|
|
try: |
|
logger.info(f"Attempting to locate images on page {i}") |
|
img_selector = '#image-container > a > img' |
|
img_elements = page.locator(img_selector) |
|
img_count = img_elements.count() |
|
logger.info(f"Found {img_count} images on page {i}") |
|
|
|
if img_count == 0: |
|
logger.warning(f"No images found on page {i}") |
|
continue |
|
|
|
for j in range(img_count): |
|
try: |
|
logger.info(f"Processing image {j + 1} on page {i}") |
|
image_element = img_elements.nth(j) |
|
image_url = image_element.get_attribute('src') |
|
if not image_url: |
|
image_url = image_element.get_attribute('data-src') |
|
logger.info(f"Image URL found: {image_url}") |
|
|
|
if image_url: |
|
image_name = f'page_{str(i).zfill(5)}_img_{str(j + 1).zfill(5)}.jpg' |
|
saved_image_path = save_image_as_jpg(image_url, folder_path, image_name) |
|
if saved_image_path: |
|
logger.info(f"Image saved successfully at: {saved_image_path}") |
|
else: |
|
logger.error(f"Failed to save image {image_name} from page {i}") |
|
except Exception as e: |
|
logger.error(f"Error processing image {j + 1} on page {i}: {e}") |
|
continue |
|
except Exception as e: |
|
logger.error(f"Error occurred while retrieving images on page {i}: {e}") |
|
continue |
|
|
|
browser.close() |
|
logger.info(f"Browser closed") |
|
|
|
if __name__ == "__main__": |
|
if len(sys.argv) < 2: |
|
logger.error("Usage: python scrape_images_worker.py <URL>") |
|
sys.exit(1) |
|
|
|
url = sys.argv[1] |
|
folder_name = 'scraped_images' |
|
logger.info(f"Script started with URL: {url}") |
|
scrape_images_by_page(url, folder_name) |
|
logger.info("Script completed") |
|
|