Update scrape_images_worker.py
Browse files- scrape_images_worker.py +41 -13
scrape_images_worker.py
CHANGED
@@ -3,12 +3,9 @@ import re
|
|
3 |
from playwright.sync_api import sync_playwright
|
4 |
import requests
|
5 |
import sys
|
|
|
6 |
from PIL import Image, UnidentifiedImageError
|
7 |
-
from io import
|
8 |
-
|
9 |
-
# Playwrightをインストールするコマンドの実行
|
10 |
-
os.system("python3 -m playwright install")
|
11 |
-
|
12 |
|
13 |
log_file = "app_log.txt" # ログファイルのパス
|
14 |
|
@@ -19,7 +16,7 @@ import logging
|
|
19 |
file_handler = logging.FileHandler(log_file, encoding='utf-8')
|
20 |
# ログの設定
|
21 |
logging.basicConfig(
|
22 |
-
level=logging.DEBUG, # ログレベルを
|
23 |
format='%(asctime)s - %(levelname)s - %(message)s', # ログのフォーマットを指定
|
24 |
handlers=[
|
25 |
logging.StreamHandler(sys.stdout), # 標準出力にログを出力
|
@@ -28,6 +25,27 @@ logging.basicConfig(
|
|
28 |
)
|
29 |
logger = logging.getLogger(__name__)
|
30 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
31 |
# 安全なフォルダ名を生成する関数
|
32 |
def generate_safe_folder_name(url):
|
33 |
logger.info(f"Generating a safe folder name from URL: {url}")
|
@@ -51,7 +69,7 @@ def save_image_as_jpg(image_url, save_folder, image_name):
|
|
51 |
logger.info(f"Successfully downloaded image: {image_url}")
|
52 |
except requests.exceptions.RequestException as e:
|
53 |
logger.error(f"Error occurred during image download: {e}")
|
54 |
-
return
|
55 |
|
56 |
try:
|
57 |
logger.info(f"Opening image from response content")
|
@@ -59,18 +77,20 @@ def save_image_as_jpg(image_url, save_folder, image_name):
|
|
59 |
logger.info(f"Image successfully opened")
|
60 |
except UnidentifiedImageError:
|
61 |
logger.warning(f"Unidentified image file from URL: {image_url}. Skipping.")
|
62 |
-
return
|
63 |
except Exception as e:
|
64 |
logger.error(f"Error occurred while opening image: {e}")
|
65 |
-
return
|
66 |
|
67 |
image_path = os.path.join(save_folder, image_name)
|
68 |
try:
|
69 |
logger.info(f"Converting image to JPEG and saving to {image_path}")
|
70 |
image.convert("RGB").save(image_path, "JPEG", quality=80)
|
71 |
logger.info(f"Image saved successfully: {image_path}")
|
|
|
72 |
except Exception as e:
|
73 |
logger.error(f"Error occurred while saving image: {e}")
|
|
|
74 |
|
75 |
# 画像の再帰的取得
|
76 |
def scrape_images_by_page(url, folder_name='scraped_images'):
|
@@ -81,9 +101,13 @@ def scrape_images_by_page(url, folder_name='scraped_images'):
|
|
81 |
|
82 |
with sync_playwright() as p:
|
83 |
logger.info(f"Launching Chromium browser in headless mode")
|
84 |
-
|
85 |
-
|
86 |
-
|
|
|
|
|
|
|
|
|
87 |
logger.info(f"Accessing page: {url}")
|
88 |
page.goto(url)
|
89 |
page.wait_for_load_state('networkidle')
|
@@ -157,7 +181,11 @@ def scrape_images_by_page(url, folder_name='scraped_images'):
|
|
157 |
|
158 |
if image_url:
|
159 |
image_name = f'page_{str(i).zfill(5)}_img_{str(j + 1).zfill(5)}.jpg'
|
160 |
-
save_image_as_jpg(image_url, folder_path, image_name)
|
|
|
|
|
|
|
|
|
161 |
except Exception as e:
|
162 |
logger.error(f"Error processing image {j + 1} on page {i}: {e}")
|
163 |
continue
|
|
|
3 |
from playwright.sync_api import sync_playwright
|
4 |
import requests
|
5 |
import sys
|
6 |
+
import subprocess
|
7 |
from PIL import Image, UnidentifiedImageError
|
8 |
+
from io import BytesIO
|
|
|
|
|
|
|
|
|
9 |
|
10 |
log_file = "app_log.txt" # ログファイルのパス
|
11 |
|
|
|
16 |
file_handler = logging.FileHandler(log_file, encoding='utf-8')
|
17 |
# ログの設定
|
18 |
logging.basicConfig(
|
19 |
+
level=logging.DEBUG, # ログレベルをDEBUGに設定
|
20 |
format='%(asctime)s - %(levelname)s - %(message)s', # ログのフォーマットを指定
|
21 |
handlers=[
|
22 |
logging.StreamHandler(sys.stdout), # 標準出力にログを出力
|
|
|
25 |
)
|
26 |
logger = logging.getLogger(__name__)
|
27 |
|
28 |
+
# コマンド実行結果をログに記録する関数
|
29 |
+
def run_command(command):
|
30 |
+
logger.info(f"Running command: {command}")
|
31 |
+
try:
|
32 |
+
result = subprocess.run(command, shell=True, capture_output=True, text=True)
|
33 |
+
logger.info(f"Command output: {result.stdout}")
|
34 |
+
if result.stderr:
|
35 |
+
logger.error(f"Command error output: {result.stderr}")
|
36 |
+
return result.returncode
|
37 |
+
except Exception as e:
|
38 |
+
logger.error(f"Failed to run command '{command}': {e}")
|
39 |
+
return None
|
40 |
+
|
41 |
+
# Playwrightのインストールを実行
|
42 |
+
install_command = "python3 -m playwright install"
|
43 |
+
if run_command(install_command) != 0:
|
44 |
+
logger.error("Playwright installation failed.")
|
45 |
+
sys.exit(1)
|
46 |
+
else:
|
47 |
+
logger.info("Playwright installed successfully.")
|
48 |
+
|
49 |
# 安全なフォルダ名を生成する関数
|
50 |
def generate_safe_folder_name(url):
|
51 |
logger.info(f"Generating a safe folder name from URL: {url}")
|
|
|
69 |
logger.info(f"Successfully downloaded image: {image_url}")
|
70 |
except requests.exceptions.RequestException as e:
|
71 |
logger.error(f"Error occurred during image download: {e}")
|
72 |
+
return None
|
73 |
|
74 |
try:
|
75 |
logger.info(f"Opening image from response content")
|
|
|
77 |
logger.info(f"Image successfully opened")
|
78 |
except UnidentifiedImageError:
|
79 |
logger.warning(f"Unidentified image file from URL: {image_url}. Skipping.")
|
80 |
+
return None
|
81 |
except Exception as e:
|
82 |
logger.error(f"Error occurred while opening image: {e}")
|
83 |
+
return None
|
84 |
|
85 |
image_path = os.path.join(save_folder, image_name)
|
86 |
try:
|
87 |
logger.info(f"Converting image to JPEG and saving to {image_path}")
|
88 |
image.convert("RGB").save(image_path, "JPEG", quality=80)
|
89 |
logger.info(f"Image saved successfully: {image_path}")
|
90 |
+
return image_path
|
91 |
except Exception as e:
|
92 |
logger.error(f"Error occurred while saving image: {e}")
|
93 |
+
return None
|
94 |
|
95 |
# 画像の再帰的取得
|
96 |
def scrape_images_by_page(url, folder_name='scraped_images'):
|
|
|
101 |
|
102 |
with sync_playwright() as p:
|
103 |
logger.info(f"Launching Chromium browser in headless mode")
|
104 |
+
try:
|
105 |
+
browser = p.chromium.launch(headless=True) # ブラウザを非表示で起動
|
106 |
+
page = browser.new_page()
|
107 |
+
except Exception as e:
|
108 |
+
logger.error(f"Failed to launch Chromium browser: {e}")
|
109 |
+
return
|
110 |
+
|
111 |
logger.info(f"Accessing page: {url}")
|
112 |
page.goto(url)
|
113 |
page.wait_for_load_state('networkidle')
|
|
|
181 |
|
182 |
if image_url:
|
183 |
image_name = f'page_{str(i).zfill(5)}_img_{str(j + 1).zfill(5)}.jpg'
|
184 |
+
saved_image_path = save_image_as_jpg(image_url, folder_path, image_name)
|
185 |
+
if saved_image_path:
|
186 |
+
logger.info(f"Image saved successfully at: {saved_image_path}")
|
187 |
+
else:
|
188 |
+
logger.error(f"Failed to save image {image_name} from page {i}")
|
189 |
except Exception as e:
|
190 |
logger.error(f"Error processing image {j + 1} on page {i}: {e}")
|
191 |
continue
|