OzoneAsai commited on
Commit
a590766
1 Parent(s): 8f02f25

Update scrape_images_worker.py

Browse files
Files changed (1) hide show
  1. scrape_images_worker.py +41 -13
scrape_images_worker.py CHANGED
@@ -3,12 +3,9 @@ import re
3
  from playwright.sync_api import sync_playwright
4
  import requests
5
  import sys
 
6
  from PIL import Image, UnidentifiedImageError
7
- from io import Bytesimport os
8
-
9
- # Playwrightをインストールするコマンドの実行
10
- os.system("python3 -m playwright install")
11
-
12
 
13
  log_file = "app_log.txt" # ログファイルのパス
14
 
@@ -19,7 +16,7 @@ import logging
19
  file_handler = logging.FileHandler(log_file, encoding='utf-8')
20
  # ログの設定
21
  logging.basicConfig(
22
- level=logging.DEBUG, # ログレベルをINFOに設定
23
  format='%(asctime)s - %(levelname)s - %(message)s', # ログのフォーマットを指定
24
  handlers=[
25
  logging.StreamHandler(sys.stdout), # 標準出力にログを出力
@@ -28,6 +25,27 @@ logging.basicConfig(
28
  )
29
  logger = logging.getLogger(__name__)
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  # 安全なフォルダ名を生成する関数
32
  def generate_safe_folder_name(url):
33
  logger.info(f"Generating a safe folder name from URL: {url}")
@@ -51,7 +69,7 @@ def save_image_as_jpg(image_url, save_folder, image_name):
51
  logger.info(f"Successfully downloaded image: {image_url}")
52
  except requests.exceptions.RequestException as e:
53
  logger.error(f"Error occurred during image download: {e}")
54
- return
55
 
56
  try:
57
  logger.info(f"Opening image from response content")
@@ -59,18 +77,20 @@ def save_image_as_jpg(image_url, save_folder, image_name):
59
  logger.info(f"Image successfully opened")
60
  except UnidentifiedImageError:
61
  logger.warning(f"Unidentified image file from URL: {image_url}. Skipping.")
62
- return
63
  except Exception as e:
64
  logger.error(f"Error occurred while opening image: {e}")
65
- return
66
 
67
  image_path = os.path.join(save_folder, image_name)
68
  try:
69
  logger.info(f"Converting image to JPEG and saving to {image_path}")
70
  image.convert("RGB").save(image_path, "JPEG", quality=80)
71
  logger.info(f"Image saved successfully: {image_path}")
 
72
  except Exception as e:
73
  logger.error(f"Error occurred while saving image: {e}")
 
74
 
75
  # 画像の再帰的取得
76
  def scrape_images_by_page(url, folder_name='scraped_images'):
@@ -81,9 +101,13 @@ def scrape_images_by_page(url, folder_name='scraped_images'):
81
 
82
  with sync_playwright() as p:
83
  logger.info(f"Launching Chromium browser in headless mode")
84
- browser = p.chromium.launch(headless=True) # ブラウザを非表示で起動
85
- page = browser.new_page()
86
-
 
 
 
 
87
  logger.info(f"Accessing page: {url}")
88
  page.goto(url)
89
  page.wait_for_load_state('networkidle')
@@ -157,7 +181,11 @@ def scrape_images_by_page(url, folder_name='scraped_images'):
157
 
158
  if image_url:
159
  image_name = f'page_{str(i).zfill(5)}_img_{str(j + 1).zfill(5)}.jpg'
160
- save_image_as_jpg(image_url, folder_path, image_name)
 
 
 
 
161
  except Exception as e:
162
  logger.error(f"Error processing image {j + 1} on page {i}: {e}")
163
  continue
 
3
  from playwright.sync_api import sync_playwright
4
  import requests
5
  import sys
6
+ import subprocess
7
  from PIL import Image, UnidentifiedImageError
8
+ from io import BytesIO
 
 
 
 
9
 
10
  log_file = "app_log.txt" # ログファイルのパス
11
 
 
16
  file_handler = logging.FileHandler(log_file, encoding='utf-8')
17
  # ログの設定
18
  logging.basicConfig(
19
+ level=logging.DEBUG, # ログレベルをDEBUGに設定
20
  format='%(asctime)s - %(levelname)s - %(message)s', # ログのフォーマットを指定
21
  handlers=[
22
  logging.StreamHandler(sys.stdout), # 標準出力にログを出力
 
25
  )
26
  logger = logging.getLogger(__name__)
27
 
28
+ # コマンド実行結果をログに記録する関数
29
+ def run_command(command):
30
+ logger.info(f"Running command: {command}")
31
+ try:
32
+ result = subprocess.run(command, shell=True, capture_output=True, text=True)
33
+ logger.info(f"Command output: {result.stdout}")
34
+ if result.stderr:
35
+ logger.error(f"Command error output: {result.stderr}")
36
+ return result.returncode
37
+ except Exception as e:
38
+ logger.error(f"Failed to run command '{command}': {e}")
39
+ return None
40
+
41
+ # Playwrightのインストールを実行
42
+ install_command = "python3 -m playwright install"
43
+ if run_command(install_command) != 0:
44
+ logger.error("Playwright installation failed.")
45
+ sys.exit(1)
46
+ else:
47
+ logger.info("Playwright installed successfully.")
48
+
49
  # 安全なフォルダ名を生成する関数
50
  def generate_safe_folder_name(url):
51
  logger.info(f"Generating a safe folder name from URL: {url}")
 
69
  logger.info(f"Successfully downloaded image: {image_url}")
70
  except requests.exceptions.RequestException as e:
71
  logger.error(f"Error occurred during image download: {e}")
72
+ return None
73
 
74
  try:
75
  logger.info(f"Opening image from response content")
 
77
  logger.info(f"Image successfully opened")
78
  except UnidentifiedImageError:
79
  logger.warning(f"Unidentified image file from URL: {image_url}. Skipping.")
80
+ return None
81
  except Exception as e:
82
  logger.error(f"Error occurred while opening image: {e}")
83
+ return None
84
 
85
  image_path = os.path.join(save_folder, image_name)
86
  try:
87
  logger.info(f"Converting image to JPEG and saving to {image_path}")
88
  image.convert("RGB").save(image_path, "JPEG", quality=80)
89
  logger.info(f"Image saved successfully: {image_path}")
90
+ return image_path
91
  except Exception as e:
92
  logger.error(f"Error occurred while saving image: {e}")
93
+ return None
94
 
95
  # 画像の再帰的取得
96
  def scrape_images_by_page(url, folder_name='scraped_images'):
 
101
 
102
  with sync_playwright() as p:
103
  logger.info(f"Launching Chromium browser in headless mode")
104
+ try:
105
+ browser = p.chromium.launch(headless=True) # ブラウザを非表示で起動
106
+ page = browser.new_page()
107
+ except Exception as e:
108
+ logger.error(f"Failed to launch Chromium browser: {e}")
109
+ return
110
+
111
  logger.info(f"Accessing page: {url}")
112
  page.goto(url)
113
  page.wait_for_load_state('networkidle')
 
181
 
182
  if image_url:
183
  image_name = f'page_{str(i).zfill(5)}_img_{str(j + 1).zfill(5)}.jpg'
184
+ saved_image_path = save_image_as_jpg(image_url, folder_path, image_name)
185
+ if saved_image_path:
186
+ logger.info(f"Image saved successfully at: {saved_image_path}")
187
+ else:
188
+ logger.error(f"Failed to save image {image_name} from page {i}")
189
  except Exception as e:
190
  logger.error(f"Error processing image {j + 1} on page {i}: {e}")
191
  continue