k4d3 commited on
Commit
026c9c4
·
1 Parent(s): d86635a

update crawl

Browse files

Signed-off-by: Balazs Horvath <acsipont@gmail.com>

Files changed (1) hide show
  1. crawl/crawl +44 -15
crawl/crawl CHANGED
@@ -14,9 +14,24 @@ import os
14
  import re
15
  import platform
16
  from concurrent.futures import ThreadPoolExecutor, as_completed
17
-
 
 
18
  import requests
19
- from crawl4ai import WebCrawler
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
 
22
  def create_crawler():
@@ -62,8 +77,6 @@ def download_image(session, image_url, save_dir, base_url):
62
  try:
63
  # Ensure the URL has a scheme
64
  if image_url.startswith(".."):
65
- from urllib.parse import urljoin
66
-
67
  image_url = urljoin(base_url, image_url)
68
  elif not re.match(r"^https?://", image_url):
69
  image_url = "https://" + image_url.lstrip("/")
@@ -78,10 +91,10 @@ def download_image(session, image_url, save_dir, base_url):
78
  for chunk in response.iter_content(chunk_size=8192):
79
  image_file.write(chunk)
80
  print(f"Saved image: {image_path}")
81
- except requests.RequestException as e:
82
- print(f"Error downloading image {image_url}: {str(e)}")
83
- except IOError as e:
84
- print(f"Error saving image {image_url}: {str(e)}")
85
 
86
 
87
  def save_result(target_url):
@@ -96,6 +109,8 @@ def save_result(target_url):
96
  """
97
  crawler = create_crawler()
98
  result = crawler.run(url=target_url)
 
 
99
  title = result.metadata.get("title", "untitled")
100
  sanitized_title = sanitize_filename(title).replace(" ", "_")
101
 
@@ -123,7 +138,8 @@ def save_result(target_url):
123
  "Chrome/91.0.4472.124 Safari/537.36",
124
  "Referer": target_url,
125
  "Accept": (
126
- "image/avif,image/webp,image/apng,image/svg+xml," "image/*,*/*;q=0.8"
 
127
  ),
128
  "Accept-Language": "en-US,en;q=0.9",
129
  "Sec-Fetch-Dest": "image",
@@ -151,9 +167,22 @@ def save_result(target_url):
151
 
152
 
153
  if __name__ == "__main__":
154
- if len(sys.argv) < 2:
155
- print("Usage: python crawl.py <URL1> <URL2> ... <URLn>")
156
- else:
157
- urls = sys.argv[1:]
158
- for url in urls:
159
- save_result(url)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
14
  import re
15
  import platform
16
  from concurrent.futures import ThreadPoolExecutor, as_completed
17
+ import time
18
+ import argparse
19
+ from urllib.parse import urljoin
20
  import requests
21
+ try:
22
+ from crawl4ai import WebCrawler # type: ignore
23
+ except ImportError as exc:
24
+ raise ImportError(
25
+ "The module 'crawl4ai' could not be imported. Please ensure it is "
26
+ "installed and accessible."
27
+ ) from exc
28
+
29
+ # Check if the current Conda environment is "crawl"
30
+ conda_env = os.environ.get('CONDA_DEFAULT_ENV')
31
+ if conda_env != 'crawl':
32
+ print(f"Error: The current Conda environment is '{conda_env}'. "
33
+ "Please activate the 'crawl' environment.")
34
+ sys.exit(1)
35
 
36
 
37
  def create_crawler():
 
77
  try:
78
  # Ensure the URL has a scheme
79
  if image_url.startswith(".."):
 
 
80
  image_url = urljoin(base_url, image_url)
81
  elif not re.match(r"^https?://", image_url):
82
  image_url = "https://" + image_url.lstrip("/")
 
91
  for chunk in response.iter_content(chunk_size=8192):
92
  image_file.write(chunk)
93
  print(f"Saved image: {image_path}")
94
+ except requests.RequestException as req_err:
95
+ print(f"Error downloading image {image_url}: {str(req_err)}")
96
+ except IOError as io_err:
97
+ print(f"Error saving image {image_url}: {str(io_err)}")
98
 
99
 
100
  def save_result(target_url):
 
109
  """
110
  crawler = create_crawler()
111
  result = crawler.run(url=target_url)
112
+ if result is None:
113
+ raise ValueError(f"Failed to crawl {target_url}")
114
  title = result.metadata.get("title", "untitled")
115
  sanitized_title = sanitize_filename(title).replace(" ", "_")
116
 
 
138
  "Chrome/91.0.4472.124 Safari/537.36",
139
  "Referer": target_url,
140
  "Accept": (
141
+ "image/avif,image/webp,image/apng,image/svg+xml,"
142
+ "image/*,*/*;q=0.8"
143
  ),
144
  "Accept-Language": "en-US,en;q=0.9",
145
  "Sec-Fetch-Dest": "image",
 
167
 
168
 
169
  if __name__ == "__main__":
170
+ parser = argparse.ArgumentParser(
171
+ description="Web Crawler and Content Saver"
172
+ )
173
+ parser.add_argument(
174
+ "urls",
175
+ nargs="+",
176
+ help="List of URLs to crawl"
177
+ )
178
+ args = parser.parse_args()
179
+
180
+ for url in args.urls:
181
+ while True:
182
+ try:
183
+ save_result(url)
184
+ break
185
+ except (AttributeError, ValueError) as e:
186
+ print(f"[ERROR] 🚫 Failed to crawl {url}, error: {str(e)}")
187
+ print("Retrying in 3 seconds...")
188
+ time.sleep(3)