k4d3 commited on
Commit
a7f9c9a
•
1 Parent(s): e4d6e76

dont redefine variables from outer scope + code formatting

Browse files
Files changed (1) hide show
  1. crawl/crawl +21 -11
crawl/crawl CHANGED
@@ -148,12 +148,12 @@ def save_result(target_url):
148
  }
149
  session.headers.update(headers)
150
 
151
- with ThreadPoolExecutor(max_workers=5) as executor:
152
- futures = []
153
  for image_data in result.media["images"]:
154
  if "src" in image_data:
155
- futures.append(
156
- executor.submit(
157
  download_image,
158
  session,
159
  image_data["src"],
@@ -162,8 +162,8 @@ def save_result(target_url):
162
  )
163
  )
164
 
165
- for future in as_completed(futures):
166
- future.result()
167
 
168
 
169
  if __name__ == "__main__":
@@ -188,7 +188,9 @@ if __name__ == "__main__":
188
  if args.retry:
189
  futures.append(
190
  executor.submit(
191
- lambda u: save_result(u) if not retry_crawl(u) else None,
 
 
192
  url
193
  )
194
  )
@@ -203,13 +205,21 @@ if __name__ == "__main__":
203
  except (AttributeError, ValueError) as e:
204
  print(f"[ERROR] 🚫 Failed to crawl, error: {str(e)}")
205
 
206
- def retry_crawl(url):
 
 
 
 
 
 
 
207
  while True:
208
  try:
209
- save_result(url)
210
  return False
211
- except (AttributeError, ValueError) as e:
212
- print(f"[ERROR] 🚫 Failed to crawl {url}, error: {str(e)}")
 
213
  print("Retrying in 3 seconds...")
214
  time.sleep(3)
215
  return True
 
148
  }
149
  session.headers.update(headers)
150
 
151
+ with ThreadPoolExecutor(max_workers=5) as image_executor:
152
+ image_futures = []
153
  for image_data in result.media["images"]:
154
  if "src" in image_data:
155
+ image_futures.append(
156
+ image_executor.submit(
157
  download_image,
158
  session,
159
  image_data["src"],
 
162
  )
163
  )
164
 
165
+ for img_future in as_completed(image_futures):
166
+ img_future.result()
167
 
168
 
169
  if __name__ == "__main__":
 
188
  if args.retry:
189
  futures.append(
190
  executor.submit(
191
+ lambda u: (
192
+ save_result(u) if not retry_crawl(u) else None
193
+ ),
194
  url
195
  )
196
  )
 
205
  except (AttributeError, ValueError) as e:
206
  print(f"[ERROR] 🚫 Failed to crawl, error: {str(e)}")
207
 
208
+
209
+ def retry_crawl(inner_url):
210
+ """
211
+ Retries crawling the given URL until successful.
212
+
213
+ Args:
214
+ inner_url (str): The URL to crawl.
215
+ """
216
  while True:
217
  try:
218
+ save_result(inner_url)
219
  return False
220
+ except (AttributeError, ValueError) as inner_e:
221
+ print(f"[ERROR] 🚫 Failed to crawl {inner_url}, "
222
+ f"error: {str(inner_e)}")
223
  print("Retrying in 3 seconds...")
224
  time.sleep(3)
225
  return True