k4d3 commited on
Commit
e4d6e76
•
1 Parent(s): c239ce8

feat: Add parallel execution for multiple URLs

Browse files

- Implemented ThreadPoolExecutor to handle multiple URLs concurrently.
- Moved retry logic into a separate function `retry_crawl` for better readability and maintainability.
- Updated main execution block to submit tasks to the executor for each URL, enabling parallel processing.
- Improved efficiency by allowing the script to process multiple URLs in parallel.

Files changed (1) hide show
  1. crawl/crawl +29 -13
crawl/crawl CHANGED
@@ -182,18 +182,34 @@ if __name__ == "__main__":
182
  )
183
  args = parser.parse_args()
184
 
185
- for url in args.urls:
186
- if args.retry:
187
- while True:
188
- try:
189
- save_result(url)
190
- break
191
- except (AttributeError, ValueError) as e:
192
- print(f"[ERROR] 🚫 Failed to crawl {url}, error: {str(e)}")
193
- print("Retrying in 3 seconds...")
194
- time.sleep(3)
195
- else:
 
 
 
 
 
196
  try:
197
- save_result(url)
198
  except (AttributeError, ValueError) as e:
199
- print(f"[ERROR] 🚫 Failed to crawl {url}, error: {str(e)}")
 
 
 
 
 
 
 
 
 
 
 
 
182
  )
183
  args = parser.parse_args()
184
 
185
+ with ThreadPoolExecutor(max_workers=5) as executor:
186
+ futures = []
187
+ for url in args.urls:
188
+ if args.retry:
189
+ futures.append(
190
+ executor.submit(
191
+ lambda u: save_result(u) if not retry_crawl(u) else None,
192
+ url
193
+ )
194
+ )
195
+ else:
196
+ futures.append(
197
+ executor.submit(save_result, url)
198
+ )
199
+
200
+ for future in as_completed(futures):
201
  try:
202
+ future.result()
203
  except (AttributeError, ValueError) as e:
204
+ print(f"[ERROR] 🚫 Failed to crawl, error: {str(e)}")
205
+
206
+ def retry_crawl(url):
207
+ while True:
208
+ try:
209
+ save_result(url)
210
+ return False
211
+ except (AttributeError, ValueError) as e:
212
+ print(f"[ERROR] 🚫 Failed to crawl {url}, error: {str(e)}")
213
+ print("Retrying in 3 seconds...")
214
+ time.sleep(3)
215
+ return True