feat: Add parallel execution for multiple URLs

- Implemented ThreadPoolExecutor to handle multiple URLs concurrently.
- Moved retry logic into a separate function `retry_crawl` for better readability and maintainability.
- Updated main execution block to submit tasks to the executor for each URL, enabling parallel processing.
- Improved efficiency by allowing the script to process multiple URLs in parallel.

Files changed (1) hide show

crawl/crawl +29 -13

crawl/crawl CHANGED Viewed

@@ -182,18 +182,34 @@ if __name__ == "__main__":
     )
     args = parser.parse_args()
-    for url in args.urls:
-        if args.retry:
-            while True:
-                try:
-                    save_result(url)
-                    break
-                except (AttributeError, ValueError) as e:
-                    print(f"[ERROR] 🚫 Failed to crawl {url}, error: {str(e)}")
-                    print("Retrying in 3 seconds...")
-                    time.sleep(3)
-        else:
             try:
-                save_result(url)
             except (AttributeError, ValueError) as e:
-                print(f"[ERROR] 🚫 Failed to crawl {url}, error: {str(e)}")

     )
     args = parser.parse_args()
+    with ThreadPoolExecutor(max_workers=5) as executor:
+        futures = []
+        for url in args.urls:
+            if args.retry:
+                futures.append(
+                    executor.submit(
+                        lambda u: save_result(u) if not retry_crawl(u) else None,
+                        url
+                    )
+                )
+            else:
+                futures.append(
+                    executor.submit(save_result, url)
+                )
+        for future in as_completed(futures):
             try:
+                future.result()
             except (AttributeError, ValueError) as e:
+                print(f"[ERROR] 🚫 Failed to crawl, error: {str(e)}")
+def retry_crawl(url):
+    while True:
+        try:
+            save_result(url)
+            return False
+        except (AttributeError, ValueError) as e:
+            print(f"[ERROR] 🚫 Failed to crawl {url}, error: {str(e)}")
+            print("Retrying in 3 seconds...")
+            time.sleep(3)
+            return True