Fix: Correct retry logic to prevent multiple restarts

- Updated retry_crawl function to retry indefinitely until successful without returning prematurely.
- Simplified the lambda function in the main block to directly submit retry_crawl when --retry is specified.
- Ensured that the script does not restart the crawling process for the same URL multiple times unnecessarily.

Files changed (1) hide show

crawl/crawl +2 -8

crawl/crawl CHANGED Viewed

@@ -176,13 +176,12 @@ def retry_crawl(inner_url):
     while True:
         try:
             save_result(inner_url)
-            return False
         except (AttributeError, ValueError) as inner_e:
             print(f"[ERROR] 🚫 Failed to crawl {inner_url}, "
                   f"error: {str(inner_e)}")
             print("Retrying in 3 seconds...")
             time.sleep(3)
-            return True
 if __name__ == "__main__":
@@ -206,12 +205,7 @@ if __name__ == "__main__":
         for url in args.urls:
             if args.retry:
                 futures.append(
-                    executor.submit(
-                        lambda u: (
-                            save_result(u) if not retry_crawl(u) else None
-                        ),
-                        url
-                    )
                 )
             else:
                 futures.append(

     while True:
         try:
             save_result(inner_url)
+            return
         except (AttributeError, ValueError) as inner_e:
             print(f"[ERROR] 🚫 Failed to crawl {inner_url}, "
                   f"error: {str(inner_e)}")
             print("Retrying in 3 seconds...")
             time.sleep(3)
 if __name__ == "__main__":
         for url in args.urls:
             if args.retry:
                 futures.append(
+                    executor.submit(retry_crawl, url)
                 )
             else:
                 futures.append(