Fix: Correct retry logic to prevent multiple restarts
Browse files- Updated retry_crawl function to retry indefinitely until successful without returning prematurely.
- Simplified the lambda function in the main block to directly submit retry_crawl when --retry is specified.
- Ensured that the script does not restart the crawling process for the same URL multiple times unnecessarily.
- crawl/crawl +2 -8
crawl/crawl
CHANGED
@@ -176,13 +176,12 @@ def retry_crawl(inner_url):
|
|
176 |
while True:
|
177 |
try:
|
178 |
save_result(inner_url)
|
179 |
-
return
|
180 |
except (AttributeError, ValueError) as inner_e:
|
181 |
print(f"[ERROR] 🚫 Failed to crawl {inner_url}, "
|
182 |
f"error: {str(inner_e)}")
|
183 |
print("Retrying in 3 seconds...")
|
184 |
time.sleep(3)
|
185 |
-
return True
|
186 |
|
187 |
|
188 |
if __name__ == "__main__":
|
@@ -206,12 +205,7 @@ if __name__ == "__main__":
|
|
206 |
for url in args.urls:
|
207 |
if args.retry:
|
208 |
futures.append(
|
209 |
-
executor.submit(
|
210 |
-
lambda u: (
|
211 |
-
save_result(u) if not retry_crawl(u) else None
|
212 |
-
),
|
213 |
-
url
|
214 |
-
)
|
215 |
)
|
216 |
else:
|
217 |
futures.append(
|
|
|
176 |
while True:
|
177 |
try:
|
178 |
save_result(inner_url)
|
179 |
+
return
|
180 |
except (AttributeError, ValueError) as inner_e:
|
181 |
print(f"[ERROR] 🚫 Failed to crawl {inner_url}, "
|
182 |
f"error: {str(inner_e)}")
|
183 |
print("Retrying in 3 seconds...")
|
184 |
time.sleep(3)
|
|
|
185 |
|
186 |
|
187 |
if __name__ == "__main__":
|
|
|
205 |
for url in args.urls:
|
206 |
if args.retry:
|
207 |
futures.append(
|
208 |
+
executor.submit(retry_crawl, url)
|
|
|
|
|
|
|
|
|
|
|
209 |
)
|
210 |
else:
|
211 |
futures.append(
|