feat: Add parallel execution for multiple URLs
Browse files- Implemented ThreadPoolExecutor to handle multiple URLs concurrently.
- Moved retry logic into a separate function `retry_crawl` for better readability and maintainability.
- Updated main execution block to submit tasks to the executor for each URL, enabling parallel processing.
- Improved efficiency by allowing the script to process multiple URLs in parallel.
- crawl/crawl +29 -13
crawl/crawl
CHANGED
@@ -182,18 +182,34 @@ if __name__ == "__main__":
|
|
182 |
)
|
183 |
args = parser.parse_args()
|
184 |
|
185 |
-
|
186 |
-
|
187 |
-
|
188 |
-
|
189 |
-
|
190 |
-
|
191 |
-
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
|
|
|
|
|
|
|
|
|
|
196 |
try:
|
197 |
-
|
198 |
except (AttributeError, ValueError) as e:
|
199 |
-
print(f"[ERROR] 🚫 Failed to crawl
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
182 |
)
|
183 |
args = parser.parse_args()
|
184 |
|
185 |
+
with ThreadPoolExecutor(max_workers=5) as executor:
|
186 |
+
futures = []
|
187 |
+
for url in args.urls:
|
188 |
+
if args.retry:
|
189 |
+
futures.append(
|
190 |
+
executor.submit(
|
191 |
+
lambda u: save_result(u) if not retry_crawl(u) else None,
|
192 |
+
url
|
193 |
+
)
|
194 |
+
)
|
195 |
+
else:
|
196 |
+
futures.append(
|
197 |
+
executor.submit(save_result, url)
|
198 |
+
)
|
199 |
+
|
200 |
+
for future in as_completed(futures):
|
201 |
try:
|
202 |
+
future.result()
|
203 |
except (AttributeError, ValueError) as e:
|
204 |
+
print(f"[ERROR] 🚫 Failed to crawl, error: {str(e)}")
|
205 |
+
|
206 |
+
def retry_crawl(url):
|
207 |
+
while True:
|
208 |
+
try:
|
209 |
+
save_result(url)
|
210 |
+
return False
|
211 |
+
except (AttributeError, ValueError) as e:
|
212 |
+
print(f"[ERROR] 🚫 Failed to crawl {url}, error: {str(e)}")
|
213 |
+
print("Retrying in 3 seconds...")
|
214 |
+
time.sleep(3)
|
215 |
+
return True
|