dont redefine variables from outer scope + code formatting
Browse files- crawl/crawl +21 -11
crawl/crawl
CHANGED
@@ -148,12 +148,12 @@ def save_result(target_url):
|
|
148 |
}
|
149 |
session.headers.update(headers)
|
150 |
|
151 |
-
with ThreadPoolExecutor(max_workers=5) as
|
152 |
-
|
153 |
for image_data in result.media["images"]:
|
154 |
if "src" in image_data:
|
155 |
-
|
156 |
-
|
157 |
download_image,
|
158 |
session,
|
159 |
image_data["src"],
|
@@ -162,8 +162,8 @@ def save_result(target_url):
|
|
162 |
)
|
163 |
)
|
164 |
|
165 |
-
for
|
166 |
-
|
167 |
|
168 |
|
169 |
if __name__ == "__main__":
|
@@ -188,7 +188,9 @@ if __name__ == "__main__":
|
|
188 |
if args.retry:
|
189 |
futures.append(
|
190 |
executor.submit(
|
191 |
-
lambda u:
|
|
|
|
|
192 |
url
|
193 |
)
|
194 |
)
|
@@ -203,13 +205,21 @@ if __name__ == "__main__":
|
|
203 |
except (AttributeError, ValueError) as e:
|
204 |
print(f"[ERROR] 🚫 Failed to crawl, error: {str(e)}")
|
205 |
|
206 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
207 |
while True:
|
208 |
try:
|
209 |
-
save_result(
|
210 |
return False
|
211 |
-
except (AttributeError, ValueError) as
|
212 |
-
print(f"[ERROR] 🚫 Failed to crawl {
|
|
|
213 |
print("Retrying in 3 seconds...")
|
214 |
time.sleep(3)
|
215 |
return True
|
|
|
148 |
}
|
149 |
session.headers.update(headers)
|
150 |
|
151 |
+
with ThreadPoolExecutor(max_workers=5) as image_executor:
|
152 |
+
image_futures = []
|
153 |
for image_data in result.media["images"]:
|
154 |
if "src" in image_data:
|
155 |
+
image_futures.append(
|
156 |
+
image_executor.submit(
|
157 |
download_image,
|
158 |
session,
|
159 |
image_data["src"],
|
|
|
162 |
)
|
163 |
)
|
164 |
|
165 |
+
for img_future in as_completed(image_futures):
|
166 |
+
img_future.result()
|
167 |
|
168 |
|
169 |
if __name__ == "__main__":
|
|
|
188 |
if args.retry:
|
189 |
futures.append(
|
190 |
executor.submit(
|
191 |
+
lambda u: (
|
192 |
+
save_result(u) if not retry_crawl(u) else None
|
193 |
+
),
|
194 |
url
|
195 |
)
|
196 |
)
|
|
|
205 |
except (AttributeError, ValueError) as e:
|
206 |
print(f"[ERROR] 🚫 Failed to crawl, error: {str(e)}")
|
207 |
|
208 |
+
|
209 |
+
def retry_crawl(inner_url):
|
210 |
+
"""
|
211 |
+
Retries crawling the given URL until successful.
|
212 |
+
|
213 |
+
Args:
|
214 |
+
inner_url (str): The URL to crawl.
|
215 |
+
"""
|
216 |
while True:
|
217 |
try:
|
218 |
+
save_result(inner_url)
|
219 |
return False
|
220 |
+
except (AttributeError, ValueError) as inner_e:
|
221 |
+
print(f"[ERROR] 🚫 Failed to crawl {inner_url}, "
|
222 |
+
f"error: {str(inner_e)}")
|
223 |
print("Retrying in 3 seconds...")
|
224 |
time.sleep(3)
|
225 |
return True
|