Spaces:
Running
Running
Update helper_functions_api.py
Browse files- helper_functions_api.py +22 -7
helper_functions_api.py
CHANGED
@@ -221,13 +221,28 @@ def process_content(data_format, url, query):
|
|
221 |
return rephrased_content, url
|
222 |
return "", url
|
223 |
|
224 |
-
def fetch_and_extract_content(
|
225 |
-
|
226 |
-
|
227 |
-
|
228 |
-
|
229 |
-
|
230 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
231 |
|
232 |
return all_text_with_urls
|
233 |
|
|
|
221 |
return rephrased_content, url
|
222 |
return "", url
|
223 |
|
224 |
+
def fetch_and_extract_content(
|
225 |
+
data_format: str, query: str, urls: List[str], num_refrences: int = 8
|
226 |
+
) -> List[Tuple[str | None, str]]:
|
227 |
+
"""
|
228 |
+
Asynchronously makeing request to urls and doing further process
|
229 |
+
"""
|
230 |
+
all_text_with_urls = []
|
231 |
+
start_url = 0
|
232 |
+
while (len(all_text_with_urls) != num_refrences) and (start_url < len(urls)):
|
233 |
+
end_url = start_url + (num_refrences - len(all_text_with_urls))
|
234 |
+
urls_subset = urls[start_url:end_url]
|
235 |
+
with ThreadPoolExecutor(max_workers=len(urls_subset)) as executor:
|
236 |
+
future_to_url = {
|
237 |
+
executor.submit(process_content, data_format, url, query): url
|
238 |
+
for url in urls_subset
|
239 |
+
}
|
240 |
+
all_text_with_urls += [
|
241 |
+
future.result()
|
242 |
+
for future in as_completed(future_to_url)
|
243 |
+
if future.result()[0] != ""
|
244 |
+
]
|
245 |
+
start_url = end_url
|
246 |
|
247 |
return all_text_with_urls
|
248 |
|