Neu256's picture
Upload 98 files
7f51819 verified
raw
history blame contribute delete
No virus
1.99 kB
import concurrent.futures
import requests
import re
from bs4 import BeautifulSoup
import extensions.superboogav2.parameters as parameters
from .data_processor import process_and_add_to_collector
from .utils import create_metadata_source
def _download_single(url):
response = requests.get(url, timeout=5)
if response.status_code == 200:
return response.content
else:
raise Exception("Failed to download URL")
def _download_urls(urls, threads=1):
with concurrent.futures.ThreadPoolExecutor(max_workers=threads) as executor:
futures = []
for url in urls:
future = executor.submit(_download_single, url)
futures.append(future)
results = []
i = 0
for future in concurrent.futures.as_completed(futures):
try:
result = future.result()
results.append(result)
i += 1
yield f"{i}/{len(urls)}", results
except Exception:
pass
yield "Done", results
def feed_url_into_collector(urls, collector):
all_text = ''
cumulative = ''
urls = urls.strip().split('\n')
cumulative += f'Loading {len(urls)} URLs with {parameters.get_num_threads()} threads...\n\n'
yield cumulative
for update, contents in _download_urls(urls, threads=parameters.get_num_threads()):
yield cumulative + update
cumulative += 'Processing the HTML sources...'
yield cumulative
for content in contents:
soup = BeautifulSoup(content, features="lxml")
for script in soup(["script", "style"]):
script.extract()
strings = soup.stripped_strings
if parameters.get_is_strong_cleanup():
strings = [s for s in strings if re.search("[A-Za-z] ", s)]
text = '\n'.join([s.strip() for s in strings])
all_text += text
process_and_add_to_collector(all_text, collector, False, create_metadata_source('url-download'))