Spaces:
Runtime error
Runtime error
Peter Vandenabeele
Clean up scraping to eliminate scripts and style,,but keep other tags in order
a92d81b
import requests | |
from bs4 import BeautifulSoup | |
from typing import List | |
CHARACTER_CUT_OFF = 20000 | |
def remove_tags(soup: BeautifulSoup) -> str: | |
for data in soup(['style', 'script']): | |
# Remove tags | |
data.decompose() | |
# return data by retrieving the tag content | |
return ' '.join(soup.stripped_strings) | |
def read_webpage(url:str) -> str: | |
print(f"Getting the response from url : {url})") | |
response = requests.get(url) | |
html_content = response.content | |
# Parse the HTML content using BeautifulSoup | |
soup = BeautifulSoup(html_content, "html.parser") | |
# Get all the text content from the relevant HTML tags | |
text_content = remove_tags(soup) | |
# for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "div"]: | |
# for element in soup.find_all(tag): | |
# text_content += element.get_text() + " " | |
print(text_content) | |
return text_content | |
def process_webpages(urls:List[str]): | |
# A set to keep track of visited pages | |
visited_pages = set() | |
aggregated_text = "" | |
for url in urls: | |
visited_pages.add(url) | |
aggregated_text += f"\nGetting the content of {url}:\n" | |
aggregated_text += read_webpage(url) | |
return aggregated_text[:CHARACTER_CUT_OFF] | |
if __name__ == '__main__': | |
print(process_webpages(urls=[ | |
"https://www.example.org", | |
"https://www.example.com", | |
])) |