faq-website / scrape_website.py
Peter Vandenabeele
Clean up scraping to eliminate scripts and style,,but keep other tags in order
a92d81b
raw
history blame
1.39 kB
import requests
from bs4 import BeautifulSoup
from typing import List
CHARACTER_CUT_OFF = 20000
def remove_tags(soup: BeautifulSoup) -> str:
for data in soup(['style', 'script']):
# Remove tags
data.decompose()
# return data by retrieving the tag content
return ' '.join(soup.stripped_strings)
def read_webpage(url:str) -> str:
print(f"Getting the response from url : {url})")
response = requests.get(url)
html_content = response.content
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
# Get all the text content from the relevant HTML tags
text_content = remove_tags(soup)
# for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "div"]:
# for element in soup.find_all(tag):
# text_content += element.get_text() + " "
print(text_content)
return text_content
def process_webpages(urls:List[str]):
# A set to keep track of visited pages
visited_pages = set()
aggregated_text = ""
for url in urls:
visited_pages.add(url)
aggregated_text += f"\nGetting the content of {url}:\n"
aggregated_text += read_webpage(url)
return aggregated_text[:CHARACTER_CUT_OFF]
if __name__ == '__main__':
print(process_webpages(urls=[
"https://www.example.org",
"https://www.example.com",
]))