Spaces:
Runtime error
Runtime error
File size: 1,630 Bytes
aa936e5 a92d81b aa936e5 a92d81b aa936e5 a92d81b 5f6deb9 a92d81b aa936e5 a92d81b 5f6deb9 aa936e5 5f6deb9 a92d81b aa936e5 a92d81b aa936e5 5f6deb9 a92d81b 5f6deb9 34d3c87 5f6deb9 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 |
import requests
from bs4 import BeautifulSoup
from typing import List
CHARACTER_CUT_OFF = 20000
def remove_tags(soup: BeautifulSoup) -> str:
for data in soup(["style", "script"]):
# Remove tags
data.decompose()
# return data by retrieving the tag content
return " ".join(soup.stripped_strings)
def read_webpage(url: str) -> str:
print(f"Getting the response from url : {url})")
response = requests.get(url)
html_content = response.content
# Parse the HTML content using BeautifulSoup
soup = BeautifulSoup(html_content, "html.parser")
# Get all the text content from the relevant HTML tags
text_content = remove_tags(soup)
# for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "div"]:
# for element in soup.find_all(tag):
# text_content += element.get_text() + " "
print(text_content)
return text_content
def process_webpages(urls: List[str]):
# A set to keep track of visited pages
visited_pages = set()
aggregated_text = ""
for url in urls:
visited_pages.add(url)
aggregated_text += f"\nGetting the content of {url}:\n"
aggregated_text += read_webpage(url)
return aggregated_text[:CHARACTER_CUT_OFF]
if __name__ == "__main__":
print(
process_webpages(
urls=[
"https://www.example.org",
"https://www.example.com",
"https://www.imperial.ac.uk/stories/climate-action/",
"https://support.worldwildlife.org/site/SPageNavigator/ActionsToFightClimateChange.html",
]
)
)
|