File size: 1,630 Bytes
aa936e5
 
a92d81b
aa936e5
a92d81b
aa936e5
 
a92d81b
5f6deb9
a92d81b
 
aa936e5
a92d81b
5f6deb9
aa936e5
 
5f6deb9
a92d81b
aa936e5
 
 
 
 
 
 
a92d81b
 
 
 
 
aa936e5
 
 
 
5f6deb9
 
a92d81b
 
 
 
 
 
 
 
 
 
 
5f6deb9
 
 
 
 
 
34d3c87
 
5f6deb9
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
import requests
from bs4 import BeautifulSoup
from typing import List

CHARACTER_CUT_OFF = 20000


def remove_tags(soup: BeautifulSoup) -> str:
    for data in soup(["style", "script"]):
        # Remove tags
        data.decompose()

    # return data by retrieving the tag content
    return " ".join(soup.stripped_strings)


def read_webpage(url: str) -> str:
    print(f"Getting the response from url : {url})")
    response = requests.get(url)
    html_content = response.content

    # Parse the HTML content using BeautifulSoup
    soup = BeautifulSoup(html_content, "html.parser")

    # Get all the text content from the relevant HTML tags
    text_content = remove_tags(soup)

    # for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li", "div"]:
    #     for element in soup.find_all(tag):
    #         text_content += element.get_text() + " "

    print(text_content)
    return text_content


def process_webpages(urls: List[str]):
    # A set to keep track of visited pages
    visited_pages = set()
    aggregated_text = ""
    for url in urls:
        visited_pages.add(url)
        aggregated_text += f"\nGetting the content of {url}:\n"
        aggregated_text += read_webpage(url)

    return aggregated_text[:CHARACTER_CUT_OFF]


if __name__ == "__main__":
    print(
        process_webpages(
            urls=[
                "https://www.example.org",
                "https://www.example.com",
                "https://www.imperial.ac.uk/stories/climate-action/",
                "https://support.worldwildlife.org/site/SPageNavigator/ActionsToFightClimateChange.html",
            ]
        )
    )