Spaces:
Runtime error
Runtime error
vincentclaes
commited on
Commit
•
4f7d130
1
Parent(s):
4161807
remove get child pages
Browse files- scrape_website.py +0 -32
scrape_website.py
CHANGED
@@ -3,38 +3,6 @@ from bs4 import BeautifulSoup
|
|
3 |
|
4 |
|
5 |
def process_webpage(url: str):
|
6 |
-
# A set to keep track of visited pages
|
7 |
-
visited_pages = set()
|
8 |
-
|
9 |
-
text_list = []
|
10 |
-
|
11 |
-
# A function to recursively get all child pages
|
12 |
-
def get_child_pages(url):
|
13 |
-
# Make a GET request to the page and get the HTML content
|
14 |
-
response = requests.get(url)
|
15 |
-
html_content = response.content
|
16 |
-
|
17 |
-
# Parse the HTML content using BeautifulSoup
|
18 |
-
soup = BeautifulSoup(html_content, "html.parser")
|
19 |
-
|
20 |
-
# Get all the text content from the relevant HTML tags
|
21 |
-
text_content = ""
|
22 |
-
for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]:
|
23 |
-
for element in soup.find_all(tag):
|
24 |
-
text_content += element.get_text() + " "
|
25 |
-
|
26 |
-
# Add the page to the set of visited pages
|
27 |
-
text_content = f"page {url} contains: " + text_content
|
28 |
-
visited_pages.add(url)
|
29 |
-
|
30 |
-
# Find all the child links and recursively get their text content
|
31 |
-
for link in soup.find_all("a"):
|
32 |
-
href = link.get("href")
|
33 |
-
if href and href not in visited_pages and url in href:
|
34 |
-
get_child_pages(href)
|
35 |
-
|
36 |
-
text_list.append(text_content)
|
37 |
-
|
38 |
# Make a GET request to the page and get the HTML content
|
39 |
response = requests.get(url)
|
40 |
html_content = response.content
|
|
|
3 |
|
4 |
|
5 |
def process_webpage(url: str):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
6 |
# Make a GET request to the page and get the HTML content
|
7 |
response = requests.get(url)
|
8 |
html_content = response.content
|