vincentclaes commited on
Commit
4f7d130
1 Parent(s): 4161807

remove get child pages

Browse files
Files changed (1) hide show
  1. scrape_website.py +0 -32
scrape_website.py CHANGED
@@ -3,38 +3,6 @@ from bs4 import BeautifulSoup
3
 
4
 
5
  def process_webpage(url: str):
6
- # A set to keep track of visited pages
7
- visited_pages = set()
8
-
9
- text_list = []
10
-
11
- # A function to recursively get all child pages
12
- def get_child_pages(url):
13
- # Make a GET request to the page and get the HTML content
14
- response = requests.get(url)
15
- html_content = response.content
16
-
17
- # Parse the HTML content using BeautifulSoup
18
- soup = BeautifulSoup(html_content, "html.parser")
19
-
20
- # Get all the text content from the relevant HTML tags
21
- text_content = ""
22
- for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]:
23
- for element in soup.find_all(tag):
24
- text_content += element.get_text() + " "
25
-
26
- # Add the page to the set of visited pages
27
- text_content = f"page {url} contains: " + text_content
28
- visited_pages.add(url)
29
-
30
- # Find all the child links and recursively get their text content
31
- for link in soup.find_all("a"):
32
- href = link.get("href")
33
- if href and href not in visited_pages and url in href:
34
- get_child_pages(href)
35
-
36
- text_list.append(text_content)
37
-
38
  # Make a GET request to the page and get the HTML content
39
  response = requests.get(url)
40
  html_content = response.content
 
3
 
4
 
5
  def process_webpage(url: str):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6
  # Make a GET request to the page and get the HTML content
7
  response = requests.get(url)
8
  html_content = response.content