Spaces:

drift-ai
/

faq-website

Runtime error

vincentclaes commited on Apr 10, 2023

Commit

4f7d130

•

1 Parent(s): 4161807

remove get child pages

Files changed (1) hide show

scrape_website.py CHANGED Viewed

@@ -3,38 +3,6 @@ from bs4 import BeautifulSoup
 def process_webpage(url: str):
-    # A set to keep track of visited pages
-    visited_pages = set()
-    text_list = []
-    # A function to recursively get all child pages
-    def get_child_pages(url):
-        # Make a GET request to the page and get the HTML content
-        response = requests.get(url)
-        html_content = response.content
-        # Parse the HTML content using BeautifulSoup
-        soup = BeautifulSoup(html_content, "html.parser")
-        # Get all the text content from the relevant HTML tags
-        text_content = ""
-        for tag in ["p", "h1", "h2", "h3", "h4", "h5", "h6", "li"]:
-            for element in soup.find_all(tag):
-                text_content += element.get_text() + " "
-        # Add the page to the set of visited pages
-        text_content = f"page {url} contains: " + text_content
-        visited_pages.add(url)
-        # Find all the child links and recursively get their text content
-        for link in soup.find_all("a"):
-            href = link.get("href")
-            if href and href not in visited_pages and url in href:
-                get_child_pages(href)
-        text_list.append(text_content)
     # Make a GET request to the page and get the HTML content
     response = requests.get(url)
     html_content = response.content

 def process_webpage(url: str):
     # Make a GET request to the page and get the HTML content
     response = requests.get(url)
     html_content = response.content