import time from typing import List from llama_index.core.readers.base import BaseReader from llama_index.core.schema import Document from selenium import webdriver from selenium.common.exceptions import WebDriverException from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC from selenium.webdriver.support.ui import WebDriverWait class WholeSiteReader(BaseReader): """ BFS Web Scraper for websites. This class provides functionality to scrape entire websites using a breadth-first search algorithm. It navigates web pages from a given base URL, following links that match a specified prefix. Attributes: prefix (str): URL prefix to focus the scraping. max_depth (int): Maximum depth for BFS algorithm. Args: prefix (str): URL prefix for scraping. max_depth (int, optional): Maximum depth for BFS. Defaults to 10. """ def __init__(self, prefix: str, max_depth: int = 10) -> None: """ Initialize the WholeSiteReader with the provided prefix and maximum depth. """ self.prefix = prefix self.max_depth = max_depth self.driver = self.setup_driver() def setup_driver(self): """ Sets up the Selenium WebDriver for Chrome. Returns: WebDriver: An instance of Chrome WebDriver. """ try: import chromedriver_autoinstaller except ImportError: raise ImportError("Please install chromedriver_autoinstaller") opt = webdriver.ChromeOptions() opt.add_argument("--start-maximized") chromedriver_autoinstaller.install() return webdriver.Chrome(options=opt) def clean_url(self, url): return url.split("#")[0] def restart_driver(self): self.driver.quit() self.driver = self.setup_driver() def extract_content(self): WebDriverWait(self.driver, 10).until( EC.presence_of_element_located((By.TAG_NAME, "body")) ) body_element = self.driver.find_element(By.TAG_NAME, "body") return body_element.text.strip() def extract_links(self): js_script = """ var links = []; var elements = document.getElementsByTagName('a'); for (var i = 0; i < elements.length; i++) { var href = elements[i].href; if (href) { links.push(href); } } return links; """ return self.driver.execute_script(js_script) def load_data(self, base_url: str) -> List[Document]: """Load data from the base URL using BFS algorithm. Args: base_url (str): Base URL to start scraping. Returns: List[Document]: List of scraped documents. """ added_urls = set() urls_to_visit = [(base_url, 0)] documents = [] while urls_to_visit: current_url, depth = urls_to_visit.pop(0) print(f"Visiting: {current_url}, {len(urls_to_visit)} left") if depth > self.max_depth: continue try: self.driver.get(current_url) page_content = self.extract_content() # links = self.driver.find_elements(By.TAG_NAME, 'a') links = self.extract_links() # clean all urls links = [self.clean_url(link) for link in links] # extract new links links = [link for link in links if link not in added_urls] print(f"Found {len(links)} new potential links") for href in links: try: if href.startswith(self.prefix) and href not in added_urls: urls_to_visit.append((href, depth + 1)) added_urls.add(href) except Exception: continue documents.append( Document(text=page_content, extra_info={"URL": current_url}) ) time.sleep(1) except WebDriverException: print("WebDriverException encountered, restarting driver...") self.restart_driver() except Exception as e: print(f"An unexpected exception occurred: {e}, skipping URL...") continue self.driver.quit() return documents