from selenium import webdriver import autogpt.summary as summary from bs4 import BeautifulSoup from selenium.webdriver.common.by import By from selenium.webdriver.support.wait import WebDriverWait from selenium.webdriver.support import expected_conditions as EC from webdriver_manager.chrome import ChromeDriverManager from selenium.webdriver.chrome.options import Options import logging from pathlib import Path from autogpt.config import Config file_dir = Path(__file__).parent cfg = Config() def browse_website(url, question): driver, text = scrape_text_with_selenium(url) add_header(driver) summary_text = summary.summarize_text(driver, text, question) links = scrape_links_with_selenium(driver) # Limit links to 5 if len(links) > 5: links = links[:5] close_browser(driver) return f"Answer gathered from website: {summary_text} \n \n Links: {links}", driver def scrape_text_with_selenium(url): logging.getLogger("selenium").setLevel(logging.CRITICAL) options = Options() options.add_argument( "user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.5615.49 Safari/537.36" ) driver = webdriver.Chrome( executable_path=ChromeDriverManager().install(), options=options ) driver.get(url) WebDriverWait(driver, 10).until( EC.presence_of_element_located((By.TAG_NAME, "body")) ) # Get the HTML content directly from the browser's DOM page_source = driver.execute_script("return document.body.outerHTML;") soup = BeautifulSoup(page_source, "html.parser") for script in soup(["script", "style"]): script.extract() text = soup.get_text() lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = "\n".join(chunk for chunk in chunks if chunk) return driver, text def scrape_links_with_selenium(driver): page_source = driver.page_source soup = BeautifulSoup(page_source, "html.parser") for script in soup(["script", "style"]): script.extract() hyperlinks = extract_hyperlinks(soup) return format_hyperlinks(hyperlinks) def close_browser(driver): driver.quit() def extract_hyperlinks(soup): return [(link.text, link["href"]) for link in soup.find_all("a", href=True)] def format_hyperlinks(hyperlinks): return [f"{link_text} ({link_url})" for link_text, link_url in hyperlinks] def add_header(driver): driver.execute_script(open(f"{file_dir}/js/overlay.js", "r").read())