Spaces:
Runtime error
Runtime error
"""Selenium web scraping module.""" | |
from __future__ import annotations | |
import logging | |
from pathlib import Path | |
from sys import platform | |
from bs4 import BeautifulSoup | |
from selenium import webdriver | |
from selenium.webdriver.chrome.options import Options as ChromeOptions | |
from selenium.webdriver.common.by import By | |
from selenium.webdriver.firefox.options import Options as FirefoxOptions | |
from selenium.webdriver.remote.webdriver import WebDriver | |
from selenium.webdriver.safari.options import Options as SafariOptions | |
from selenium.webdriver.support import expected_conditions as EC | |
from selenium.webdriver.support.wait import WebDriverWait | |
from webdriver_manager.chrome import ChromeDriverManager | |
from webdriver_manager.firefox import GeckoDriverManager | |
import autogpt.processing.text as summary | |
from autogpt.config import Config | |
from autogpt.processing.html import extract_hyperlinks, format_hyperlinks | |
FILE_DIR = Path(__file__).parent.parent | |
CFG = Config() | |
def browse_website(url: str, question: str) -> tuple[str, WebDriver]: | |
"""Browse a website and return the answer and links to the user | |
Args: | |
url (str): The url of the website to browse | |
question (str): The question asked by the user | |
Returns: | |
Tuple[str, WebDriver]: The answer and links to the user and the webdriver | |
""" | |
driver, text = scrape_text_with_selenium(url) | |
add_header(driver) | |
summary_text = summary.summarize_text(url, text, question, driver) | |
links = scrape_links_with_selenium(driver, url) | |
# Limit links to 5 | |
if len(links) > 5: | |
links = links[:5] | |
close_browser(driver) | |
return f"Answer gathered from website: {summary_text} \n \n Links: {links}", driver | |
def scrape_text_with_selenium(url: str) -> tuple[WebDriver, str]: | |
"""Scrape text from a website using selenium | |
Args: | |
url (str): The url of the website to scrape | |
Returns: | |
Tuple[WebDriver, str]: The webdriver and the text scraped from the website | |
""" | |
logging.getLogger("selenium").setLevel(logging.CRITICAL) | |
options_available = { | |
"chrome": ChromeOptions, | |
"safari": SafariOptions, | |
"firefox": FirefoxOptions, | |
} | |
options = options_available[CFG.selenium_web_browser]() | |
options.add_argument( | |
"user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/112.0.5615.49 Safari/537.36" | |
) | |
if CFG.selenium_web_browser == "firefox": | |
driver = webdriver.Firefox( | |
executable_path=GeckoDriverManager().install(), options=options | |
) | |
elif CFG.selenium_web_browser == "safari": | |
# Requires a bit more setup on the users end | |
# See https://developer.apple.com/documentation/webkit/testing_with_webdriver_in_safari | |
driver = webdriver.Safari(options=options) | |
else: | |
if platform == "linux" or platform == "linux2": | |
options.add_argument("--disable-dev-shm-usage") | |
options.add_argument("--remote-debugging-port=9222") | |
options.add_argument("--no-sandbox") | |
if CFG.selenium_headless: | |
options.add_argument("--headless") | |
options.add_argument("--disable-gpu") | |
driver = webdriver.Chrome( | |
executable_path=ChromeDriverManager().install(), options=options | |
) | |
driver.get(url) | |
WebDriverWait(driver, 10).until( | |
EC.presence_of_element_located((By.TAG_NAME, "body")) | |
) | |
# Get the HTML content directly from the browser's DOM | |
page_source = driver.execute_script("return document.body.outerHTML;") | |
soup = BeautifulSoup(page_source, "html.parser") | |
for script in soup(["script", "style"]): | |
script.extract() | |
text = soup.get_text() | |
lines = (line.strip() for line in text.splitlines()) | |
chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) | |
text = "\n".join(chunk for chunk in chunks if chunk) | |
return driver, text | |
def scrape_links_with_selenium(driver: WebDriver, url: str) -> list[str]: | |
"""Scrape links from a website using selenium | |
Args: | |
driver (WebDriver): The webdriver to use to scrape the links | |
Returns: | |
List[str]: The links scraped from the website | |
""" | |
page_source = driver.page_source | |
soup = BeautifulSoup(page_source, "html.parser") | |
for script in soup(["script", "style"]): | |
script.extract() | |
hyperlinks = extract_hyperlinks(soup, url) | |
return format_hyperlinks(hyperlinks) | |
def close_browser(driver: WebDriver) -> None: | |
"""Close the browser | |
Args: | |
driver (WebDriver): The webdriver to close | |
Returns: | |
None | |
""" | |
driver.quit() | |
def add_header(driver: WebDriver) -> None: | |
"""Add a header to the website | |
Args: | |
driver (WebDriver): The webdriver to use to add the header | |
Returns: | |
None | |
""" | |
driver.execute_script(open(f"{FILE_DIR}/js/overlay.js", "r").read()) | |