"""Browse a webpage and summarize it using the LLM model""" from typing import List, Tuple, Union from urllib.parse import urljoin, urlparse import requests from requests.compat import urljoin from requests import Response from bs4 import BeautifulSoup from autogpt.config import Config from autogpt.memory import get_memory from autogpt.processing.html import extract_hyperlinks, format_hyperlinks CFG = Config() memory = get_memory(CFG) session = requests.Session() session.headers.update({"User-Agent": CFG.user_agent}) def is_valid_url(url: str) -> bool: """Check if the URL is valid Args: url (str): The URL to check Returns: bool: True if the URL is valid, False otherwise """ try: result = urlparse(url) return all([result.scheme, result.netloc]) except ValueError: return False def sanitize_url(url: str) -> str: """Sanitize the URL Args: url (str): The URL to sanitize Returns: str: The sanitized URL """ return urljoin(url, urlparse(url).path) def check_local_file_access(url: str) -> bool: """Check if the URL is a local file Args: url (str): The URL to check Returns: bool: True if the URL is a local file, False otherwise """ local_prefixes = [ "file:///", "file://localhost", "http://localhost", "https://localhost", ] return any(url.startswith(prefix) for prefix in local_prefixes) def get_response( url: str, timeout: int = 10 ) -> Union[Tuple[None, str], Tuple[Response, None]]: """Get the response from a URL Args: url (str): The URL to get the response from timeout (int): The timeout for the HTTP request Returns: Tuple[None, str] | Tuple[Response, None]: The response and error message Raises: ValueError: If the URL is invalid requests.exceptions.RequestException: If the HTTP request fails """ try: # Restrict access to local files if check_local_file_access(url): raise ValueError("Access to local files is restricted") # Most basic check if the URL is valid: if not url.startswith("http://") and not url.startswith("https://"): raise ValueError("Invalid URL format") sanitized_url = sanitize_url(url) response = session.get(sanitized_url, timeout=timeout) # Check if the response contains an HTTP error if response.status_code >= 400: return None, f"Error: HTTP {str(response.status_code)} error" return response, None except ValueError as ve: # Handle invalid URL format return None, f"Error: {str(ve)}" except requests.exceptions.RequestException as re: # Handle exceptions related to the HTTP request # (e.g., connection errors, timeouts, etc.) return None, f"Error: {str(re)}" def scrape_text(url: str) -> str: """Scrape text from a webpage Args: url (str): The URL to scrape text from Returns: str: The scraped text """ response, error_message = get_response(url) if error_message: return error_message if not response: return "Error: Could not get response" soup = BeautifulSoup(response.text, "html.parser") for script in soup(["script", "style"]): script.extract() text = soup.get_text() lines = (line.strip() for line in text.splitlines()) chunks = (phrase.strip() for line in lines for phrase in line.split(" ")) text = "\n".join(chunk for chunk in chunks if chunk) return text def scrape_links(url: str) -> Union[str, List[str]]: """Scrape links from a webpage Args: url (str): The URL to scrape links from Returns: Union[str, List[str]]: The scraped links """ response, error_message = get_response(url) if error_message: return error_message if not response: return "Error: Could not get response" soup = BeautifulSoup(response.text, "html.parser") for script in soup(["script", "style"]): script.extract() hyperlinks = extract_hyperlinks(soup, url) return format_hyperlinks(hyperlinks) def create_message(chunk, question): """Create a message for the user to summarize a chunk of text""" return { "role": "user", "content": f'"""{chunk}""" Using the above text, answer the following' f' question: "{question}" -- if the question cannot be answered using the' " text, summarize the text.", }