#!/usr/bin/env python # -*- coding: utf-8 -*- """ Web Crawler and Content Saver This module provides functionality to crawl web pages, extract content, and save the results including markdown text and images. It uses the WebCrawler class from crawl4ai and implements parallel image downloading. """ import sys import os import re import platform from concurrent.futures import ThreadPoolExecutor, as_completed import time import argparse from urllib.parse import urljoin, urlparse import requests try: from crawl4ai import WebCrawler # type: ignore except ImportError as exc: raise ImportError( "The module 'crawl4ai' could not be imported. Please ensure it is " "installed and accessible." ) from exc # Check if the current Conda environment is "crawl" conda_env = os.environ.get('CONDA_DEFAULT_ENV') if conda_env != 'crawl': print(f"Error: The current Conda environment is '{conda_env}'. " "Please activate the 'crawl' environment.") sys.exit(1) def create_crawler(): """ Create and initialize a WebCrawler instance. Returns: WebCrawler: An initialized WebCrawler object. """ crawler = WebCrawler(verbose=True) crawler.warmup() return crawler def sanitize_filename(filename): """ Remove invalid characters from a filename to make it Windows-compatible. Args: filename (str): The original filename. Returns: str: The sanitized filename. """ # Remove invalid characters for Windows file names return re.sub(r'[<>:"/\\|?*]', "", filename) def download_image(session, image_url, save_dir, base_url): """ Download an image from a given URL and save it to the specified directory. Args: session (requests.Session): The requests session to use for downloading. image_url (str): The URL of the image to download. save_dir (str): The directory to save the downloaded image. base_url (str): The base URL of the page being crawled. """ try: # Parse the base URL to get the scheme and netloc parsed_base_url = urlparse(base_url) base_image_url = ( f"{parsed_base_url.scheme}://{parsed_base_url.netloc}/" ) # Ensure the URL has a scheme and is properly joined with # the base image URL if not re.match(r"^https?://", image_url): image_url = urljoin( base_image_url, image_url.lstrip("/") ) image_filename = os.path.basename(image_url).split("?")[0] sanitized_image_filename = sanitize_filename(image_filename) image_path = os.path.join(save_dir, sanitized_image_filename) response = session.get(image_url, stream=True) response.raise_for_status() with open(image_path, "wb") as image_file: for chunk in response.iter_content(chunk_size=8192): image_file.write(chunk) print(f"Saved image: {image_path}") except requests.RequestException as req_err: print(f"Error downloading image {image_url}: {str(req_err)}") except IOError as io_err: print(f"Error saving image {image_url}: {str(io_err)}") def save_result(target_url): """ Crawl a given URL, extract content, and save the results. This function crawls the specified URL, saves the markdown content, and downloads all associated images in parallel. Args: target_url (str): The URL to crawl and save content from. """ crawler = create_crawler() result = crawler.run(url=target_url) if result is None: raise ValueError(f"Failed to crawl {target_url}") title = result.metadata.get("title", "untitled") sanitized_title = sanitize_filename(title).replace(" ", "_") # Choose the appropriate base path based on the operating system if platform.system() == "Windows": base_path = "E:\\datasets\\knowledgebase\\Saved Websites\\" else: base_path = "/home/kade/saved_websites/" save_dir = os.path.join(base_path, sanitized_title) os.makedirs(save_dir, exist_ok=True) # Save markdown save_path = os.path.join(save_dir, f"{sanitized_title}.md") with open(save_path, "w", encoding="utf-8") as file: file.write(result.markdown) print(f"Saved markdown to {save_path}") # Save images in parallel if "images" in result.media and isinstance(result.media["images"], list): session = requests.Session() headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)" "AppleWebKit/537.36 (KHTML, like Gecko) " "Chrome/91.0.4472.124 Safari/537.36", "Referer": target_url, "Accept": ( "image/avif,image/webp,image/apng,image/svg+xml," "image/*,*/*;q=0.8" ), "Accept-Language": "en-US,en;q=0.9", "Sec-Fetch-Dest": "image", "Sec-Fetch-Mode": "no-cors", "Sec-Fetch-Site": "cross-site", } session.headers.update(headers) with ThreadPoolExecutor(max_workers=5) as image_executor: image_futures = [] for image_data in result.media["images"]: if "src" in image_data: image_futures.append( image_executor.submit( download_image, session, image_data["src"], save_dir, target_url, ) ) for img_future in as_completed(image_futures): img_future.result() def retry_crawl(inner_url): """ Retries crawling the given URL until successful. Args: inner_url (str): The URL to crawl. """ while True: try: save_result(inner_url) return except (AttributeError, ValueError) as inner_e: print(f"[ERROR] 🚫 Failed to crawl {inner_url}, " f"error: {str(inner_e)}") print("Retrying in 3 seconds...") time.sleep(3) if __name__ == "__main__": parser = argparse.ArgumentParser( description="Web Crawler and Content Saver" ) parser.add_argument( "urls", nargs="+", help="List of URLs to crawl" ) parser.add_argument( "--retry", action="store_true", help="Retry crawling indefinitely every 3 seconds until successful" ) args = parser.parse_args() with ThreadPoolExecutor(max_workers=5) as executor: futures = [] for url in args.urls: if args.retry: futures.append( executor.submit(retry_crawl, url) ) else: futures.append( executor.submit(save_result, url) ) for future in as_completed(futures): try: future.result() except (AttributeError, ValueError) as e: print(f"[ERROR] 🚫 Failed to crawl, error: {str(e)}")