#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
Web Crawler and Content Saver

This module provides functionality to crawl web pages, extract content,
and save the results including markdown text and images. It uses the
WebCrawler class from crawl4ai and implements parallel image downloading.
"""

import sys
import os
import re
import platform
from concurrent.futures import ThreadPoolExecutor, as_completed
import time
import argparse
from urllib.parse import urljoin, urlparse
import requests
try:
    from crawl4ai import WebCrawler  # type: ignore
except ImportError as exc:
    raise ImportError(
        "The module 'crawl4ai' could not be imported. Please ensure it is "
        "installed and accessible."
    ) from exc

# Check if the current Conda environment is "crawl"
conda_env = os.environ.get('CONDA_DEFAULT_ENV')
if conda_env != 'crawl':
    print(f"Error: The current Conda environment is '{conda_env}'. "
          "Please activate the 'crawl' environment.")
    sys.exit(1)


def create_crawler():
    """
    Create and initialize a WebCrawler instance.

    Returns:
        WebCrawler: An initialized WebCrawler object.
    """
    crawler = WebCrawler(verbose=True)
    crawler.warmup()
    return crawler


def sanitize_filename(filename):
    """
    Remove invalid characters from a filename to make it Windows-compatible.

    Args:
        filename (str): The original filename.

    Returns:
        str: The sanitized filename.
    """
    # Remove invalid characters for Windows file names
    return re.sub(r'[<>:"/\\|?*]', "", filename)


def download_image(session, image_url, save_dir, base_url):
    """
    Download an image from a given URL and save it to the specified directory.

    Args:
        session (requests.Session):
            The requests session to use for downloading.
        image_url (str):
            The URL of the image to download.
        save_dir (str):
            The directory to save the downloaded image.
        base_url (str):
            The base URL of the page being crawled.
    """
    try:
        # Parse the base URL to get the scheme and netloc
        parsed_base_url = urlparse(base_url)
        base_image_url = (
            f"{parsed_base_url.scheme}://{parsed_base_url.netloc}/"
        )

        # Ensure the URL has a scheme and is properly joined with
        # the base image URL
        if not re.match(r"^https?://", image_url):
            image_url = urljoin(
                base_image_url, image_url.lstrip("/")
            )

        image_filename = os.path.basename(image_url).split("?")[0]
        sanitized_image_filename = sanitize_filename(image_filename)
        image_path = os.path.join(save_dir, sanitized_image_filename)

        response = session.get(image_url, stream=True)
        response.raise_for_status()
        with open(image_path, "wb") as image_file:
            for chunk in response.iter_content(chunk_size=8192):
                image_file.write(chunk)
        print(f"Saved image: {image_path}")
    except requests.RequestException as req_err:
        print(f"Error downloading image {image_url}: {str(req_err)}")
    except IOError as io_err:
        print(f"Error saving image {image_url}: {str(io_err)}")


def save_result(target_url):
    """
    Crawl a given URL, extract content, and save the results.

    This function crawls the specified URL, saves the markdown content,
    and downloads all associated images in parallel.

    Args:
        target_url (str): The URL to crawl and save content from.
    """
    crawler = create_crawler()
    result = crawler.run(url=target_url)
    if result is None:
        raise ValueError(f"Failed to crawl {target_url}")
    title = result.metadata.get("title", "untitled")
    sanitized_title = sanitize_filename(title).replace(" ", "_")

    # Choose the appropriate base path based on the operating system
    if platform.system() == "Windows":
        base_path = "E:\\datasets\\knowledgebase\\Saved Websites\\"
    else:
        base_path = "/home/kade/saved_websites/"

    save_dir = os.path.join(base_path, sanitized_title)
    os.makedirs(save_dir, exist_ok=True)

    # Save markdown
    save_path = os.path.join(save_dir, f"{sanitized_title}.md")
    with open(save_path, "w", encoding="utf-8") as file:
        file.write(result.markdown)
    print(f"Saved markdown to {save_path}")

    # Save images in parallel
    if "images" in result.media and isinstance(result.media["images"], list):
        session = requests.Session()
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64)"
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/91.0.4472.124 Safari/537.36",
            "Referer": target_url,
            "Accept": (
                "image/avif,image/webp,image/apng,image/svg+xml,"
                "image/*,*/*;q=0.8"
            ),
            "Accept-Language": "en-US,en;q=0.9",
            "Sec-Fetch-Dest": "image",
            "Sec-Fetch-Mode": "no-cors",
            "Sec-Fetch-Site": "cross-site",
        }
        session.headers.update(headers)

        with ThreadPoolExecutor(max_workers=5) as image_executor:
            image_futures = []
            for image_data in result.media["images"]:
                if "src" in image_data:
                    image_futures.append(
                        image_executor.submit(
                            download_image,
                            session,
                            image_data["src"],
                            save_dir,
                            target_url,
                        )
                    )

            for img_future in as_completed(image_futures):
                img_future.result()


def retry_crawl(inner_url):
    """
    Retries crawling the given URL until successful.

    Args:
        inner_url (str): The URL to crawl.
    """
    while True:
        try:
            save_result(inner_url)
            return
        except (AttributeError, ValueError) as inner_e:
            print(f"[ERROR] 🚫 Failed to crawl {inner_url}, "
                  f"error: {str(inner_e)}")
            print("Retrying in 3 seconds...")
            time.sleep(3)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        description="Web Crawler and Content Saver"
    )
    parser.add_argument(
        "urls",
        nargs="+",
        help="List of URLs to crawl"
    )
    parser.add_argument(
        "--retry",
        action="store_true",
        help="Retry crawling indefinitely every 3 seconds until successful"
    )
    args = parser.parse_args()

    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = []
        for url in args.urls:
            if args.retry:
                futures.append(
                    executor.submit(retry_crawl, url)
                )
            else:
                futures.append(
                    executor.submit(save_result, url)
                )

        for future in as_completed(futures):
            try:
                future.result()
            except (AttributeError, ValueError) as e:
                print(f"[ERROR] 🚫 Failed to crawl, error: {str(e)}")