import gradio as gr
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse
import asyncio
from collections import defaultdict
import unicodedata
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)


class WebsiteCrawler:
    def __init__(self, max_depth=3, max_pages=50):
        self.max_depth = max_depth
        self.max_pages = max_pages
        self.visited_urls = set()
        self.url_metadata = defaultdict(dict)
        self.headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
        }

    def clean_text(self, text, is_title=False):
        """Clean and normalize text"""
        if not text:
            return ""
        # Normalize unicode characters
        text = unicodedata.normalize("NFKD", text)
        text = re.sub(r"[^\x00-\x7F]+", "", text)

        if is_title:
            # Remove common suffixes and fragments for titles
            text = re.sub(r"\s*[\|\-#:•].*", "", text)
            text = re.sub(r"^\s*Welcome to\s+", "", text)
            text = text.replace("docusaurus_skipToContent_fallback", "")

        return " ".join(text.split()).strip()

    async def crawl_page(self, url, depth, base_domain):
        """Crawl a single page and extract information"""
        if (
            depth > self.max_depth
            or url in self.visited_urls
            or len(self.visited_urls) >= self.max_pages
        ):
            return []

        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            response.encoding = "utf-8"
            self.visited_urls.add(url)

            soup = BeautifulSoup(response.text, "html.parser")

            # Extract title with fallbacks
            title = None
            meta_title = soup.find("meta", property="og:title")
            if meta_title and meta_title.get("content"):
                title = meta_title["content"]
            if not title:
                title_tag = soup.find("title")
                if title_tag:
                    title = title_tag.text
            if not title:
                h1_tag = soup.find("h1")
                if h1_tag:
                    title = h1_tag.text
            if not title:
                title = url.split("/")[-1]

            title = self.clean_text(title, is_title=True)

            # Extract description with fallbacks
            desc = None
            meta_desc = soup.find("meta", {"name": "description"})
            if meta_desc and meta_desc.get("content"):
                desc = meta_desc["content"]
            if not desc:
                og_desc = soup.find("meta", property="og:description")
                if og_desc and og_desc.get("content"):
                    desc = og_desc["content"]
            if not desc:
                first_p = soup.find("p")
                if first_p:
                    desc = first_p.text

            desc = self.clean_text(desc) if desc else ""

            # Determine category and importance
            url_lower = url.lower()
            category = "Optional"
            importance = 0

            if "docs" in url_lower or "documentation" in url_lower:
                category = "Docs"
                importance = 5
            elif "api" in url_lower:
                category = "API"
                importance = 4
            elif "guide" in url_lower or "tutorial" in url_lower:
                category = "Guides"
                importance = 3
            elif "example" in url_lower:
                category = "Examples"
                importance = 2
            elif "blog" in url_lower:
                category = "Blog"
                importance = 1

            # Store metadata
            clean_url = re.sub(r"#.*", "", url).rstrip("/")
            if title and len(title.strip()) > 0:  # Only store if we have a valid title
                self.url_metadata[clean_url] = {
                    "title": title,
                    "description": desc,
                    "category": category,
                    "importance": importance,
                }

            # Find links
            links = []
            for a in soup.find_all("a", href=True):
                href = a["href"]
                if not any(
                    x in href.lower()
                    for x in ["javascript:", "mailto:", ".pdf", ".jpg", ".png", ".gif"]
                ):
                    next_url = urljoin(url, href)
                    if urlparse(next_url).netloc == base_domain:
                        links.append(next_url)
            return links

        except Exception as e:
            logger.error(f"Error crawling {url}: {str(e)}")
            return []

    async def crawl_website(self, start_url):
        """Crawl website starting from the given URL"""
        base_domain = urlparse(start_url).netloc
        queue = [(start_url, 0)]
        seen = {start_url}

        while queue and len(self.visited_urls) < self.max_pages:
            current_url, depth = queue.pop(0)
            if depth > self.max_depth:
                continue

            links = await self.crawl_page(current_url, depth, base_domain)
            for link in links:
                if link not in seen and urlparse(link).netloc == base_domain:
                    seen.add(link)
                    queue.append((link, depth + 1))

    def clean_description(self, desc):
        """Clean description text"""
        if not desc:
            return ""
        # Remove leading dashes, hyphens, or colons
        desc = re.sub(r"^[-:\s]+", "", desc)
        # Remove any strings that are just "Editors", "APIs", etc.
        if len(desc.split()) <= 1:
            return ""
        return desc.strip()

    def generate_llms_txt(self):
        """Generate llms.txt content"""
        if not self.url_metadata:
            return "No content was found to generate llms.txt"

        # Sort URLs by importance and remove duplicates
        sorted_urls = []
        seen_titles = set()

        for url, metadata in sorted(
            self.url_metadata.items(),
            key=lambda x: (x[1]["importance"], x[0]),
            reverse=True,
        ):
            if metadata["title"] not in seen_titles:
                sorted_urls.append((url, metadata))
                seen_titles.add(metadata["title"])

        if not sorted_urls:
            return "No valid content was found"

        # Generate content
        content = []

        # Find the best title for the main header (prefer "Welcome" or "Overview")
        main_title = "Welcome"  # Default to Welcome

        # Find a good description for the blockquote
        best_description = None
        for _, metadata in sorted_urls:
            desc = self.clean_description(metadata["description"])
            if desc and len(desc) > 20 and "null" not in desc.lower():
                best_description = desc
                break

        content.append(f"# {main_title}")
        if best_description:
            content.append(f"\n> {best_description}")

        # Group by category
        categories = defaultdict(list)
        for url, metadata in sorted_urls:
            if metadata["title"] and url:
                categories[metadata["category"]].append((url, metadata))

        # Add sections
        for category in ["Docs", "API", "Guides", "Examples", "Blog", "Optional"]:
            if category in categories:
                content.append(f"\n## {category}")

                # Add links without extra newlines
                links = []
                for url, metadata in categories[category]:
                    title = metadata["title"].strip()
                    desc = self.clean_description(metadata["description"])
                    if desc:
                        links.append(f"- [{title}]({url}): {desc}")
                    else:
                        links.append(f"- [{title}]({url})")

                content.append("\n".join(links))

        return "\n".join(content)


async def process_url(url, max_depth, max_pages):
    """Process URL and generate llms.txt"""
    try:
        # Add https:// if not present
        if not url.startswith(("http://", "https://")):
            url = "https://" + url

        # Validate URL
        result = urlparse(url)
        if not all([result.scheme, result.netloc]):
            return "", "Invalid URL format. Please enter a valid URL."

        # Process website
        crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
        await crawler.crawl_website(url)
        content = crawler.generate_llms_txt()

        return content, f"Successfully crawled {len(crawler.visited_urls)} pages."

    except Exception as e:
        return "", f"Error: {str(e)}"


# Create Gradio interface
theme = gr.themes.Soft(primary_hue="blue", font="Open Sans")

with gr.Blocks(
    theme=theme,
    css="""
    @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');

    .gradio-container {
        font-family: 'Open Sans', sans-serif !important;
    }

    .gr-button {
        font-family: 'Open Sans', sans-serif !important;
        font-weight: 600 !important;
    }

    .primary-btn {
        background-color: #2436d4 !important;
        color: white !important;
    }

    .primary-btn:hover {
        background-color: #1c2aa8 !important;
    }

    [data-testid="textbox"] {
        font-family: 'Open Sans', sans-serif !important;
    }

    .gr-padded {
        font-family: 'Open Sans', sans-serif !important;
    }

    .gr-input {
        font-family: 'Open Sans', sans-serif !important;
    }

    .gr-label {
        font-family: 'Open Sans', sans-serif !important;
    }
""",
) as iface:
    gr.Markdown("# llms.txt Generator")
    gr.Markdown("Generate an llms.txt file from a website following the specification.")

    with gr.Row():
        url_input = gr.Textbox(
            label="Website URL",
            placeholder="Enter the website URL (e.g., example.com)",
            info="The URL will be automatically prefixed with https:// if not provided",
        )

    with gr.Row():
        with gr.Column():
            depth_input = gr.Slider(
                minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth"
            )
        with gr.Column():
            pages_input = gr.Slider(
                minimum=10, maximum=100, value=50, step=10, label="Maximum Pages"
            )

    generate_btn = gr.Button("Generate llms.txt", variant="primary")

    output = gr.Textbox(
        label="Generated llms.txt Content",
        lines=20,
        show_copy_button=True,
        container=True,
    )

    status = gr.Textbox(label="Status")

    generate_btn.click(
        fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
        inputs=[url_input, depth_input, pages_input],
        outputs=[output, status],
    )

if __name__ == "__main__":
    iface.launch()