Spaces:

WordLift
/

create-llms-txt

Running

File size: 8,290 Bytes

import gradio as gr
import advertools as adv
import pandas as pd
import re
from secrets import token_hex
import logging
import os
from markitdown import MarkItDown
from typing import Tuple, List, Optional
import validators

# Set up logging
logging.basicConfig(
    level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s"
)
logger = logging.getLogger(__name__)

# Initialize MarkItDown
md_converter = MarkItDown()


def validate_url(url: str) -> Tuple[bool, str]:
    """Validate URL format and accessibility."""
    if not url:
        return False, "URL is required"

    if not url.startswith(("http://", "https://")):
        url = "https://" + url

    if not validators.url(url):
        return False, "Invalid URL format"

    return True, url


def safe_crawl(url: str, output_file: str) -> bool:
    """Safely perform a web crawl with timeout and error handling."""
    try:
        adv.crawl(
            url,
            output_file,
            follow_links=False,
            custom_settings={
                "CLOSESPIDER_TIMEOUT": 30,
                "ROBOTSTXT_OBEY": True,
                "CONCURRENT_REQUESTS_PER_DOMAIN": 1,
                "USER_AGENT": "Mozilla/5.0 (compatible; LLMContentBot/1.0)",
                "DOWNLOAD_TIMEOUT": 10,
            },
        )
        return True
    except Exception as e:
        logger.error(f"Crawl error for {url}: {str(e)}")
        return False


def clean_text(text: str) -> str:
    """Clean and format text by removing extra whitespace and normalizing spacing."""
    if not text:
        return ""
    # Remove extra whitespace and newlines
    text = re.sub(r"[\n\s]+", " ", text)
    # Split camelCase words
    text = re.sub(r"([a-z])([A-Z])", r"\1 \2", text)
    # Clean extra spaces
    text = " ".join(text.split())
    return text.strip()


def process_link_pair(url: str, text: str, seen_links: set) -> Optional[str]:
    """Process a single link-text pair and return markdown if valid."""
    if not url or not text:
        return None

    url = url.strip()
    text = clean_text(text)

    if not text or not url or url in seen_links:
        return None

    seen_links.add(url)
    return f"## {text}\n[{text}]({url})"


def process_links(crawl_df: pd.DataFrame, link_types: List[str]) -> str:
    """Process links based on selected types with deduplication."""
    try:
        all_links = []
        seen_links = set()  # Track unique URLs

        if "All links" in link_types or not link_types:
            link_df = adv.crawlytics.links(crawl_df)
            for link, text in link_df[["link", "text"]].dropna().values:
                if md_link := process_link_pair(link, text, seen_links):
                    all_links.append(md_link)
        else:
            for link_type in link_types:
                type_match = re.findall(r"header|footer|nav", link_type.lower())
                if type_match:
                    col_prefix = type_match[0]
                    urls = crawl_df[f"{col_prefix}_links_url"].iloc[0]
                    texts = crawl_df[f"{col_prefix}_links_text"].iloc[0]

                    if urls and texts:
                        urls = urls.split("@@")
                        texts = texts.split("@@")

                        for url, text in zip(urls, texts):
                            if md_link := process_link_pair(url, text, seen_links):
                                all_links.append(md_link)

        return "\n\n".join(all_links)
    except Exception as e:
        logger.error(f"Link processing error: {str(e)}")
        return ""


def process_url(url: str, link_types: List[str]) -> Tuple[str, str]:
    """Process website URL and generate markdown content."""
    valid, result = validate_url(url)
    if not valid:
        return "", result

    url = result
    output_file = f"crawl_{token_hex(6)}.jsonl"

    try:
        if not safe_crawl(url, output_file):
            return "", "Crawl failed or timed out"

        crawl_df = pd.read_json(output_file, lines=True)
        if crawl_df.empty:
            return "", "No data found for the URL"

        # Extract and clean title and description
        title = (
            clean_text(crawl_df["title"].iloc[0])
            if "title" in crawl_df.columns
            else "Untitled"
        )
        meta_desc = (
            clean_text(crawl_df["meta_desc"].iloc[0])
            if "meta_desc" in crawl_df.columns
            else ""
        )

        # Process links
        links_content = process_links(crawl_df, link_types)

        # Generate final markdown
        content = f"# {title}\n\n"
        if meta_desc:
            content += f"> {meta_desc}\n\n"
        content += links_content

        return content, f"Successfully processed {url}"

    except Exception as e:
        logger.error(f"Error processing {url}: {str(e)}")
        return "", f"Error: {str(e)}"
    finally:
        if os.path.exists(output_file):
            os.remove(output_file)


def process_file(file: gr.File) -> Tuple[str, str]:
    """Convert uploaded file to markdown."""
    if not file:
        return "", "No file uploaded"

    supported_extensions = {".pdf", ".docx", ".pptx", ".xlsx", ".html", ".txt"}
    file_ext = os.path.splitext(file.name)[1].lower()

    if file_ext not in supported_extensions:
        return "", f"Unsupported file type: {file_ext}"

    try:
        result = md_converter.convert(file.name)
        return result.text_content, "File processed successfully"
    except Exception as e:
        logger.error(f"File processing error: {str(e)}")
        return "", f"Error processing file: {str(e)}"


# Custom CSS for styling
css = """
@import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@300;400;600;700&display=swap');

body {
    font-family: 'Open Sans', sans-serif !important;
}

.primary-btn {
    background-color: #3452db !important;
}

.primary-btn:hover {
    background-color: #2a41af !important;
}
"""

# Create a custom theme
theme = gr.themes.Soft(
    primary_hue=gr.themes.colors.Color(
        name="blue",
        c50="#eef1ff",
        c100="#e0e5ff",
        c200="#c3cbff",
        c300="#a5b2ff",
        c400="#8798ff",
        c500="#6a7eff",
        c600="#3452db",
        c700="#2a41af",
        c800="#1f3183",
        c900="#152156",
        c950="#0a102b",
    )
)

# Create interface
with gr.Blocks(
    theme=gr.themes.Soft(),
    css=css,
    head="""
        <link rel="canonical" href="https://wordlift.io/generate-llms-txt/" />
        <meta name="description" content="Generate your LLMs.txt file - A WordLift tool to help you manage Large Language Models access to your content." />
        <meta property="og:title" content="LLMs.txt Generator by WordLift" />
        <meta property="og:description" content="Generate your LLMs.txt file - A WordLift tool to help you manage Large Language Models access to your content." />
        <meta property="og:url" content="https://wordlift.io/generate-llms-txt/" />
    """,
) as iface:
    gr.Markdown("# LLMs.txt Generator")

    with gr.Tab("Website URL"):
        url_input = gr.Textbox(label="Website URL", placeholder="example.com")
        link_types = gr.Dropdown(
            choices=["All links", "<header> links", "<nav> links", "<footer> links"],
            multiselect=True,
            value=["All links"],
            label="Link Types to Extract",
        )
        url_button = gr.Button("Process URL", variant="primary")
        url_output = gr.Textbox(
            label="Generated Content", lines=20, show_copy_button=True
        )
        url_status = gr.Textbox(label="Status")

        url_button.click(
            process_url,
            inputs=[url_input, link_types],
            outputs=[url_output, url_status],
        )

    with gr.Tab("File Converter"):
        file_input = gr.File(label="Upload Document")
        file_button = gr.Button("Convert to Markdown", variant="primary")
        file_output = gr.Textbox(
            label="Converted Content", lines=20, show_copy_button=True
        )
        file_status = gr.Textbox(label="Status")

        file_button.click(
            process_file, inputs=[file_input], outputs=[file_output, file_status]
        )

if __name__ == "__main__":
    iface.launch()