Spaces:

WordLift
/

create-llms-txt

Running

File size: 9,834 Bytes

import gradio as gr
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse
import asyncio
from collections import defaultdict
import unicodedata
import logging

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

class WebsiteCrawler:
    def __init__(self, max_depth=3, max_pages=50):
        self.max_depth = max_depth
        self.max_pages = max_pages
        self.visited_urls = set()
        self.url_metadata = defaultdict(dict)
        self.headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
        }

    def clean_text(self, text, is_title=False):
        """Clean and normalize text"""
        if not text:
            return ""
        # Normalize unicode characters
        text = unicodedata.normalize('NFKD', text)
        text = re.sub(r'[^\x00-\x7F]+', '', text)
        
        if is_title:
            # Remove common suffixes and fragments for titles
            text = re.sub(r'\s*[\|\-#:•].*', '', text)
            text = re.sub(r'^\s*Welcome to\s+', '', text)
            text = text.replace('docusaurus_skipToContent_fallback', '')
        
        return ' '.join(text.split()).strip()

    async def crawl_page(self, url, depth, base_domain):
        """Crawl a single page and extract information"""
        if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages:
            return []

        try:
            response = requests.get(url, headers=self.headers, timeout=10)
            response.encoding = 'utf-8'
            self.visited_urls.add(url)

            soup = BeautifulSoup(response.text, 'html.parser')
            
            # Extract title with fallbacks
            title = None
            meta_title = soup.find('meta', property='og:title')
            if meta_title and meta_title.get('content'):
                title = meta_title['content']
            if not title:
                title_tag = soup.find('title')
                if title_tag:
                    title = title_tag.text
            if not title:
                h1_tag = soup.find('h1')
                if h1_tag:
                    title = h1_tag.text
            if not title:
                title = url.split('/')[-1]

            title = self.clean_text(title, is_title=True)
            
            # Extract description with fallbacks
            desc = None
            meta_desc = soup.find('meta', {'name': 'description'})
            if meta_desc and meta_desc.get('content'):
                desc = meta_desc['content']
            if not desc:
                og_desc = soup.find('meta', property='og:description')
                if og_desc and og_desc.get('content'):
                    desc = og_desc['content']
            if not desc:
                first_p = soup.find('p')
                if first_p:
                    desc = first_p.text
            
            desc = self.clean_text(desc) if desc else ""

            # Determine category and importance
            url_lower = url.lower()
            category = 'Optional'
            importance = 0
            
            if 'docs' in url_lower or 'documentation' in url_lower:
                category = 'Docs'
                importance = 5
            elif 'api' in url_lower:
                category = 'API'
                importance = 4
            elif 'guide' in url_lower or 'tutorial' in url_lower:
                category = 'Guides'
                importance = 3
            elif 'example' in url_lower:
                category = 'Examples'
                importance = 2
            elif 'blog' in url_lower:
                category = 'Blog'
                importance = 1
            
            # Store metadata
            clean_url = re.sub(r'#.*', '', url).rstrip('/')
            if title and len(title.strip()) > 0:  # Only store if we have a valid title
                self.url_metadata[clean_url] = {
                    'title': title,
                    'description': desc,
                    'category': category,
                    'importance': importance
                }

            # Find links
            links = []
            for a in soup.find_all('a', href=True):
                href = a['href']
                if not any(x in href.lower() for x in ['javascript:', 'mailto:', '.pdf', '.jpg', '.png', '.gif']):
                    next_url = urljoin(url, href)
                    if urlparse(next_url).netloc == base_domain:
                        links.append(next_url)
            return links

        except Exception as e:
            logger.error(f"Error crawling {url}: {str(e)}")
            return []

    async def crawl_website(self, start_url):
        """Crawl website starting from the given URL"""
        base_domain = urlparse(start_url).netloc
        queue = [(start_url, 0)]
        seen = {start_url}

        while queue and len(self.visited_urls) < self.max_pages:
            current_url, depth = queue.pop(0)
            if depth > self.max_depth:
                continue

            links = await self.crawl_page(current_url, depth, base_domain)
            for link in links:
                if link not in seen and urlparse(link).netloc == base_domain:
                    seen.add(link)
                    queue.append((link, depth + 1))

    def generate_llms_txt(self):
        """Generate llms.txt content"""
        if not self.url_metadata:
            return "No content was found to generate llms.txt"

        # Sort and filter URLs
        sorted_urls = sorted(
            self.url_metadata.items(),
            key=lambda x: (x[1]['importance'], x[0]),
            reverse=True
        )

        # Generate content
        content = []
        main_metadata = sorted_urls[0][1]
        content.append(f"# {main_metadata['title']}")
        if main_metadata['description']:
            content.append(f"\n> {main_metadata['description']}")

        # Group by category
        categories = defaultdict(list)
        seen_titles = set()
        
        for url, metadata in sorted_urls:
            title = metadata['title']
            if title not in seen_titles:
                categories[metadata['category']].append((url, metadata))
                seen_titles.add(title)

        # Add sections
        for category in ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']:
            if category in categories:
                content.append(f"\n## {category}")
                for url, metadata in categories[category]:
                    if metadata['description']:
                        content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
                    else:
                        content.append(f"\n- [{metadata['title']}]({url})")

        return "\n".join(content)

async def process_url(url, max_depth, max_pages):
    """Process URL and generate llms.txt"""
    try:
        # Add https:// if not present
        if not url.startswith(('http://', 'https://')):
            url = 'https://' + url

        # Validate URL
        result = urlparse(url)
        if not all([result.scheme, result.netloc]):
            return "", "Invalid URL format. Please enter a valid URL."

        # Process website
        crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
        await crawler.crawl_website(url)
        content = crawler.generate_llms_txt()
        
        return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
    
    except Exception as e:
        return "", f"Error: {str(e)}"

# Create Gradio interface
theme = gr.themes.Soft(primary_hue="blue", font="Open Sans")

with gr.Blocks(theme=theme, css="""
    @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
    
    .gradio-container {
        font-family: 'Open Sans', sans-serif !important;
    }
    
    .gr-button {
        font-family: 'Open Sans', sans-serif !important;
        font-weight: 600 !important;
    }

    .primary-btn {
        background-color: #2436d4 !important;
        color: white !important;
    }

    .primary-btn:hover {
        background-color: #1c2aa8 !important;
    }
    
    [data-testid="textbox"] {
        font-family: 'Open Sans', sans-serif !important;
    }
    
    .gr-padded {
        font-family: 'Open Sans', sans-serif !important;
    }
    
    .gr-input {
        font-family: 'Open Sans', sans-serif !important;
    }
    
    .gr-label {
        font-family: 'Open Sans', sans-serif !important;
    }
""") as iface:
    gr.Markdown("# llms.txt Generator")
    gr.Markdown("Generate an llms.txt file from a website following the specification.")
    
    with gr.Row():
        url_input = gr.Textbox(
            label="Website URL", 
            placeholder="Enter the website URL (e.g., example.com)",
            info="The URL will be automatically prefixed with https:// if not provided"
        )
    
    with gr.Row():
        with gr.Column():
            depth_input = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth")
        with gr.Column():
            pages_input = gr.Slider(minimum=10, maximum=100, value=50, step=10, label="Maximum Pages")
    
    generate_btn = gr.Button("Generate llms.txt", variant="primary")
    
    output = gr.Textbox(
        label="Generated llms.txt Content",
        lines=20,
        show_copy_button=True,
        container=True
    )
    
    status = gr.Textbox(label="Status")
    
    generate_btn.click(
        fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
        inputs=[url_input, depth_input, pages_input],
        outputs=[output, status]
    )

if __name__ == "__main__":
    iface.launch()