Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on 21 days ago

Commit

ab2a9d9

•

1 Parent(s): 330f459

Update app.py

Browse files

Files changed (1) hide show

app.py +213 -119

app.py CHANGED Viewed

@@ -4,119 +4,183 @@ from bs4 import BeautifulSoup
 import re
 from urllib.parse import urljoin, urlparse
 import markdown
-def get_website_title(soup):
-    """Extract website title from meta tags or title tag"""
-    # Try meta title first
-    meta_title = soup.find('meta', property='og:title')
-    if meta_title:
-        return meta_title['content']
-    # Try regular title tag
-    title_tag = soup.find('title')
-    if title_tag:
-        return title_tag.text.strip()
-    # Fallback to h1
-    h1_tag = soup.find('h1')
-    if h1_tag:
-        return h1_tag.text.strip()
-    return "Website Title"
-def get_website_description(soup):
-    """Extract website description from meta tags"""
-    # Try meta description
-    meta_desc = soup.find('meta', {'name': 'description'}) or soup.find('meta', property='og:description')
-    if meta_desc:
-        return meta_desc.get('content', '')
-    # Fallback to first paragraph
-    first_p = soup.find('p')
-    if first_p:
-        return first_p.text.strip()
-    return "Website description"
-def get_important_links(soup, base_url):
-    """Extract important links from the website"""
-    links = []
-    seen_urls = set()
-    # Look for navigation links
-    nav_elements = soup.find_all(['nav', 'header'])
-    for nav in nav_elements:
-        for a in nav.find_all('a', href=True):
-            url = urljoin(base_url, a['href'])
-            if url not in seen_urls and not url.startswith(('javascript:', 'mailto:', 'tel:')):
-                text = a.text.strip()
-                if text and len(text) > 1:  # Avoid empty or single-character links
-                    links.append({
-                        'title': text,
-                        'url': url,
-                        'section': 'Docs'
-                    })
-                    seen_urls.add(url)
-    # Look for footer links
-    footer = soup.find('footer')
-    if footer:
-        for a in footer.find_all('a', href=True):
-            url = urljoin(base_url, a['href'])
-            if url not in seen_urls and not url.startswith(('javascript:', 'mailto:', 'tel:')):
-                text = a.text.strip()
-                if text and len(text) > 1:
-                    links.append({
-                        'title': text,
-                        'url': url,
-                        'section': 'Optional'
-                    })
-                    seen_urls.add(url)
-    return links
-def generate_llms_txt(url):
-    try:
-        # Fetch the webpage
-        headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
         }
-        response = requests.get(url, headers=headers, timeout=10)
-        response.raise_for_status()
-        # Parse the HTML
-        soup = BeautifulSoup(response.text, 'html.parser')
-        # Get base components
-        title = get_website_title(soup)
-        description = get_website_description(soup)
-        links = get_important_links(soup, url)
-        # Generate llms.txt content
-        content = [
-            f"# {title}\n",
-            f"> {description}\n",
-            "## Docs\n"
-        ]
-        # Add documentation links
-        doc_links = [link for link in links if link['section'] == 'Docs']
-        for link in doc_links:
-            content.append(f"- [{link['title']}]({link['url']}): Documentation page\n")
-        # Add optional links if present
-        optional_links = [link for link in links if link['section'] == 'Optional']
-        if optional_links:
-            content.append("\n## Optional\n")
-            for link in optional_links:
-                content.append(f"- [{link['title']}]({link['url']})\n")
-        # Join all content
-        llms_txt_content = "\n".join(content)
-        return llms_txt_content
-    except Exception as e:
-        return f"Error generating llms.txt: {str(e)}"
 def save_llms_txt(content, save_path="llms.txt"):
     """Save the generated content to a file"""
@@ -127,32 +191,62 @@ def save_llms_txt(content, save_path="llms.txt"):
     except Exception as e:
         return f"Error saving file: {str(e)}"
-# Create Gradio interface
-def process_url(url, save_to_file=False):
-    content = generate_llms_txt(url)
-    if save_to_file:
-        save_message = save_llms_txt(content)
-        return content, save_message
-    return content, "File not saved (checkbox not selected)"
 # Create the Gradio interface
 iface = gr.Interface(
-    fn=process_url,
     inputs=[
         gr.Textbox(label="Website URL", placeholder="Enter the website URL..."),
         gr.Checkbox(label="Save to file", value=False)
     ],
     outputs=[
-        gr.Textbox(label="Generated llms.txt Content", lines=10),
         gr.Textbox(label="Status")
     ],
     title="llms.txt Generator",
-    description="Generate an llms.txt file from a website following the specification. The tool extracts relevant information and creates a structured markdown file suitable for LLMs.",
     examples=[
-        ["https://example.com", False],
-        ["https://docs.python.org", True]
     ],
-    theme=gr.themes.Soft()
 )
 # Launch the app

 import re
 from urllib.parse import urljoin, urlparse
 import markdown
+from concurrent.futures import ThreadPoolExecutor
+import asyncio
+from collections import defaultdict
+import time
+import logging
+# Set up logging
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+class WebsiteCrawler:
+    def __init__(self, max_depth=3, max_pages=50, timeout=30):
+        self.max_depth = max_depth
+        self.max_pages = max_pages
+        self.timeout = timeout
+        self.visited_urls = set()
+        self.url_content = {}
+        self.url_metadata = defaultdict(dict)
+        self.headers = {
             'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
         }
+    def is_valid_url(self, url, base_domain):
+        """Check if URL is valid and belongs to the same domain"""
+        try:
+            parsed = urlparse(url)
+            base_parsed = urlparse(base_domain)
+            return (parsed.netloc == base_parsed.netloc and
+                   parsed.scheme in ['http', 'https'] and
+                   not url.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip')))
+        except:
+            return False
+    def extract_content(self, soup):
+        """Extract meaningful content from HTML"""
+        # Remove script and style elements
+        for element in soup(['script', 'style', 'nav', 'footer', 'header']):
+            element.decompose()
+        # Get main content
+        main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
+        if main_content:
+            return main_content.get_text(strip=True)
+        return soup.get_text(strip=True)
+    def get_page_metadata(self, soup, url):
+        """Extract metadata from the page"""
+        metadata = {
+            'title': None,
+            'description': None,
+            'importance': 0,
+            'category': 'Optional'
+        }
+        # Title extraction
+        metadata['title'] = (
+            soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
+            soup.find('title').text if soup.find('title') else
+            soup.find('h1').text if soup.find('h1') else
+            url.split('/')[-1]
+        )
+        # Description extraction
+        metadata['description'] = (
+            soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
+            soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
+            ""
+        )
+        # Calculate importance based on various factors
+        importance = 0
+        if 'docs' in url.lower() or 'documentation' in url.lower():
+            importance += 5
+            metadata['category'] = 'Docs'
+        if 'api' in url.lower():
+            importance += 4
+            metadata['category'] = 'API'
+        if 'guide' in url.lower() or 'tutorial' in url.lower():
+            importance += 3
+            metadata['category'] = 'Guides'
+        if 'example' in url.lower():
+            importance += 2
+            metadata['category'] = 'Examples'
+        if 'blog' in url.lower():
+            importance += 1
+            metadata['category'] = 'Blog'
+        metadata['importance'] = importance
+        return metadata
+    async def crawl_page(self, url, depth, base_domain):
+        """Crawl a single page and extract information"""
+        if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages:
+            return []
+        try:
+            response = requests.get(url, headers=self.headers, timeout=self.timeout)
+            response.raise_for_status()
+            self.visited_urls.add(url)
+            soup = BeautifulSoup(response.text, 'html.parser')
+            content = self.extract_content(soup)
+            metadata = self.get_page_metadata(soup, url)
+            self.url_content[url] = content
+            self.url_metadata[url] = metadata
+            # Find all links
+            links = []
+            for a in soup.find_all('a', href=True):
+                next_url = urljoin(url, a['href'])
+                if self.is_valid_url(next_url, base_domain):
+                    links.append(next_url)
+            return links
+        except Exception as e:
+            logger.error(f"Error crawling {url}: {str(e)}")
+            return []
+    async def crawl_website(self, start_url):
+        """Crawl website starting from the given URL"""
+        base_domain = start_url
+        queue = [(start_url, 0)]
+        seen = {start_url}
+        while queue and len(self.visited_urls) < self.max_pages:
+            current_url, depth = queue.pop(0)
+            if depth > self.max_depth:
+                continue
+            links = await self.crawl_page(current_url, depth, base_domain)
+            for link in links:
+                if link not in seen:
+                    seen.add(link)
+                    queue.append((link, depth + 1))
+    def generate_llms_txt(self):
+        """Generate llms.txt content from crawled data"""
+        # Sort URLs by importance
+        sorted_urls = sorted(
+            self.url_metadata.items(),
+            key=lambda x: (x[1]['importance'], x[0]),
+            reverse=True
+        )
+        # Group URLs by category
+        categorized_urls = defaultdict(list)
+        for url, metadata in sorted_urls:
+            categorized_urls[metadata['category']].append((url, metadata))
+        # Generate content
+        content = []
+        # Add main title and description
+        if sorted_urls:
+            main_metadata = sorted_urls[0][1]
+            content.append(f"# {main_metadata['title']}\n")
+            content.append(f"> {main_metadata['description']}\n")
+        # Add categorized sections
+        priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
+        for category in priority_order:
+            if category in categorized_urls:
+                content.append(f"\n## {category}\n")
+                for url, metadata in categorized_urls[category]:
+                    title = metadata['title']
+                    desc = metadata['description']
+                    if desc:
+                        content.append(f"- [{title}]({url}): {desc[:100]}...\n")
+                    else:
+                        content.append(f"- [{title}]({url})\n")
+        return "\n".join(content)
 def save_llms_txt(content, save_path="llms.txt"):
     """Save the generated content to a file"""
     except Exception as e:
         return f"Error saving file: {str(e)}"
+async def process_url(url, max_depth, max_pages, save_to_file=False):
+    """Process URL and generate llms.txt"""
+    try:
+        crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
+        await crawler.crawl_website(url)
+        content = crawler.generate_llms_txt()
+        if save_to_file:
+            save_message = save_llms_txt(content)
+            return content, f"Crawled {len(crawler.visited_urls)} pages. {save_message}"
+        return content, f"Crawled {len(crawler.visited_urls)} pages. File not saved (checkbox not selected)"
+    except Exception as e:
+        return "", f"Error: {str(e)}"
+# Create the Gradio interface with custom CSS for Open Sans font
+css = """
+@import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
+body, .gradio-container {
+    font-family: 'Open Sans', sans-serif !important;
+}
+.gr-box {
+    border-radius: 8px !important;
+    border: 1px solid #e5e7eb !important;
+}
+.gr-button {
+    font-family: 'Open Sans', sans-serif !important;
+    font-weight: 600 !important;
+}
+"""
 # Create the Gradio interface
 iface = gr.Interface(
+    fn=lambda url, max_depth, max_pages, save: asyncio.run(process_url(url, max_depth, max_pages, save)),
     inputs=[
         gr.Textbox(label="Website URL", placeholder="Enter the website URL..."),
+        gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth"),
+        gr.Slider(minimum=10, maximum=100, value=50, step=10, label="Maximum Pages to Crawl"),
         gr.Checkbox(label="Save to file", value=False)
     ],
     outputs=[
+        gr.Textbox(label="Generated llms.txt Content", lines=20),
         gr.Textbox(label="Status")
     ],
     title="llms.txt Generator",
+    description="Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.",
     examples=[
+        ["https://example.com", 3, 50, False],
+        ["https://docs.python.org", 3, 50, True]
     ],
+    theme=gr.themes.Soft(),
+    css=css
 )
 # Launch the app