Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on 21 days ago

Commit

5e3183d

•

1 Parent(s): 1ce3f39

Create app.py

Browse files

Files changed (1) hide show

app.py +160 -0

app.py ADDED Viewed

	@@ -0,0 +1,160 @@

+import gradio as gr
+import requests
+from bs4 import BeautifulSoup
+import re
+from urllib.parse import urljoin, urlparse
+import markdown
+def get_website_title(soup):
+    """Extract website title from meta tags or title tag"""
+    # Try meta title first
+    meta_title = soup.find('meta', property='og:title')
+    if meta_title:
+        return meta_title['content']
+    # Try regular title tag
+    title_tag = soup.find('title')
+    if title_tag:
+        return title_tag.text.strip()
+    # Fallback to h1
+    h1_tag = soup.find('h1')
+    if h1_tag:
+        return h1_tag.text.strip()
+    return "Website Title"
+def get_website_description(soup):
+    """Extract website description from meta tags"""
+    # Try meta description
+    meta_desc = soup.find('meta', {'name': 'description'}) or soup.find('meta', property='og:description')
+    if meta_desc:
+        return meta_desc.get('content', '')
+    # Fallback to first paragraph
+    first_p = soup.find('p')
+    if first_p:
+        return first_p.text.strip()
+    return "Website description"
+def get_important_links(soup, base_url):
+    """Extract important links from the website"""
+    links = []
+    seen_urls = set()
+    # Look for navigation links
+    nav_elements = soup.find_all(['nav', 'header'])
+    for nav in nav_elements:
+        for a in nav.find_all('a', href=True):
+            url = urljoin(base_url, a['href'])
+            if url not in seen_urls and not url.startswith(('javascript:', 'mailto:', 'tel:')):
+                text = a.text.strip()
+                if text and len(text) > 1:  # Avoid empty or single-character links
+                    links.append({
+                        'title': text,
+                        'url': url,
+                        'section': 'Docs'
+                    })
+                    seen_urls.add(url)
+    # Look for footer links
+    footer = soup.find('footer')
+    if footer:
+        for a in footer.find_all('a', href=True):
+            url = urljoin(base_url, a['href'])
+            if url not in seen_urls and not url.startswith(('javascript:', 'mailto:', 'tel:')):
+                text = a.text.strip()
+                if text and len(text) > 1:
+                    links.append({
+                        'title': text,
+                        'url': url,
+                        'section': 'Optional'
+                    })
+                    seen_urls.add(url)
+    return links
+def generate_llms_txt(url):
+    try:
+        # Fetch the webpage
+        headers = {
+            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
+        }
+        response = requests.get(url, headers=headers, timeout=10)
+        response.raise_for_status()
+        # Parse the HTML
+        soup = BeautifulSoup(response.text, 'html.parser')
+        # Get base components
+        title = get_website_title(soup)
+        description = get_website_description(soup)
+        links = get_important_links(soup, url)
+        # Generate llms.txt content
+        content = [
+            f"# {title}\n",
+            f"> {description}\n",
+            "## Docs\n"
+        ]
+        # Add documentation links
+        doc_links = [link for link in links if link['section'] == 'Docs']
+        for link in doc_links:
+            content.append(f"- [{link['title']}]({link['url']}): Documentation page\n")
+        # Add optional links if present
+        optional_links = [link for link in links if link['section'] == 'Optional']
+        if optional_links:
+            content.append("\n## Optional\n")
+            for link in optional_links:
+                content.append(f"- [{link['title']}]({link['url']})\n")
+        # Join all content
+        llms_txt_content = "\n".join(content)
+        return llms_txt_content
+    except Exception as e:
+        return f"Error generating llms.txt: {str(e)}"
+def save_llms_txt(content, save_path="llms.txt"):
+    """Save the generated content to a file"""
+    try:
+        with open(save_path, 'w', encoding='utf-8') as f:
+            f.write(content)
+        return f"Successfully saved to {save_path}"
+    except Exception as e:
+        return f"Error saving file: {str(e)}"
+# Create Gradio interface
+def process_url(url, save_to_file=False):
+    content = generate_llms_txt(url)
+    if save_to_file:
+        save_message = save_llms_txt(content)
+        return content, save_message
+    return content, "File not saved (checkbox not selected)"
+# Create the Gradio interface
+iface = gr.Interface(
+    fn=process_url,
+    inputs=[
+        gr.Textbox(label="Website URL", placeholder="Enter the website URL..."),
+        gr.Checkbox(label="Save to file", value=False)
+    ],
+    outputs=[
+        gr.Textbox(label="Generated llms.txt Content", lines=10),
+        gr.Textbox(label="Status")
+    ],
+    title="llms.txt Generator",
+    description="Generate an llms.txt file from a website following the specification. The tool extracts relevant information and creates a structured markdown file suitable for LLMs.",
+    examples=[
+        ["https://example.com", False],
+        ["https://docs.python.org", True]
+    ],
+    theme=gr.themes.Soft()
+)
+# Launch the app
+if __name__ == "__main__":
+    iface.launch()