Spaces:
Running
Running
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
import re | |
from urllib.parse import urljoin, urlparse | |
import markdown | |
def get_website_title(soup): | |
"""Extract website title from meta tags or title tag""" | |
# Try meta title first | |
meta_title = soup.find('meta', property='og:title') | |
if meta_title: | |
return meta_title['content'] | |
# Try regular title tag | |
title_tag = soup.find('title') | |
if title_tag: | |
return title_tag.text.strip() | |
# Fallback to h1 | |
h1_tag = soup.find('h1') | |
if h1_tag: | |
return h1_tag.text.strip() | |
return "Website Title" | |
def get_website_description(soup): | |
"""Extract website description from meta tags""" | |
# Try meta description | |
meta_desc = soup.find('meta', {'name': 'description'}) or soup.find('meta', property='og:description') | |
if meta_desc: | |
return meta_desc.get('content', '') | |
# Fallback to first paragraph | |
first_p = soup.find('p') | |
if first_p: | |
return first_p.text.strip() | |
return "Website description" | |
def get_important_links(soup, base_url): | |
"""Extract important links from the website""" | |
links = [] | |
seen_urls = set() | |
# Look for navigation links | |
nav_elements = soup.find_all(['nav', 'header']) | |
for nav in nav_elements: | |
for a in nav.find_all('a', href=True): | |
url = urljoin(base_url, a['href']) | |
if url not in seen_urls and not url.startswith(('javascript:', 'mailto:', 'tel:')): | |
text = a.text.strip() | |
if text and len(text) > 1: # Avoid empty or single-character links | |
links.append({ | |
'title': text, | |
'url': url, | |
'section': 'Docs' | |
}) | |
seen_urls.add(url) | |
# Look for footer links | |
footer = soup.find('footer') | |
if footer: | |
for a in footer.find_all('a', href=True): | |
url = urljoin(base_url, a['href']) | |
if url not in seen_urls and not url.startswith(('javascript:', 'mailto:', 'tel:')): | |
text = a.text.strip() | |
if text and len(text) > 1: | |
links.append({ | |
'title': text, | |
'url': url, | |
'section': 'Optional' | |
}) | |
seen_urls.add(url) | |
return links | |
def generate_llms_txt(url): | |
try: | |
# Fetch the webpage | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
} | |
response = requests.get(url, headers=headers, timeout=10) | |
response.raise_for_status() | |
# Parse the HTML | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Get base components | |
title = get_website_title(soup) | |
description = get_website_description(soup) | |
links = get_important_links(soup, url) | |
# Generate llms.txt content | |
content = [ | |
f"# {title}\n", | |
f"> {description}\n", | |
"## Docs\n" | |
] | |
# Add documentation links | |
doc_links = [link for link in links if link['section'] == 'Docs'] | |
for link in doc_links: | |
content.append(f"- [{link['title']}]({link['url']}): Documentation page\n") | |
# Add optional links if present | |
optional_links = [link for link in links if link['section'] == 'Optional'] | |
if optional_links: | |
content.append("\n## Optional\n") | |
for link in optional_links: | |
content.append(f"- [{link['title']}]({link['url']})\n") | |
# Join all content | |
llms_txt_content = "\n".join(content) | |
return llms_txt_content | |
except Exception as e: | |
return f"Error generating llms.txt: {str(e)}" | |
def save_llms_txt(content, save_path="llms.txt"): | |
"""Save the generated content to a file""" | |
try: | |
with open(save_path, 'w', encoding='utf-8') as f: | |
f.write(content) | |
return f"Successfully saved to {save_path}" | |
except Exception as e: | |
return f"Error saving file: {str(e)}" | |
# Create Gradio interface | |
def process_url(url, save_to_file=False): | |
content = generate_llms_txt(url) | |
if save_to_file: | |
save_message = save_llms_txt(content) | |
return content, save_message | |
return content, "File not saved (checkbox not selected)" | |
# Create the Gradio interface | |
iface = gr.Interface( | |
fn=process_url, | |
inputs=[ | |
gr.Textbox(label="Website URL", placeholder="Enter the website URL..."), | |
gr.Checkbox(label="Save to file", value=False) | |
], | |
outputs=[ | |
gr.Textbox(label="Generated llms.txt Content", lines=10), | |
gr.Textbox(label="Status") | |
], | |
title="llms.txt Generator", | |
description="Generate an llms.txt file from a website following the specification. The tool extracts relevant information and creates a structured markdown file suitable for LLMs.", | |
examples=[ | |
["https://example.com", False], | |
["https://docs.python.org", True] | |
], | |
theme=gr.themes.Soft() | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
iface.launch() |