Spaces:
Running
Running
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
import re | |
from urllib.parse import urljoin, urlparse | |
import asyncio | |
from collections import defaultdict | |
import unicodedata | |
import logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class WebsiteCrawler: | |
def __init__(self, max_depth=3, max_pages=50): | |
self.max_depth = max_depth | |
self.max_pages = max_pages | |
self.visited_urls = set() | |
self.url_metadata = defaultdict(dict) | |
self.headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36' | |
} | |
def clean_text(self, text, is_title=False): | |
"""Clean and normalize text""" | |
if not text: | |
return "" | |
# Normalize unicode characters | |
text = unicodedata.normalize('NFKD', text) | |
text = re.sub(r'[^\x00-\x7F]+', '', text) | |
if is_title: | |
# Remove common suffixes and fragments for titles | |
text = re.sub(r'\s*[\|\-#:•].*', '', text) | |
text = re.sub(r'^\s*Welcome to\s+', '', text) | |
text = text.replace('docusaurus_skipToContent_fallback', '') | |
return ' '.join(text.split()).strip() | |
async def crawl_page(self, url, depth, base_domain): | |
"""Crawl a single page and extract information""" | |
if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages: | |
return [] | |
try: | |
response = requests.get(url, headers=self.headers, timeout=10) | |
response.encoding = 'utf-8' | |
self.visited_urls.add(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
# Extract title with fallbacks | |
title = None | |
meta_title = soup.find('meta', property='og:title') | |
if meta_title and meta_title.get('content'): | |
title = meta_title['content'] | |
if not title: | |
title_tag = soup.find('title') | |
if title_tag: | |
title = title_tag.text | |
if not title: | |
h1_tag = soup.find('h1') | |
if h1_tag: | |
title = h1_tag.text | |
if not title: | |
title = url.split('/')[-1] | |
title = self.clean_text(title, is_title=True) | |
# Extract description with fallbacks | |
desc = None | |
meta_desc = soup.find('meta', {'name': 'description'}) | |
if meta_desc and meta_desc.get('content'): | |
desc = meta_desc['content'] | |
if not desc: | |
og_desc = soup.find('meta', property='og:description') | |
if og_desc and og_desc.get('content'): | |
desc = og_desc['content'] | |
if not desc: | |
first_p = soup.find('p') | |
if first_p: | |
desc = first_p.text | |
desc = self.clean_text(desc) if desc else "" | |
# Determine category and importance | |
url_lower = url.lower() | |
category = 'Optional' | |
importance = 0 | |
if 'docs' in url_lower or 'documentation' in url_lower: | |
category = 'Docs' | |
importance = 5 | |
elif 'api' in url_lower: | |
category = 'API' | |
importance = 4 | |
elif 'guide' in url_lower or 'tutorial' in url_lower: | |
category = 'Guides' | |
importance = 3 | |
elif 'example' in url_lower: | |
category = 'Examples' | |
importance = 2 | |
elif 'blog' in url_lower: | |
category = 'Blog' | |
importance = 1 | |
# Store metadata | |
clean_url = re.sub(r'#.*', '', url).rstrip('/') | |
if title and len(title.strip()) > 0: # Only store if we have a valid title | |
self.url_metadata[clean_url] = { | |
'title': title, | |
'description': desc, | |
'category': category, | |
'importance': importance | |
} | |
# Find links | |
links = [] | |
for a in soup.find_all('a', href=True): | |
href = a['href'] | |
if not any(x in href.lower() for x in ['javascript:', 'mailto:', '.pdf', '.jpg', '.png', '.gif']): | |
next_url = urljoin(url, href) | |
if urlparse(next_url).netloc == base_domain: | |
links.append(next_url) | |
return links | |
except Exception as e: | |
logger.error(f"Error crawling {url}: {str(e)}") | |
return [] | |
async def crawl_website(self, start_url): | |
"""Crawl website starting from the given URL""" | |
base_domain = urlparse(start_url).netloc | |
queue = [(start_url, 0)] | |
seen = {start_url} | |
while queue and len(self.visited_urls) < self.max_pages: | |
current_url, depth = queue.pop(0) | |
if depth > self.max_depth: | |
continue | |
links = await self.crawl_page(current_url, depth, base_domain) | |
for link in links: | |
if link not in seen and urlparse(link).netloc == base_domain: | |
seen.add(link) | |
queue.append((link, depth + 1)) | |
def generate_llms_txt(self): | |
"""Generate llms.txt content""" | |
if not self.url_metadata: | |
return "No content was found to generate llms.txt" | |
# Sort and filter URLs | |
sorted_urls = sorted( | |
self.url_metadata.items(), | |
key=lambda x: (x[1]['importance'], x[0]), | |
reverse=True | |
) | |
# Generate content | |
content = [] | |
main_metadata = sorted_urls[0][1] | |
content.append(f"# {main_metadata['title']}") | |
if main_metadata['description']: | |
content.append(f"\n> {main_metadata['description']}") | |
# Group by category | |
categories = defaultdict(list) | |
seen_titles = set() | |
for url, metadata in sorted_urls: | |
title = metadata['title'] | |
if title not in seen_titles: | |
categories[metadata['category']].append((url, metadata)) | |
seen_titles.add(title) | |
# Add sections | |
for category in ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']: | |
if category in categories: | |
content.append(f"\n## {category}") | |
for url, metadata in categories[category]: | |
if metadata['description']: | |
content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}") | |
else: | |
content.append(f"\n- [{metadata['title']}]({url})") | |
return "\n".join(content) | |
async def process_url(url, max_depth, max_pages): | |
"""Process URL and generate llms.txt""" | |
try: | |
# Add https:// if not present | |
if not url.startswith(('http://', 'https://')): | |
url = 'https://' + url | |
# Validate URL | |
result = urlparse(url) | |
if not all([result.scheme, result.netloc]): | |
return "", "Invalid URL format. Please enter a valid URL." | |
# Process website | |
crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages)) | |
await crawler.crawl_website(url) | |
content = crawler.generate_llms_txt() | |
return content, f"Successfully crawled {len(crawler.visited_urls)} pages." | |
except Exception as e: | |
return "", f"Error: {str(e)}" | |
# Create Gradio interface | |
theme = gr.themes.Soft(primary_hue="blue", font="Open Sans") | |
with gr.Blocks(theme=theme, css=""" | |
@import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap'); | |
.gradio-container { | |
font-family: 'Open Sans', sans-serif !important; | |
} | |
.gr-button { | |
font-family: 'Open Sans', sans-serif !important; | |
font-weight: 600 !important; | |
} | |
.primary-btn { | |
background-color: #2436d4 !important; | |
color: white !important; | |
} | |
.primary-btn:hover { | |
background-color: #1c2aa8 !important; | |
} | |
[data-testid="textbox"] { | |
font-family: 'Open Sans', sans-serif !important; | |
} | |
.gr-padded { | |
font-family: 'Open Sans', sans-serif !important; | |
} | |
.gr-input { | |
font-family: 'Open Sans', sans-serif !important; | |
} | |
.gr-label { | |
font-family: 'Open Sans', sans-serif !important; | |
} | |
""") as iface: | |
gr.Markdown("# llms.txt Generator") | |
gr.Markdown("Generate an llms.txt file from a website following the specification.") | |
with gr.Row(): | |
url_input = gr.Textbox( | |
label="Website URL", | |
placeholder="Enter the website URL (e.g., example.com)", | |
info="The URL will be automatically prefixed with https:// if not provided" | |
) | |
with gr.Row(): | |
with gr.Column(): | |
depth_input = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth") | |
with gr.Column(): | |
pages_input = gr.Slider(minimum=10, maximum=100, value=50, step=10, label="Maximum Pages") | |
generate_btn = gr.Button("Generate llms.txt", variant="primary") | |
output = gr.Textbox( | |
label="Generated llms.txt Content", | |
lines=20, | |
show_copy_button=True, | |
container=True | |
) | |
status = gr.Textbox(label="Status") | |
generate_btn.click( | |
fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)), | |
inputs=[url_input, depth_input, pages_input], | |
outputs=[output, status] | |
) | |
if __name__ == "__main__": | |
iface.launch() |