Spaces:
Running
Running
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
import re | |
from urllib.parse import urljoin, urlparse | |
import markdown | |
from concurrent.futures import ThreadPoolExecutor | |
import asyncio | |
from collections import defaultdict | |
import time | |
import logging | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class WebsiteCrawler: | |
def __init__(self, max_depth=3, max_pages=50, timeout=30): | |
self.max_depth = max_depth | |
self.max_pages = max_pages | |
self.timeout = timeout | |
self.visited_urls = set() | |
self.url_content = {} | |
self.url_metadata = defaultdict(dict) | |
self.headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
} | |
def is_valid_url(self, url, base_domain): | |
"""Check if URL is valid and belongs to the same domain""" | |
try: | |
parsed = urlparse(url) | |
base_parsed = urlparse(base_domain) | |
return (parsed.netloc == base_parsed.netloc and | |
parsed.scheme in ['http', 'https'] and | |
not url.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip'))) | |
except: | |
return False | |
def extract_content(self, soup): | |
"""Extract meaningful content from HTML""" | |
# Remove script and style elements | |
for element in soup(['script', 'style', 'nav', 'footer', 'header']): | |
element.decompose() | |
# Get main content | |
main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)}) | |
if main_content: | |
return main_content.get_text(strip=True) | |
return soup.get_text(strip=True) | |
def get_page_metadata(self, soup, url): | |
"""Extract metadata from the page""" | |
metadata = { | |
'title': None, | |
'description': None, | |
'importance': 0, | |
'category': 'Optional' | |
} | |
# Title extraction | |
metadata['title'] = ( | |
soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else | |
soup.find('title').text if soup.find('title') else | |
soup.find('h1').text if soup.find('h1') else | |
url.split('/')[-1] | |
) | |
# Description extraction | |
metadata['description'] = ( | |
soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else | |
soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else | |
"" | |
) | |
# Calculate importance based on various factors | |
importance = 0 | |
if 'docs' in url.lower() or 'documentation' in url.lower(): | |
importance += 5 | |
metadata['category'] = 'Docs' | |
if 'api' in url.lower(): | |
importance += 4 | |
metadata['category'] = 'API' | |
if 'guide' in url.lower() or 'tutorial' in url.lower(): | |
importance += 3 | |
metadata['category'] = 'Guides' | |
if 'example' in url.lower(): | |
importance += 2 | |
metadata['category'] = 'Examples' | |
if 'blog' in url.lower(): | |
importance += 1 | |
metadata['category'] = 'Blog' | |
metadata['importance'] = importance | |
return metadata | |
async def crawl_page(self, url, depth, base_domain): | |
"""Crawl a single page and extract information""" | |
if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages: | |
return [] | |
try: | |
response = requests.get(url, headers=self.headers, timeout=self.timeout) | |
response.raise_for_status() | |
self.visited_urls.add(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
content = self.extract_content(soup) | |
metadata = self.get_page_metadata(soup, url) | |
self.url_content[url] = content | |
self.url_metadata[url] = metadata | |
# Find all links | |
links = [] | |
for a in soup.find_all('a', href=True): | |
next_url = urljoin(url, a['href']) | |
if self.is_valid_url(next_url, base_domain): | |
links.append(next_url) | |
return links | |
except Exception as e: | |
logger.error(f"Error crawling {url}: {str(e)}") | |
return [] | |
async def crawl_website(self, start_url): | |
"""Crawl website starting from the given URL""" | |
base_domain = start_url | |
queue = [(start_url, 0)] | |
seen = {start_url} | |
while queue and len(self.visited_urls) < self.max_pages: | |
current_url, depth = queue.pop(0) | |
if depth > self.max_depth: | |
continue | |
links = await self.crawl_page(current_url, depth, base_domain) | |
for link in links: | |
if link not in seen: | |
seen.add(link) | |
queue.append((link, depth + 1)) | |
def generate_llms_txt(self): | |
"""Generate llms.txt content from crawled data""" | |
# Sort URLs by importance | |
sorted_urls = sorted( | |
self.url_metadata.items(), | |
key=lambda x: (x[1]['importance'], x[0]), | |
reverse=True | |
) | |
# Group URLs by category | |
categorized_urls = defaultdict(list) | |
for url, metadata in sorted_urls: | |
categorized_urls[metadata['category']].append((url, metadata)) | |
# Generate content | |
content = [] | |
# Add main title and description | |
if sorted_urls: | |
main_metadata = sorted_urls[0][1] | |
content.append(f"# {main_metadata['title']}\n") | |
content.append(f"> {main_metadata['description']}\n") | |
# Add categorized sections | |
priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional'] | |
for category in priority_order: | |
if category in categorized_urls: | |
content.append(f"\n## {category}\n") | |
for url, metadata in categorized_urls[category]: | |
title = metadata['title'] | |
desc = metadata['description'] | |
if desc: | |
content.append(f"- [{title}]({url}): {desc[:100]}...\n") | |
else: | |
content.append(f"- [{title}]({url})\n") | |
return "\n".join(content) | |
def save_llms_txt(content, save_path="llms.txt"): | |
"""Save the generated content to a file""" | |
try: | |
with open(save_path, 'w', encoding='utf-8') as f: | |
f.write(content) | |
return f"Successfully saved to {save_path}" | |
except Exception as e: | |
return f"Error saving file: {str(e)}" | |
async def process_url(url, max_depth, max_pages, save_to_file=False): | |
"""Process URL and generate llms.txt""" | |
try: | |
crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages)) | |
await crawler.crawl_website(url) | |
content = crawler.generate_llms_txt() | |
if save_to_file: | |
save_message = save_llms_txt(content) | |
return content, f"Crawled {len(crawler.visited_urls)} pages. {save_message}" | |
return content, f"Crawled {len(crawler.visited_urls)} pages. File not saved (checkbox not selected)" | |
except Exception as e: | |
return "", f"Error: {str(e)}" | |
# Create the Gradio interface with custom CSS for Open Sans font | |
css = """ | |
@import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap'); | |
body, .gradio-container { | |
font-family: 'Open Sans', sans-serif !important; | |
} | |
.gr-box { | |
border-radius: 8px !important; | |
border: 1px solid #e5e7eb !important; | |
} | |
.gr-button { | |
font-family: 'Open Sans', sans-serif !important; | |
font-weight: 600 !important; | |
} | |
""" | |
# Create the Gradio interface | |
iface = gr.Interface( | |
fn=lambda url, max_depth, max_pages, save: asyncio.run(process_url(url, max_depth, max_pages, save)), | |
inputs=[ | |
gr.Textbox(label="Website URL", placeholder="Enter the website URL..."), | |
gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth"), | |
gr.Slider(minimum=10, maximum=100, value=50, step=10, label="Maximum Pages to Crawl"), | |
gr.Checkbox(label="Save to file", value=False) | |
], | |
outputs=[ | |
gr.Textbox(label="Generated llms.txt Content", lines=20), | |
gr.Textbox(label="Status") | |
], | |
title="llms.txt Generator", | |
description="Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.", | |
examples=[ | |
["https://example.com", 3, 50, False], | |
["https://docs.python.org", 3, 50, True] | |
], | |
theme=gr.themes.Soft(), | |
css=css | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
iface.launch() |