Spaces:
Running
Running
import gradio as gr | |
import requests | |
from bs4 import BeautifulSoup | |
import re | |
from urllib.parse import urljoin, urlparse | |
import markdown | |
from concurrent.futures import ThreadPoolExecutor | |
import asyncio | |
from collections import defaultdict | |
import time | |
import logging | |
import unicodedata | |
# Set up logging | |
logging.basicConfig(level=logging.INFO) | |
logger = logging.getLogger(__name__) | |
class WebsiteCrawler: | |
def __init__(self, max_depth=3, max_pages=50, timeout=30): | |
self.max_depth = max_depth | |
self.max_pages = max_pages | |
self.timeout = timeout | |
self.visited_urls = set() | |
self.url_content = {} | |
self.url_metadata = defaultdict(dict) | |
self.headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36' | |
} | |
def normalize_text(self, text): | |
"""Normalize text to handle encoding issues""" | |
if not text: | |
return "" | |
# Normalize unicode characters | |
text = unicodedata.normalize('NFKD', text) | |
# Replace special quotes and dashes with standard characters | |
text = text.replace('"', '"').replace('"', '"').replace(''', "'").replace('—', '-') | |
# Remove any remaining non-ASCII characters | |
text = text.encode('ascii', 'ignore').decode('ascii') | |
# Clean up extra whitespace | |
text = ' '.join(text.split()) | |
return text | |
def is_valid_url(self, url, base_domain): | |
"""Check if URL is valid and belongs to the same domain""" | |
try: | |
parsed = urlparse(url) | |
base_parsed = urlparse(base_domain) | |
return (parsed.netloc == base_parsed.netloc and | |
parsed.scheme in ['http', 'https'] and | |
not url.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip'))) | |
except: | |
return False | |
def extract_content(self, soup): | |
"""Extract meaningful content from HTML""" | |
# Remove script and style elements | |
for element in soup(['script', 'style', 'nav', 'footer', 'header']): | |
element.decompose() | |
# Get main content | |
main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)}) | |
if main_content: | |
return self.normalize_text(main_content.get_text(strip=True)) | |
return self.normalize_text(soup.get_text(strip=True)) | |
def get_page_metadata(self, soup, url): | |
"""Extract metadata from the page""" | |
metadata = { | |
'title': None, | |
'description': None, | |
'importance': 0, | |
'category': 'Optional' | |
} | |
# Title extraction with normalization | |
title = ( | |
soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else | |
soup.find('title').text if soup.find('title') else | |
soup.find('h1').text if soup.find('h1') else | |
url.split('/')[-1] | |
) | |
metadata['title'] = self.normalize_text(title) | |
# Description extraction with normalization | |
description = ( | |
soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else | |
soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else | |
"" | |
) | |
metadata['description'] = self.normalize_text(description) | |
# Calculate importance based on various factors | |
importance = 0 | |
if 'docs' in url.lower() or 'documentation' in url.lower(): | |
importance += 5 | |
metadata['category'] = 'Docs' | |
if 'api' in url.lower(): | |
importance += 4 | |
metadata['category'] = 'API' | |
if 'guide' in url.lower() or 'tutorial' in url.lower(): | |
importance += 3 | |
metadata['category'] = 'Guides' | |
if 'example' in url.lower(): | |
importance += 2 | |
metadata['category'] = 'Examples' | |
if 'blog' in url.lower(): | |
importance += 1 | |
metadata['category'] = 'Blog' | |
metadata['importance'] = importance | |
return metadata | |
async def crawl_page(self, url, depth, base_domain): | |
"""Crawl a single page and extract information""" | |
if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages: | |
return [] | |
try: | |
response = requests.get(url, headers=self.headers, timeout=self.timeout) | |
response.encoding = 'utf-8' # Explicitly set encoding | |
response.raise_for_status() | |
self.visited_urls.add(url) | |
soup = BeautifulSoup(response.text, 'html.parser') | |
content = self.extract_content(soup) | |
metadata = self.get_page_metadata(soup, url) | |
self.url_content[url] = content | |
self.url_metadata[url] = metadata | |
# Find all links | |
links = [] | |
for a in soup.find_all('a', href=True): | |
next_url = urljoin(url, a['href']) | |
if self.is_valid_url(next_url, base_domain): | |
links.append(next_url) | |
return links | |
except Exception as e: | |
logger.error(f"Error crawling {url}: {str(e)}") | |
return [] | |
async def crawl_website(self, start_url): | |
"""Crawl website starting from the given URL""" | |
base_domain = start_url | |
queue = [(start_url, 0)] | |
seen = {start_url} | |
while queue and len(self.visited_urls) < self.max_pages: | |
current_url, depth = queue.pop(0) | |
if depth > self.max_depth: | |
continue | |
links = await self.crawl_page(current_url, depth, base_domain) | |
for link in links: | |
if link not in seen: | |
seen.add(link) | |
queue.append((link, depth + 1)) | |
def generate_llms_txt(self): | |
"""Generate llms.txt content from crawled data""" | |
# Sort URLs by importance | |
sorted_urls = sorted( | |
self.url_metadata.items(), | |
key=lambda x: (x[1]['importance'], x[0]), | |
reverse=True | |
) | |
# Group URLs by category | |
categorized_urls = defaultdict(list) | |
for url, metadata in sorted_urls: | |
categorized_urls[metadata['category']].append((url, metadata)) | |
# Generate content | |
content = [] | |
# Add main title and description | |
if sorted_urls: | |
main_metadata = sorted_urls[0][1] | |
content.append(f"# {main_metadata['title']}\n") | |
content.append(f"> {main_metadata['description']}\n") | |
# Add categorized sections | |
priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional'] | |
for category in priority_order: | |
if category in categorized_urls: | |
content.append(f"\n## {category}\n") | |
for url, metadata in categorized_urls[category]: | |
title = metadata['title'] | |
desc = metadata['description'] | |
if desc: | |
content.append(f"- [{title}]({url}): {desc[:100]}...\n") | |
else: | |
content.append(f"- [{title}]({url})\n") | |
return "\n".join(content) | |
async def process_url(url, max_depth, max_pages): | |
"""Process URL and generate llms.txt""" | |
try: | |
# Add https:// if not present | |
if not url.startswith(('http://', 'https://')): | |
url = 'https://' + url | |
# Validate URL format | |
try: | |
result = urlparse(url) | |
if not all([result.scheme, result.netloc]): | |
return "", "Invalid URL format. Please enter a valid URL." | |
except: | |
return "", "Invalid URL format. Please enter a valid URL." | |
# Create crawler and process | |
crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages)) | |
await crawler.crawl_website(url) | |
content = crawler.generate_llms_txt() | |
return content, f"Successfully crawled {len(crawler.visited_urls)} pages. You can now copy the generated content." | |
except Exception as e: | |
return "", f"Error: {str(e)}" | |
# Create the Gradio interface with custom CSS for Open Sans font | |
css = """ | |
@import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap'); | |
body, .gradio-container { | |
font-family: 'Open Sans', sans-serif !important; | |
} | |
.gr-box { | |
border-radius: 8px !important; | |
border: 1px solid #e5e7eb !important; | |
} | |
.gr-button { | |
font-family: 'Open Sans', sans-serif !important; | |
font-weight: 600 !important; | |
} | |
.gr-input { | |
font-family: 'Open Sans', sans-serif !important; | |
} | |
""" | |
# Create the Gradio interface | |
iface = gr.Interface( | |
fn=lambda url, max_depth, max_pages: asyncio.run(process_url(url, max_depth, max_pages)), | |
inputs=[ | |
gr.Textbox( | |
label="Website URL", | |
placeholder="Enter the website URL (e.g., example.com or https://example.com)", | |
info="The URL will be automatically prefixed with https:// if no protocol is specified." | |
), | |
gr.Slider( | |
minimum=1, | |
maximum=5, | |
value=3, | |
step=1, | |
label="Maximum Crawl Depth", | |
info="Higher values will result in more thorough but slower crawling" | |
), | |
gr.Slider( | |
minimum=10, | |
maximum=100, | |
value=50, | |
step=10, | |
label="Maximum Pages to Crawl", | |
info="Higher values will result in more comprehensive but slower results" | |
) | |
], | |
outputs=[ | |
gr.Textbox( | |
label="Generated llms.txt Content", | |
lines=20, | |
info="Copy this content to create your llms.txt file" | |
), | |
gr.Textbox(label="Status") | |
], | |
title="llms.txt Generator", | |
description="Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.", | |
theme=gr.themes.Soft(), | |
css=css | |
) | |
# Launch the app | |
if __name__ == "__main__": | |
iface.launch() |