create-llms-txt / app.py
cyberandy's picture
update
3734cdf
raw
history blame
9.32 kB
import gradio as gr
import requests
from bs4 import BeautifulSoup
import re
from urllib.parse import urljoin, urlparse
import asyncio
import aiohttp
from collections import defaultdict
import unicodedata
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
class WebsiteCrawler:
def __init__(self, max_depth=3, max_pages=50):
self.max_depth = max_depth
self.max_pages = max_pages
self.visited_urls = set()
self.url_metadata = defaultdict(dict)
self.homepage_metadata = None
self.headers = {
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
}
def determine_category_importance(self, url, title, desc):
url_lower = url.lower()
path = urlparse(url).path.lower()
if path == "/" or path == "":
return "Main", 10
if any(x in url_lower for x in ["/docs", "/faq", "/help"]):
return "Documentation", 8
elif any(x in url_lower for x in ["/api", "/developer"]):
return "API", 8
elif any(x in url_lower for x in ["/about", "/company", "/contact"]):
return "About", 7
elif any(x in url_lower for x in ["/news", "/blog", "/events"]):
return "News", 5
elif any(x in url_lower for x in ["/tools", "/pricing"]):
return "Tools", 6
return "Optional", 1
def clean_text(self, text, is_title=False):
if not text:
return ""
text = unicodedata.normalize("NFKD", text)
text = re.sub(r"[^\x00-\x7F]+", "", text)
text = " ".join(text.split()).strip()
if is_title:
text = re.sub(r"^\s*Welcome to\s+", "", text)
return text
async def crawl_page(self, url, depth, base_domain):
if (
depth > self.max_depth
or url in self.visited_urls
or len(self.visited_urls) >= self.max_pages
):
return []
try:
async with aiohttp.ClientSession(
timeout=aiohttp.ClientTimeout(total=20)
) as session:
async with session.get(
url, headers=self.headers, allow_redirects=True
) as response:
if response.status != 200:
return []
text = await response.text()
self.visited_urls.add(url)
soup = BeautifulSoup(text, "html.parser")
title_tag = soup.find("title")
title = (
self.clean_text(title_tag.text)
if title_tag
else url.split("/")[-1]
)
desc_tag = soup.find("meta", {"name": "description"})
desc = (
self.clean_text(desc_tag["content"])
if desc_tag and desc_tag.get("content")
else ""
)
category, importance = self.determine_category_importance(
url, title, desc
)
self.url_metadata[url] = {
"title": title,
"description": desc,
"category": category,
"importance": importance,
}
links = []
for a in soup.find_all("a", href=True):
next_url = urljoin(url, a["href"])
if urlparse(next_url).netloc == base_domain:
links.append(next_url)
return links
except Exception as e:
logger.error(f"Error crawling {url}: {str(e)}")
return []
async def process_homepage(self, url):
try:
async with aiohttp.ClientSession(
timeout=aiohttp.ClientTimeout(total=20)
) as session:
async with session.get(
url, headers=self.headers, allow_redirects=True
) as response:
if response.status != 200:
return
text = await response.text()
soup = BeautifulSoup(text, "html.parser")
site_name = (
soup.find("title").text.split("|")[0].strip()
if soup.find("title")
else urlparse(url).netloc
)
description = soup.find("meta", {"name": "description"})
description = (
description["content"].strip()
if description and description.get("content")
else None
)
self.homepage_metadata = {
"site_name": self.clean_text(site_name, is_title=True),
"description": (
self.clean_text(description) if description else None
),
}
except Exception as e:
logger.error(f"Error processing homepage {url}: {str(e)}")
async def crawl_website(self, start_url):
try:
await self.process_homepage(start_url)
base_domain = urlparse(start_url).netloc
queue = [(start_url, 0)]
seen = {start_url}
while queue and len(self.visited_urls) < self.max_pages:
current_url, depth = queue.pop(0)
if depth > self.max_depth:
continue
links = await self.crawl_page(current_url, depth, base_domain)
for link in links:
if link not in seen:
seen.add(link)
queue.append((link, depth + 1))
except Exception as e:
logger.error(f"Error during crawl: {str(e)}")
raise
def generate_llms_txt(self):
if not self.url_metadata:
return "No content available."
content = []
homepage_title = self.homepage_metadata.get("site_name", "Website")
homepage_description = self.homepage_metadata.get(
"description", "No description available."
)
content.append(f"# {homepage_title}\n\n> {homepage_description}\n")
categories = defaultdict(list)
for url, metadata in self.url_metadata.items():
categories[metadata["category"]].append((url, metadata))
category_order = [
"Main",
"Documentation",
"API",
"About",
"News",
"Tools",
"Optional",
]
for category in category_order:
if category in categories:
content.append(f"## {category}")
for url, metadata in categories[category]:
content.append(
f"- [{metadata['title']}]({url}): {metadata['description']}"
)
return "\n".join(content)
async def process_url(url, max_depth, max_pages):
try:
if not url.startswith(("http://", "https://")):
url = "https://" + url
result = urlparse(url)
if not result.scheme or not result.netloc:
return "", "Invalid URL format. Please enter a valid URL."
crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
await crawler.crawl_website(url)
content = crawler.generate_llms_txt()
return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
except Exception as e:
logger.error(f"Error processing URL {url}: {str(e)}")
return "", f"Error: {str(e)}"
# Gradio interface
theme = gr.themes.Soft(primary_hue="blue", font="Open Sans")
with gr.Blocks(theme=theme) as iface:
with gr.Row():
gr.Markdown("## Website Crawler - Generate llms.txt")
with gr.Row():
url_input = gr.Textbox(
label="Website URL",
placeholder="Enter the website URL (e.g., example.com)",
info="The URL will be automatically prefixed with https:// if not provided",
lines=1,
)
with gr.Row():
depth_input = gr.Slider(
minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth"
)
pages_input = gr.Slider(
minimum=10, maximum=100, value=50, step=10, label="Maximum Pages"
)
with gr.Row():
generate_btn = gr.Button("Generate llms.txt", variant="primary")
with gr.Row():
output = gr.Textbox(
label="Generated llms.txt Content",
lines=15,
show_copy_button=True,
container=True,
)
with gr.Row():
status = gr.Textbox(label="Status", interactive=False)
# Pass the asynchronous function directly
generate_btn.click(
fn=process_url,
inputs=[url_input, depth_input, pages_input],
outputs=[output, status],
)
if __name__ == "__main__":
iface.launch(asyncio_mode="auto")