Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

create-llms-txt / app.py

cyberandy

update

3734cdf 17 days ago

raw

history blame

9.32 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	import re
	from urllib.parse import urljoin, urlparse
	import asyncio
	import aiohttp
	from collections import defaultdict
	import unicodedata
	import logging

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)


	class WebsiteCrawler:
	def __init__(self, max_depth=3, max_pages=50):
	self.max_depth = max_depth
	self.max_pages = max_pages
	self.visited_urls = set()
	self.url_metadata = defaultdict(dict)
	self.homepage_metadata = None
	self.headers = {
	"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
	}

	def determine_category_importance(self, url, title, desc):
	url_lower = url.lower()
	path = urlparse(url).path.lower()

	if path == "/" or path == "":
	return "Main", 10

	if any(x in url_lower for x in ["/docs", "/faq", "/help"]):
	return "Documentation", 8

	elif any(x in url_lower for x in ["/api", "/developer"]):
	return "API", 8

	elif any(x in url_lower for x in ["/about", "/company", "/contact"]):
	return "About", 7

	elif any(x in url_lower for x in ["/news", "/blog", "/events"]):
	return "News", 5

	elif any(x in url_lower for x in ["/tools", "/pricing"]):
	return "Tools", 6

	return "Optional", 1

	def clean_text(self, text, is_title=False):
	if not text:
	return ""
	text = unicodedata.normalize("NFKD", text)
	text = re.sub(r"[^\x00-\x7F]+", "", text)
	text = " ".join(text.split()).strip()

	if is_title:
	text = re.sub(r"^\s*Welcome to\s+", "", text)
	return text

	async def crawl_page(self, url, depth, base_domain):
	if (
	depth > self.max_depth
	or url in self.visited_urls
	or len(self.visited_urls) >= self.max_pages
	):
	return []

	try:
	async with aiohttp.ClientSession(
	timeout=aiohttp.ClientTimeout(total=20)
	) as session:
	async with session.get(
	url, headers=self.headers, allow_redirects=True
	) as response:
	if response.status != 200:
	return []
	text = await response.text()
	self.visited_urls.add(url)

	soup = BeautifulSoup(text, "html.parser")
	title_tag = soup.find("title")
	title = (
	self.clean_text(title_tag.text)
	if title_tag
	else url.split("/")[-1]
	)

	desc_tag = soup.find("meta", {"name": "description"})
	desc = (
	self.clean_text(desc_tag["content"])
	if desc_tag and desc_tag.get("content")
	else ""
	)

	category, importance = self.determine_category_importance(
	url, title, desc
	)

	self.url_metadata[url] = {
	"title": title,
	"description": desc,
	"category": category,
	"importance": importance,
	}

	links = []
	for a in soup.find_all("a", href=True):
	next_url = urljoin(url, a["href"])
	if urlparse(next_url).netloc == base_domain:
	links.append(next_url)

	return links
	except Exception as e:
	logger.error(f"Error crawling {url}: {str(e)}")
	return []

	async def process_homepage(self, url):
	try:
	async with aiohttp.ClientSession(
	timeout=aiohttp.ClientTimeout(total=20)
	) as session:
	async with session.get(
	url, headers=self.headers, allow_redirects=True
	) as response:
	if response.status != 200:
	return
	text = await response.text()
	soup = BeautifulSoup(text, "html.parser")

	site_name = (
	soup.find("title").text.split("\|")[0].strip()
	if soup.find("title")
	else urlparse(url).netloc
	)
	description = soup.find("meta", {"name": "description"})
	description = (
	description["content"].strip()
	if description and description.get("content")
	else None
	)

	self.homepage_metadata = {
	"site_name": self.clean_text(site_name, is_title=True),
	"description": (
	self.clean_text(description) if description else None
	),
	}
	except Exception as e:
	logger.error(f"Error processing homepage {url}: {str(e)}")

	async def crawl_website(self, start_url):
	try:
	await self.process_homepage(start_url)
	base_domain = urlparse(start_url).netloc
	queue = [(start_url, 0)]
	seen = {start_url}

	while queue and len(self.visited_urls) < self.max_pages:
	current_url, depth = queue.pop(0)
	if depth > self.max_depth:
	continue
	links = await self.crawl_page(current_url, depth, base_domain)
	for link in links:
	if link not in seen:
	seen.add(link)
	queue.append((link, depth + 1))

	except Exception as e:
	logger.error(f"Error during crawl: {str(e)}")
	raise

	def generate_llms_txt(self):
	if not self.url_metadata:
	return "No content available."

	content = []
	homepage_title = self.homepage_metadata.get("site_name", "Website")
	homepage_description = self.homepage_metadata.get(
	"description", "No description available."
	)
	content.append(f"# {homepage_title}\n\n> {homepage_description}\n")

	categories = defaultdict(list)
	for url, metadata in self.url_metadata.items():
	categories[metadata["category"]].append((url, metadata))

	category_order = [
	"Main",
	"Documentation",
	"API",
	"About",
	"News",
	"Tools",
	"Optional",
	]
	for category in category_order:
	if category in categories:
	content.append(f"## {category}")
	for url, metadata in categories[category]:
	content.append(
	f"- [{metadata['title']}]({url}): {metadata['description']}"
	)

	return "\n".join(content)


	async def process_url(url, max_depth, max_pages):
	try:
	if not url.startswith(("http://", "https://")):
	url = "https://" + url
	result = urlparse(url)
	if not result.scheme or not result.netloc:
	return "", "Invalid URL format. Please enter a valid URL."

	crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
	await crawler.crawl_website(url)
	content = crawler.generate_llms_txt()

	return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
	except Exception as e:
	logger.error(f"Error processing URL {url}: {str(e)}")
	return "", f"Error: {str(e)}"


	# Gradio interface
	theme = gr.themes.Soft(primary_hue="blue", font="Open Sans")

	with gr.Blocks(theme=theme) as iface:
	with gr.Row():
	gr.Markdown("## Website Crawler - Generate llms.txt")

	with gr.Row():
	url_input = gr.Textbox(
	label="Website URL",
	placeholder="Enter the website URL (e.g., example.com)",
	info="The URL will be automatically prefixed with https:// if not provided",
	lines=1,
	)

	with gr.Row():
	depth_input = gr.Slider(
	minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth"
	)
	pages_input = gr.Slider(
	minimum=10, maximum=100, value=50, step=10, label="Maximum Pages"
	)

	with gr.Row():
	generate_btn = gr.Button("Generate llms.txt", variant="primary")

	with gr.Row():
	output = gr.Textbox(
	label="Generated llms.txt Content",
	lines=15,
	show_copy_button=True,
	container=True,
	)

	with gr.Row():
	status = gr.Textbox(label="Status", interactive=False)

	# Pass the asynchronous function directly
	generate_btn.click(
	fn=process_url,
	inputs=[url_input, depth_input, pages_input],
	outputs=[output, status],
	)

	if __name__ == "__main__":
	iface.launch(asyncio_mode="auto")