Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

create-llms-txt / app.py

cyberandy

Create app.py

5e3183d verified 21 days ago

raw

history blame

5.32 kB

	import gradio as gr
	import requests
	from bs4 import BeautifulSoup
	import re
	from urllib.parse import urljoin, urlparse
	import markdown

	def get_website_title(soup):
	"""Extract website title from meta tags or title tag"""
	# Try meta title first
	meta_title = soup.find('meta', property='og:title')
	if meta_title:
	return meta_title['content']

	# Try regular title tag
	title_tag = soup.find('title')
	if title_tag:
	return title_tag.text.strip()

	# Fallback to h1
	h1_tag = soup.find('h1')
	if h1_tag:
	return h1_tag.text.strip()

	return "Website Title"

	def get_website_description(soup):
	"""Extract website description from meta tags"""
	# Try meta description
	meta_desc = soup.find('meta', {'name': 'description'}) or soup.find('meta', property='og:description')
	if meta_desc:
	return meta_desc.get('content', '')

	# Fallback to first paragraph
	first_p = soup.find('p')
	if first_p:
	return first_p.text.strip()

	return "Website description"

	def get_important_links(soup, base_url):
	"""Extract important links from the website"""
	links = []
	seen_urls = set()

	# Look for navigation links
	nav_elements = soup.find_all(['nav', 'header'])
	for nav in nav_elements:
	for a in nav.find_all('a', href=True):
	url = urljoin(base_url, a['href'])
	if url not in seen_urls and not url.startswith(('javascript:', 'mailto:', 'tel:')):
	text = a.text.strip()
	if text and len(text) > 1: # Avoid empty or single-character links
	links.append({
	'title': text,
	'url': url,
	'section': 'Docs'
	})
	seen_urls.add(url)

	# Look for footer links
	footer = soup.find('footer')
	if footer:
	for a in footer.find_all('a', href=True):
	url = urljoin(base_url, a['href'])
	if url not in seen_urls and not url.startswith(('javascript:', 'mailto:', 'tel:')):
	text = a.text.strip()
	if text and len(text) > 1:
	links.append({
	'title': text,
	'url': url,
	'section': 'Optional'
	})
	seen_urls.add(url)

	return links

	def generate_llms_txt(url):
	try:
	# Fetch the webpage
	headers = {
	'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
	}
	response = requests.get(url, headers=headers, timeout=10)
	response.raise_for_status()

	# Parse the HTML
	soup = BeautifulSoup(response.text, 'html.parser')

	# Get base components
	title = get_website_title(soup)
	description = get_website_description(soup)
	links = get_important_links(soup, url)

	# Generate llms.txt content
	content = [
	f"# {title}\n",
	f"> {description}\n",
	"## Docs\n"
	]

	# Add documentation links
	doc_links = [link for link in links if link['section'] == 'Docs']
	for link in doc_links:
	content.append(f"- [{link['title']}]({link['url']}): Documentation page\n")

	# Add optional links if present
	optional_links = [link for link in links if link['section'] == 'Optional']
	if optional_links:
	content.append("\n## Optional\n")
	for link in optional_links:
	content.append(f"- [{link['title']}]({link['url']})\n")

	# Join all content
	llms_txt_content = "\n".join(content)

	return llms_txt_content
	except Exception as e:
	return f"Error generating llms.txt: {str(e)}"

	def save_llms_txt(content, save_path="llms.txt"):
	"""Save the generated content to a file"""
	try:
	with open(save_path, 'w', encoding='utf-8') as f:
	f.write(content)
	return f"Successfully saved to {save_path}"
	except Exception as e:
	return f"Error saving file: {str(e)}"

	# Create Gradio interface
	def process_url(url, save_to_file=False):
	content = generate_llms_txt(url)
	if save_to_file:
	save_message = save_llms_txt(content)
	return content, save_message
	return content, "File not saved (checkbox not selected)"

	# Create the Gradio interface
	iface = gr.Interface(
	fn=process_url,
	inputs=[
	gr.Textbox(label="Website URL", placeholder="Enter the website URL..."),
	gr.Checkbox(label="Save to file", value=False)
	],
	outputs=[
	gr.Textbox(label="Generated llms.txt Content", lines=10),
	gr.Textbox(label="Status")
	],
	title="llms.txt Generator",
	description="Generate an llms.txt file from a website following the specification. The tool extracts relevant information and creates a structured markdown file suitable for LLMs.",
	examples=[
	["https://example.com", False],
	["https://docs.python.org", True]
	],
	theme=gr.themes.Soft()
	)

	# Launch the app
	if __name__ == "__main__":
	iface.launch()