import gradio as gr import requests from bs4 import BeautifulSoup import re from urllib.parse import urljoin, urlparse import asyncio from collections import defaultdict import unicodedata import logging logging.basicConfig(level=logging.INFO) logger = logging.getLogger(__name__) class WebsiteCrawler: def __init__(self, max_depth=3, max_pages=50): self.max_depth = max_depth self.max_pages = max_pages self.visited_urls = set() self.url_metadata = defaultdict(dict) self.homepage_metadata = None self.headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" } def determine_category_importance(self, url, title, desc): """Improved category detection""" url_lower = url.lower() path = urlparse(url).path.lower() # Homepage if path == "/" or path == "": return "Main", 10 # Documentation and Features if any(x in url_lower for x in ['/docs', '/documentation', '/features', '/pricing']): return "Documentation", 8 # API elif any(x in url_lower for x in ['/api', '/developer', 'developers']): return "API", 8 # About/Company elif any(x in url_lower for x in ['/about', '/company', '/partners', '/stories']): return "About", 7 # News and Updates elif any(x in url_lower for x in ['/news', '/blog', '/releases', '/academy']): return "News", 5 # Tools and Features elif any(x in url_lower for x in ['/tools', '/features', '/website', '/keyword']): return "Tools", 6 return "Optional", 1 def clean_text(self, text, is_title=False): """Improved text cleaning""" if not text or len(text.strip()) < 2: return "" # Normalize unicode characters text = unicodedata.normalize("NFKD", text) text = re.sub(r"[^\x00-\x7F]+", "", text) # Remove any template variables/placeholders text = re.sub(r'\{\{.*?\}\}', '', text) text = re.sub(r'\{\%.*?\%\}', '', text) text = re.sub(r'\${.*?\}', '', text) if is_title: # Remove common suffixes and fragments for titles text = re.sub(r'^\s*Welcome to\s+', '', text) text = re.sub(r'\s*[\|\-#:•].*', '', text) text = re.sub(r'\s+Homepage$', '', text, flags=re.IGNORECASE) # Handle overly generic titles if text.lower() in ['features', 'home', 'homepage', 'welcome']: return "" # Only return if we have meaningful text cleaned = " ".join(text.split()).strip() if len(cleaned.split()) < 2 and not is_title: # Allow single-word titles return "" return cleaned async def process_homepage(self, url): """Specifically process the homepage to extract key metadata""" try: response = requests.get(url, headers=self.headers, timeout=10) response.encoding = "utf-8" soup = BeautifulSoup(response.text, "html.parser") # Extract site name with more fallbacks site_name = None # Try meta tags first site_meta = soup.find("meta", property="og:site_name") if site_meta and site_meta.get("content"): site_name = site_meta["content"] # Try structured data if not site_name: schema = soup.find("script", type="application/ld+json") if schema: try: import json data = json.loads(schema.string) if isinstance(data, dict): site_name = data.get("name") or data.get("organizationName") except: pass # Try title tag if not site_name: title_tag = soup.find("title") if title_tag: site_name = title_tag.text.split('|')[0].strip() # Last resort - use domain name if not site_name: site_name = urlparse(url).netloc.split('.')[0].capitalize() # Get homepage description description = self.extract_homepage_description(soup) self.homepage_metadata = { "site_name": self.clean_text(site_name, is_title=True), "description": description } except Exception as e: logger.error(f"Error processing homepage {url}: {str(e)}") self.homepage_metadata = { "site_name": urlparse(url).netloc.split('.')[0].capitalize(), "description": None } def clean_description(self, desc): """Clean description text""" if not desc: return "" # Remove leading dashes, hyphens, or colons desc = re.sub(r"^[-:\s]+", "", desc) # Remove any strings that are just "Editors", "APIs", etc. if len(desc.split()) <= 1: return "" return desc.strip() def is_duplicate_content(self, desc, title, url): """Improved duplicate/translation detection""" if not desc or not title: return False # Skip common translation paths translation_indicators = [ '/welcome', '/bienvenue', '/willkommen', '/benvenuto', '/tervetuloa', '/bienvenido', '/velkommen', '/welkom' ] if any(indicator in url.lower() for indicator in translation_indicators): return True # Check for similar content length and patterns for existing_metadata in self.url_metadata.values(): existing_desc = existing_metadata.get("description", "") if not existing_desc: continue # If descriptions are very similar in length, likely a translation if (abs(len(desc) - len(existing_desc)) < 20 and len(desc) > 50): # Only check substantial descriptions return True return False def extract_homepage_description(self, soup): """Extract description from homepage with multiple fallbacks""" # Try meta description first meta_desc = soup.find("meta", {"name": "description"}) if meta_desc and meta_desc.get("content"): desc = meta_desc["content"] if desc and len(desc.strip()) > 20: return self.clean_text(desc) # Try OpenGraph description og_desc = soup.find("meta", property="og:description") if og_desc and og_desc.get("content"): desc = og_desc["content"] if desc and len(desc.strip()) > 20: return self.clean_text(desc) # Try first significant paragraph for p in soup.find_all("p"): text = p.get_text().strip() if len(text) > 50 and not any(x in text.lower() for x in ["cookie", "accept", "privacy"]): return self.clean_text(text) # Try main content area if exists main = soup.find("main") if main: first_p = main.find("p") if first_p: text = first_p.get_text().strip() if len(text) > 50: return self.clean_text(text) return None async def crawl_page(self, url, depth, base_domain): """Crawl a single page and extract information""" if ( depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages ): return [] try: await asyncio.sleep(1) # Be polite to servers response = requests.get(url, headers=self.headers, timeout=10) response.encoding = "utf-8" self.visited_urls.add(url) soup = BeautifulSoup(response.text, "html.parser") # Extract title with fallbacks title = None meta_title = soup.find("meta", property="og:title") if meta_title and meta_title.get("content"): title = meta_title["content"] if not title: title_tag = soup.find("title") if title_tag: title = title_tag.text if not title: h1_tag = soup.find("h1") if h1_tag: title = h1_tag.text if not title: title = url.split("/")[-1] title = self.clean_text(title, is_title=True) # Extract description with fallbacks desc = None meta_desc = soup.find("meta", {"name": "description"}) if meta_desc and meta_desc.get("content"): desc = meta_desc["content"] if not desc: og_desc = soup.find("meta", property="og:description") if og_desc and og_desc.get("content"): desc = og_desc["content"] if not desc: first_p = soup.find("p") if first_p: desc = first_p.text desc = self.clean_text(desc) if desc else "" # Skip if it's duplicate content if self.is_duplicate_content(desc, title, url): return [] # Determine category and importance category, importance = self.determine_category_importance(url, title, desc) # Store metadata clean_url = re.sub(r"#.*", "", url).rstrip("/") if title and len(title.strip()) > 0: # Only store if we have a valid title self.url_metadata[clean_url] = { "title": title, "description": desc, "category": category, "importance": importance, } # Find links links = [] for a in soup.find_all("a", href=True): href = a["href"] if not any( x in href.lower() for x in ["javascript:", "mailto:", ".pdf", ".jpg", ".png", ".gif"] ): next_url = urljoin(url, href) if urlparse(next_url).netloc == base_domain: links.append(next_url) return links except Exception as e: logger.error(f"Error crawling {url}: {str(e)}") return [] async def process_homepage(self, url): """Specifically process the homepage to extract key metadata""" try: response = requests.get(url, headers=self.headers, timeout=10) response.encoding = "utf-8" soup = BeautifulSoup(response.text, "html.parser") # Extract site name with fallbacks site_name = None site_meta = soup.find("meta", property="og:site_name") if site_meta and site_meta.get("content"): site_name = site_meta["content"] if not site_name: site_name = soup.find("title").text if soup.find("title") else None if not site_name: site_name = urlparse(url).netloc.split('.')[0].capitalize() # Get homepage description description = self.extract_homepage_description(soup) self.homepage_metadata = { "site_name": self.clean_text(site_name, is_title=True), "description": description } except Exception as e: logger.error(f"Error processing homepage {url}: {str(e)}") self.homepage_metadata = { "site_name": urlparse(url).netloc.split('.')[0].capitalize(), "description": None } async def crawl_website(self, start_url): """Crawl website starting from the given URL""" # First process the homepage await self.process_homepage(start_url) base_domain = urlparse(start_url).netloc queue = [(start_url, 0)] seen = {start_url} while queue and len(self.visited_urls) < self.max_pages: current_url, depth = queue.pop(0) if depth > self.max_depth: continue links = await self.crawl_page(current_url, depth, base_domain) for link in links: if link not in seen and urlparse(link).netloc == base_domain: seen.add(link) queue.append((link, depth + 1)) def generate_llms_txt(self): """Generate llms.txt content""" if not self.url_metadata: return "No content was found to generate llms.txt" # Sort URLs by importance and remove duplicates sorted_urls = [] seen_titles = set() for url, metadata in sorted( self.url_metadata.items(), key=lambda x: (x[1]["importance"], x[0]), reverse=True, ): if metadata["title"] not in seen_titles: sorted_urls.append((url, metadata)) seen_titles.add(metadata["title"]) if not sorted_urls: return "No valid content was found" # Generate content content = [] # Use homepage metadata for main title and description main_title = self.homepage_metadata.get("site_name", "Welcome") homepage_description = self.homepage_metadata.get("description") content.append(f"# {main_title}") if homepage_description: content.append(f"\n> {homepage_description}") elif len(sorted_urls) > 0: # Fallback to first good description from content if no homepage description for _, metadata in sorted_urls: desc = self.clean_description(metadata["description"]) if desc and len(desc) > 20 and "null" not in desc.lower(): content.append(f"\n> {desc}") break # Group by category categories = defaultdict(list) for url, metadata in sorted_urls: if metadata["title"] and url: categories[metadata["category"]].append((url, metadata)) # Add sections in a logical order category_order = [ "Main", "Documentation", "API", "Tools", "About", "News", "Optional" ] for category in category_order: if category in categories: content.append(f"\n## {category}") # Sort links within category by importance and description length category_links = sorted( categories[category], key=lambda x: (-len(x[1]["description"] or ""), x[1]["title"]) ) links = [] for url, metadata in category_links: title = metadata["title"].strip() desc = self.clean_description(metadata["description"]) if desc: links.append(f"- [{title}]({url}): {desc}") else: links.append(f"- [{title}]({url})") content.append("\n".join(links)) return "\n".join(content) async def process_url(url, max_depth, max_pages): """Process URL and generate llms.txt""" try: # Add https:// if not present if not url.startswith(("http://", "https://")): url = "https://" + url # Validate URL result = urlparse(url) if not all([result.scheme, result.netloc]): return "", "Invalid URL format. Please enter a valid URL." # Process website crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages)) await crawler.crawl_website(url) content = crawler.generate_llms_txt() return content, f"Successfully crawled {len(crawler.visited_urls)} pages." except Exception as e: return "", f"Error: {str(e)}" # Create Gradio interface theme = gr.themes.Soft(primary_hue="blue", font="Open Sans") with gr.Blocks( theme=theme, css=""" @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap'); .gradio-container { font-family: 'Open Sans', sans-serif !important; } .gr-button { font-family: 'Open Sans', sans-serif !important; font-weight: 600 !important; } .primary-btn { background-color: #2436d4 !important; color: white !important; } .primary-btn:hover { background-color: #1c2aa8 !important; } [data-testid="textbox"] { font-family: 'Open Sans', sans-serif !important; } .gr-padded { font-family: 'Open Sans', sans-serif !important; } .gr-input { font-family: 'Open Sans', sans-serif !important; } .gr-label { font-family: 'Open Sans', sans-serif !important; } """, ) as iface: gr.Markdown("# llms.txt Generator") gr.Markdown("Generate an llms.txt file from a website following the specification.") with gr.Row(): url_input = gr.Textbox( label="Website URL", placeholder="Enter the website URL (e.g., example.com)", info="The URL will be automatically prefixed with https:// if not provided", ) with gr.Row(): with gr.Column(): depth_input = gr.Slider( minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth" ) with gr.Column(): pages_input = gr.Slider( minimum=10, maximum=100, value=50, step=10, label="Maximum Pages" ) generate_btn = gr.Button("Generate llms.txt", variant="primary") output = gr.Textbox( label="Generated llms.txt Content", lines=20, show_copy_button=True, container=True, ) status = gr.Textbox(label="Status") generate_btn.click( fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)), inputs=[url_input, depth_input, pages_input], outputs=[output, status], ) if __name__ == "__main__": iface.launch()