Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on 18 days ago

Commit

970c25e

•

1 Parent(s): 718decc

update

Browse files

Files changed (1) hide show

app.py +207 -130

app.py CHANGED Viewed

@@ -28,121 +28,167 @@ class WebsiteCrawler:
             "Accept-Encoding": "gzip, deflate, br",
             "DNT": "1",
             "Connection": "keep-alive",
-            "Upgrade-Insecure-Requests": "1"
         }
     def determine_category_importance(self, url, title, desc):
         """Improved category detection"""
         url_lower = url.lower()
         path = urlparse(url).path.lower()
         # Homepage
         if path == "/" or path == "":
             return "Main", 10
         # Documentation and Help
-        if any(x in url_lower for x in ['/docs', '/documentation', '/faq', '/help', 'frequently-asked-questions']):
             return "Documentation", 8
         # API and Developer
-        elif any(x in url_lower for x in ['/api', '/developer', 'developers']):
             return "API", 8
         # About/Company pages
-        elif any(x in url_lower for x in [
-            '/about', '/company', '/references', '/work-with-us',
-            'careers', '/team', '/contact', '/about-us'
-        ]):
             return "About", 7
-        # News and Events
-        elif any(x in url_lower for x in [
-            '/news', '/blog', '/events', '/press',
-            'research', 'power-of', 'latest'
-        ]):
             return "News", 5
         # Tools and Services
-        elif any(x in url_lower for x in [
-            '/tools', '/quote', '/pricing', '/services',
-            '/translate', '/order', '/buy'
-        ]):
             return "Tools", 6
         # Check if URL path contains non-ASCII or percent-encoded characters
-        if bool(re.search(r'[^\x00-\x7F]', path)) or bool(re.search(r'%[0-9A-F]{2}', path)):
             return "Optional", 0
         return "Optional", 1
     def is_duplicate_content(self, desc, title, url):
         """Improved duplicate/translation detection"""
         if not desc or not title:
             return False
         # Skip non-latin character URLs or URLs with percent-encoded non-ASCII
-        if bool(re.search(r'[^\x00-\x7F]', url)) or bool(re.search(r'%[0-9A-F]{2}', url)):
             return True
         # Skip common translation paths
         translation_indicators = [
-            '/welcome', '/bienvenue', '/willkommen', '/benvenuto',
-            '/tervetuloa', '/bienvenido', '/velkommen', '/welkom',
-            'translate.com/', '/translate/', '/translation/'
         ]
         if any(indicator in url.lower() for indicator in translation_indicators):
             url_path = urlparse(url).path.lower()
-            if url_path != '/':  # Don't skip homepage
                 return True
         # Check for similar content length and patterns
         for existing_metadata in self.url_metadata.values():
             existing_desc = existing_metadata.get("description", "")
             existing_title = existing_metadata.get("title", "")
             if not existing_desc or not existing_title:
                 continue
             # If descriptions are very similar in length, likely a translation
-            if (abs(len(desc) - len(existing_desc)) < 20 and
-                len(desc) > 50 and
-                desc != existing_desc):  # Allow exact duplicates for main page
                 return True
         return False
     def clean_text(self, text, is_title=False):
         """Improved text cleaning"""
         if not text or len(text.strip()) < 2:
             return ""
         # Normalize unicode characters
         text = unicodedata.normalize("NFKD", text)
         text = re.sub(r"[^\x00-\x7F]+", "", text)
         # Remove any template variables/placeholders
-        text = re.sub(r'\{\{.*?\}\}', '', text)
-        text = re.sub(r'\{\%.*?\%\}', '', text)
-        text = re.sub(r'\${.*?\}', '', text)
         if is_title:
             # Remove common suffixes and fragments for titles
-            text = re.sub(r'^\s*Welcome to\s+', '', text)
-            text = re.sub(r'\s*[\|\-#:•].*', '', text)
-            text = re.sub(r'\s+Homepage$', '', text, flags=re.IGNORECASE)
             # Handle overly generic titles
-            if text.lower() in ['features', 'home', 'homepage', 'welcome']:
                 return ""
         # Only return if we have meaningful text
         cleaned = " ".join(text.split()).strip()
         if len(cleaned.split()) < 2 and not is_title:  # Allow single-word titles
             return ""
         return cleaned
     def clean_description(self, desc):
         """Clean description text"""
         if not desc:
@@ -154,7 +200,6 @@ class WebsiteCrawler:
             return ""
         return desc.strip()
     def extract_homepage_description(self, soup):
         """Extract description from homepage with multiple fallbacks"""
         # Try meta description first
@@ -174,7 +219,9 @@ class WebsiteCrawler:
         # Try first significant paragraph
         for p in soup.find_all("p"):
             text = p.get_text().strip()
-            if len(text) > 50 and not any(x in text.lower() for x in ["cookie", "accept", "privacy"]):
                 return self.clean_text(text)
         # Try main content area if exists
@@ -196,18 +243,22 @@ class WebsiteCrawler:
             or len(self.visited_urls) >= self.max_pages
         ):
             return []
         try:
             await asyncio.sleep(1)  # Be polite to servers
             async with aiohttp.ClientSession() as session:
-                async with session.get(url, headers=self.headers, allow_redirects=True) as response:
                     if response.status == 403:
                         # Try with alternative headers
                         alt_headers = {
                             "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
                             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                         }
-                        async with session.get(url, headers=alt_headers, allow_redirects=True) as retry_response:
                             if retry_response.status != 200:
                                 return []
                             text = await retry_response.text()
@@ -215,10 +266,10 @@ class WebsiteCrawler:
                         return []
                     else:
                         text = await response.text()
                     self.visited_urls.add(url)
-                    soup = BeautifulSoup(text, "html.parser")
                     # Extract title with fallbacks
                     title = None
                     meta_title = soup.find("meta", property="og:title")
@@ -234,9 +285,9 @@ class WebsiteCrawler:
                             title = h1_tag.text
                     if not title:
                         title = url.split("/")[-1]
                     title = self.clean_text(title, is_title=True)
                     # Extract description with fallbacks
                     desc = None
                     meta_desc = soup.find("meta", {"name": "description"})
@@ -250,44 +301,57 @@ class WebsiteCrawler:
                         first_p = soup.find("p")
                         if first_p:
                             desc = first_p.text
                     desc = self.clean_text(desc) if desc else ""
                     # Skip if it's duplicate content
                     if self.is_duplicate_content(desc, title, url):
                         return []
                     # Determine category and importance
-                    category, importance = self.determine_category_importance(url, title, desc)
                     # Store metadata
                     clean_url = re.sub(r"#.*", "", url).rstrip("/")
-                    if title and len(title.strip()) > 0:  # Only store if we have a valid title
-                        logger.info(f"Storing metadata for {clean_url}: {title[:30]}...")
                         self.url_metadata[clean_url] = {
                             "title": title,
                             "description": desc,
                             "category": category,
                             "importance": importance,
                         }
                     # Find links
                     links = []
                     for a in soup.find_all("a", href=True):
                         href = a["href"]
                         if not any(
                             x in href.lower()
-                            for x in ["javascript:", "mailto:", ".pdf", ".jpg", ".png", ".gif"]
                         ):
                             next_url = urljoin(url, href)
                             if urlparse(next_url).netloc == base_domain:
                                 links.append(next_url)
                     return links
         except Exception as e:
             logger.error(f"Error crawling {url}: {str(e)}")
             return []
     async def process_homepage(self, url):
         """Specifically process the homepage to extract key metadata"""
         try:
@@ -295,94 +359,103 @@ class WebsiteCrawler:
             ssl_context = ssl.create_default_context()
             ssl_context.check_hostname = False
             ssl_context.verify_mode = ssl.CERT_NONE
             connector = aiohttp.TCPConnector(ssl=ssl_context)
             timeout = aiohttp.ClientTimeout(total=30)
-            async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
-                async with session.get(url, headers=self.headers, allow_redirects=True) as response:
                     if response.status != 200:
-                        raise Exception(f"Failed to fetch homepage: status {response.status}")
                     try:
                         text = await response.text()
                     except UnicodeDecodeError:
                         text = await response.read()
-                        text = text.decode('utf-8', errors='ignore')
                     soup = BeautifulSoup(text, "html.parser")
                     # Extract site name with more fallbacks
                     site_name = None
                     # Try meta tags first
                     site_meta = soup.find("meta", property="og:site_name")
                     if site_meta and site_meta.get("content"):
                         site_name = site_meta["content"]
                     # Try structured data
                     if not site_name:
                         schema = soup.find("script", type="application/ld+json")
                         if schema:
                             try:
                                 import json
                                 data = json.loads(schema.string)
                                 if isinstance(data, dict):
-                                    site_name = data.get("name") or data.get("organizationName")
                             except:
                                 pass
                     # Try title tag
                     if not site_name:
                         title_tag = soup.find("title")
                         if title_tag:
-                            site_name = title_tag.text.split('|')[0].strip()
                     # Last resort - use domain name
                     if not site_name:
-                        site_name = urlparse(url).netloc.split('.')[0].capitalize()
                     # Get homepage description
                     description = self.extract_homepage_description(soup)
                     self.homepage_metadata = {
                         "site_name": self.clean_text(site_name, is_title=True),
-                        "description": description
                     }
         except Exception as e:
             logger.error(f"Error processing homepage {url}: {str(e)}")
             self.homepage_metadata = {
-                "site_name": urlparse(url).netloc.split('.')[0].capitalize(),
-                "description": None
             }
     async def crawl_website(self, start_url):
         """Crawl website starting from the given URL"""
         try:
             # First process the homepage
             logger.info(f"Processing homepage: {start_url}")
             await self.process_homepage(start_url)
             base_domain = urlparse(start_url).netloc
             queue = [(start_url, 0)]
             seen = {start_url}
             while queue and len(self.visited_urls) < self.max_pages:
                 current_url, depth = queue.pop(0)
                 if depth > self.max_depth:
                     continue
                 logger.info(f"Crawling page: {current_url} (depth: {depth})")
                 links = await self.crawl_page(current_url, depth, base_domain)
                 logger.info(f"Found {len(links)} links on {current_url}")
                 for link in links:
                     if link not in seen and urlparse(link).netloc == base_domain:
                         seen.add(link)
                         queue.append((link, depth + 1))
             logger.info(f"Crawl completed. Visited {len(self.visited_urls)} pages")
         except Exception as e:
             logger.error(f"Error during crawl: {str(e)}")
             raise
@@ -390,15 +463,15 @@ class WebsiteCrawler:
     def generate_llms_txt(self):
         """Generate llms.txt content"""
         logger.info(f"Starting generate_llms_txt with {len(self.url_metadata)} URLs")
         if not self.url_metadata:
             logger.error("No URL metadata found")
             return "No content was found to generate llms.txt"
         # Sort URLs by importance and remove duplicates
         sorted_urls = []
         seen_titles = set()
         for url, metadata in sorted(
             self.url_metadata.items(),
             key=lambda x: (x[1]["importance"], x[0]),
@@ -407,23 +480,23 @@ class WebsiteCrawler:
             if metadata["title"] not in seen_titles:
                 sorted_urls.append((url, metadata))
                 seen_titles.add(metadata["title"])
         logger.info(f"Found {len(sorted_urls)} unique URLs after deduplication")
         if not sorted_urls:
             logger.error("No valid URLs found after sorting")
             return "No valid content was found"
         # Generate content
         content = []
         # Use homepage metadata for main title and description
         main_title = self.homepage_metadata.get("site_name", "Welcome")
         homepage_description = self.homepage_metadata.get("description")
         logger.info(f"Homepage title: {main_title}")
         logger.info(f"Homepage description: {homepage_description}")
         content.append(f"# {main_title}")
         if homepage_description:
             content.append(f"\n> {homepage_description}")
@@ -434,15 +507,15 @@ class WebsiteCrawler:
                 if desc and len(desc) > 20 and "null" not in desc.lower():
                     content.append(f"\n> {desc}")
                     break
         # Group by category
         categories = defaultdict(list)
         for url, metadata in sorted_urls:
             if metadata["title"] and url:
                 categories[metadata["category"]].append((url, metadata))
         logger.info(f"Categories found: {list(categories.keys())}")
         # Add sections in a logical order
         category_order = [
             "Main",
@@ -451,49 +524,53 @@ class WebsiteCrawler:
             "Tools",
             "About",
             "News",
-            "Optional"
         ]
         # Only show Main section if it has content different from the homepage description
         if "Main" in categories:
             main_content = categories["Main"]
-            if len(main_content) == 1 and main_content[0][1]["description"] == homepage_description:
                 logger.info("Removing duplicate Main content")
                 del categories["Main"]
         for category in category_order:
             if category in categories and categories[category]:
-                logger.info(f"Processing category {category} with {len(categories[category])} items")
                 content.append(f"\n## {category}")
                 # Sort links within category by importance and description length
                 category_links = sorted(
                     categories[category],
-                    key=lambda x: (-len(x[1]["description"] or ""), x[1]["title"])
                 )
                 links = []
                 seen_desc = set()  # Avoid duplicate descriptions within category
                 for url, metadata in category_links:
                     title = metadata["title"].strip()
                     desc = self.clean_description(metadata["description"])
                     # Skip if description is duplicate within category
                     if desc in seen_desc:
                         continue
                     seen_desc.add(desc)
                     if desc:
                         links.append(f"- [{title}]({url}): {desc}")
                     else:
                         links.append(f"- [{title}]({url})")
                 content.append("\n".join(links))
         final_content = "\n".join(content)
         logger.info(f"Generated content length: {len(final_content)}")
         return final_content
 async def process_url(url, max_depth, max_pages):
@@ -509,11 +586,11 @@ async def process_url(url, max_depth, max_pages):
             return "", "Invalid URL format. Please enter a valid URL."
         logger.info(f"Starting crawl of {url}")
         # Process website
         crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
         await crawler.crawl_website(url)
         logger.info("Generating llms.txt content")
         content = crawler.generate_llms_txt()
@@ -606,4 +683,4 @@ with gr.Blocks(
     )
 if __name__ == "__main__":
-    iface.launch()

             "Accept-Encoding": "gzip, deflate, br",
             "DNT": "1",
             "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1",
         }
     def determine_category_importance(self, url, title, desc):
         """Improved category detection"""
         url_lower = url.lower()
         path = urlparse(url).path.lower()
         # Homepage
         if path == "/" or path == "":
             return "Main", 10
         # Documentation and Help
+        if any(
+            x in url_lower
+            for x in [
+                "/docs",
+                "/documentation",
+                "/faq",
+                "/help",
+                "frequently-asked-questions",
+            ]
+        ):
             return "Documentation", 8
         # API and Developer
+        elif any(x in url_lower for x in ["/api", "/developer", "developers"]):
             return "API", 8
         # About/Company pages
+        elif any(
+            x in url_lower
+            for x in [
+                "/about",
+                "/company",
+                "/references",
+                "/work-with-us",
+                "careers",
+                "/team",
+                "/contact",
+                "/about-us",
+            ]
+        ):
             return "About", 7
+        # News and Events
+        elif any(
+            x in url_lower
+            for x in [
+                "/news",
+                "/blog",
+                "/events",
+                "/press",
+                "research",
+                "power-of",
+                "latest",
+            ]
+        ):
             return "News", 5
         # Tools and Services
+        elif any(
+            x in url_lower
+            for x in [
+                "/tools",
+                "/quote",
+                "/pricing",
+                "/services",
+                "/translate",
+                "/order",
+                "/buy",
+            ]
+        ):
             return "Tools", 6
         # Check if URL path contains non-ASCII or percent-encoded characters
+        if bool(re.search(r"[^\x00-\x7F]", path)) or bool(
+            re.search(r"%[0-9A-F]{2}", path)
+        ):
             return "Optional", 0
         return "Optional", 1
     def is_duplicate_content(self, desc, title, url):
         """Improved duplicate/translation detection"""
         if not desc or not title:
             return False
         # Skip non-latin character URLs or URLs with percent-encoded non-ASCII
+        if bool(re.search(r"[^\x00-\x7F]", url)) or bool(
+            re.search(r"%[0-9A-F]{2}", url)
+        ):
             return True
         # Skip common translation paths
         translation_indicators = [
+            "/welcome",
+            "/bienvenue",
+            "/willkommen",
+            "/benvenuto",
+            "/tervetuloa",
+            "/bienvenido",
+            "/velkommen",
+            "/welkom",
+            "translate.com/",
+            "/translate/",
+            "/translation/",
         ]
         if any(indicator in url.lower() for indicator in translation_indicators):
             url_path = urlparse(url).path.lower()
+            if url_path != "/":  # Don't skip homepage
                 return True
         # Check for similar content length and patterns
         for existing_metadata in self.url_metadata.values():
             existing_desc = existing_metadata.get("description", "")
             existing_title = existing_metadata.get("title", "")
             if not existing_desc or not existing_title:
                 continue
             # If descriptions are very similar in length, likely a translation
+            if (
+                abs(len(desc) - len(existing_desc)) < 20
+                and len(desc) > 50
+                and desc != existing_desc
+            ):  # Allow exact duplicates for main page
                 return True
         return False
     def clean_text(self, text, is_title=False):
         """Improved text cleaning"""
         if not text or len(text.strip()) < 2:
             return ""
         # Normalize unicode characters
         text = unicodedata.normalize("NFKD", text)
         text = re.sub(r"[^\x00-\x7F]+", "", text)
         # Remove any template variables/placeholders
+        text = re.sub(r"\{\{.*?\}\}", "", text)
+        text = re.sub(r"\{\%.*?\%\}", "", text)
+        text = re.sub(r"\${.*?\}", "", text)
         if is_title:
             # Remove common suffixes and fragments for titles
+            text = re.sub(r"^\s*Welcome to\s+", "", text)
+            text = re.sub(r"\s*[\|\-#:•].*", "", text)
+            text = re.sub(r"\s+Homepage$", "", text, flags=re.IGNORECASE)
             # Handle overly generic titles
+            if text.lower() in ["features", "home", "homepage", "welcome"]:
                 return ""
         # Only return if we have meaningful text
         cleaned = " ".join(text.split()).strip()
         if len(cleaned.split()) < 2 and not is_title:  # Allow single-word titles
             return ""
         return cleaned
     def clean_description(self, desc):
         """Clean description text"""
         if not desc:
             return ""
         return desc.strip()
     def extract_homepage_description(self, soup):
         """Extract description from homepage with multiple fallbacks"""
         # Try meta description first
         # Try first significant paragraph
         for p in soup.find_all("p"):
             text = p.get_text().strip()
+            if len(text) > 50 and not any(
+                x in text.lower() for x in ["cookie", "accept", "privacy"]
+            ):
                 return self.clean_text(text)
         # Try main content area if exists
             or len(self.visited_urls) >= self.max_pages
         ):
             return []
         try:
             await asyncio.sleep(1)  # Be polite to servers
             async with aiohttp.ClientSession() as session:
+                async with session.get(
+                    url, headers=self.headers, allow_redirects=True
+                ) as response:
                     if response.status == 403:
                         # Try with alternative headers
                         alt_headers = {
                             "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
                             "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                         }
+                        async with session.get(
+                            url, headers=alt_headers, allow_redirects=True
+                        ) as retry_response:
                             if retry_response.status != 200:
                                 return []
                             text = await retry_response.text()
                         return []
                     else:
                         text = await response.text()
                     self.visited_urls.add(url)
+                    soup = BeautifulSoup(text, "html.parser")
                     # Extract title with fallbacks
                     title = None
                     meta_title = soup.find("meta", property="og:title")
                             title = h1_tag.text
                     if not title:
                         title = url.split("/")[-1]
                     title = self.clean_text(title, is_title=True)
                     # Extract description with fallbacks
                     desc = None
                     meta_desc = soup.find("meta", {"name": "description"})
                         first_p = soup.find("p")
                         if first_p:
                             desc = first_p.text
                     desc = self.clean_text(desc) if desc else ""
                     # Skip if it's duplicate content
                     if self.is_duplicate_content(desc, title, url):
                         return []
                     # Determine category and importance
+                    category, importance = self.determine_category_importance(
+                        url, title, desc
+                    )
                     # Store metadata
                     clean_url = re.sub(r"#.*", "", url).rstrip("/")
+                    if (
+                        title and len(title.strip()) > 0
+                    ):  # Only store if we have a valid title
+                        logger.info(
+                            f"Storing metadata for {clean_url}: {title[:30]}..."
+                        )
                         self.url_metadata[clean_url] = {
                             "title": title,
                             "description": desc,
                             "category": category,
                             "importance": importance,
                         }
                     # Find links
                     links = []
                     for a in soup.find_all("a", href=True):
                         href = a["href"]
                         if not any(
                             x in href.lower()
+                            for x in [
+                                "javascript:",
+                                "mailto:",
+                                ".pdf",
+                                ".jpg",
+                                ".png",
+                                ".gif",
+                            ]
                         ):
                             next_url = urljoin(url, href)
                             if urlparse(next_url).netloc == base_domain:
                                 links.append(next_url)
                     return links
         except Exception as e:
             logger.error(f"Error crawling {url}: {str(e)}")
             return []
     async def process_homepage(self, url):
         """Specifically process the homepage to extract key metadata"""
         try:
             ssl_context = ssl.create_default_context()
             ssl_context.check_hostname = False
             ssl_context.verify_mode = ssl.CERT_NONE
             connector = aiohttp.TCPConnector(ssl=ssl_context)
             timeout = aiohttp.ClientTimeout(total=30)
+            async with aiohttp.ClientSession(
+                connector=connector, timeout=timeout
+            ) as session:
+                async with session.get(
+                    url, headers=self.headers, allow_redirects=True
+                ) as response:
                     if response.status != 200:
+                        raise Exception(
+                            f"Failed to fetch homepage: status {response.status}"
+                        )
                     try:
                         text = await response.text()
                     except UnicodeDecodeError:
                         text = await response.read()
+                        text = text.decode("utf-8", errors="ignore")
                     soup = BeautifulSoup(text, "html.parser")
                     # Extract site name with more fallbacks
                     site_name = None
                     # Try meta tags first
                     site_meta = soup.find("meta", property="og:site_name")
                     if site_meta and site_meta.get("content"):
                         site_name = site_meta["content"]
                     # Try structured data
                     if not site_name:
                         schema = soup.find("script", type="application/ld+json")
                         if schema:
                             try:
                                 import json
                                 data = json.loads(schema.string)
                                 if isinstance(data, dict):
+                                    site_name = data.get("name") or data.get(
+                                        "organizationName"
+                                    )
                             except:
                                 pass
                     # Try title tag
                     if not site_name:
                         title_tag = soup.find("title")
                         if title_tag:
+                            site_name = title_tag.text.split("|")[0].strip()
                     # Last resort - use domain name
                     if not site_name:
+                        site_name = urlparse(url).netloc.split(".")[0].capitalize()
                     # Get homepage description
                     description = self.extract_homepage_description(soup)
                     self.homepage_metadata = {
                         "site_name": self.clean_text(site_name, is_title=True),
+                        "description": description,
                     }
         except Exception as e:
             logger.error(f"Error processing homepage {url}: {str(e)}")
             self.homepage_metadata = {
+                "site_name": urlparse(url).netloc.split(".")[0].capitalize(),
+                "description": None,
             }
     async def crawl_website(self, start_url):
         """Crawl website starting from the given URL"""
         try:
             # First process the homepage
             logger.info(f"Processing homepage: {start_url}")
             await self.process_homepage(start_url)
             base_domain = urlparse(start_url).netloc
             queue = [(start_url, 0)]
             seen = {start_url}
             while queue and len(self.visited_urls) < self.max_pages:
                 current_url, depth = queue.pop(0)
                 if depth > self.max_depth:
                     continue
                 logger.info(f"Crawling page: {current_url} (depth: {depth})")
                 links = await self.crawl_page(current_url, depth, base_domain)
                 logger.info(f"Found {len(links)} links on {current_url}")
                 for link in links:
                     if link not in seen and urlparse(link).netloc == base_domain:
                         seen.add(link)
                         queue.append((link, depth + 1))
             logger.info(f"Crawl completed. Visited {len(self.visited_urls)} pages")
         except Exception as e:
             logger.error(f"Error during crawl: {str(e)}")
             raise
     def generate_llms_txt(self):
         """Generate llms.txt content"""
         logger.info(f"Starting generate_llms_txt with {len(self.url_metadata)} URLs")
         if not self.url_metadata:
             logger.error("No URL metadata found")
             return "No content was found to generate llms.txt"
         # Sort URLs by importance and remove duplicates
         sorted_urls = []
         seen_titles = set()
         for url, metadata in sorted(
             self.url_metadata.items(),
             key=lambda x: (x[1]["importance"], x[0]),
             if metadata["title"] not in seen_titles:
                 sorted_urls.append((url, metadata))
                 seen_titles.add(metadata["title"])
         logger.info(f"Found {len(sorted_urls)} unique URLs after deduplication")
         if not sorted_urls:
             logger.error("No valid URLs found after sorting")
             return "No valid content was found"
         # Generate content
         content = []
         # Use homepage metadata for main title and description
         main_title = self.homepage_metadata.get("site_name", "Welcome")
         homepage_description = self.homepage_metadata.get("description")
         logger.info(f"Homepage title: {main_title}")
         logger.info(f"Homepage description: {homepage_description}")
         content.append(f"# {main_title}")
         if homepage_description:
             content.append(f"\n> {homepage_description}")
                 if desc and len(desc) > 20 and "null" not in desc.lower():
                     content.append(f"\n> {desc}")
                     break
         # Group by category
         categories = defaultdict(list)
         for url, metadata in sorted_urls:
             if metadata["title"] and url:
                 categories[metadata["category"]].append((url, metadata))
         logger.info(f"Categories found: {list(categories.keys())}")
         # Add sections in a logical order
         category_order = [
             "Main",
             "Tools",
             "About",
             "News",
+            "Optional",
         ]
         # Only show Main section if it has content different from the homepage description
         if "Main" in categories:
             main_content = categories["Main"]
+            if (
+                len(main_content) == 1
+                and main_content[0][1]["description"] == homepage_description
+            ):
                 logger.info("Removing duplicate Main content")
                 del categories["Main"]
         for category in category_order:
             if category in categories and categories[category]:
+                logger.info(
+                    f"Processing category {category} with {len(categories[category])} items"
+                )
                 content.append(f"\n## {category}")
                 # Sort links within category by importance and description length
                 category_links = sorted(
                     categories[category],
+                    key=lambda x: (-len(x[1]["description"] or ""), x[1]["title"]),
                 )
                 links = []
                 seen_desc = set()  # Avoid duplicate descriptions within category
                 for url, metadata in category_links:
                     title = metadata["title"].strip()
                     desc = self.clean_description(metadata["description"])
                     # Skip if description is duplicate within category
                     if desc in seen_desc:
                         continue
                     seen_desc.add(desc)
                     if desc:
                         links.append(f"- [{title}]({url}): {desc}")
                     else:
                         links.append(f"- [{title}]({url})")
                 content.append("\n".join(links))
         final_content = "\n".join(content)
         logger.info(f"Generated content length: {len(final_content)}")
         return final_content
 async def process_url(url, max_depth, max_pages):
             return "", "Invalid URL format. Please enter a valid URL."
         logger.info(f"Starting crawl of {url}")
         # Process website
         crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
         await crawler.crawl_website(url)
         logger.info("Generating llms.txt content")
         content = crawler.generate_llms_txt()
     )
 if __name__ == "__main__":
+    iface.launch()