Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on Nov 25, 2024

Commit

eccd1e5

verified ·

1 Parent(s): fe2936f

Update app.py

Browse files

Files changed (1) hide show

app.py +26 -8

app.py CHANGED Viewed

@@ -258,6 +258,7 @@ class WebsiteCrawler:
                     # Store metadata
                     clean_url = re.sub(r"#.*", "", url).rstrip("/")
                     if title and len(title.strip()) > 0:  # Only store if we have a valid title
                         self.url_metadata[clean_url] = {
                             "title": title,
                             "description": desc,
@@ -383,13 +384,16 @@ class WebsiteCrawler:
     def generate_llms_txt(self):
         """Generate llms.txt content"""
         if not self.url_metadata:
             return "No content was found to generate llms.txt"
         # Sort URLs by importance and remove duplicates
         sorted_urls = []
         seen_titles = set()
         for url, metadata in sorted(
             self.url_metadata.items(),
             key=lambda x: (x[1]["importance"], x[0]),
@@ -398,17 +402,23 @@ class WebsiteCrawler:
             if metadata["title"] not in seen_titles:
                 sorted_urls.append((url, metadata))
                 seen_titles.add(metadata["title"])
         if not sorted_urls:
             return "No valid content was found"
         # Generate content
         content = []
         # Use homepage metadata for main title and description
         main_title = self.homepage_metadata.get("site_name", "Welcome")
         homepage_description = self.homepage_metadata.get("description")
         content.append(f"# {main_title}")
         if homepage_description:
             content.append(f"\n> {homepage_description}")
@@ -419,13 +429,15 @@ class WebsiteCrawler:
                 if desc and len(desc) > 20 and "null" not in desc.lower():
                     content.append(f"\n> {desc}")
                     break
         # Group by category
         categories = defaultdict(list)
         for url, metadata in sorted_urls:
             if metadata["title"] and url:
                 categories[metadata["category"]].append((url, metadata))
         # Add sections in a logical order
         category_order = [
             "Main",
@@ -441,10 +453,12 @@ class WebsiteCrawler:
         if "Main" in categories:
             main_content = categories["Main"]
             if len(main_content) == 1 and main_content[0][1]["description"] == homepage_description:
                 del categories["Main"]
         for category in category_order:
             if category in categories and categories[category]:
                 content.append(f"\n## {category}")
                 # Sort links within category by importance and description length
@@ -470,6 +484,10 @@ class WebsiteCrawler:
                         links.append(f"- [{title}]({url})")
                 content.append("\n".join(links))

                     # Store metadata
                     clean_url = re.sub(r"#.*", "", url).rstrip("/")
                     if title and len(title.strip()) > 0:  # Only store if we have a valid title
+                        logger.info(f"Storing metadata for {clean_url}: {title[:30]}...")
                         self.url_metadata[clean_url] = {
                             "title": title,
                             "description": desc,
     def generate_llms_txt(self):
         """Generate llms.txt content"""
+        logger.info(f"Starting generate_llms_txt with {len(self.url_metadata)} URLs")
         if not self.url_metadata:
+            logger.error("No URL metadata found")
             return "No content was found to generate llms.txt"
         # Sort URLs by importance and remove duplicates
         sorted_urls = []
         seen_titles = set()
         for url, metadata in sorted(
             self.url_metadata.items(),
             key=lambda x: (x[1]["importance"], x[0]),
             if metadata["title"] not in seen_titles:
                 sorted_urls.append((url, metadata))
                 seen_titles.add(metadata["title"])
+        logger.info(f"Found {len(sorted_urls)} unique URLs after deduplication")
         if not sorted_urls:
+            logger.error("No valid URLs found after sorting")
             return "No valid content was found"
         # Generate content
         content = []
         # Use homepage metadata for main title and description
         main_title = self.homepage_metadata.get("site_name", "Welcome")
         homepage_description = self.homepage_metadata.get("description")
+        logger.info(f"Homepage title: {main_title}")
+        logger.info(f"Homepage description: {homepage_description}")
         content.append(f"# {main_title}")
         if homepage_description:
             content.append(f"\n> {homepage_description}")
                 if desc and len(desc) > 20 and "null" not in desc.lower():
                     content.append(f"\n> {desc}")
                     break
         # Group by category
         categories = defaultdict(list)
         for url, metadata in sorted_urls:
             if metadata["title"] and url:
                 categories[metadata["category"]].append((url, metadata))
+        logger.info(f"Categories found: {list(categories.keys())}")
         # Add sections in a logical order
         category_order = [
             "Main",
         if "Main" in categories:
             main_content = categories["Main"]
             if len(main_content) == 1 and main_content[0][1]["description"] == homepage_description:
+                logger.info("Removing duplicate Main content")
                 del categories["Main"]
         for category in category_order:
             if category in categories and categories[category]:
+                logger.info(f"Processing category {category} with {len(categories[category])} items")
                 content.append(f"\n## {category}")
                 # Sort links within category by importance and description length
                         links.append(f"- [{title}]({url})")
                 content.append("\n".join(links))
+        final_content = "\n".join(content)
+        logger.info(f"Generated content length: {len(final_content)}")
+        return final_content