Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on 21 days ago

Commit

f21d84e

•

1 Parent(s): e81ffaf

Update app.py

Browse files

Files changed (1) hide show

app.py +38 -16

app.py CHANGED Viewed

@@ -150,39 +150,61 @@ class WebsiteCrawler:
         if not self.url_metadata:
             return "No content was found to generate llms.txt"
-        # Sort and filter URLs
-        sorted_urls = sorted(
             self.url_metadata.items(),
             key=lambda x: (x[1]['importance'], x[0]),
             reverse=True
-        )
         # Generate content
         content = []
-        main_metadata = sorted_urls[0][1]
-        content.append(f"# {main_metadata['title']}")
-        if main_metadata['description']:
-            content.append(f"\n> {main_metadata['description']}")
         # Group by category
         categories = defaultdict(list)
-        seen_titles = set()
         for url, metadata in sorted_urls:
-            title = metadata['title']
-            if title not in seen_titles:
                 categories[metadata['category']].append((url, metadata))
-                seen_titles.add(title)
         # Add sections
         for category in ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']:
             if category in categories:
-                content.append(f"\n## {category}")
                 for url, metadata in categories[category]:
-                    if metadata['description']:
-                        content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
                     else:
-                        content.append(f"\n- [{metadata['title']}]({url})")
         return "\n".join(content)

         if not self.url_metadata:
             return "No content was found to generate llms.txt"
+        # Sort URLs by importance and remove duplicates
+        sorted_urls = []
+        seen_titles = set()
+        for url, metadata in sorted(
             self.url_metadata.items(),
             key=lambda x: (x[1]['importance'], x[0]),
             reverse=True
+        ):
+            if metadata['title'] not in seen_titles:
+                sorted_urls.append((url, metadata))
+                seen_titles.add(metadata['title'])
+        if not sorted_urls:
+            return "No valid content was found"
         # Generate content
         content = []
+        # Find the best title for the main header
+        main_titles = [
+            metadata['title'] for _, metadata in sorted_urls
+            if 'overview' in metadata['title'].lower() or
+               'welcome' in metadata['title'].lower() or
+               'introduction' in metadata['title'].lower()
+        ]
+        main_title = main_titles[0] if main_titles else sorted_urls[0][1]['title']
+        content.append(f"# {main_title}")
+        # Find a good description for the blockquote
+        descriptions = [
+            metadata['description'] for _, metadata in sorted_urls
+            if metadata['description'] and len(metadata['description']) > 20
+        ]
+        if descriptions:
+            content.append(f"\n> {descriptions[0]}")
         # Group by category
         categories = defaultdict(list)
         for url, metadata in sorted_urls:
+            if metadata['title'] and url:  # Ensure we have both title and URL
                 categories[metadata['category']].append((url, metadata))
         # Add sections
         for category in ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']:
             if category in categories:
+                content.append(f"\n## {category}\n")
                 for url, metadata in categories[category]:
+                    title = metadata['title'].strip()
+                    desc = metadata['description'].strip() if metadata['description'] else ""
+                    if desc:
+                        content.append(f"- [{title}]({url}): {desc}")
                     else:
+                        content.append(f"- [{title}]({url})")
         return "\n".join(content)