Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -150,39 +150,61 @@ class WebsiteCrawler:
|
|
150 |
if not self.url_metadata:
|
151 |
return "No content was found to generate llms.txt"
|
152 |
|
153 |
-
# Sort and
|
154 |
-
sorted_urls =
|
|
|
|
|
|
|
155 |
self.url_metadata.items(),
|
156 |
key=lambda x: (x[1]['importance'], x[0]),
|
157 |
reverse=True
|
158 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
159 |
|
160 |
# Generate content
|
161 |
content = []
|
162 |
-
|
163 |
-
|
164 |
-
|
165 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
# Group by category
|
168 |
categories = defaultdict(list)
|
169 |
-
seen_titles = set()
|
170 |
-
|
171 |
for url, metadata in sorted_urls:
|
172 |
-
|
173 |
-
if title not in seen_titles:
|
174 |
categories[metadata['category']].append((url, metadata))
|
175 |
-
seen_titles.add(title)
|
176 |
|
177 |
# Add sections
|
178 |
for category in ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']:
|
179 |
if category in categories:
|
180 |
-
content.append(f"\n## {category}")
|
181 |
for url, metadata in categories[category]:
|
182 |
-
|
183 |
-
|
|
|
|
|
184 |
else:
|
185 |
-
content.append(f"
|
186 |
|
187 |
return "\n".join(content)
|
188 |
|
|
|
150 |
if not self.url_metadata:
|
151 |
return "No content was found to generate llms.txt"
|
152 |
|
153 |
+
# Sort URLs by importance and remove duplicates
|
154 |
+
sorted_urls = []
|
155 |
+
seen_titles = set()
|
156 |
+
|
157 |
+
for url, metadata in sorted(
|
158 |
self.url_metadata.items(),
|
159 |
key=lambda x: (x[1]['importance'], x[0]),
|
160 |
reverse=True
|
161 |
+
):
|
162 |
+
if metadata['title'] not in seen_titles:
|
163 |
+
sorted_urls.append((url, metadata))
|
164 |
+
seen_titles.add(metadata['title'])
|
165 |
+
|
166 |
+
if not sorted_urls:
|
167 |
+
return "No valid content was found"
|
168 |
|
169 |
# Generate content
|
170 |
content = []
|
171 |
+
|
172 |
+
# Find the best title for the main header
|
173 |
+
main_titles = [
|
174 |
+
metadata['title'] for _, metadata in sorted_urls
|
175 |
+
if 'overview' in metadata['title'].lower() or
|
176 |
+
'welcome' in metadata['title'].lower() or
|
177 |
+
'introduction' in metadata['title'].lower()
|
178 |
+
]
|
179 |
+
|
180 |
+
main_title = main_titles[0] if main_titles else sorted_urls[0][1]['title']
|
181 |
+
content.append(f"# {main_title}")
|
182 |
+
|
183 |
+
# Find a good description for the blockquote
|
184 |
+
descriptions = [
|
185 |
+
metadata['description'] for _, metadata in sorted_urls
|
186 |
+
if metadata['description'] and len(metadata['description']) > 20
|
187 |
+
]
|
188 |
+
if descriptions:
|
189 |
+
content.append(f"\n> {descriptions[0]}")
|
190 |
|
191 |
# Group by category
|
192 |
categories = defaultdict(list)
|
|
|
|
|
193 |
for url, metadata in sorted_urls:
|
194 |
+
if metadata['title'] and url: # Ensure we have both title and URL
|
|
|
195 |
categories[metadata['category']].append((url, metadata))
|
|
|
196 |
|
197 |
# Add sections
|
198 |
for category in ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']:
|
199 |
if category in categories:
|
200 |
+
content.append(f"\n## {category}\n")
|
201 |
for url, metadata in categories[category]:
|
202 |
+
title = metadata['title'].strip()
|
203 |
+
desc = metadata['description'].strip() if metadata['description'] else ""
|
204 |
+
if desc:
|
205 |
+
content.append(f"- [{title}]({url}): {desc}")
|
206 |
else:
|
207 |
+
content.append(f"- [{title}]({url})")
|
208 |
|
209 |
return "\n".join(content)
|
210 |
|