Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -258,6 +258,7 @@ class WebsiteCrawler:
|
|
258 |
# Store metadata
|
259 |
clean_url = re.sub(r"#.*", "", url).rstrip("/")
|
260 |
if title and len(title.strip()) > 0: # Only store if we have a valid title
|
|
|
261 |
self.url_metadata[clean_url] = {
|
262 |
"title": title,
|
263 |
"description": desc,
|
@@ -383,13 +384,16 @@ class WebsiteCrawler:
|
|
383 |
|
384 |
def generate_llms_txt(self):
|
385 |
"""Generate llms.txt content"""
|
|
|
|
|
386 |
if not self.url_metadata:
|
|
|
387 |
return "No content was found to generate llms.txt"
|
388 |
-
|
389 |
# Sort URLs by importance and remove duplicates
|
390 |
sorted_urls = []
|
391 |
seen_titles = set()
|
392 |
-
|
393 |
for url, metadata in sorted(
|
394 |
self.url_metadata.items(),
|
395 |
key=lambda x: (x[1]["importance"], x[0]),
|
@@ -398,17 +402,23 @@ class WebsiteCrawler:
|
|
398 |
if metadata["title"] not in seen_titles:
|
399 |
sorted_urls.append((url, metadata))
|
400 |
seen_titles.add(metadata["title"])
|
401 |
-
|
|
|
|
|
402 |
if not sorted_urls:
|
|
|
403 |
return "No valid content was found"
|
404 |
-
|
405 |
# Generate content
|
406 |
content = []
|
407 |
-
|
408 |
# Use homepage metadata for main title and description
|
409 |
main_title = self.homepage_metadata.get("site_name", "Welcome")
|
410 |
homepage_description = self.homepage_metadata.get("description")
|
411 |
-
|
|
|
|
|
|
|
412 |
content.append(f"# {main_title}")
|
413 |
if homepage_description:
|
414 |
content.append(f"\n> {homepage_description}")
|
@@ -419,13 +429,15 @@ class WebsiteCrawler:
|
|
419 |
if desc and len(desc) > 20 and "null" not in desc.lower():
|
420 |
content.append(f"\n> {desc}")
|
421 |
break
|
422 |
-
|
423 |
# Group by category
|
424 |
categories = defaultdict(list)
|
425 |
for url, metadata in sorted_urls:
|
426 |
if metadata["title"] and url:
|
427 |
categories[metadata["category"]].append((url, metadata))
|
428 |
-
|
|
|
|
|
429 |
# Add sections in a logical order
|
430 |
category_order = [
|
431 |
"Main",
|
@@ -441,10 +453,12 @@ class WebsiteCrawler:
|
|
441 |
if "Main" in categories:
|
442 |
main_content = categories["Main"]
|
443 |
if len(main_content) == 1 and main_content[0][1]["description"] == homepage_description:
|
|
|
444 |
del categories["Main"]
|
445 |
|
446 |
for category in category_order:
|
447 |
if category in categories and categories[category]:
|
|
|
448 |
content.append(f"\n## {category}")
|
449 |
|
450 |
# Sort links within category by importance and description length
|
@@ -470,6 +484,10 @@ class WebsiteCrawler:
|
|
470 |
links.append(f"- [{title}]({url})")
|
471 |
|
472 |
content.append("\n".join(links))
|
|
|
|
|
|
|
|
|
473 |
|
474 |
|
475 |
|
|
|
258 |
# Store metadata
|
259 |
clean_url = re.sub(r"#.*", "", url).rstrip("/")
|
260 |
if title and len(title.strip()) > 0: # Only store if we have a valid title
|
261 |
+
logger.info(f"Storing metadata for {clean_url}: {title[:30]}...")
|
262 |
self.url_metadata[clean_url] = {
|
263 |
"title": title,
|
264 |
"description": desc,
|
|
|
384 |
|
385 |
def generate_llms_txt(self):
|
386 |
"""Generate llms.txt content"""
|
387 |
+
logger.info(f"Starting generate_llms_txt with {len(self.url_metadata)} URLs")
|
388 |
+
|
389 |
if not self.url_metadata:
|
390 |
+
logger.error("No URL metadata found")
|
391 |
return "No content was found to generate llms.txt"
|
392 |
+
|
393 |
# Sort URLs by importance and remove duplicates
|
394 |
sorted_urls = []
|
395 |
seen_titles = set()
|
396 |
+
|
397 |
for url, metadata in sorted(
|
398 |
self.url_metadata.items(),
|
399 |
key=lambda x: (x[1]["importance"], x[0]),
|
|
|
402 |
if metadata["title"] not in seen_titles:
|
403 |
sorted_urls.append((url, metadata))
|
404 |
seen_titles.add(metadata["title"])
|
405 |
+
|
406 |
+
logger.info(f"Found {len(sorted_urls)} unique URLs after deduplication")
|
407 |
+
|
408 |
if not sorted_urls:
|
409 |
+
logger.error("No valid URLs found after sorting")
|
410 |
return "No valid content was found"
|
411 |
+
|
412 |
# Generate content
|
413 |
content = []
|
414 |
+
|
415 |
# Use homepage metadata for main title and description
|
416 |
main_title = self.homepage_metadata.get("site_name", "Welcome")
|
417 |
homepage_description = self.homepage_metadata.get("description")
|
418 |
+
|
419 |
+
logger.info(f"Homepage title: {main_title}")
|
420 |
+
logger.info(f"Homepage description: {homepage_description}")
|
421 |
+
|
422 |
content.append(f"# {main_title}")
|
423 |
if homepage_description:
|
424 |
content.append(f"\n> {homepage_description}")
|
|
|
429 |
if desc and len(desc) > 20 and "null" not in desc.lower():
|
430 |
content.append(f"\n> {desc}")
|
431 |
break
|
432 |
+
|
433 |
# Group by category
|
434 |
categories = defaultdict(list)
|
435 |
for url, metadata in sorted_urls:
|
436 |
if metadata["title"] and url:
|
437 |
categories[metadata["category"]].append((url, metadata))
|
438 |
+
|
439 |
+
logger.info(f"Categories found: {list(categories.keys())}")
|
440 |
+
|
441 |
# Add sections in a logical order
|
442 |
category_order = [
|
443 |
"Main",
|
|
|
453 |
if "Main" in categories:
|
454 |
main_content = categories["Main"]
|
455 |
if len(main_content) == 1 and main_content[0][1]["description"] == homepage_description:
|
456 |
+
logger.info("Removing duplicate Main content")
|
457 |
del categories["Main"]
|
458 |
|
459 |
for category in category_order:
|
460 |
if category in categories and categories[category]:
|
461 |
+
logger.info(f"Processing category {category} with {len(categories[category])} items")
|
462 |
content.append(f"\n## {category}")
|
463 |
|
464 |
# Sort links within category by importance and description length
|
|
|
484 |
links.append(f"- [{title}]({url})")
|
485 |
|
486 |
content.append("\n".join(links))
|
487 |
+
|
488 |
+
final_content = "\n".join(content)
|
489 |
+
logger.info(f"Generated content length: {len(final_content)}")
|
490 |
+
return final_content
|
491 |
|
492 |
|
493 |
|