cyberandy commited on
Commit
eccd1e5
·
verified ·
1 Parent(s): fe2936f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +26 -8
app.py CHANGED
@@ -258,6 +258,7 @@ class WebsiteCrawler:
258
  # Store metadata
259
  clean_url = re.sub(r"#.*", "", url).rstrip("/")
260
  if title and len(title.strip()) > 0: # Only store if we have a valid title
 
261
  self.url_metadata[clean_url] = {
262
  "title": title,
263
  "description": desc,
@@ -383,13 +384,16 @@ class WebsiteCrawler:
383
 
384
  def generate_llms_txt(self):
385
  """Generate llms.txt content"""
 
 
386
  if not self.url_metadata:
 
387
  return "No content was found to generate llms.txt"
388
-
389
  # Sort URLs by importance and remove duplicates
390
  sorted_urls = []
391
  seen_titles = set()
392
-
393
  for url, metadata in sorted(
394
  self.url_metadata.items(),
395
  key=lambda x: (x[1]["importance"], x[0]),
@@ -398,17 +402,23 @@ class WebsiteCrawler:
398
  if metadata["title"] not in seen_titles:
399
  sorted_urls.append((url, metadata))
400
  seen_titles.add(metadata["title"])
401
-
 
 
402
  if not sorted_urls:
 
403
  return "No valid content was found"
404
-
405
  # Generate content
406
  content = []
407
-
408
  # Use homepage metadata for main title and description
409
  main_title = self.homepage_metadata.get("site_name", "Welcome")
410
  homepage_description = self.homepage_metadata.get("description")
411
-
 
 
 
412
  content.append(f"# {main_title}")
413
  if homepage_description:
414
  content.append(f"\n> {homepage_description}")
@@ -419,13 +429,15 @@ class WebsiteCrawler:
419
  if desc and len(desc) > 20 and "null" not in desc.lower():
420
  content.append(f"\n> {desc}")
421
  break
422
-
423
  # Group by category
424
  categories = defaultdict(list)
425
  for url, metadata in sorted_urls:
426
  if metadata["title"] and url:
427
  categories[metadata["category"]].append((url, metadata))
428
-
 
 
429
  # Add sections in a logical order
430
  category_order = [
431
  "Main",
@@ -441,10 +453,12 @@ class WebsiteCrawler:
441
  if "Main" in categories:
442
  main_content = categories["Main"]
443
  if len(main_content) == 1 and main_content[0][1]["description"] == homepage_description:
 
444
  del categories["Main"]
445
 
446
  for category in category_order:
447
  if category in categories and categories[category]:
 
448
  content.append(f"\n## {category}")
449
 
450
  # Sort links within category by importance and description length
@@ -470,6 +484,10 @@ class WebsiteCrawler:
470
  links.append(f"- [{title}]({url})")
471
 
472
  content.append("\n".join(links))
 
 
 
 
473
 
474
 
475
 
 
258
  # Store metadata
259
  clean_url = re.sub(r"#.*", "", url).rstrip("/")
260
  if title and len(title.strip()) > 0: # Only store if we have a valid title
261
+ logger.info(f"Storing metadata for {clean_url}: {title[:30]}...")
262
  self.url_metadata[clean_url] = {
263
  "title": title,
264
  "description": desc,
 
384
 
385
  def generate_llms_txt(self):
386
  """Generate llms.txt content"""
387
+ logger.info(f"Starting generate_llms_txt with {len(self.url_metadata)} URLs")
388
+
389
  if not self.url_metadata:
390
+ logger.error("No URL metadata found")
391
  return "No content was found to generate llms.txt"
392
+
393
  # Sort URLs by importance and remove duplicates
394
  sorted_urls = []
395
  seen_titles = set()
396
+
397
  for url, metadata in sorted(
398
  self.url_metadata.items(),
399
  key=lambda x: (x[1]["importance"], x[0]),
 
402
  if metadata["title"] not in seen_titles:
403
  sorted_urls.append((url, metadata))
404
  seen_titles.add(metadata["title"])
405
+
406
+ logger.info(f"Found {len(sorted_urls)} unique URLs after deduplication")
407
+
408
  if not sorted_urls:
409
+ logger.error("No valid URLs found after sorting")
410
  return "No valid content was found"
411
+
412
  # Generate content
413
  content = []
414
+
415
  # Use homepage metadata for main title and description
416
  main_title = self.homepage_metadata.get("site_name", "Welcome")
417
  homepage_description = self.homepage_metadata.get("description")
418
+
419
+ logger.info(f"Homepage title: {main_title}")
420
+ logger.info(f"Homepage description: {homepage_description}")
421
+
422
  content.append(f"# {main_title}")
423
  if homepage_description:
424
  content.append(f"\n> {homepage_description}")
 
429
  if desc and len(desc) > 20 and "null" not in desc.lower():
430
  content.append(f"\n> {desc}")
431
  break
432
+
433
  # Group by category
434
  categories = defaultdict(list)
435
  for url, metadata in sorted_urls:
436
  if metadata["title"] and url:
437
  categories[metadata["category"]].append((url, metadata))
438
+
439
+ logger.info(f"Categories found: {list(categories.keys())}")
440
+
441
  # Add sections in a logical order
442
  category_order = [
443
  "Main",
 
453
  if "Main" in categories:
454
  main_content = categories["Main"]
455
  if len(main_content) == 1 and main_content[0][1]["description"] == homepage_description:
456
+ logger.info("Removing duplicate Main content")
457
  del categories["Main"]
458
 
459
  for category in category_order:
460
  if category in categories and categories[category]:
461
+ logger.info(f"Processing category {category} with {len(categories[category])} items")
462
  content.append(f"\n## {category}")
463
 
464
  # Sort links within category by importance and description length
 
484
  links.append(f"- [{title}]({url})")
485
 
486
  content.append("\n".join(links))
487
+
488
+ final_content = "\n".join(content)
489
+ logger.info(f"Generated content length: {len(final_content)}")
490
+ return final_content
491
 
492
 
493