cyberandy commited on
Commit
fa155f9
·
verified ·
1 Parent(s): d469446

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +6 -42
app.py CHANGED
@@ -69,57 +69,21 @@ class WebsiteCrawler:
69
  url_lower = url.lower()
70
  path = urlparse(url).path.lower()
71
 
72
- # Check for case studies and success stories
73
- if any(x in url_lower for x in ['case-study', 'success-story']):
74
- return "Case Studies", 7
75
-
76
- # Check for product/service pages
77
- if any(x in title.lower() for x in ['service', 'product', 'solution']):
78
- return "Services", 6
79
-
80
- # Keep existing categories but adjust priorities
81
  if path == "/" or path == "":
82
  return "Main", 10
 
 
83
  elif any(x in url_lower for x in ['/api', '/developer']):
84
  return "API", 8
85
  elif any(x in url_lower for x in ['/about', '/contact']):
86
  return "About", 7
87
- elif any(x in url_lower for x in ['/news', '/blog', '/update']):
88
- return "News", 4
 
 
89
 
90
  return "Optional", 1
91
 
92
- def clean_text(self, text, is_title=False):
93
- """Improved text cleaning"""
94
- if not text or len(text.strip()) < 2:
95
- return ""
96
-
97
- text = super().clean_text(text, is_title)
98
-
99
- # Remove broken/malformed text
100
- if len(re.findall(r'[a-zA-Z]', text)) < 10: # If less than 10 letters
101
- return ""
102
-
103
- # Clean up title specifically
104
- if is_title:
105
- # Remove company name if it's redundant
106
- text = re.sub(r'\s*[-|]\s*.*?$', '', text)
107
- # Remove generic suffixes
108
- text = re.sub(r'\s+Homepage$', '', text, flags=re.IGNORECASE)
109
-
110
- return text
111
-
112
- # Update category order in generate_llms_txt
113
- category_order = [
114
- "Main",
115
- "Services",
116
- "API",
117
- "About",
118
- "Case Studies",
119
- "News",
120
- "Optional"
121
- ]
122
-
123
  def is_duplicate_content(self, desc, title, url):
124
  """Improved duplicate/translation detection"""
125
  if not desc or not title:
 
69
  url_lower = url.lower()
70
  path = urlparse(url).path.lower()
71
 
 
 
 
 
 
 
 
 
 
72
  if path == "/" or path == "":
73
  return "Main", 10
74
+ elif any(x in url_lower for x in ['/docs', '/documentation', '/guide', '/manual']):
75
+ return "Documentation", 8
76
  elif any(x in url_lower for x in ['/api', '/developer']):
77
  return "API", 8
78
  elif any(x in url_lower for x in ['/about', '/contact']):
79
  return "About", 7
80
+ elif any(x in path for x in ['.html', '.md', '.txt', '/']):
81
+ return "Content", 4
82
+ elif any(x in url_lower for x in ['/blog', '/news', '/article']):
83
+ return "Blog", 5
84
 
85
  return "Optional", 1
86
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
87
  def is_duplicate_content(self, desc, title, url):
88
  """Improved duplicate/translation detection"""
89
  if not desc or not title: