Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -69,57 +69,21 @@ class WebsiteCrawler:
|
|
69 |
url_lower = url.lower()
|
70 |
path = urlparse(url).path.lower()
|
71 |
|
72 |
-
# Check for case studies and success stories
|
73 |
-
if any(x in url_lower for x in ['case-study', 'success-story']):
|
74 |
-
return "Case Studies", 7
|
75 |
-
|
76 |
-
# Check for product/service pages
|
77 |
-
if any(x in title.lower() for x in ['service', 'product', 'solution']):
|
78 |
-
return "Services", 6
|
79 |
-
|
80 |
-
# Keep existing categories but adjust priorities
|
81 |
if path == "/" or path == "":
|
82 |
return "Main", 10
|
|
|
|
|
83 |
elif any(x in url_lower for x in ['/api', '/developer']):
|
84 |
return "API", 8
|
85 |
elif any(x in url_lower for x in ['/about', '/contact']):
|
86 |
return "About", 7
|
87 |
-
elif any(x in
|
88 |
-
return "
|
|
|
|
|
89 |
|
90 |
return "Optional", 1
|
91 |
|
92 |
-
def clean_text(self, text, is_title=False):
|
93 |
-
"""Improved text cleaning"""
|
94 |
-
if not text or len(text.strip()) < 2:
|
95 |
-
return ""
|
96 |
-
|
97 |
-
text = super().clean_text(text, is_title)
|
98 |
-
|
99 |
-
# Remove broken/malformed text
|
100 |
-
if len(re.findall(r'[a-zA-Z]', text)) < 10: # If less than 10 letters
|
101 |
-
return ""
|
102 |
-
|
103 |
-
# Clean up title specifically
|
104 |
-
if is_title:
|
105 |
-
# Remove company name if it's redundant
|
106 |
-
text = re.sub(r'\s*[-|]\s*.*?$', '', text)
|
107 |
-
# Remove generic suffixes
|
108 |
-
text = re.sub(r'\s+Homepage$', '', text, flags=re.IGNORECASE)
|
109 |
-
|
110 |
-
return text
|
111 |
-
|
112 |
-
# Update category order in generate_llms_txt
|
113 |
-
category_order = [
|
114 |
-
"Main",
|
115 |
-
"Services",
|
116 |
-
"API",
|
117 |
-
"About",
|
118 |
-
"Case Studies",
|
119 |
-
"News",
|
120 |
-
"Optional"
|
121 |
-
]
|
122 |
-
|
123 |
def is_duplicate_content(self, desc, title, url):
|
124 |
"""Improved duplicate/translation detection"""
|
125 |
if not desc or not title:
|
|
|
69 |
url_lower = url.lower()
|
70 |
path = urlparse(url).path.lower()
|
71 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
72 |
if path == "/" or path == "":
|
73 |
return "Main", 10
|
74 |
+
elif any(x in url_lower for x in ['/docs', '/documentation', '/guide', '/manual']):
|
75 |
+
return "Documentation", 8
|
76 |
elif any(x in url_lower for x in ['/api', '/developer']):
|
77 |
return "API", 8
|
78 |
elif any(x in url_lower for x in ['/about', '/contact']):
|
79 |
return "About", 7
|
80 |
+
elif any(x in path for x in ['.html', '.md', '.txt', '/']):
|
81 |
+
return "Content", 4
|
82 |
+
elif any(x in url_lower for x in ['/blog', '/news', '/article']):
|
83 |
+
return "Blog", 5
|
84 |
|
85 |
return "Optional", 1
|
86 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
87 |
def is_duplicate_content(self, desc, title, url):
|
88 |
"""Improved duplicate/translation detection"""
|
89 |
if not desc or not title:
|