Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on Nov 25, 2024

Commit

fa155f9

verified ·

1 Parent(s): d469446

Update app.py

Browse files

Files changed (1) hide show

app.py +6 -42

app.py CHANGED Viewed

@@ -69,57 +69,21 @@ class WebsiteCrawler:
         url_lower = url.lower()
         path = urlparse(url).path.lower()
-        # Check for case studies and success stories
-        if any(x in url_lower for x in ['case-study', 'success-story']):
-            return "Case Studies", 7
-        # Check for product/service pages
-        if any(x in title.lower() for x in ['service', 'product', 'solution']):
-            return "Services", 6
-        # Keep existing categories but adjust priorities
         if path == "/" or path == "":
             return "Main", 10
         elif any(x in url_lower for x in ['/api', '/developer']):
             return "API", 8
         elif any(x in url_lower for x in ['/about', '/contact']):
             return "About", 7
-        elif any(x in url_lower for x in ['/news', '/blog', '/update']):
-            return "News", 4
         return "Optional", 1
-    def clean_text(self, text, is_title=False):
-        """Improved text cleaning"""
-        if not text or len(text.strip()) < 2:
-            return ""
-        text = super().clean_text(text, is_title)
-        # Remove broken/malformed text
-        if len(re.findall(r'[a-zA-Z]', text)) < 10:  # If less than 10 letters
-            return ""
-        # Clean up title specifically
-        if is_title:
-            # Remove company name if it's redundant
-            text = re.sub(r'\s*[-|]\s*.*?$', '', text)
-            # Remove generic suffixes
-            text = re.sub(r'\s+Homepage$', '', text, flags=re.IGNORECASE)
-        return text
-    # Update category order in generate_llms_txt
-    category_order = [
-        "Main",
-        "Services",
-        "API",
-        "About",
-        "Case Studies",
-        "News",
-        "Optional"
-    ]
     def is_duplicate_content(self, desc, title, url):
         """Improved duplicate/translation detection"""
         if not desc or not title:

         url_lower = url.lower()
         path = urlparse(url).path.lower()
         if path == "/" or path == "":
             return "Main", 10
+        elif any(x in url_lower for x in ['/docs', '/documentation', '/guide', '/manual']):
+            return "Documentation", 8
         elif any(x in url_lower for x in ['/api', '/developer']):
             return "API", 8
         elif any(x in url_lower for x in ['/about', '/contact']):
             return "About", 7
+        elif any(x in path for x in ['.html', '.md', '.txt', '/']):
+            return "Content", 4
+        elif any(x in url_lower for x in ['/blog', '/news', '/article']):
+            return "Blog", 5
         return "Optional", 1
     def is_duplicate_content(self, desc, title, url):
         """Improved duplicate/translation detection"""
         if not desc or not title: