Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on Nov 25, 2024

Commit

f663df1

verified ·

1 Parent(s): fa155f9

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -30

app.py CHANGED Viewed

@@ -23,6 +23,37 @@ class WebsiteCrawler:
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
         }
     def clean_text(self, text, is_title=False):
         """Improved text cleaning"""
         if not text or len(text.strip()) < 2:
@@ -37,21 +68,73 @@ class WebsiteCrawler:
         text = re.sub(r'\{\%.*?\%\}', '', text)
         text = re.sub(r'\${.*?\}', '', text)
-        # Remove broken/malformed text
-        if len(re.findall(r'[a-zA-Z]', text)) < 10:  # If less than 10 letters
-            return ""
-        # Clean up title specifically
         if is_title:
             # Remove common suffixes and fragments for titles
             text = re.sub(r'^\s*Welcome to\s+', '', text)
             text = re.sub(r'\s*[\|\-#:•].*', '', text)
-            # Remove company name if it's redundant
-            text = re.sub(r'\s*[-|]\s*.*?$', '', text)
-            # Remove generic suffixes
             text = re.sub(r'\s+Homepage$', '', text, flags=re.IGNORECASE)
-        return " ".join(text.split()).strip()
     def clean_description(self, desc):
         """Clean description text"""
@@ -64,25 +147,6 @@ class WebsiteCrawler:
             return ""
         return desc.strip()
-    def determine_category_importance(self, url, title, desc):
-        """Improved category detection"""
-        url_lower = url.lower()
-        path = urlparse(url).path.lower()
-        if path == "/" or path == "":
-            return "Main", 10
-        elif any(x in url_lower for x in ['/docs', '/documentation', '/guide', '/manual']):
-            return "Documentation", 8
-        elif any(x in url_lower for x in ['/api', '/developer']):
-            return "API", 8
-        elif any(x in url_lower for x in ['/about', '/contact']):
-            return "About", 7
-        elif any(x in path for x in ['.html', '.md', '.txt', '/']):
-            return "Content", 4
-        elif any(x in url_lower for x in ['/blog', '/news', '/article']):
-            return "Blog", 5
-        return "Optional", 1
     def is_duplicate_content(self, desc, title, url):
         """Improved duplicate/translation detection"""
@@ -330,9 +394,9 @@ class WebsiteCrawler:
             "Main",
             "Documentation",
             "API",
             "About",
-            "Content",
-            "Blog",
             "Optional"
         ]

             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
         }
+    def determine_category_importance(self, url, title, desc):
+        """Improved category detection"""
+        url_lower = url.lower()
+        path = urlparse(url).path.lower()
+        # Homepage
+        if path == "/" or path == "":
+            return "Main", 10
+        # Documentation and Features
+        if any(x in url_lower for x in ['/docs', '/documentation', '/features', '/pricing']):
+            return "Documentation", 8
+        # API
+        elif any(x in url_lower for x in ['/api', '/developer', 'developers']):
+            return "API", 8
+        # About/Company
+        elif any(x in url_lower for x in ['/about', '/company', '/partners', '/stories']):
+            return "About", 7
+        # News and Updates
+        elif any(x in url_lower for x in ['/news', '/blog', '/releases', '/academy']):
+            return "News", 5
+        # Tools and Features
+        elif any(x in url_lower for x in ['/tools', '/features', '/website', '/keyword']):
+            return "Tools", 6
+        return "Optional", 1
     def clean_text(self, text, is_title=False):
         """Improved text cleaning"""
         if not text or len(text.strip()) < 2:
         text = re.sub(r'\{\%.*?\%\}', '', text)
         text = re.sub(r'\${.*?\}', '', text)
         if is_title:
             # Remove common suffixes and fragments for titles
             text = re.sub(r'^\s*Welcome to\s+', '', text)
             text = re.sub(r'\s*[\|\-#:•].*', '', text)
             text = re.sub(r'\s+Homepage$', '', text, flags=re.IGNORECASE)
+            # Handle overly generic titles
+            if text.lower() in ['features', 'home', 'homepage', 'welcome']:
+                return ""
+        # Only return if we have meaningful text
+        cleaned = " ".join(text.split()).strip()
+        if len(cleaned.split()) < 2 and not is_title:  # Allow single-word titles
+            return ""
+        return cleaned
+    async def process_homepage(self, url):
+        """Specifically process the homepage to extract key metadata"""
+        try:
+            response = requests.get(url, headers=self.headers, timeout=10)
+            response.encoding = "utf-8"
+            soup = BeautifulSoup(response.text, "html.parser")
+            # Extract site name with more fallbacks
+            site_name = None
+            # Try meta tags first
+            site_meta = soup.find("meta", property="og:site_name")
+            if site_meta and site_meta.get("content"):
+                site_name = site_meta["content"]
+            # Try structured data
+            if not site_name:
+                schema = soup.find("script", type="application/ld+json")
+                if schema:
+                    try:
+                        import json
+                        data = json.loads(schema.string)
+                        if isinstance(data, dict):
+                            site_name = data.get("name") or data.get("organizationName")
+                    except:
+                        pass
+            # Try title tag
+            if not site_name:
+                title_tag = soup.find("title")
+                if title_tag:
+                    site_name = title_tag.text.split('|')[0].strip()
+            # Last resort - use domain name
+            if not site_name:
+                site_name = urlparse(url).netloc.split('.')[0].capitalize()
+            # Get homepage description
+            description = self.extract_homepage_description(soup)
+            self.homepage_metadata = {
+                "site_name": self.clean_text(site_name, is_title=True),
+                "description": description
+            }
+        except Exception as e:
+            logger.error(f"Error processing homepage {url}: {str(e)}")
+            self.homepage_metadata = {
+                "site_name": urlparse(url).netloc.split('.')[0].capitalize(),
+                "description": None
+            }
     def clean_description(self, desc):
         """Clean description text"""
             return ""
         return desc.strip()
     def is_duplicate_content(self, desc, title, url):
         """Improved duplicate/translation detection"""
             "Main",
             "Documentation",
             "API",
+            "Tools",
             "About",
+            "News",
             "Optional"
         ]