Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on Nov 25, 2024

Commit

4206e10

verified ·

1 Parent(s): 66fe9ad

Update app.py

Browse files

Files changed (1) hide show

app.py +136 -136

app.py CHANGED Viewed

@@ -23,6 +23,33 @@ class WebsiteCrawler:
             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
         }
     def extract_homepage_description(self, soup):
         """Extract description from homepage with multiple fallbacks"""
         # Try meta description first
@@ -56,6 +83,104 @@ class WebsiteCrawler:
         return None
     async def process_homepage(self, url):
         """Specifically process the homepage to extract key metadata"""
         try:
@@ -112,11 +237,11 @@ class WebsiteCrawler:
         """Generate llms.txt content"""
         if not self.url_metadata:
             return "No content was found to generate llms.txt"
         # Sort URLs by importance and remove duplicates
         sorted_urls = []
         seen_titles = set()
         for url, metadata in sorted(
             self.url_metadata.items(),
             key=lambda x: (x[1]["importance"], x[0]),
@@ -125,17 +250,17 @@ class WebsiteCrawler:
             if metadata["title"] not in seen_titles:
                 sorted_urls.append((url, metadata))
                 seen_titles.add(metadata["title"])
         if not sorted_urls:
             return "No valid content was found"
         # Generate content
         content = []
         # Use homepage metadata for main title and description
         main_title = self.homepage_metadata.get("site_name", "Welcome")
         homepage_description = self.homepage_metadata.get("description")
         content.append(f"# {main_title}")
         if homepage_description:
             content.append(f"\n> {homepage_description}")
@@ -146,18 +271,18 @@ class WebsiteCrawler:
                 if desc and len(desc) > 20 and "null" not in desc.lower():
                     content.append(f"\n> {desc}")
                     break
         # Group by category
         categories = defaultdict(list)
         for url, metadata in sorted_urls:
             if metadata["title"] and url:
                 categories[metadata["category"]].append((url, metadata))
         # Add sections
         for category in ["Docs", "API", "Guides", "Examples", "Blog", "Optional"]:
             if category in categories:
                 content.append(f"\n## {category}")
                 # Add links without extra newlines
                 links = []
                 for url, metadata in categories[category]:
@@ -167,135 +292,10 @@ class WebsiteCrawler:
                         links.append(f"- [{title}]({url}): {desc}")
                     else:
                         links.append(f"- [{title}]({url})")
                 content.append("\n".join(links))
-        return "\n".join(content)
-        def clean_text(self, text, is_title=False):
-            """Clean and normalize text"""
-            if not text:
-                return ""
-            # Normalize unicode characters
-            text = unicodedata.normalize("NFKD", text)
-            text = re.sub(r"[^\x00-\x7F]+", "", text)
-            if is_title:
-                # Remove common suffixes and fragments for titles
-                text = re.sub(r"\s*[\|\-#:•].*", "", text)
-                text = re.sub(r"^\s*Welcome to\s+", "", text)
-                text = text.replace("docusaurus_skipToContent_fallback", "")
-            return " ".join(text.split()).strip()
-        def clean_description(self, desc):
-            """Clean description text"""
-            if not desc:
-                return ""
-            # Remove leading dashes, hyphens, or colons
-            desc = re.sub(r"^[-:\s]+", "", desc)
-            # Remove any strings that are just "Editors", "APIs", etc.
-            if len(desc.split()) <= 1:
-                return ""
-            return desc.strip()
-        async def crawl_page(self, url, depth, base_domain):
-            """Crawl a single page and extract information"""
-            if (
-                depth > self.max_depth
-                or url in self.visited_urls
-                or len(self.visited_urls) >= self.max_pages
-            ):
-                return []
-            try:
-                response = requests.get(url, headers=self.headers, timeout=10)
-                response.encoding = "utf-8"
-                self.visited_urls.add(url)
-                soup = BeautifulSoup(response.text, "html.parser")
-                # Extract title with fallbacks
-                title = None
-                meta_title = soup.find("meta", property="og:title")
-                if meta_title and meta_title.get("content"):
-                    title = meta_title["content"]
-                if not title:
-                    title_tag = soup.find("title")
-                    if title_tag:
-                        title = title_tag.text
-                if not title:
-                    h1_tag = soup.find("h1")
-                    if h1_tag:
-                        title = h1_tag.text
-                if not title:
-                    title = url.split("/")[-1]
-                title = self.clean_text(title, is_title=True)
-                # Extract description with fallbacks
-                desc = None
-                meta_desc = soup.find("meta", {"name": "description"})
-                if meta_desc and meta_desc.get("content"):
-                    desc = meta_desc["content"]
-                if not desc:
-                    og_desc = soup.find("meta", property="og:description")
-                    if og_desc and og_desc.get("content"):
-                        desc = og_desc["content"]
-                if not desc:
-                    first_p = soup.find("p")
-                    if first_p:
-                        desc = first_p.text
-                desc = self.clean_text(desc) if desc else ""
-                # Determine category and importance
-                url_lower = url.lower()
-                category = "Optional"
-                importance = 0
-                if "docs" in url_lower or "documentation" in url_lower:
-                    category = "Docs"
-                    importance = 5
-                elif "api" in url_lower:
-                    category = "API"
-                    importance = 4
-                elif "guide" in url_lower or "tutorial" in url_lower:
-                    category = "Guides"
-                    importance = 3
-                elif "example" in url_lower:
-                    category = "Examples"
-                    importance = 2
-                elif "blog" in url_lower:
-                    category = "Blog"
-                    importance = 1
-                # Store metadata
-                clean_url = re.sub(r"#.*", "", url).rstrip("/")
-                if title and len(title.strip()) > 0:  # Only store if we have a valid title
-                    self.url_metadata[clean_url] = {
-                        "title": title,
-                        "description": desc,
-                        "category": category,
-                        "importance": importance,
-                    }
-                # Find links
-                links = []
-                for a in soup.find_all("a", href=True):
-                    href = a["href"]
-                    if not any(
-                        x in href.lower()
-                        for x in ["javascript:", "mailto:", ".pdf", ".jpg", ".png", ".gif"]
-                    ):
-                        next_url = urljoin(url, href)
-                        if urlparse(next_url).netloc == base_domain:
-                            links.append(next_url)
-                return links
-            except Exception as e:
-                logger.error(f"Error crawling {url}: {str(e)}")
-                return []

             "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
         }
+    def clean_text(self, text, is_title=False):
+        """Clean and normalize text"""
+        if not text:
+            return ""
+        # Normalize unicode characters
+        text = unicodedata.normalize("NFKD", text)
+        text = re.sub(r"[^\x00-\x7F]+", "", text)
+        if is_title:
+            # Remove common suffixes and fragments for titles
+            text = re.sub(r"\s*[\|\-#:•].*", "", text)
+            text = re.sub(r"^\s*Welcome to\s+", "", text)
+            text = text.replace("docusaurus_skipToContent_fallback", "")
+        return " ".join(text.split()).strip()
+    def clean_description(self, desc):
+        """Clean description text"""
+        if not desc:
+            return ""
+        # Remove leading dashes, hyphens, or colons
+        desc = re.sub(r"^[-:\s]+", "", desc)
+        # Remove any strings that are just "Editors", "APIs", etc.
+        if len(desc.split()) <= 1:
+            return ""
+        return desc.strip()
     def extract_homepage_description(self, soup):
         """Extract description from homepage with multiple fallbacks"""
         # Try meta description first
         return None
+    async def crawl_page(self, url, depth, base_domain):
+        """Crawl a single page and extract information"""
+        if (
+            depth > self.max_depth
+            or url in self.visited_urls
+            or len(self.visited_urls) >= self.max_pages
+        ):
+            return []
+        try:
+            response = requests.get(url, headers=self.headers, timeout=10)
+            response.encoding = "utf-8"
+            self.visited_urls.add(url)
+            soup = BeautifulSoup(response.text, "html.parser")
+            # Extract title with fallbacks
+            title = None
+            meta_title = soup.find("meta", property="og:title")
+            if meta_title and meta_title.get("content"):
+                title = meta_title["content"]
+            if not title:
+                title_tag = soup.find("title")
+                if title_tag:
+                    title = title_tag.text
+            if not title:
+                h1_tag = soup.find("h1")
+                if h1_tag:
+                    title = h1_tag.text
+            if not title:
+                title = url.split("/")[-1]
+            title = self.clean_text(title, is_title=True)
+            # Extract description with fallbacks
+            desc = None
+            meta_desc = soup.find("meta", {"name": "description"})
+            if meta_desc and meta_desc.get("content"):
+                desc = meta_desc["content"]
+            if not desc:
+                og_desc = soup.find("meta", property="og:description")
+                if og_desc and og_desc.get("content"):
+                    desc = og_desc["content"]
+            if not desc:
+                first_p = soup.find("p")
+                if first_p:
+                    desc = first_p.text
+            desc = self.clean_text(desc) if desc else ""
+            # Determine category and importance
+            url_lower = url.lower()
+            category = "Optional"
+            importance = 0
+            if "docs" in url_lower or "documentation" in url_lower:
+                category = "Docs"
+                importance = 5
+            elif "api" in url_lower:
+                category = "API"
+                importance = 4
+            elif "guide" in url_lower or "tutorial" in url_lower:
+                category = "Guides"
+                importance = 3
+            elif "example" in url_lower:
+                category = "Examples"
+                importance = 2
+            elif "blog" in url_lower:
+                category = "Blog"
+                importance = 1
+            # Store metadata
+            clean_url = re.sub(r"#.*", "", url).rstrip("/")
+            if title and len(title.strip()) > 0:  # Only store if we have a valid title
+                self.url_metadata[clean_url] = {
+                    "title": title,
+                    "description": desc,
+                    "category": category,
+                    "importance": importance,
+                }
+            # Find links
+            links = []
+            for a in soup.find_all("a", href=True):
+                href = a["href"]
+                if not any(
+                    x in href.lower()
+                    for x in ["javascript:", "mailto:", ".pdf", ".jpg", ".png", ".gif"]
+                ):
+                    next_url = urljoin(url, href)
+                    if urlparse(next_url).netloc == base_domain:
+                        links.append(next_url)
+            return links
+        except Exception as e:
+            logger.error(f"Error crawling {url}: {str(e)}")
+            return []
     async def process_homepage(self, url):
         """Specifically process the homepage to extract key metadata"""
         try:
         """Generate llms.txt content"""
         if not self.url_metadata:
             return "No content was found to generate llms.txt"
         # Sort URLs by importance and remove duplicates
         sorted_urls = []
         seen_titles = set()
         for url, metadata in sorted(
             self.url_metadata.items(),
             key=lambda x: (x[1]["importance"], x[0]),
             if metadata["title"] not in seen_titles:
                 sorted_urls.append((url, metadata))
                 seen_titles.add(metadata["title"])
         if not sorted_urls:
             return "No valid content was found"
         # Generate content
         content = []
         # Use homepage metadata for main title and description
         main_title = self.homepage_metadata.get("site_name", "Welcome")
         homepage_description = self.homepage_metadata.get("description")
         content.append(f"# {main_title}")
         if homepage_description:
             content.append(f"\n> {homepage_description}")
                 if desc and len(desc) > 20 and "null" not in desc.lower():
                     content.append(f"\n> {desc}")
                     break
         # Group by category
         categories = defaultdict(list)
         for url, metadata in sorted_urls:
             if metadata["title"] and url:
                 categories[metadata["category"]].append((url, metadata))
         # Add sections
         for category in ["Docs", "API", "Guides", "Examples", "Blog", "Optional"]:
             if category in categories:
                 content.append(f"\n## {category}")
                 # Add links without extra newlines
                 links = []
                 for url, metadata in categories[category]:
                         links.append(f"- [{title}]({url}): {desc}")
                     else:
                         links.append(f"- [{title}]({url})")
                 content.append("\n".join(links))
+        return "\n".join(content)