Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on Nov 25, 2024

Commit

e9f1fb9

verified ·

1 Parent(s): 03dc650

Update app.py

Browse files

Files changed (1) hide show

app.py +126 -0

app.py CHANGED Viewed

@@ -152,6 +152,132 @@ class WebsiteCrawler:
         return "\n".join(content)
 async def process_url(url, max_depth, max_pages):
     """Process URL and generate llms.txt"""

         return "\n".join(content)
+        def clean_text(self, text, is_title=False):
+            """Clean and normalize text"""
+            if not text:
+                return ""
+            # Normalize unicode characters
+            text = unicodedata.normalize("NFKD", text)
+            text = re.sub(r"[^\x00-\x7F]+", "", text)
+            if is_title:
+                # Remove common suffixes and fragments for titles
+                text = re.sub(r"\s*[\|\-#:•].*", "", text)
+                text = re.sub(r"^\s*Welcome to\s+", "", text)
+                text = text.replace("docusaurus_skipToContent_fallback", "")
+            return " ".join(text.split()).strip()
+        def clean_description(self, desc):
+            """Clean description text"""
+            if not desc:
+                return ""
+            # Remove leading dashes, hyphens, or colons
+            desc = re.sub(r"^[-:\s]+", "", desc)
+            # Remove any strings that are just "Editors", "APIs", etc.
+            if len(desc.split()) <= 1:
+                return ""
+            return desc.strip()
+        async def crawl_page(self, url, depth, base_domain):
+            """Crawl a single page and extract information"""
+            if (
+                depth > self.max_depth
+                or url in self.visited_urls
+                or len(self.visited_urls) >= self.max_pages
+            ):
+                return []
+            try:
+                response = requests.get(url, headers=self.headers, timeout=10)
+                response.encoding = "utf-8"
+                self.visited_urls.add(url)
+                soup = BeautifulSoup(response.text, "html.parser")
+                # Extract title with fallbacks
+                title = None
+                meta_title = soup.find("meta", property="og:title")
+                if meta_title and meta_title.get("content"):
+                    title = meta_title["content"]
+                if not title:
+                    title_tag = soup.find("title")
+                    if title_tag:
+                        title = title_tag.text
+                if not title:
+                    h1_tag = soup.find("h1")
+                    if h1_tag:
+                        title = h1_tag.text
+                if not title:
+                    title = url.split("/")[-1]
+                title = self.clean_text(title, is_title=True)
+                # Extract description with fallbacks
+                desc = None
+                meta_desc = soup.find("meta", {"name": "description"})
+                if meta_desc and meta_desc.get("content"):
+                    desc = meta_desc["content"]
+                if not desc:
+                    og_desc = soup.find("meta", property="og:description")
+                    if og_desc and og_desc.get("content"):
+                        desc = og_desc["content"]
+                if not desc:
+                    first_p = soup.find("p")
+                    if first_p:
+                        desc = first_p.text
+                desc = self.clean_text(desc) if desc else ""
+                # Determine category and importance
+                url_lower = url.lower()
+                category = "Optional"
+                importance = 0
+                if "docs" in url_lower or "documentation" in url_lower:
+                    category = "Docs"
+                    importance = 5
+                elif "api" in url_lower:
+                    category = "API"
+                    importance = 4
+                elif "guide" in url_lower or "tutorial" in url_lower:
+                    category = "Guides"
+                    importance = 3
+                elif "example" in url_lower:
+                    category = "Examples"
+                    importance = 2
+                elif "blog" in url_lower:
+                    category = "Blog"
+                    importance = 1
+                # Store metadata
+                clean_url = re.sub(r"#.*", "", url).rstrip("/")
+                if title and len(title.strip()) > 0:  # Only store if we have a valid title
+                    self.url_metadata[clean_url] = {
+                        "title": title,
+                        "description": desc,
+                        "category": category,
+                        "importance": importance,
+                    }
+                # Find links
+                links = []
+                for a in soup.find_all("a", href=True):
+                    href = a["href"]
+                    if not any(
+                        x in href.lower()
+                        for x in ["javascript:", "mailto:", ".pdf", ".jpg", ".png", ".gif"]
+                    ):
+                        next_url = urljoin(url, href)
+                        if urlparse(next_url).netloc == base_domain:
+                            links.append(next_url)
+                return links
+            except Exception as e:
+                logger.error(f"Error crawling {url}: {str(e)}")
+                return []
 async def process_url(url, max_depth, max_pages):
     """Process URL and generate llms.txt"""