Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on 18 days ago

Commit

1de7c37

•

1 Parent(s): dd2349f

update

Browse files

Files changed (1) hide show

app.py +154 -131

app.py CHANGED Viewed

@@ -4,9 +4,12 @@ from bs4 import BeautifulSoup
 import re
 from urllib.parse import urljoin, urlparse
 import asyncio
 from collections import defaultdict
 import unicodedata
 import logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -18,9 +21,54 @@ class WebsiteCrawler:
         self.max_pages = max_pages
         self.visited_urls = set()
         self.url_metadata = defaultdict(dict)
         self.headers = {
-            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
         }
     def clean_text(self, text, is_title=False):
         """Clean and normalize text"""
@@ -38,131 +86,97 @@ class WebsiteCrawler:
         return " ".join(text.split()).strip()
-    async def crawl_page(self, url, depth, base_domain):
-        """Crawl a single page and extract information"""
-        if (
-            depth > self.max_depth
-            or url in self.visited_urls
-            or len(self.visited_urls) >= self.max_pages
-        ):
-            return []
         try:
-            response = requests.get(url, headers=self.headers, timeout=10)
-            response.encoding = "utf-8"
-            self.visited_urls.add(url)
-            soup = BeautifulSoup(response.text, "html.parser")
-            # Extract title with fallbacks
-            title = None
-            meta_title = soup.find("meta", property="og:title")
-            if meta_title and meta_title.get("content"):
-                title = meta_title["content"]
-            if not title:
-                title_tag = soup.find("title")
-                if title_tag:
-                    title = title_tag.text
-            if not title:
-                h1_tag = soup.find("h1")
-                if h1_tag:
-                    title = h1_tag.text
-            if not title:
-                title = url.split("/")[-1]
-            title = self.clean_text(title, is_title=True)
-            # Extract description with fallbacks
-            desc = None
-            meta_desc = soup.find("meta", {"name": "description"})
-            if meta_desc and meta_desc.get("content"):
-                desc = meta_desc["content"]
-            if not desc:
-                og_desc = soup.find("meta", property="og:description")
-                if og_desc and og_desc.get("content"):
-                    desc = og_desc["content"]
-            if not desc:
-                first_p = soup.find("p")
-                if first_p:
-                    desc = first_p.text
-            desc = self.clean_text(desc) if desc else ""
-            # Determine category and importance
-            url_lower = url.lower()
-            category = "Optional"
-            importance = 0
-            if "docs" in url_lower or "documentation" in url_lower:
-                category = "Docs"
-                importance = 5
-            elif "api" in url_lower:
-                category = "API"
-                importance = 4
-            elif "guide" in url_lower or "tutorial" in url_lower:
-                category = "Guides"
-                importance = 3
-            elif "example" in url_lower:
-                category = "Examples"
-                importance = 2
-            elif "blog" in url_lower:
-                category = "Blog"
-                importance = 1
-            # Store metadata
-            clean_url = re.sub(r"#.*", "", url).rstrip("/")
-            if title and len(title.strip()) > 0:  # Only store if we have a valid title
-                self.url_metadata[clean_url] = {
-                    "title": title,
-                    "description": desc,
-                    "category": category,
-                    "importance": importance,
                 }
-            # Find links
-            links = []
-            for a in soup.find_all("a", href=True):
-                href = a["href"]
-                if not any(
-                    x in href.lower()
-                    for x in ["javascript:", "mailto:", ".pdf", ".jpg", ".png", ".gif"]
-                ):
-                    next_url = urljoin(url, href)
-                    if urlparse(next_url).netloc == base_domain:
-                        links.append(next_url)
-            return links
         except Exception as e:
-            logger.error(f"Error crawling {url}: {str(e)}")
-            return []
     async def crawl_website(self, start_url):
         """Crawl website starting from the given URL"""
-        base_domain = urlparse(start_url).netloc
-        queue = [(start_url, 0)]
-        seen = {start_url}
-        while queue and len(self.visited_urls) < self.max_pages:
-            current_url, depth = queue.pop(0)
-            if depth > self.max_depth:
-                continue
-            links = await self.crawl_page(current_url, depth, base_domain)
-            for link in links:
-                if link not in seen and urlparse(link).netloc == base_domain:
-                    seen.add(link)
-                    queue.append((link, depth + 1))
-    def clean_description(self, desc):
-        """Clean description text"""
-        if not desc:
-            return ""
-        # Remove leading dashes, hyphens, or colons
-        desc = re.sub(r"^[-:\s]+", "", desc)
-        # Remove any strings that are just "Editors", "APIs", etc.
-        if len(desc.split()) <= 1:
-            return ""
-        return desc.strip()
     def generate_llms_txt(self):
         """Generate llms.txt content"""
@@ -188,20 +202,20 @@ class WebsiteCrawler:
         # Generate content
         content = []
-        # Find the best title for the main header (prefer "Welcome" or "Overview")
-        main_title = "Welcome"  # Default to Welcome
-        # Find a good description for the blockquote
-        best_description = None
-        for _, metadata in sorted_urls:
-            desc = self.clean_description(metadata["description"])
-            if desc and len(desc) > 20 and "null" not in desc.lower():
-                best_description = desc
-                break
         content.append(f"# {main_title}")
-        if best_description:
-            content.append(f"\n> {best_description}")
         # Group by category
         categories = defaultdict(list)
@@ -229,6 +243,7 @@ class WebsiteCrawler:
         return "\n".join(content)
 async def process_url(url, max_depth, max_pages):
     """Process URL and generate llms.txt"""
     try:
@@ -241,14 +256,22 @@ async def process_url(url, max_depth, max_pages):
         if not all([result.scheme, result.netloc]):
             return "", "Invalid URL format. Please enter a valid URL."
         # Process website
         crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
         await crawler.crawl_website(url)
         content = crawler.generate_llms_txt()
         return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
     except Exception as e:
         return "", f"Error: {str(e)}"

 import re
 from urllib.parse import urljoin, urlparse
 import asyncio
+import aiohttp
 from collections import defaultdict
 import unicodedata
 import logging
+import ssl
+import brotli  # Add this import
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
         self.max_pages = max_pages
         self.visited_urls = set()
         self.url_metadata = defaultdict(dict)
+        self.homepage_metadata = None
         self.headers = {
+            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
+            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
+            "Accept-Language": "en-US,en;q=0.5",
+            "Accept-Encoding": "gzip, deflate, br",
+            "DNT": "1",
+            "Connection": "keep-alive",
+            "Upgrade-Insecure-Requests": "1",
         }
+        self.session = None
+    async def get_session(self):
+        if self.session is None:
+            ssl_context = ssl.create_default_context()
+            ssl_context.check_hostname = False
+            ssl_context.verify_mode = ssl.CERT_NONE
+            # Configure client with brotli support
+            connector = aiohttp.TCPConnector(ssl=ssl_context)
+            self.session = aiohttp.ClientSession(
+                connector=connector, timeout=aiohttp.ClientTimeout(total=30)
+            )
+        return self.session
+    async def decode_response(self, response):
+        """Handle various content encodings including brotli"""
+        content_encoding = response.headers.get("Content-Encoding", "").lower()
+        content = await response.read()
+        if content_encoding == "br":
+            try:
+                decoded = brotli.decompress(content)
+                return decoded.decode("utf-8", errors="ignore")
+            except Exception as e:
+                logger.error(f"Error decoding brotli content: {str(e)}")
+                return content.decode("utf-8", errors="ignore")
+        elif content_encoding == "gzip":
+            import gzip
+            try:
+                decoded = gzip.decompress(content)
+                return decoded.decode("utf-8", errors="ignore")
+            except Exception as e:
+                logger.error(f"Error decoding gzip content: {str(e)}")
+                return content.decode("utf-8", errors="ignore")
+        else:
+            return content.decode("utf-8", errors="ignore")
     def clean_text(self, text, is_title=False):
         """Clean and normalize text"""
         return " ".join(text.split()).strip()
+    async def process_homepage(self, url):
+        """Specifically process the homepage to extract key metadata"""
         try:
+            session = await self.get_session()
+            async with session.get(
+                url, headers=self.headers, allow_redirects=True
+            ) as response:
+                if response.status != 200:
+                    raise Exception(
+                        f"Failed to fetch homepage: status {response.status}"
+                    )
+                text = await self.decode_response(response)
+                soup = BeautifulSoup(text, "html.parser")
+                # Extract site name
+                site_name = None
+                site_meta = soup.find("meta", property="og:site_name")
+                if site_meta and site_meta.get("content"):
+                    site_name = site_meta["content"]
+                if not site_name:
+                    title_tag = soup.find("title")
+                    if title_tag:
+                        site_name = title_tag.text.split("|")[0].strip()
+                if not site_name:
+                    site_name = urlparse(url).netloc.split(".")[0].capitalize()
+                # Get homepage description
+                description = None
+                meta_desc = soup.find("meta", {"name": "description"})
+                if meta_desc and meta_desc.get("content"):
+                    description = meta_desc["content"]
+                if not description:
+                    og_desc = soup.find("meta", property="og:description")
+                    if og_desc and og_desc.get("content"):
+                        description = og_desc["content"]
+                if not description:
+                    first_p = soup.find("p")
+                    if first_p:
+                        description = first_p.text
+                self.homepage_metadata = {
+                    "site_name": self.clean_text(site_name, is_title=True),
+                    "description": (
+                        self.clean_text(description) if description else None
+                    ),
                 }
         except Exception as e:
+            logger.error(f"Error processing homepage {url}: {str(e)}")
+            self.homepage_metadata = {
+                "site_name": urlparse(url).netloc.split(".")[0].capitalize(),
+                "description": None,
+            }
     async def crawl_website(self, start_url):
         """Crawl website starting from the given URL"""
+        try:
+            # First process the homepage
+            logger.info(f"Processing homepage: {start_url}")
+            await self.process_homepage(start_url)
+            base_domain = urlparse(start_url).netloc
+            queue = [(start_url, 0)]
+            seen = {start_url}
+            while queue and len(self.visited_urls) < self.max_pages:
+                current_url, depth = queue.pop(0)
+                if depth > self.max_depth:
+                    continue
+                logger.info(f"Crawling page: {current_url} (depth: {depth})")
+                links = await self.crawl_page(current_url, depth, base_domain)
+                logger.info(f"Found {len(links)} links on {current_url}")
+                for link in links:
+                    if link not in seen and urlparse(link).netloc == base_domain:
+                        seen.add(link)
+                        queue.append((link, depth + 1))
+            logger.info(f"Crawl completed. Visited {len(self.visited_urls)} pages")
+        except Exception as e:
+            logger.error(f"Error during crawl: {str(e)}")
+            raise
+        finally:
+            await self.cleanup()
     def generate_llms_txt(self):
         """Generate llms.txt content"""
         # Generate content
         content = []
+        # Use homepage metadata for main title and description
+        main_title = self.homepage_metadata.get("site_name", "Welcome")
+        homepage_description = self.homepage_metadata.get("description")
         content.append(f"# {main_title}")
+        if homepage_description:
+            content.append(f"\n> {homepage_description}")
+        else:
+            # Fallback to first good description from content
+            for _, metadata in sorted_urls:
+                desc = self.clean_description(metadata["description"])
+                if desc and len(desc) > 20 and "null" not in desc.lower():
+                    content.append(f"\n> {desc}")
+                    break
         # Group by category
         categories = defaultdict(list)
         return "\n".join(content)
+# Process URL function (outside the class)
 async def process_url(url, max_depth, max_pages):
     """Process URL and generate llms.txt"""
     try:
         if not all([result.scheme, result.netloc]):
             return "", "Invalid URL format. Please enter a valid URL."
+        logger.info(f"Starting crawl of {url}")
         # Process website
         crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
         await crawler.crawl_website(url)
+        logger.info("Generating llms.txt content")
         content = crawler.generate_llms_txt()
+        if not content or content.strip() == "":
+            return "", "No content was generated. Check the logs for details."
         return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
     except Exception as e:
+        logger.error(f"Error processing URL {url}: {str(e)}")
         return "", f"Error: {str(e)}"