Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on Nov 25, 2024

Commit

3ab2a75

verified ·

1 Parent(s): da08b01

Update app.py

Browse files

Files changed (1) hide show

app.py +33 -35

app.py CHANGED Viewed

@@ -141,9 +141,8 @@ class WebsiteCrawler:
             return ""
         return cleaned
     def clean_description(self, desc):
         """Clean description text"""
         if not desc:
@@ -189,37 +188,37 @@ class WebsiteCrawler:
         return None
-        async def crawl_page(self, url, depth, base_domain):
-            """Crawl a single page and extract information"""
-            if (
-                depth > self.max_depth
-                or url in self.visited_urls
-                or len(self.visited_urls) >= self.max_pages
-            ):
-                return []
-            try:
-                await asyncio.sleep(1)  # Be polite to servers
-                async with aiohttp.ClientSession() as session:
-                    async with session.get(url, headers=self.headers, allow_redirects=True) as response:
-                        if response.status == 403:
-                            # Try with alternative headers
-                            alt_headers = {
-                                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
-                                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
-                            }
-                            async with session.get(url, headers=alt_headers, allow_redirects=True) as retry_response:
-                                if retry_response.status != 200:
-                                    return []
-                                text = await retry_response.text()
-                        elif response.status != 200:
-                            return []
-                        else:
-                            text = await response.text()
-                        self.visited_urls.add(url)
-                        soup = BeautifulSoup(text, "html.parser")
                     # Extract title with fallbacks
                     title = None
                     meta_title = soup.find("meta", property="og:title")
@@ -283,13 +282,12 @@ class WebsiteCrawler:
                             next_url = urljoin(url, href)
                             if urlparse(next_url).netloc == base_domain:
                                 links.append(next_url)
                     return links
         except Exception as e:
             logger.error(f"Error crawling {url}: {str(e)}")
             return []
     async def process_homepage(self, url):
         """Specifically process the homepage to extract key metadata"""
         try:

             return ""
         return cleaned
     def clean_description(self, desc):
         """Clean description text"""
         if not desc:
         return None
+    async def crawl_page(self, url, depth, base_domain):
+        """Crawl a single page and extract information"""
+        if (
+            depth > self.max_depth
+            or url in self.visited_urls
+            or len(self.visited_urls) >= self.max_pages
+        ):
+            return []
+        try:
+            await asyncio.sleep(1)  # Be polite to servers
+            async with aiohttp.ClientSession() as session:
+                async with session.get(url, headers=self.headers, allow_redirects=True) as response:
+                    if response.status == 403:
+                        # Try with alternative headers
+                        alt_headers = {
+                            "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
+                            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
+                        }
+                        async with session.get(url, headers=alt_headers, allow_redirects=True) as retry_response:
+                            if retry_response.status != 200:
+                                return []
+                            text = await retry_response.text()
+                    elif response.status != 200:
+                        return []
+                    else:
+                        text = await response.text()
+                    self.visited_urls.add(url)
+                    soup = BeautifulSoup(text, "html.parser")
                     # Extract title with fallbacks
                     title = None
                     meta_title = soup.find("meta", property="og:title")
                             next_url = urljoin(url, href)
                             if urlparse(next_url).netloc == base_domain:
                                 links.append(next_url)
                     return links
         except Exception as e:
             logger.error(f"Error crawling {url}: {str(e)}")
             return []
     async def process_homepage(self, url):
         """Specifically process the homepage to extract key metadata"""
         try: