Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on Nov 25, 2024

Commit

da08b01

verified ·

1 Parent(s): c12eb63

Update app.py

Browse files

Files changed (1) hide show

app.py +23 -32

app.py CHANGED Viewed

@@ -189,39 +189,34 @@ class WebsiteCrawler:
         return None
-    async def crawl_page(self, url, depth, base_domain):
-        if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages:
-            return []
-        max_retries = 3
-        for attempt in range(max_retries):
             try:
-                await asyncio.sleep(1 + attempt)  # Increasing delay between retries
                 async with aiohttp.ClientSession() as session:
                     async with session.get(url, headers=self.headers, allow_redirects=True) as response:
                         if response.status == 403:
-                            logger.info(f"Access forbidden for {url}, trying with different headers")
-                            # Try with different headers
-                            headers = {
-                                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
-                                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
-                                "Accept-Language": "en-US,en;q=0.5",
-                                "Accept-Encoding": "gzip, deflate, br",
-                                "DNT": "1",
-                                "Connection": "keep-alive",
-                                "Upgrade-Insecure-Requests": "1"
                             }
-                            async with session.get(url, headers=headers, allow_redirects=True) as retry_response:
-                                if retry_response.status == 200:
-                                    text = await retry_response.text()
-                                else:
-                                    continue
                         elif response.status != 200:
-                            logger.error(f"Error status {response.status} for URL: {url}")
                             return []
                         else:
                             text = await response.text()
                         self.visited_urls.add(url)
                         soup = BeautifulSoup(text, "html.parser")
@@ -288,17 +283,13 @@ class WebsiteCrawler:
                             next_url = urljoin(url, href)
                             if urlparse(next_url).netloc == base_domain:
                                 links.append(next_url)
-                    return links
         except Exception as e:
-            logger.error(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
-            if attempt == max_retries - 1:
-                return []
-            continue
-        break  # If we get here, the request succeeded
     async def process_homepage(self, url):
         """Specifically process the homepage to extract key metadata"""
         try:

         return None
+        async def crawl_page(self, url, depth, base_domain):
+            """Crawl a single page and extract information"""
+            if (
+                depth > self.max_depth
+                or url in self.visited_urls
+                or len(self.visited_urls) >= self.max_pages
+            ):
+                return []
             try:
+                await asyncio.sleep(1)  # Be polite to servers
                 async with aiohttp.ClientSession() as session:
                     async with session.get(url, headers=self.headers, allow_redirects=True) as response:
                         if response.status == 403:
+                            # Try with alternative headers
+                            alt_headers = {
+                                "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
+                                "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                             }
+                            async with session.get(url, headers=alt_headers, allow_redirects=True) as retry_response:
+                                if retry_response.status != 200:
+                                    return []
+                                text = await retry_response.text()
                         elif response.status != 200:
                             return []
                         else:
                             text = await response.text()
                         self.visited_urls.add(url)
                         soup = BeautifulSoup(text, "html.parser")
                             next_url = urljoin(url, href)
                             if urlparse(next_url).netloc == base_domain:
                                 links.append(next_url)
+                    return links
         except Exception as e:
+            logger.error(f"Error crawling {url}: {str(e)}")
+            return []
     async def process_homepage(self, url):
         """Specifically process the homepage to extract key metadata"""
         try: