Spaces:

WordLift
/

create-llms-txt

Running

App Files Files Community

cyberandy commited on Nov 25, 2024

Commit

fe2936f

verified ·

1 Parent(s): e769bef

Update app.py

Browse files

Files changed (1) hide show

app.py +42 -7

app.py CHANGED Viewed

@@ -8,6 +8,7 @@ import aiohttp
 from collections import defaultdict
 import unicodedata
 import logging
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
@@ -188,13 +189,31 @@ class WebsiteCrawler:
         try:
             await asyncio.sleep(1)  # Be polite to servers
-            async with aiohttp.ClientSession() as session:
-                async with session.get(url, headers=self.headers, timeout=10) as response:
-                    text = await response.text()
                     self.visited_urls.add(url)
                     soup = BeautifulSoup(text, "html.parser")
                     # Extract title with fallbacks
                     title = None
                     meta_title = soup.find("meta", property="og:title")
@@ -266,9 +285,25 @@ class WebsiteCrawler:
     async def process_homepage(self, url):
         """Specifically process the homepage to extract key metadata"""
         try:
-            async with aiohttp.ClientSession() as session:
-                async with session.get(url, headers=self.headers, timeout=10) as response:
-                    text = await response.text()
                     soup = BeautifulSoup(text, "html.parser")
                     # Extract site name with more fallbacks

 from collections import defaultdict
 import unicodedata
 import logging
+import ssl
 logging.basicConfig(level=logging.INFO)
 logger = logging.getLogger(__name__)
         try:
             await asyncio.sleep(1)  # Be polite to servers
+            # Configure SSL context to handle certificate issues
+            ssl_context = ssl.create_default_context()
+            ssl_context.check_hostname = False
+            ssl_context.verify_mode = ssl.CERT_NONE
+            connector = aiohttp.TCPConnector(ssl=ssl_context)
+            timeout = aiohttp.ClientTimeout(total=30)
+            async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
+                async with session.get(url, headers=self.headers, allow_redirects=True) as response:
+                    if response.status != 200:
+                        logger.error(f"Error status {response.status} for URL: {url}")
+                        return []
+                    try:
+                        text = await response.text()
+                    except UnicodeDecodeError:
+                        text = await response.read()
+                        text = text.decode('utf-8', errors='ignore')
                     self.visited_urls.add(url)
                     soup = BeautifulSoup(text, "html.parser")
                     # Extract title with fallbacks
                     title = None
                     meta_title = soup.find("meta", property="og:title")
     async def process_homepage(self, url):
         """Specifically process the homepage to extract key metadata"""
         try:
+            # Configure SSL context
+            ssl_context = ssl.create_default_context()
+            ssl_context.check_hostname = False
+            ssl_context.verify_mode = ssl.CERT_NONE
+            connector = aiohttp.TCPConnector(ssl=ssl_context)
+            timeout = aiohttp.ClientTimeout(total=30)
+            async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
+                async with session.get(url, headers=self.headers, allow_redirects=True) as response:
+                    if response.status != 200:
+                        raise Exception(f"Failed to fetch homepage: status {response.status}")
+                    try:
+                        text = await response.text()
+                    except UnicodeDecodeError:
+                        text = await response.read()
+                        text = text.decode('utf-8', errors='ignore')
                     soup = BeautifulSoup(text, "html.parser")
                     # Extract site name with more fallbacks