cyberandy commited on
Commit
fe2936f
·
verified ·
1 Parent(s): e769bef

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +42 -7
app.py CHANGED
@@ -8,6 +8,7 @@ import aiohttp
8
  from collections import defaultdict
9
  import unicodedata
10
  import logging
 
11
 
12
  logging.basicConfig(level=logging.INFO)
13
  logger = logging.getLogger(__name__)
@@ -188,13 +189,31 @@ class WebsiteCrawler:
188
 
189
  try:
190
  await asyncio.sleep(1) # Be polite to servers
191
- async with aiohttp.ClientSession() as session:
192
- async with session.get(url, headers=self.headers, timeout=10) as response:
193
- text = await response.text()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
194
  self.visited_urls.add(url)
195
 
196
  soup = BeautifulSoup(text, "html.parser")
197
-
198
  # Extract title with fallbacks
199
  title = None
200
  meta_title = soup.find("meta", property="og:title")
@@ -266,9 +285,25 @@ class WebsiteCrawler:
266
  async def process_homepage(self, url):
267
  """Specifically process the homepage to extract key metadata"""
268
  try:
269
- async with aiohttp.ClientSession() as session:
270
- async with session.get(url, headers=self.headers, timeout=10) as response:
271
- text = await response.text()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
272
  soup = BeautifulSoup(text, "html.parser")
273
 
274
  # Extract site name with more fallbacks
 
8
  from collections import defaultdict
9
  import unicodedata
10
  import logging
11
+ import ssl
12
 
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
 
189
 
190
  try:
191
  await asyncio.sleep(1) # Be polite to servers
192
+
193
+ # Configure SSL context to handle certificate issues
194
+ ssl_context = ssl.create_default_context()
195
+ ssl_context.check_hostname = False
196
+ ssl_context.verify_mode = ssl.CERT_NONE
197
+
198
+ connector = aiohttp.TCPConnector(ssl=ssl_context)
199
+ timeout = aiohttp.ClientTimeout(total=30)
200
+
201
+ async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
202
+ async with session.get(url, headers=self.headers, allow_redirects=True) as response:
203
+ if response.status != 200:
204
+ logger.error(f"Error status {response.status} for URL: {url}")
205
+ return []
206
+
207
+ try:
208
+ text = await response.text()
209
+ except UnicodeDecodeError:
210
+ text = await response.read()
211
+ text = text.decode('utf-8', errors='ignore')
212
+
213
  self.visited_urls.add(url)
214
 
215
  soup = BeautifulSoup(text, "html.parser")
216
+
217
  # Extract title with fallbacks
218
  title = None
219
  meta_title = soup.find("meta", property="og:title")
 
285
  async def process_homepage(self, url):
286
  """Specifically process the homepage to extract key metadata"""
287
  try:
288
+ # Configure SSL context
289
+ ssl_context = ssl.create_default_context()
290
+ ssl_context.check_hostname = False
291
+ ssl_context.verify_mode = ssl.CERT_NONE
292
+
293
+ connector = aiohttp.TCPConnector(ssl=ssl_context)
294
+ timeout = aiohttp.ClientTimeout(total=30)
295
+
296
+ async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
297
+ async with session.get(url, headers=self.headers, allow_redirects=True) as response:
298
+ if response.status != 200:
299
+ raise Exception(f"Failed to fetch homepage: status {response.status}")
300
+
301
+ try:
302
+ text = await response.text()
303
+ except UnicodeDecodeError:
304
+ text = await response.read()
305
+ text = text.decode('utf-8', errors='ignore')
306
+
307
  soup = BeautifulSoup(text, "html.parser")
308
 
309
  # Extract site name with more fallbacks