Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -8,6 +8,7 @@ import aiohttp
|
|
8 |
from collections import defaultdict
|
9 |
import unicodedata
|
10 |
import logging
|
|
|
11 |
|
12 |
logging.basicConfig(level=logging.INFO)
|
13 |
logger = logging.getLogger(__name__)
|
@@ -188,13 +189,31 @@ class WebsiteCrawler:
|
|
188 |
|
189 |
try:
|
190 |
await asyncio.sleep(1) # Be polite to servers
|
191 |
-
|
192 |
-
|
193 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
194 |
self.visited_urls.add(url)
|
195 |
|
196 |
soup = BeautifulSoup(text, "html.parser")
|
197 |
-
|
198 |
# Extract title with fallbacks
|
199 |
title = None
|
200 |
meta_title = soup.find("meta", property="og:title")
|
@@ -266,9 +285,25 @@ class WebsiteCrawler:
|
|
266 |
async def process_homepage(self, url):
|
267 |
"""Specifically process the homepage to extract key metadata"""
|
268 |
try:
|
269 |
-
|
270 |
-
|
271 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
272 |
soup = BeautifulSoup(text, "html.parser")
|
273 |
|
274 |
# Extract site name with more fallbacks
|
|
|
8 |
from collections import defaultdict
|
9 |
import unicodedata
|
10 |
import logging
|
11 |
+
import ssl
|
12 |
|
13 |
logging.basicConfig(level=logging.INFO)
|
14 |
logger = logging.getLogger(__name__)
|
|
|
189 |
|
190 |
try:
|
191 |
await asyncio.sleep(1) # Be polite to servers
|
192 |
+
|
193 |
+
# Configure SSL context to handle certificate issues
|
194 |
+
ssl_context = ssl.create_default_context()
|
195 |
+
ssl_context.check_hostname = False
|
196 |
+
ssl_context.verify_mode = ssl.CERT_NONE
|
197 |
+
|
198 |
+
connector = aiohttp.TCPConnector(ssl=ssl_context)
|
199 |
+
timeout = aiohttp.ClientTimeout(total=30)
|
200 |
+
|
201 |
+
async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
|
202 |
+
async with session.get(url, headers=self.headers, allow_redirects=True) as response:
|
203 |
+
if response.status != 200:
|
204 |
+
logger.error(f"Error status {response.status} for URL: {url}")
|
205 |
+
return []
|
206 |
+
|
207 |
+
try:
|
208 |
+
text = await response.text()
|
209 |
+
except UnicodeDecodeError:
|
210 |
+
text = await response.read()
|
211 |
+
text = text.decode('utf-8', errors='ignore')
|
212 |
+
|
213 |
self.visited_urls.add(url)
|
214 |
|
215 |
soup = BeautifulSoup(text, "html.parser")
|
216 |
+
|
217 |
# Extract title with fallbacks
|
218 |
title = None
|
219 |
meta_title = soup.find("meta", property="og:title")
|
|
|
285 |
async def process_homepage(self, url):
|
286 |
"""Specifically process the homepage to extract key metadata"""
|
287 |
try:
|
288 |
+
# Configure SSL context
|
289 |
+
ssl_context = ssl.create_default_context()
|
290 |
+
ssl_context.check_hostname = False
|
291 |
+
ssl_context.verify_mode = ssl.CERT_NONE
|
292 |
+
|
293 |
+
connector = aiohttp.TCPConnector(ssl=ssl_context)
|
294 |
+
timeout = aiohttp.ClientTimeout(total=30)
|
295 |
+
|
296 |
+
async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
|
297 |
+
async with session.get(url, headers=self.headers, allow_redirects=True) as response:
|
298 |
+
if response.status != 200:
|
299 |
+
raise Exception(f"Failed to fetch homepage: status {response.status}")
|
300 |
+
|
301 |
+
try:
|
302 |
+
text = await response.text()
|
303 |
+
except UnicodeDecodeError:
|
304 |
+
text = await response.read()
|
305 |
+
text = text.decode('utf-8', errors='ignore')
|
306 |
+
|
307 |
soup = BeautifulSoup(text, "html.parser")
|
308 |
|
309 |
# Extract site name with more fallbacks
|