Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -141,9 +141,8 @@ class WebsiteCrawler:
|
|
141 |
return ""
|
142 |
|
143 |
return cleaned
|
144 |
-
|
145 |
-
|
146 |
|
|
|
147 |
def clean_description(self, desc):
|
148 |
"""Clean description text"""
|
149 |
if not desc:
|
@@ -189,37 +188,37 @@ class WebsiteCrawler:
|
|
189 |
|
190 |
return None
|
191 |
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
198 |
-
|
199 |
-
|
200 |
-
|
201 |
-
|
202 |
-
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
|
216 |
-
|
217 |
-
|
218 |
-
|
219 |
-
|
220 |
-
|
221 |
-
|
222 |
-
|
223 |
# Extract title with fallbacks
|
224 |
title = None
|
225 |
meta_title = soup.find("meta", property="og:title")
|
@@ -283,13 +282,12 @@ class WebsiteCrawler:
|
|
283 |
next_url = urljoin(url, href)
|
284 |
if urlparse(next_url).netloc == base_domain:
|
285 |
links.append(next_url)
|
286 |
-
|
287 |
return links
|
288 |
|
289 |
except Exception as e:
|
290 |
logger.error(f"Error crawling {url}: {str(e)}")
|
291 |
return []
|
292 |
-
|
293 |
async def process_homepage(self, url):
|
294 |
"""Specifically process the homepage to extract key metadata"""
|
295 |
try:
|
|
|
141 |
return ""
|
142 |
|
143 |
return cleaned
|
|
|
|
|
144 |
|
145 |
+
|
146 |
def clean_description(self, desc):
|
147 |
"""Clean description text"""
|
148 |
if not desc:
|
|
|
188 |
|
189 |
return None
|
190 |
|
191 |
+
async def crawl_page(self, url, depth, base_domain):
|
192 |
+
"""Crawl a single page and extract information"""
|
193 |
+
if (
|
194 |
+
depth > self.max_depth
|
195 |
+
or url in self.visited_urls
|
196 |
+
or len(self.visited_urls) >= self.max_pages
|
197 |
+
):
|
198 |
+
return []
|
199 |
+
|
200 |
+
try:
|
201 |
+
await asyncio.sleep(1) # Be polite to servers
|
202 |
+
async with aiohttp.ClientSession() as session:
|
203 |
+
async with session.get(url, headers=self.headers, allow_redirects=True) as response:
|
204 |
+
if response.status == 403:
|
205 |
+
# Try with alternative headers
|
206 |
+
alt_headers = {
|
207 |
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
|
208 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
209 |
+
}
|
210 |
+
async with session.get(url, headers=alt_headers, allow_redirects=True) as retry_response:
|
211 |
+
if retry_response.status != 200:
|
212 |
+
return []
|
213 |
+
text = await retry_response.text()
|
214 |
+
elif response.status != 200:
|
215 |
+
return []
|
216 |
+
else:
|
217 |
+
text = await response.text()
|
218 |
+
|
219 |
+
self.visited_urls.add(url)
|
220 |
+
soup = BeautifulSoup(text, "html.parser")
|
221 |
+
|
222 |
# Extract title with fallbacks
|
223 |
title = None
|
224 |
meta_title = soup.find("meta", property="og:title")
|
|
|
282 |
next_url = urljoin(url, href)
|
283 |
if urlparse(next_url).netloc == base_domain:
|
284 |
links.append(next_url)
|
|
|
285 |
return links
|
286 |
|
287 |
except Exception as e:
|
288 |
logger.error(f"Error crawling {url}: {str(e)}")
|
289 |
return []
|
290 |
+
|
291 |
async def process_homepage(self, url):
|
292 |
"""Specifically process the homepage to extract key metadata"""
|
293 |
try:
|