Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -189,39 +189,34 @@ class WebsiteCrawler:
|
|
189 |
|
190 |
return None
|
191 |
|
192 |
-
|
193 |
-
|
194 |
-
|
195 |
-
|
196 |
-
|
197 |
-
|
|
|
|
|
|
|
198 |
try:
|
199 |
-
await asyncio.sleep(1
|
200 |
async with aiohttp.ClientSession() as session:
|
201 |
async with session.get(url, headers=self.headers, allow_redirects=True) as response:
|
202 |
if response.status == 403:
|
203 |
-
|
204 |
-
|
205 |
-
|
206 |
-
"
|
207 |
-
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
|
208 |
-
"Accept-Language": "en-US,en;q=0.5",
|
209 |
-
"Accept-Encoding": "gzip, deflate, br",
|
210 |
-
"DNT": "1",
|
211 |
-
"Connection": "keep-alive",
|
212 |
-
"Upgrade-Insecure-Requests": "1"
|
213 |
}
|
214 |
-
async with session.get(url, headers=
|
215 |
-
if retry_response.status
|
216 |
-
|
217 |
-
|
218 |
-
continue
|
219 |
elif response.status != 200:
|
220 |
-
logger.error(f"Error status {response.status} for URL: {url}")
|
221 |
return []
|
222 |
else:
|
223 |
text = await response.text()
|
224 |
-
|
225 |
self.visited_urls.add(url)
|
226 |
soup = BeautifulSoup(text, "html.parser")
|
227 |
|
@@ -288,17 +283,13 @@ class WebsiteCrawler:
|
|
288 |
next_url = urljoin(url, href)
|
289 |
if urlparse(next_url).netloc == base_domain:
|
290 |
links.append(next_url)
|
291 |
-
return links
|
292 |
|
|
|
|
|
293 |
except Exception as e:
|
294 |
-
logger.error(f"
|
295 |
-
|
296 |
-
return []
|
297 |
-
continue
|
298 |
-
|
299 |
-
break # If we get here, the request succeeded
|
300 |
|
301 |
-
|
302 |
async def process_homepage(self, url):
|
303 |
"""Specifically process the homepage to extract key metadata"""
|
304 |
try:
|
|
|
189 |
|
190 |
return None
|
191 |
|
192 |
+
async def crawl_page(self, url, depth, base_domain):
|
193 |
+
"""Crawl a single page and extract information"""
|
194 |
+
if (
|
195 |
+
depth > self.max_depth
|
196 |
+
or url in self.visited_urls
|
197 |
+
or len(self.visited_urls) >= self.max_pages
|
198 |
+
):
|
199 |
+
return []
|
200 |
+
|
201 |
try:
|
202 |
+
await asyncio.sleep(1) # Be polite to servers
|
203 |
async with aiohttp.ClientSession() as session:
|
204 |
async with session.get(url, headers=self.headers, allow_redirects=True) as response:
|
205 |
if response.status == 403:
|
206 |
+
# Try with alternative headers
|
207 |
+
alt_headers = {
|
208 |
+
"User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
|
209 |
+
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
|
|
|
|
|
|
|
|
|
|
|
|
|
210 |
}
|
211 |
+
async with session.get(url, headers=alt_headers, allow_redirects=True) as retry_response:
|
212 |
+
if retry_response.status != 200:
|
213 |
+
return []
|
214 |
+
text = await retry_response.text()
|
|
|
215 |
elif response.status != 200:
|
|
|
216 |
return []
|
217 |
else:
|
218 |
text = await response.text()
|
219 |
+
|
220 |
self.visited_urls.add(url)
|
221 |
soup = BeautifulSoup(text, "html.parser")
|
222 |
|
|
|
283 |
next_url = urljoin(url, href)
|
284 |
if urlparse(next_url).netloc == base_domain:
|
285 |
links.append(next_url)
|
|
|
286 |
|
287 |
+
return links
|
288 |
+
|
289 |
except Exception as e:
|
290 |
+
logger.error(f"Error crawling {url}: {str(e)}")
|
291 |
+
return []
|
|
|
|
|
|
|
|
|
292 |
|
|
|
293 |
async def process_homepage(self, url):
|
294 |
"""Specifically process the homepage to extract key metadata"""
|
295 |
try:
|