cyberandy commited on
Commit
da08b01
·
verified ·
1 Parent(s): c12eb63

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +23 -32
app.py CHANGED
@@ -189,39 +189,34 @@ class WebsiteCrawler:
189
 
190
  return None
191
 
192
- async def crawl_page(self, url, depth, base_domain):
193
- if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages:
194
- return []
195
-
196
- max_retries = 3
197
- for attempt in range(max_retries):
 
 
 
198
  try:
199
- await asyncio.sleep(1 + attempt) # Increasing delay between retries
200
  async with aiohttp.ClientSession() as session:
201
  async with session.get(url, headers=self.headers, allow_redirects=True) as response:
202
  if response.status == 403:
203
- logger.info(f"Access forbidden for {url}, trying with different headers")
204
- # Try with different headers
205
- headers = {
206
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
207
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
208
- "Accept-Language": "en-US,en;q=0.5",
209
- "Accept-Encoding": "gzip, deflate, br",
210
- "DNT": "1",
211
- "Connection": "keep-alive",
212
- "Upgrade-Insecure-Requests": "1"
213
  }
214
- async with session.get(url, headers=headers, allow_redirects=True) as retry_response:
215
- if retry_response.status == 200:
216
- text = await retry_response.text()
217
- else:
218
- continue
219
  elif response.status != 200:
220
- logger.error(f"Error status {response.status} for URL: {url}")
221
  return []
222
  else:
223
  text = await response.text()
224
-
225
  self.visited_urls.add(url)
226
  soup = BeautifulSoup(text, "html.parser")
227
 
@@ -288,17 +283,13 @@ class WebsiteCrawler:
288
  next_url = urljoin(url, href)
289
  if urlparse(next_url).netloc == base_domain:
290
  links.append(next_url)
291
- return links
292
 
 
 
293
  except Exception as e:
294
- logger.error(f"Attempt {attempt + 1} failed for {url}: {str(e)}")
295
- if attempt == max_retries - 1:
296
- return []
297
- continue
298
-
299
- break # If we get here, the request succeeded
300
 
301
-
302
  async def process_homepage(self, url):
303
  """Specifically process the homepage to extract key metadata"""
304
  try:
 
189
 
190
  return None
191
 
192
+ async def crawl_page(self, url, depth, base_domain):
193
+ """Crawl a single page and extract information"""
194
+ if (
195
+ depth > self.max_depth
196
+ or url in self.visited_urls
197
+ or len(self.visited_urls) >= self.max_pages
198
+ ):
199
+ return []
200
+
201
  try:
202
+ await asyncio.sleep(1) # Be polite to servers
203
  async with aiohttp.ClientSession() as session:
204
  async with session.get(url, headers=self.headers, allow_redirects=True) as response:
205
  if response.status == 403:
206
+ # Try with alternative headers
207
+ alt_headers = {
208
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
209
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
 
 
 
 
 
 
210
  }
211
+ async with session.get(url, headers=alt_headers, allow_redirects=True) as retry_response:
212
+ if retry_response.status != 200:
213
+ return []
214
+ text = await retry_response.text()
 
215
  elif response.status != 200:
 
216
  return []
217
  else:
218
  text = await response.text()
219
+
220
  self.visited_urls.add(url)
221
  soup = BeautifulSoup(text, "html.parser")
222
 
 
283
  next_url = urljoin(url, href)
284
  if urlparse(next_url).netloc == base_domain:
285
  links.append(next_url)
 
286
 
287
+ return links
288
+
289
  except Exception as e:
290
+ logger.error(f"Error crawling {url}: {str(e)}")
291
+ return []
 
 
 
 
292
 
 
293
  async def process_homepage(self, url):
294
  """Specifically process the homepage to extract key metadata"""
295
  try: