cyberandy commited on
Commit
3ab2a75
·
verified ·
1 Parent(s): da08b01

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +33 -35
app.py CHANGED
@@ -141,9 +141,8 @@ class WebsiteCrawler:
141
  return ""
142
 
143
  return cleaned
144
-
145
-
146
 
 
147
  def clean_description(self, desc):
148
  """Clean description text"""
149
  if not desc:
@@ -189,37 +188,37 @@ class WebsiteCrawler:
189
 
190
  return None
191
 
192
- async def crawl_page(self, url, depth, base_domain):
193
- """Crawl a single page and extract information"""
194
- if (
195
- depth > self.max_depth
196
- or url in self.visited_urls
197
- or len(self.visited_urls) >= self.max_pages
198
- ):
199
- return []
200
-
201
- try:
202
- await asyncio.sleep(1) # Be polite to servers
203
- async with aiohttp.ClientSession() as session:
204
- async with session.get(url, headers=self.headers, allow_redirects=True) as response:
205
- if response.status == 403:
206
- # Try with alternative headers
207
- alt_headers = {
208
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
209
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
210
- }
211
- async with session.get(url, headers=alt_headers, allow_redirects=True) as retry_response:
212
- if retry_response.status != 200:
213
- return []
214
- text = await retry_response.text()
215
- elif response.status != 200:
216
- return []
217
- else:
218
- text = await response.text()
219
-
220
- self.visited_urls.add(url)
221
- soup = BeautifulSoup(text, "html.parser")
222
-
223
  # Extract title with fallbacks
224
  title = None
225
  meta_title = soup.find("meta", property="og:title")
@@ -283,13 +282,12 @@ class WebsiteCrawler:
283
  next_url = urljoin(url, href)
284
  if urlparse(next_url).netloc == base_domain:
285
  links.append(next_url)
286
-
287
  return links
288
 
289
  except Exception as e:
290
  logger.error(f"Error crawling {url}: {str(e)}")
291
  return []
292
-
293
  async def process_homepage(self, url):
294
  """Specifically process the homepage to extract key metadata"""
295
  try:
 
141
  return ""
142
 
143
  return cleaned
 
 
144
 
145
+
146
  def clean_description(self, desc):
147
  """Clean description text"""
148
  if not desc:
 
188
 
189
  return None
190
 
191
+ async def crawl_page(self, url, depth, base_domain):
192
+ """Crawl a single page and extract information"""
193
+ if (
194
+ depth > self.max_depth
195
+ or url in self.visited_urls
196
+ or len(self.visited_urls) >= self.max_pages
197
+ ):
198
+ return []
199
+
200
+ try:
201
+ await asyncio.sleep(1) # Be polite to servers
202
+ async with aiohttp.ClientSession() as session:
203
+ async with session.get(url, headers=self.headers, allow_redirects=True) as response:
204
+ if response.status == 403:
205
+ # Try with alternative headers
206
+ alt_headers = {
207
+ "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
208
+ "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
209
+ }
210
+ async with session.get(url, headers=alt_headers, allow_redirects=True) as retry_response:
211
+ if retry_response.status != 200:
212
+ return []
213
+ text = await retry_response.text()
214
+ elif response.status != 200:
215
+ return []
216
+ else:
217
+ text = await response.text()
218
+
219
+ self.visited_urls.add(url)
220
+ soup = BeautifulSoup(text, "html.parser")
221
+
222
  # Extract title with fallbacks
223
  title = None
224
  meta_title = soup.find("meta", property="og:title")
 
282
  next_url = urljoin(url, href)
283
  if urlparse(next_url).netloc == base_domain:
284
  links.append(next_url)
 
285
  return links
286
 
287
  except Exception as e:
288
  logger.error(f"Error crawling {url}: {str(e)}")
289
  return []
290
+
291
  async def process_homepage(self, url):
292
  """Specifically process the homepage to extract key metadata"""
293
  try: