cyberandy commited on
Commit
970c25e
1 Parent(s): 718decc
Files changed (1) hide show
  1. app.py +207 -130
app.py CHANGED
@@ -28,121 +28,167 @@ class WebsiteCrawler:
28
  "Accept-Encoding": "gzip, deflate, br",
29
  "DNT": "1",
30
  "Connection": "keep-alive",
31
- "Upgrade-Insecure-Requests": "1"
32
  }
33
 
34
  def determine_category_importance(self, url, title, desc):
35
  """Improved category detection"""
36
  url_lower = url.lower()
37
  path = urlparse(url).path.lower()
38
-
39
  # Homepage
40
  if path == "/" or path == "":
41
  return "Main", 10
42
-
43
  # Documentation and Help
44
- if any(x in url_lower for x in ['/docs', '/documentation', '/faq', '/help', 'frequently-asked-questions']):
 
 
 
 
 
 
 
 
 
45
  return "Documentation", 8
46
-
47
  # API and Developer
48
- elif any(x in url_lower for x in ['/api', '/developer', 'developers']):
49
  return "API", 8
50
-
51
  # About/Company pages
52
- elif any(x in url_lower for x in [
53
- '/about', '/company', '/references', '/work-with-us',
54
- 'careers', '/team', '/contact', '/about-us'
55
- ]):
 
 
 
 
 
 
 
 
 
56
  return "About", 7
57
-
58
- # News and Events
59
- elif any(x in url_lower for x in [
60
- '/news', '/blog', '/events', '/press',
61
- 'research', 'power-of', 'latest'
62
- ]):
 
 
 
 
 
 
 
 
63
  return "News", 5
64
-
65
  # Tools and Services
66
- elif any(x in url_lower for x in [
67
- '/tools', '/quote', '/pricing', '/services',
68
- '/translate', '/order', '/buy'
69
- ]):
 
 
 
 
 
 
 
 
70
  return "Tools", 6
71
-
72
  # Check if URL path contains non-ASCII or percent-encoded characters
73
- if bool(re.search(r'[^\x00-\x7F]', path)) or bool(re.search(r'%[0-9A-F]{2}', path)):
 
 
74
  return "Optional", 0
75
-
76
  return "Optional", 1
77
 
78
  def is_duplicate_content(self, desc, title, url):
79
  """Improved duplicate/translation detection"""
80
  if not desc or not title:
81
  return False
82
-
83
  # Skip non-latin character URLs or URLs with percent-encoded non-ASCII
84
- if bool(re.search(r'[^\x00-\x7F]', url)) or bool(re.search(r'%[0-9A-F]{2}', url)):
 
 
85
  return True
86
-
87
-
88
  # Skip common translation paths
89
  translation_indicators = [
90
- '/welcome', '/bienvenue', '/willkommen', '/benvenuto',
91
- '/tervetuloa', '/bienvenido', '/velkommen', '/welkom',
92
- 'translate.com/', '/translate/', '/translation/'
 
 
 
 
 
 
 
 
93
  ]
94
  if any(indicator in url.lower() for indicator in translation_indicators):
95
  url_path = urlparse(url).path.lower()
96
- if url_path != '/': # Don't skip homepage
97
  return True
98
-
99
  # Check for similar content length and patterns
100
  for existing_metadata in self.url_metadata.values():
101
  existing_desc = existing_metadata.get("description", "")
102
  existing_title = existing_metadata.get("title", "")
103
  if not existing_desc or not existing_title:
104
  continue
105
-
106
  # If descriptions are very similar in length, likely a translation
107
- if (abs(len(desc) - len(existing_desc)) < 20 and
108
- len(desc) > 50 and
109
- desc != existing_desc): # Allow exact duplicates for main page
 
 
110
  return True
111
-
112
  return False
113
 
114
  def clean_text(self, text, is_title=False):
115
  """Improved text cleaning"""
116
  if not text or len(text.strip()) < 2:
117
  return ""
118
-
119
  # Normalize unicode characters
120
  text = unicodedata.normalize("NFKD", text)
121
  text = re.sub(r"[^\x00-\x7F]+", "", text)
122
-
123
  # Remove any template variables/placeholders
124
- text = re.sub(r'\{\{.*?\}\}', '', text)
125
- text = re.sub(r'\{\%.*?\%\}', '', text)
126
- text = re.sub(r'\${.*?\}', '', text)
127
-
128
  if is_title:
129
  # Remove common suffixes and fragments for titles
130
- text = re.sub(r'^\s*Welcome to\s+', '', text)
131
- text = re.sub(r'\s*[\|\-#:•].*', '', text)
132
- text = re.sub(r'\s+Homepage$', '', text, flags=re.IGNORECASE)
133
-
134
  # Handle overly generic titles
135
- if text.lower() in ['features', 'home', 'homepage', 'welcome']:
136
  return ""
137
-
138
  # Only return if we have meaningful text
139
  cleaned = " ".join(text.split()).strip()
140
  if len(cleaned.split()) < 2 and not is_title: # Allow single-word titles
141
  return ""
142
-
143
  return cleaned
144
 
145
-
146
  def clean_description(self, desc):
147
  """Clean description text"""
148
  if not desc:
@@ -154,7 +200,6 @@ class WebsiteCrawler:
154
  return ""
155
  return desc.strip()
156
 
157
-
158
  def extract_homepage_description(self, soup):
159
  """Extract description from homepage with multiple fallbacks"""
160
  # Try meta description first
@@ -174,7 +219,9 @@ class WebsiteCrawler:
174
  # Try first significant paragraph
175
  for p in soup.find_all("p"):
176
  text = p.get_text().strip()
177
- if len(text) > 50 and not any(x in text.lower() for x in ["cookie", "accept", "privacy"]):
 
 
178
  return self.clean_text(text)
179
 
180
  # Try main content area if exists
@@ -196,18 +243,22 @@ class WebsiteCrawler:
196
  or len(self.visited_urls) >= self.max_pages
197
  ):
198
  return []
199
-
200
  try:
201
  await asyncio.sleep(1) # Be polite to servers
202
  async with aiohttp.ClientSession() as session:
203
- async with session.get(url, headers=self.headers, allow_redirects=True) as response:
 
 
204
  if response.status == 403:
205
  # Try with alternative headers
206
  alt_headers = {
207
  "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
208
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
209
  }
210
- async with session.get(url, headers=alt_headers, allow_redirects=True) as retry_response:
 
 
211
  if retry_response.status != 200:
212
  return []
213
  text = await retry_response.text()
@@ -215,10 +266,10 @@ class WebsiteCrawler:
215
  return []
216
  else:
217
  text = await response.text()
218
-
219
  self.visited_urls.add(url)
220
- soup = BeautifulSoup(text, "html.parser")
221
-
222
  # Extract title with fallbacks
223
  title = None
224
  meta_title = soup.find("meta", property="og:title")
@@ -234,9 +285,9 @@ class WebsiteCrawler:
234
  title = h1_tag.text
235
  if not title:
236
  title = url.split("/")[-1]
237
-
238
  title = self.clean_text(title, is_title=True)
239
-
240
  # Extract description with fallbacks
241
  desc = None
242
  meta_desc = soup.find("meta", {"name": "description"})
@@ -250,44 +301,57 @@ class WebsiteCrawler:
250
  first_p = soup.find("p")
251
  if first_p:
252
  desc = first_p.text
253
-
254
  desc = self.clean_text(desc) if desc else ""
255
-
256
  # Skip if it's duplicate content
257
  if self.is_duplicate_content(desc, title, url):
258
  return []
259
-
260
  # Determine category and importance
261
- category, importance = self.determine_category_importance(url, title, desc)
262
-
 
 
263
  # Store metadata
264
  clean_url = re.sub(r"#.*", "", url).rstrip("/")
265
- if title and len(title.strip()) > 0: # Only store if we have a valid title
266
- logger.info(f"Storing metadata for {clean_url}: {title[:30]}...")
 
 
 
 
267
  self.url_metadata[clean_url] = {
268
  "title": title,
269
  "description": desc,
270
  "category": category,
271
  "importance": importance,
272
  }
273
-
274
  # Find links
275
  links = []
276
  for a in soup.find_all("a", href=True):
277
  href = a["href"]
278
  if not any(
279
  x in href.lower()
280
- for x in ["javascript:", "mailto:", ".pdf", ".jpg", ".png", ".gif"]
 
 
 
 
 
 
 
281
  ):
282
  next_url = urljoin(url, href)
283
  if urlparse(next_url).netloc == base_domain:
284
  links.append(next_url)
285
  return links
286
-
287
  except Exception as e:
288
  logger.error(f"Error crawling {url}: {str(e)}")
289
  return []
290
-
291
  async def process_homepage(self, url):
292
  """Specifically process the homepage to extract key metadata"""
293
  try:
@@ -295,94 +359,103 @@ class WebsiteCrawler:
295
  ssl_context = ssl.create_default_context()
296
  ssl_context.check_hostname = False
297
  ssl_context.verify_mode = ssl.CERT_NONE
298
-
299
  connector = aiohttp.TCPConnector(ssl=ssl_context)
300
  timeout = aiohttp.ClientTimeout(total=30)
301
-
302
- async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
303
- async with session.get(url, headers=self.headers, allow_redirects=True) as response:
 
 
 
 
304
  if response.status != 200:
305
- raise Exception(f"Failed to fetch homepage: status {response.status}")
306
-
 
 
307
  try:
308
  text = await response.text()
309
  except UnicodeDecodeError:
310
  text = await response.read()
311
- text = text.decode('utf-8', errors='ignore')
312
-
313
  soup = BeautifulSoup(text, "html.parser")
314
-
315
  # Extract site name with more fallbacks
316
  site_name = None
317
  # Try meta tags first
318
  site_meta = soup.find("meta", property="og:site_name")
319
  if site_meta and site_meta.get("content"):
320
  site_name = site_meta["content"]
321
-
322
  # Try structured data
323
  if not site_name:
324
  schema = soup.find("script", type="application/ld+json")
325
  if schema:
326
  try:
327
  import json
 
328
  data = json.loads(schema.string)
329
  if isinstance(data, dict):
330
- site_name = data.get("name") or data.get("organizationName")
 
 
331
  except:
332
  pass
333
-
334
  # Try title tag
335
  if not site_name:
336
  title_tag = soup.find("title")
337
  if title_tag:
338
- site_name = title_tag.text.split('|')[0].strip()
339
-
340
  # Last resort - use domain name
341
  if not site_name:
342
- site_name = urlparse(url).netloc.split('.')[0].capitalize()
343
-
344
  # Get homepage description
345
  description = self.extract_homepage_description(soup)
346
-
347
  self.homepage_metadata = {
348
  "site_name": self.clean_text(site_name, is_title=True),
349
- "description": description
350
  }
351
-
352
  except Exception as e:
353
  logger.error(f"Error processing homepage {url}: {str(e)}")
354
  self.homepage_metadata = {
355
- "site_name": urlparse(url).netloc.split('.')[0].capitalize(),
356
- "description": None
357
  }
358
-
359
  async def crawl_website(self, start_url):
360
  """Crawl website starting from the given URL"""
361
  try:
362
  # First process the homepage
363
  logger.info(f"Processing homepage: {start_url}")
364
  await self.process_homepage(start_url)
365
-
366
  base_domain = urlparse(start_url).netloc
367
  queue = [(start_url, 0)]
368
  seen = {start_url}
369
-
370
  while queue and len(self.visited_urls) < self.max_pages:
371
  current_url, depth = queue.pop(0)
372
  if depth > self.max_depth:
373
  continue
374
-
375
  logger.info(f"Crawling page: {current_url} (depth: {depth})")
376
  links = await self.crawl_page(current_url, depth, base_domain)
377
  logger.info(f"Found {len(links)} links on {current_url}")
378
-
379
  for link in links:
380
  if link not in seen and urlparse(link).netloc == base_domain:
381
  seen.add(link)
382
  queue.append((link, depth + 1))
383
-
384
  logger.info(f"Crawl completed. Visited {len(self.visited_urls)} pages")
385
-
386
  except Exception as e:
387
  logger.error(f"Error during crawl: {str(e)}")
388
  raise
@@ -390,15 +463,15 @@ class WebsiteCrawler:
390
  def generate_llms_txt(self):
391
  """Generate llms.txt content"""
392
  logger.info(f"Starting generate_llms_txt with {len(self.url_metadata)} URLs")
393
-
394
  if not self.url_metadata:
395
  logger.error("No URL metadata found")
396
  return "No content was found to generate llms.txt"
397
-
398
  # Sort URLs by importance and remove duplicates
399
  sorted_urls = []
400
  seen_titles = set()
401
-
402
  for url, metadata in sorted(
403
  self.url_metadata.items(),
404
  key=lambda x: (x[1]["importance"], x[0]),
@@ -407,23 +480,23 @@ class WebsiteCrawler:
407
  if metadata["title"] not in seen_titles:
408
  sorted_urls.append((url, metadata))
409
  seen_titles.add(metadata["title"])
410
-
411
  logger.info(f"Found {len(sorted_urls)} unique URLs after deduplication")
412
-
413
  if not sorted_urls:
414
  logger.error("No valid URLs found after sorting")
415
  return "No valid content was found"
416
-
417
  # Generate content
418
  content = []
419
-
420
  # Use homepage metadata for main title and description
421
  main_title = self.homepage_metadata.get("site_name", "Welcome")
422
  homepage_description = self.homepage_metadata.get("description")
423
-
424
  logger.info(f"Homepage title: {main_title}")
425
  logger.info(f"Homepage description: {homepage_description}")
426
-
427
  content.append(f"# {main_title}")
428
  if homepage_description:
429
  content.append(f"\n> {homepage_description}")
@@ -434,15 +507,15 @@ class WebsiteCrawler:
434
  if desc and len(desc) > 20 and "null" not in desc.lower():
435
  content.append(f"\n> {desc}")
436
  break
437
-
438
  # Group by category
439
  categories = defaultdict(list)
440
  for url, metadata in sorted_urls:
441
  if metadata["title"] and url:
442
  categories[metadata["category"]].append((url, metadata))
443
-
444
  logger.info(f"Categories found: {list(categories.keys())}")
445
-
446
  # Add sections in a logical order
447
  category_order = [
448
  "Main",
@@ -451,49 +524,53 @@ class WebsiteCrawler:
451
  "Tools",
452
  "About",
453
  "News",
454
- "Optional"
455
  ]
456
-
457
  # Only show Main section if it has content different from the homepage description
458
  if "Main" in categories:
459
  main_content = categories["Main"]
460
- if len(main_content) == 1 and main_content[0][1]["description"] == homepage_description:
 
 
 
461
  logger.info("Removing duplicate Main content")
462
  del categories["Main"]
463
-
464
  for category in category_order:
465
  if category in categories and categories[category]:
466
- logger.info(f"Processing category {category} with {len(categories[category])} items")
 
 
467
  content.append(f"\n## {category}")
468
-
469
  # Sort links within category by importance and description length
470
  category_links = sorted(
471
  categories[category],
472
- key=lambda x: (-len(x[1]["description"] or ""), x[1]["title"])
473
  )
474
-
475
  links = []
476
  seen_desc = set() # Avoid duplicate descriptions within category
477
  for url, metadata in category_links:
478
  title = metadata["title"].strip()
479
  desc = self.clean_description(metadata["description"])
480
-
481
  # Skip if description is duplicate within category
482
  if desc in seen_desc:
483
  continue
484
  seen_desc.add(desc)
485
-
486
  if desc:
487
  links.append(f"- [{title}]({url}): {desc}")
488
  else:
489
  links.append(f"- [{title}]({url})")
490
-
491
  content.append("\n".join(links))
492
-
493
  final_content = "\n".join(content)
494
  logger.info(f"Generated content length: {len(final_content)}")
495
  return final_content
496
-
497
 
498
 
499
  async def process_url(url, max_depth, max_pages):
@@ -509,11 +586,11 @@ async def process_url(url, max_depth, max_pages):
509
  return "", "Invalid URL format. Please enter a valid URL."
510
 
511
  logger.info(f"Starting crawl of {url}")
512
-
513
  # Process website
514
  crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
515
  await crawler.crawl_website(url)
516
-
517
  logger.info("Generating llms.txt content")
518
  content = crawler.generate_llms_txt()
519
 
@@ -606,4 +683,4 @@ with gr.Blocks(
606
  )
607
 
608
  if __name__ == "__main__":
609
- iface.launch()
 
28
  "Accept-Encoding": "gzip, deflate, br",
29
  "DNT": "1",
30
  "Connection": "keep-alive",
31
+ "Upgrade-Insecure-Requests": "1",
32
  }
33
 
34
  def determine_category_importance(self, url, title, desc):
35
  """Improved category detection"""
36
  url_lower = url.lower()
37
  path = urlparse(url).path.lower()
38
+
39
  # Homepage
40
  if path == "/" or path == "":
41
  return "Main", 10
42
+
43
  # Documentation and Help
44
+ if any(
45
+ x in url_lower
46
+ for x in [
47
+ "/docs",
48
+ "/documentation",
49
+ "/faq",
50
+ "/help",
51
+ "frequently-asked-questions",
52
+ ]
53
+ ):
54
  return "Documentation", 8
55
+
56
  # API and Developer
57
+ elif any(x in url_lower for x in ["/api", "/developer", "developers"]):
58
  return "API", 8
59
+
60
  # About/Company pages
61
+ elif any(
62
+ x in url_lower
63
+ for x in [
64
+ "/about",
65
+ "/company",
66
+ "/references",
67
+ "/work-with-us",
68
+ "careers",
69
+ "/team",
70
+ "/contact",
71
+ "/about-us",
72
+ ]
73
+ ):
74
  return "About", 7
75
+
76
+ # News and Events
77
+ elif any(
78
+ x in url_lower
79
+ for x in [
80
+ "/news",
81
+ "/blog",
82
+ "/events",
83
+ "/press",
84
+ "research",
85
+ "power-of",
86
+ "latest",
87
+ ]
88
+ ):
89
  return "News", 5
90
+
91
  # Tools and Services
92
+ elif any(
93
+ x in url_lower
94
+ for x in [
95
+ "/tools",
96
+ "/quote",
97
+ "/pricing",
98
+ "/services",
99
+ "/translate",
100
+ "/order",
101
+ "/buy",
102
+ ]
103
+ ):
104
  return "Tools", 6
105
+
106
  # Check if URL path contains non-ASCII or percent-encoded characters
107
+ if bool(re.search(r"[^\x00-\x7F]", path)) or bool(
108
+ re.search(r"%[0-9A-F]{2}", path)
109
+ ):
110
  return "Optional", 0
111
+
112
  return "Optional", 1
113
 
114
  def is_duplicate_content(self, desc, title, url):
115
  """Improved duplicate/translation detection"""
116
  if not desc or not title:
117
  return False
118
+
119
  # Skip non-latin character URLs or URLs with percent-encoded non-ASCII
120
+ if bool(re.search(r"[^\x00-\x7F]", url)) or bool(
121
+ re.search(r"%[0-9A-F]{2}", url)
122
+ ):
123
  return True
124
+
 
125
  # Skip common translation paths
126
  translation_indicators = [
127
+ "/welcome",
128
+ "/bienvenue",
129
+ "/willkommen",
130
+ "/benvenuto",
131
+ "/tervetuloa",
132
+ "/bienvenido",
133
+ "/velkommen",
134
+ "/welkom",
135
+ "translate.com/",
136
+ "/translate/",
137
+ "/translation/",
138
  ]
139
  if any(indicator in url.lower() for indicator in translation_indicators):
140
  url_path = urlparse(url).path.lower()
141
+ if url_path != "/": # Don't skip homepage
142
  return True
143
+
144
  # Check for similar content length and patterns
145
  for existing_metadata in self.url_metadata.values():
146
  existing_desc = existing_metadata.get("description", "")
147
  existing_title = existing_metadata.get("title", "")
148
  if not existing_desc or not existing_title:
149
  continue
150
+
151
  # If descriptions are very similar in length, likely a translation
152
+ if (
153
+ abs(len(desc) - len(existing_desc)) < 20
154
+ and len(desc) > 50
155
+ and desc != existing_desc
156
+ ): # Allow exact duplicates for main page
157
  return True
158
+
159
  return False
160
 
161
  def clean_text(self, text, is_title=False):
162
  """Improved text cleaning"""
163
  if not text or len(text.strip()) < 2:
164
  return ""
165
+
166
  # Normalize unicode characters
167
  text = unicodedata.normalize("NFKD", text)
168
  text = re.sub(r"[^\x00-\x7F]+", "", text)
169
+
170
  # Remove any template variables/placeholders
171
+ text = re.sub(r"\{\{.*?\}\}", "", text)
172
+ text = re.sub(r"\{\%.*?\%\}", "", text)
173
+ text = re.sub(r"\${.*?\}", "", text)
174
+
175
  if is_title:
176
  # Remove common suffixes and fragments for titles
177
+ text = re.sub(r"^\s*Welcome to\s+", "", text)
178
+ text = re.sub(r"\s*[\|\-#:•].*", "", text)
179
+ text = re.sub(r"\s+Homepage$", "", text, flags=re.IGNORECASE)
180
+
181
  # Handle overly generic titles
182
+ if text.lower() in ["features", "home", "homepage", "welcome"]:
183
  return ""
184
+
185
  # Only return if we have meaningful text
186
  cleaned = " ".join(text.split()).strip()
187
  if len(cleaned.split()) < 2 and not is_title: # Allow single-word titles
188
  return ""
189
+
190
  return cleaned
191
 
 
192
  def clean_description(self, desc):
193
  """Clean description text"""
194
  if not desc:
 
200
  return ""
201
  return desc.strip()
202
 
 
203
  def extract_homepage_description(self, soup):
204
  """Extract description from homepage with multiple fallbacks"""
205
  # Try meta description first
 
219
  # Try first significant paragraph
220
  for p in soup.find_all("p"):
221
  text = p.get_text().strip()
222
+ if len(text) > 50 and not any(
223
+ x in text.lower() for x in ["cookie", "accept", "privacy"]
224
+ ):
225
  return self.clean_text(text)
226
 
227
  # Try main content area if exists
 
243
  or len(self.visited_urls) >= self.max_pages
244
  ):
245
  return []
246
+
247
  try:
248
  await asyncio.sleep(1) # Be polite to servers
249
  async with aiohttp.ClientSession() as session:
250
+ async with session.get(
251
+ url, headers=self.headers, allow_redirects=True
252
+ ) as response:
253
  if response.status == 403:
254
  # Try with alternative headers
255
  alt_headers = {
256
  "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
257
  "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
258
  }
259
+ async with session.get(
260
+ url, headers=alt_headers, allow_redirects=True
261
+ ) as retry_response:
262
  if retry_response.status != 200:
263
  return []
264
  text = await retry_response.text()
 
266
  return []
267
  else:
268
  text = await response.text()
269
+
270
  self.visited_urls.add(url)
271
+ soup = BeautifulSoup(text, "html.parser")
272
+
273
  # Extract title with fallbacks
274
  title = None
275
  meta_title = soup.find("meta", property="og:title")
 
285
  title = h1_tag.text
286
  if not title:
287
  title = url.split("/")[-1]
288
+
289
  title = self.clean_text(title, is_title=True)
290
+
291
  # Extract description with fallbacks
292
  desc = None
293
  meta_desc = soup.find("meta", {"name": "description"})
 
301
  first_p = soup.find("p")
302
  if first_p:
303
  desc = first_p.text
304
+
305
  desc = self.clean_text(desc) if desc else ""
306
+
307
  # Skip if it's duplicate content
308
  if self.is_duplicate_content(desc, title, url):
309
  return []
310
+
311
  # Determine category and importance
312
+ category, importance = self.determine_category_importance(
313
+ url, title, desc
314
+ )
315
+
316
  # Store metadata
317
  clean_url = re.sub(r"#.*", "", url).rstrip("/")
318
+ if (
319
+ title and len(title.strip()) > 0
320
+ ): # Only store if we have a valid title
321
+ logger.info(
322
+ f"Storing metadata for {clean_url}: {title[:30]}..."
323
+ )
324
  self.url_metadata[clean_url] = {
325
  "title": title,
326
  "description": desc,
327
  "category": category,
328
  "importance": importance,
329
  }
330
+
331
  # Find links
332
  links = []
333
  for a in soup.find_all("a", href=True):
334
  href = a["href"]
335
  if not any(
336
  x in href.lower()
337
+ for x in [
338
+ "javascript:",
339
+ "mailto:",
340
+ ".pdf",
341
+ ".jpg",
342
+ ".png",
343
+ ".gif",
344
+ ]
345
  ):
346
  next_url = urljoin(url, href)
347
  if urlparse(next_url).netloc == base_domain:
348
  links.append(next_url)
349
  return links
350
+
351
  except Exception as e:
352
  logger.error(f"Error crawling {url}: {str(e)}")
353
  return []
354
+
355
  async def process_homepage(self, url):
356
  """Specifically process the homepage to extract key metadata"""
357
  try:
 
359
  ssl_context = ssl.create_default_context()
360
  ssl_context.check_hostname = False
361
  ssl_context.verify_mode = ssl.CERT_NONE
362
+
363
  connector = aiohttp.TCPConnector(ssl=ssl_context)
364
  timeout = aiohttp.ClientTimeout(total=30)
365
+
366
+ async with aiohttp.ClientSession(
367
+ connector=connector, timeout=timeout
368
+ ) as session:
369
+ async with session.get(
370
+ url, headers=self.headers, allow_redirects=True
371
+ ) as response:
372
  if response.status != 200:
373
+ raise Exception(
374
+ f"Failed to fetch homepage: status {response.status}"
375
+ )
376
+
377
  try:
378
  text = await response.text()
379
  except UnicodeDecodeError:
380
  text = await response.read()
381
+ text = text.decode("utf-8", errors="ignore")
382
+
383
  soup = BeautifulSoup(text, "html.parser")
384
+
385
  # Extract site name with more fallbacks
386
  site_name = None
387
  # Try meta tags first
388
  site_meta = soup.find("meta", property="og:site_name")
389
  if site_meta and site_meta.get("content"):
390
  site_name = site_meta["content"]
391
+
392
  # Try structured data
393
  if not site_name:
394
  schema = soup.find("script", type="application/ld+json")
395
  if schema:
396
  try:
397
  import json
398
+
399
  data = json.loads(schema.string)
400
  if isinstance(data, dict):
401
+ site_name = data.get("name") or data.get(
402
+ "organizationName"
403
+ )
404
  except:
405
  pass
406
+
407
  # Try title tag
408
  if not site_name:
409
  title_tag = soup.find("title")
410
  if title_tag:
411
+ site_name = title_tag.text.split("|")[0].strip()
412
+
413
  # Last resort - use domain name
414
  if not site_name:
415
+ site_name = urlparse(url).netloc.split(".")[0].capitalize()
416
+
417
  # Get homepage description
418
  description = self.extract_homepage_description(soup)
419
+
420
  self.homepage_metadata = {
421
  "site_name": self.clean_text(site_name, is_title=True),
422
+ "description": description,
423
  }
424
+
425
  except Exception as e:
426
  logger.error(f"Error processing homepage {url}: {str(e)}")
427
  self.homepage_metadata = {
428
+ "site_name": urlparse(url).netloc.split(".")[0].capitalize(),
429
+ "description": None,
430
  }
431
+
432
  async def crawl_website(self, start_url):
433
  """Crawl website starting from the given URL"""
434
  try:
435
  # First process the homepage
436
  logger.info(f"Processing homepage: {start_url}")
437
  await self.process_homepage(start_url)
438
+
439
  base_domain = urlparse(start_url).netloc
440
  queue = [(start_url, 0)]
441
  seen = {start_url}
442
+
443
  while queue and len(self.visited_urls) < self.max_pages:
444
  current_url, depth = queue.pop(0)
445
  if depth > self.max_depth:
446
  continue
447
+
448
  logger.info(f"Crawling page: {current_url} (depth: {depth})")
449
  links = await self.crawl_page(current_url, depth, base_domain)
450
  logger.info(f"Found {len(links)} links on {current_url}")
451
+
452
  for link in links:
453
  if link not in seen and urlparse(link).netloc == base_domain:
454
  seen.add(link)
455
  queue.append((link, depth + 1))
456
+
457
  logger.info(f"Crawl completed. Visited {len(self.visited_urls)} pages")
458
+
459
  except Exception as e:
460
  logger.error(f"Error during crawl: {str(e)}")
461
  raise
 
463
  def generate_llms_txt(self):
464
  """Generate llms.txt content"""
465
  logger.info(f"Starting generate_llms_txt with {len(self.url_metadata)} URLs")
466
+
467
  if not self.url_metadata:
468
  logger.error("No URL metadata found")
469
  return "No content was found to generate llms.txt"
470
+
471
  # Sort URLs by importance and remove duplicates
472
  sorted_urls = []
473
  seen_titles = set()
474
+
475
  for url, metadata in sorted(
476
  self.url_metadata.items(),
477
  key=lambda x: (x[1]["importance"], x[0]),
 
480
  if metadata["title"] not in seen_titles:
481
  sorted_urls.append((url, metadata))
482
  seen_titles.add(metadata["title"])
483
+
484
  logger.info(f"Found {len(sorted_urls)} unique URLs after deduplication")
485
+
486
  if not sorted_urls:
487
  logger.error("No valid URLs found after sorting")
488
  return "No valid content was found"
489
+
490
  # Generate content
491
  content = []
492
+
493
  # Use homepage metadata for main title and description
494
  main_title = self.homepage_metadata.get("site_name", "Welcome")
495
  homepage_description = self.homepage_metadata.get("description")
496
+
497
  logger.info(f"Homepage title: {main_title}")
498
  logger.info(f"Homepage description: {homepage_description}")
499
+
500
  content.append(f"# {main_title}")
501
  if homepage_description:
502
  content.append(f"\n> {homepage_description}")
 
507
  if desc and len(desc) > 20 and "null" not in desc.lower():
508
  content.append(f"\n> {desc}")
509
  break
510
+
511
  # Group by category
512
  categories = defaultdict(list)
513
  for url, metadata in sorted_urls:
514
  if metadata["title"] and url:
515
  categories[metadata["category"]].append((url, metadata))
516
+
517
  logger.info(f"Categories found: {list(categories.keys())}")
518
+
519
  # Add sections in a logical order
520
  category_order = [
521
  "Main",
 
524
  "Tools",
525
  "About",
526
  "News",
527
+ "Optional",
528
  ]
529
+
530
  # Only show Main section if it has content different from the homepage description
531
  if "Main" in categories:
532
  main_content = categories["Main"]
533
+ if (
534
+ len(main_content) == 1
535
+ and main_content[0][1]["description"] == homepage_description
536
+ ):
537
  logger.info("Removing duplicate Main content")
538
  del categories["Main"]
539
+
540
  for category in category_order:
541
  if category in categories and categories[category]:
542
+ logger.info(
543
+ f"Processing category {category} with {len(categories[category])} items"
544
+ )
545
  content.append(f"\n## {category}")
546
+
547
  # Sort links within category by importance and description length
548
  category_links = sorted(
549
  categories[category],
550
+ key=lambda x: (-len(x[1]["description"] or ""), x[1]["title"]),
551
  )
552
+
553
  links = []
554
  seen_desc = set() # Avoid duplicate descriptions within category
555
  for url, metadata in category_links:
556
  title = metadata["title"].strip()
557
  desc = self.clean_description(metadata["description"])
558
+
559
  # Skip if description is duplicate within category
560
  if desc in seen_desc:
561
  continue
562
  seen_desc.add(desc)
563
+
564
  if desc:
565
  links.append(f"- [{title}]({url}): {desc}")
566
  else:
567
  links.append(f"- [{title}]({url})")
568
+
569
  content.append("\n".join(links))
570
+
571
  final_content = "\n".join(content)
572
  logger.info(f"Generated content length: {len(final_content)}")
573
  return final_content
 
574
 
575
 
576
  async def process_url(url, max_depth, max_pages):
 
586
  return "", "Invalid URL format. Please enter a valid URL."
587
 
588
  logger.info(f"Starting crawl of {url}")
589
+
590
  # Process website
591
  crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
592
  await crawler.crawl_website(url)
593
+
594
  logger.info("Generating llms.txt content")
595
  content = crawler.generate_llms_txt()
596
 
 
683
  )
684
 
685
  if __name__ == "__main__":
686
+ iface.launch()