cyberandy commited on
Commit
b2ecedb
·
1 Parent(s): 970c25e
Files changed (1) hide show
  1. app.py +87 -515
app.py CHANGED
@@ -8,7 +8,6 @@ import aiohttp
8
  from collections import defaultdict
9
  import unicodedata
10
  import logging
11
- import ssl
12
 
13
  logging.basicConfig(level=logging.INFO)
14
  logger = logging.getLogger(__name__)
@@ -23,220 +22,44 @@ class WebsiteCrawler:
23
  self.homepage_metadata = None
24
  self.headers = {
25
  "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
26
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8",
27
- "Accept-Language": "en-US,en;q=0.5",
28
- "Accept-Encoding": "gzip, deflate, br",
29
- "DNT": "1",
30
- "Connection": "keep-alive",
31
- "Upgrade-Insecure-Requests": "1",
32
  }
33
 
34
  def determine_category_importance(self, url, title, desc):
35
- """Improved category detection"""
36
  url_lower = url.lower()
37
  path = urlparse(url).path.lower()
38
 
39
- # Homepage
40
  if path == "/" or path == "":
41
  return "Main", 10
42
 
43
- # Documentation and Help
44
- if any(
45
- x in url_lower
46
- for x in [
47
- "/docs",
48
- "/documentation",
49
- "/faq",
50
- "/help",
51
- "frequently-asked-questions",
52
- ]
53
- ):
54
  return "Documentation", 8
55
 
56
- # API and Developer
57
- elif any(x in url_lower for x in ["/api", "/developer", "developers"]):
58
  return "API", 8
59
 
60
- # About/Company pages
61
- elif any(
62
- x in url_lower
63
- for x in [
64
- "/about",
65
- "/company",
66
- "/references",
67
- "/work-with-us",
68
- "careers",
69
- "/team",
70
- "/contact",
71
- "/about-us",
72
- ]
73
- ):
74
  return "About", 7
75
 
76
- # News and Events
77
- elif any(
78
- x in url_lower
79
- for x in [
80
- "/news",
81
- "/blog",
82
- "/events",
83
- "/press",
84
- "research",
85
- "power-of",
86
- "latest",
87
- ]
88
- ):
89
  return "News", 5
90
 
91
- # Tools and Services
92
- elif any(
93
- x in url_lower
94
- for x in [
95
- "/tools",
96
- "/quote",
97
- "/pricing",
98
- "/services",
99
- "/translate",
100
- "/order",
101
- "/buy",
102
- ]
103
- ):
104
  return "Tools", 6
105
 
106
- # Check if URL path contains non-ASCII or percent-encoded characters
107
- if bool(re.search(r"[^\x00-\x7F]", path)) or bool(
108
- re.search(r"%[0-9A-F]{2}", path)
109
- ):
110
- return "Optional", 0
111
-
112
  return "Optional", 1
113
 
114
- def is_duplicate_content(self, desc, title, url):
115
- """Improved duplicate/translation detection"""
116
- if not desc or not title:
117
- return False
118
-
119
- # Skip non-latin character URLs or URLs with percent-encoded non-ASCII
120
- if bool(re.search(r"[^\x00-\x7F]", url)) or bool(
121
- re.search(r"%[0-9A-F]{2}", url)
122
- ):
123
- return True
124
-
125
- # Skip common translation paths
126
- translation_indicators = [
127
- "/welcome",
128
- "/bienvenue",
129
- "/willkommen",
130
- "/benvenuto",
131
- "/tervetuloa",
132
- "/bienvenido",
133
- "/velkommen",
134
- "/welkom",
135
- "translate.com/",
136
- "/translate/",
137
- "/translation/",
138
- ]
139
- if any(indicator in url.lower() for indicator in translation_indicators):
140
- url_path = urlparse(url).path.lower()
141
- if url_path != "/": # Don't skip homepage
142
- return True
143
-
144
- # Check for similar content length and patterns
145
- for existing_metadata in self.url_metadata.values():
146
- existing_desc = existing_metadata.get("description", "")
147
- existing_title = existing_metadata.get("title", "")
148
- if not existing_desc or not existing_title:
149
- continue
150
-
151
- # If descriptions are very similar in length, likely a translation
152
- if (
153
- abs(len(desc) - len(existing_desc)) < 20
154
- and len(desc) > 50
155
- and desc != existing_desc
156
- ): # Allow exact duplicates for main page
157
- return True
158
-
159
- return False
160
-
161
  def clean_text(self, text, is_title=False):
162
- """Improved text cleaning"""
163
- if not text or len(text.strip()) < 2:
164
  return ""
165
-
166
- # Normalize unicode characters
167
  text = unicodedata.normalize("NFKD", text)
168
  text = re.sub(r"[^\x00-\x7F]+", "", text)
169
-
170
- # Remove any template variables/placeholders
171
- text = re.sub(r"\{\{.*?\}\}", "", text)
172
- text = re.sub(r"\{\%.*?\%\}", "", text)
173
- text = re.sub(r"\${.*?\}", "", text)
174
 
175
  if is_title:
176
- # Remove common suffixes and fragments for titles
177
  text = re.sub(r"^\s*Welcome to\s+", "", text)
178
- text = re.sub(r"\s*[\|\-#:•].*", "", text)
179
- text = re.sub(r"\s+Homepage$", "", text, flags=re.IGNORECASE)
180
-
181
- # Handle overly generic titles
182
- if text.lower() in ["features", "home", "homepage", "welcome"]:
183
- return ""
184
-
185
- # Only return if we have meaningful text
186
- cleaned = " ".join(text.split()).strip()
187
- if len(cleaned.split()) < 2 and not is_title: # Allow single-word titles
188
- return ""
189
-
190
- return cleaned
191
-
192
- def clean_description(self, desc):
193
- """Clean description text"""
194
- if not desc:
195
- return ""
196
- # Remove leading dashes, hyphens, or colons
197
- desc = re.sub(r"^[-:\s]+", "", desc)
198
- # Remove any strings that are just "Editors", "APIs", etc.
199
- if len(desc.split()) <= 1:
200
- return ""
201
- return desc.strip()
202
-
203
- def extract_homepage_description(self, soup):
204
- """Extract description from homepage with multiple fallbacks"""
205
- # Try meta description first
206
- meta_desc = soup.find("meta", {"name": "description"})
207
- if meta_desc and meta_desc.get("content"):
208
- desc = meta_desc["content"]
209
- if desc and len(desc.strip()) > 20:
210
- return self.clean_text(desc)
211
-
212
- # Try OpenGraph description
213
- og_desc = soup.find("meta", property="og:description")
214
- if og_desc and og_desc.get("content"):
215
- desc = og_desc["content"]
216
- if desc and len(desc.strip()) > 20:
217
- return self.clean_text(desc)
218
-
219
- # Try first significant paragraph
220
- for p in soup.find_all("p"):
221
- text = p.get_text().strip()
222
- if len(text) > 50 and not any(
223
- x in text.lower() for x in ["cookie", "accept", "privacy"]
224
- ):
225
- return self.clean_text(text)
226
-
227
- # Try main content area if exists
228
- main = soup.find("main")
229
- if main:
230
- first_p = main.find("p")
231
- if first_p:
232
- text = first_p.get_text().strip()
233
- if len(text) > 50:
234
- return self.clean_text(text)
235
-
236
- return None
237
 
238
  async def crawl_page(self, url, depth, base_domain):
239
- """Crawl a single page and extract information"""
240
  if (
241
  depth > self.max_depth
242
  or url in self.visited_urls
@@ -245,197 +68,91 @@ class WebsiteCrawler:
245
  return []
246
 
247
  try:
248
- await asyncio.sleep(1) # Be polite to servers
249
- async with aiohttp.ClientSession() as session:
 
250
  async with session.get(
251
  url, headers=self.headers, allow_redirects=True
252
  ) as response:
253
- if response.status == 403:
254
- # Try with alternative headers
255
- alt_headers = {
256
- "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36",
257
- "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
258
- }
259
- async with session.get(
260
- url, headers=alt_headers, allow_redirects=True
261
- ) as retry_response:
262
- if retry_response.status != 200:
263
- return []
264
- text = await retry_response.text()
265
- elif response.status != 200:
266
  return []
267
- else:
268
- text = await response.text()
269
-
270
  self.visited_urls.add(url)
 
271
  soup = BeautifulSoup(text, "html.parser")
 
 
 
 
 
 
272
 
273
- # Extract title with fallbacks
274
- title = None
275
- meta_title = soup.find("meta", property="og:title")
276
- if meta_title and meta_title.get("content"):
277
- title = meta_title["content"]
278
- if not title:
279
- title_tag = soup.find("title")
280
- if title_tag:
281
- title = title_tag.text
282
- if not title:
283
- h1_tag = soup.find("h1")
284
- if h1_tag:
285
- title = h1_tag.text
286
- if not title:
287
- title = url.split("/")[-1]
288
-
289
- title = self.clean_text(title, is_title=True)
290
-
291
- # Extract description with fallbacks
292
- desc = None
293
- meta_desc = soup.find("meta", {"name": "description"})
294
- if meta_desc and meta_desc.get("content"):
295
- desc = meta_desc["content"]
296
- if not desc:
297
- og_desc = soup.find("meta", property="og:description")
298
- if og_desc and og_desc.get("content"):
299
- desc = og_desc["content"]
300
- if not desc:
301
- first_p = soup.find("p")
302
- if first_p:
303
- desc = first_p.text
304
-
305
- desc = self.clean_text(desc) if desc else ""
306
-
307
- # Skip if it's duplicate content
308
- if self.is_duplicate_content(desc, title, url):
309
- return []
310
 
311
- # Determine category and importance
312
  category, importance = self.determine_category_importance(
313
  url, title, desc
314
  )
315
 
316
- # Store metadata
317
- clean_url = re.sub(r"#.*", "", url).rstrip("/")
318
- if (
319
- title and len(title.strip()) > 0
320
- ): # Only store if we have a valid title
321
- logger.info(
322
- f"Storing metadata for {clean_url}: {title[:30]}..."
323
- )
324
- self.url_metadata[clean_url] = {
325
- "title": title,
326
- "description": desc,
327
- "category": category,
328
- "importance": importance,
329
- }
330
-
331
- # Find links
332
  links = []
333
  for a in soup.find_all("a", href=True):
334
- href = a["href"]
335
- if not any(
336
- x in href.lower()
337
- for x in [
338
- "javascript:",
339
- "mailto:",
340
- ".pdf",
341
- ".jpg",
342
- ".png",
343
- ".gif",
344
- ]
345
- ):
346
- next_url = urljoin(url, href)
347
- if urlparse(next_url).netloc == base_domain:
348
- links.append(next_url)
349
- return links
350
 
 
351
  except Exception as e:
352
  logger.error(f"Error crawling {url}: {str(e)}")
353
  return []
354
 
355
  async def process_homepage(self, url):
356
- """Specifically process the homepage to extract key metadata"""
357
  try:
358
- # Configure SSL context
359
- ssl_context = ssl.create_default_context()
360
- ssl_context.check_hostname = False
361
- ssl_context.verify_mode = ssl.CERT_NONE
362
-
363
- connector = aiohttp.TCPConnector(ssl=ssl_context)
364
- timeout = aiohttp.ClientTimeout(total=30)
365
-
366
  async with aiohttp.ClientSession(
367
- connector=connector, timeout=timeout
368
  ) as session:
369
  async with session.get(
370
  url, headers=self.headers, allow_redirects=True
371
  ) as response:
372
  if response.status != 200:
373
- raise Exception(
374
- f"Failed to fetch homepage: status {response.status}"
375
- )
376
-
377
- try:
378
- text = await response.text()
379
- except UnicodeDecodeError:
380
- text = await response.read()
381
- text = text.decode("utf-8", errors="ignore")
382
-
383
  soup = BeautifulSoup(text, "html.parser")
384
 
385
- # Extract site name with more fallbacks
386
- site_name = None
387
- # Try meta tags first
388
- site_meta = soup.find("meta", property="og:site_name")
389
- if site_meta and site_meta.get("content"):
390
- site_name = site_meta["content"]
391
-
392
- # Try structured data
393
- if not site_name:
394
- schema = soup.find("script", type="application/ld+json")
395
- if schema:
396
- try:
397
- import json
398
-
399
- data = json.loads(schema.string)
400
- if isinstance(data, dict):
401
- site_name = data.get("name") or data.get(
402
- "organizationName"
403
- )
404
- except:
405
- pass
406
-
407
- # Try title tag
408
- if not site_name:
409
- title_tag = soup.find("title")
410
- if title_tag:
411
- site_name = title_tag.text.split("|")[0].strip()
412
-
413
- # Last resort - use domain name
414
- if not site_name:
415
- site_name = urlparse(url).netloc.split(".")[0].capitalize()
416
-
417
- # Get homepage description
418
- description = self.extract_homepage_description(soup)
419
 
420
  self.homepage_metadata = {
421
  "site_name": self.clean_text(site_name, is_title=True),
422
- "description": description,
 
 
423
  }
424
-
425
  except Exception as e:
426
  logger.error(f"Error processing homepage {url}: {str(e)}")
427
- self.homepage_metadata = {
428
- "site_name": urlparse(url).netloc.split(".")[0].capitalize(),
429
- "description": None,
430
- }
431
 
432
  async def crawl_website(self, start_url):
433
- """Crawl website starting from the given URL"""
434
  try:
435
- # First process the homepage
436
- logger.info(f"Processing homepage: {start_url}")
437
  await self.process_homepage(start_url)
438
-
439
  base_domain = urlparse(start_url).netloc
440
  queue = [(start_url, 0)]
441
  seen = {start_url}
@@ -444,240 +161,95 @@ class WebsiteCrawler:
444
  current_url, depth = queue.pop(0)
445
  if depth > self.max_depth:
446
  continue
447
-
448
- logger.info(f"Crawling page: {current_url} (depth: {depth})")
449
  links = await self.crawl_page(current_url, depth, base_domain)
450
- logger.info(f"Found {len(links)} links on {current_url}")
451
-
452
  for link in links:
453
- if link not in seen and urlparse(link).netloc == base_domain:
454
  seen.add(link)
455
  queue.append((link, depth + 1))
456
 
457
- logger.info(f"Crawl completed. Visited {len(self.visited_urls)} pages")
458
-
459
  except Exception as e:
460
  logger.error(f"Error during crawl: {str(e)}")
461
  raise
462
 
463
  def generate_llms_txt(self):
464
- """Generate llms.txt content"""
465
- logger.info(f"Starting generate_llms_txt with {len(self.url_metadata)} URLs")
466
-
467
  if not self.url_metadata:
468
- logger.error("No URL metadata found")
469
- return "No content was found to generate llms.txt"
470
-
471
- # Sort URLs by importance and remove duplicates
472
- sorted_urls = []
473
- seen_titles = set()
474
-
475
- for url, metadata in sorted(
476
- self.url_metadata.items(),
477
- key=lambda x: (x[1]["importance"], x[0]),
478
- reverse=True,
479
- ):
480
- if metadata["title"] not in seen_titles:
481
- sorted_urls.append((url, metadata))
482
- seen_titles.add(metadata["title"])
483
 
484
- logger.info(f"Found {len(sorted_urls)} unique URLs after deduplication")
485
-
486
- if not sorted_urls:
487
- logger.error("No valid URLs found after sorting")
488
- return "No valid content was found"
489
-
490
- # Generate content
491
  content = []
 
 
 
 
 
492
 
493
- # Use homepage metadata for main title and description
494
- main_title = self.homepage_metadata.get("site_name", "Welcome")
495
- homepage_description = self.homepage_metadata.get("description")
496
-
497
- logger.info(f"Homepage title: {main_title}")
498
- logger.info(f"Homepage description: {homepage_description}")
499
-
500
- content.append(f"# {main_title}")
501
- if homepage_description:
502
- content.append(f"\n> {homepage_description}")
503
- elif len(sorted_urls) > 0:
504
- # Fallback to first good description from content if no homepage description
505
- for _, metadata in sorted_urls:
506
- desc = self.clean_description(metadata["description"])
507
- if desc and len(desc) > 20 and "null" not in desc.lower():
508
- content.append(f"\n> {desc}")
509
- break
510
-
511
- # Group by category
512
  categories = defaultdict(list)
513
- for url, metadata in sorted_urls:
514
- if metadata["title"] and url:
515
- categories[metadata["category"]].append((url, metadata))
516
 
517
- logger.info(f"Categories found: {list(categories.keys())}")
518
-
519
- # Add sections in a logical order
520
  category_order = [
521
  "Main",
522
  "Documentation",
523
  "API",
524
- "Tools",
525
  "About",
526
  "News",
 
527
  "Optional",
528
  ]
529
-
530
- # Only show Main section if it has content different from the homepage description
531
- if "Main" in categories:
532
- main_content = categories["Main"]
533
- if (
534
- len(main_content) == 1
535
- and main_content[0][1]["description"] == homepage_description
536
- ):
537
- logger.info("Removing duplicate Main content")
538
- del categories["Main"]
539
-
540
  for category in category_order:
541
- if category in categories and categories[category]:
542
- logger.info(
543
- f"Processing category {category} with {len(categories[category])} items"
544
- )
545
- content.append(f"\n## {category}")
546
-
547
- # Sort links within category by importance and description length
548
- category_links = sorted(
549
- categories[category],
550
- key=lambda x: (-len(x[1]["description"] or ""), x[1]["title"]),
551
- )
552
-
553
- links = []
554
- seen_desc = set() # Avoid duplicate descriptions within category
555
- for url, metadata in category_links:
556
- title = metadata["title"].strip()
557
- desc = self.clean_description(metadata["description"])
558
-
559
- # Skip if description is duplicate within category
560
- if desc in seen_desc:
561
- continue
562
- seen_desc.add(desc)
563
-
564
- if desc:
565
- links.append(f"- [{title}]({url}): {desc}")
566
- else:
567
- links.append(f"- [{title}]({url})")
568
-
569
- content.append("\n".join(links))
570
 
571
- final_content = "\n".join(content)
572
- logger.info(f"Generated content length: {len(final_content)}")
573
- return final_content
574
 
575
 
576
  async def process_url(url, max_depth, max_pages):
577
- """Process URL and generate llms.txt"""
578
  try:
579
- # Add https:// if not present
580
  if not url.startswith(("http://", "https://")):
581
  url = "https://" + url
582
-
583
- # Validate URL
584
  result = urlparse(url)
585
- if not all([result.scheme, result.netloc]):
586
  return "", "Invalid URL format. Please enter a valid URL."
587
 
588
- logger.info(f"Starting crawl of {url}")
589
-
590
- # Process website
591
  crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
592
  await crawler.crawl_website(url)
593
-
594
- logger.info("Generating llms.txt content")
595
  content = crawler.generate_llms_txt()
596
 
597
- if not content or content.strip() == "":
598
- return "", "No content was generated. Check the logs for details."
599
-
600
  return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
601
-
602
  except Exception as e:
603
  logger.error(f"Error processing URL {url}: {str(e)}")
604
  return "", f"Error: {str(e)}"
605
 
606
 
607
- # Create Gradio interface
608
  theme = gr.themes.Soft(primary_hue="blue", font="Open Sans")
609
 
610
- with gr.Blocks(
611
- theme=theme,
612
- css="""
613
- @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
614
-
615
- .gradio-container {
616
- font-family: 'Open Sans', sans-serif !important;
617
- }
618
-
619
- .gr-button {
620
- font-family: 'Open Sans', sans-serif !important;
621
- font-weight: 600 !important;
622
- }
623
-
624
- .primary-btn {
625
- background-color: #2436d4 !important;
626
- color: white !important;
627
- }
628
-
629
- .primary-btn:hover {
630
- background-color: #1c2aa8 !important;
631
- }
632
-
633
- [data-testid="textbox"] {
634
- font-family: 'Open Sans', sans-serif !important;
635
- }
636
-
637
- .gr-padded {
638
- font-family: 'Open Sans', sans-serif !important;
639
- }
640
-
641
- .gr-input {
642
- font-family: 'Open Sans', sans-serif !important;
643
- }
644
-
645
- .gr-label {
646
- font-family: 'Open Sans', sans-serif !important;
647
- }
648
- """,
649
- ) as iface:
650
-
651
  with gr.Row():
652
  url_input = gr.Textbox(
653
- label="Website URL",
654
- placeholder="Enter the website URL (e.g., example.com)",
655
- info="The URL will be automatically prefixed with https:// if not provided",
656
  )
657
-
658
  with gr.Row():
659
- with gr.Column():
660
- depth_input = gr.Slider(
661
- minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth"
662
- )
663
- with gr.Column():
664
- pages_input = gr.Slider(
665
- minimum=10, maximum=100, value=50, step=10, label="Maximum Pages"
666
- )
667
-
668
- generate_btn = gr.Button("Generate llms.txt", variant="primary")
669
-
670
  output = gr.Textbox(
671
- label="Generated llms.txt Content",
672
- lines=20,
673
- show_copy_button=True,
674
- container=True,
675
  )
676
-
677
  status = gr.Textbox(label="Status")
678
 
 
 
 
679
  generate_btn.click(
680
- fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
681
  inputs=[url_input, depth_input, pages_input],
682
  outputs=[output, status],
683
  )
 
8
  from collections import defaultdict
9
  import unicodedata
10
  import logging
 
11
 
12
  logging.basicConfig(level=logging.INFO)
13
  logger = logging.getLogger(__name__)
 
22
  self.homepage_metadata = None
23
  self.headers = {
24
  "User-Agent": "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
 
 
 
 
 
 
25
  }
26
 
27
  def determine_category_importance(self, url, title, desc):
 
28
  url_lower = url.lower()
29
  path = urlparse(url).path.lower()
30
 
 
31
  if path == "/" or path == "":
32
  return "Main", 10
33
 
34
+ if any(x in url_lower for x in ["/docs", "/faq", "/help"]):
 
 
 
 
 
 
 
 
 
 
35
  return "Documentation", 8
36
 
37
+ elif any(x in url_lower for x in ["/api", "/developer"]):
 
38
  return "API", 8
39
 
40
+ elif any(x in url_lower for x in ["/about", "/company", "/contact"]):
 
 
 
 
 
 
 
 
 
 
 
 
 
41
  return "About", 7
42
 
43
+ elif any(x in url_lower for x in ["/news", "/blog", "/events"]):
 
 
 
 
 
 
 
 
 
 
 
 
44
  return "News", 5
45
 
46
+ elif any(x in url_lower for x in ["/tools", "/pricing"]):
 
 
 
 
 
 
 
 
 
 
 
 
47
  return "Tools", 6
48
 
 
 
 
 
 
 
49
  return "Optional", 1
50
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
51
  def clean_text(self, text, is_title=False):
52
+ if not text:
 
53
  return ""
 
 
54
  text = unicodedata.normalize("NFKD", text)
55
  text = re.sub(r"[^\x00-\x7F]+", "", text)
56
+ text = " ".join(text.split()).strip()
 
 
 
 
57
 
58
  if is_title:
 
59
  text = re.sub(r"^\s*Welcome to\s+", "", text)
60
+ return text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  async def crawl_page(self, url, depth, base_domain):
 
63
  if (
64
  depth > self.max_depth
65
  or url in self.visited_urls
 
68
  return []
69
 
70
  try:
71
+ async with aiohttp.ClientSession(
72
+ timeout=aiohttp.ClientTimeout(total=20)
73
+ ) as session:
74
  async with session.get(
75
  url, headers=self.headers, allow_redirects=True
76
  ) as response:
77
+ if response.status != 200:
 
 
 
 
 
 
 
 
 
 
 
 
78
  return []
79
+ text = await response.text()
 
 
80
  self.visited_urls.add(url)
81
+
82
  soup = BeautifulSoup(text, "html.parser")
83
+ title_tag = soup.find("title")
84
+ title = (
85
+ self.clean_text(title_tag.text)
86
+ if title_tag
87
+ else url.split("/")[-1]
88
+ )
89
 
90
+ desc_tag = soup.find("meta", {"name": "description"})
91
+ desc = (
92
+ self.clean_text(desc_tag["content"])
93
+ if desc_tag and desc_tag.get("content")
94
+ else ""
95
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
96
 
 
97
  category, importance = self.determine_category_importance(
98
  url, title, desc
99
  )
100
 
101
+ self.url_metadata[url] = {
102
+ "title": title,
103
+ "description": desc,
104
+ "category": category,
105
+ "importance": importance,
106
+ }
107
+
 
 
 
 
 
 
 
 
 
108
  links = []
109
  for a in soup.find_all("a", href=True):
110
+ next_url = urljoin(url, a["href"])
111
+ if urlparse(next_url).netloc == base_domain:
112
+ links.append(next_url)
 
 
 
 
 
 
 
 
 
 
 
 
 
113
 
114
+ return links
115
  except Exception as e:
116
  logger.error(f"Error crawling {url}: {str(e)}")
117
  return []
118
 
119
  async def process_homepage(self, url):
 
120
  try:
 
 
 
 
 
 
 
 
121
  async with aiohttp.ClientSession(
122
+ timeout=aiohttp.ClientTimeout(total=20)
123
  ) as session:
124
  async with session.get(
125
  url, headers=self.headers, allow_redirects=True
126
  ) as response:
127
  if response.status != 200:
128
+ return
129
+ text = await response.text()
 
 
 
 
 
 
 
 
130
  soup = BeautifulSoup(text, "html.parser")
131
 
132
+ site_name = (
133
+ soup.find("title").text.split("|")[0].strip()
134
+ if soup.find("title")
135
+ else urlparse(url).netloc
136
+ )
137
+ description = soup.find("meta", {"name": "description"})
138
+ description = (
139
+ description["content"].strip()
140
+ if description and description.get("content")
141
+ else None
142
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
144
  self.homepage_metadata = {
145
  "site_name": self.clean_text(site_name, is_title=True),
146
+ "description": (
147
+ self.clean_text(description) if description else None
148
+ ),
149
  }
 
150
  except Exception as e:
151
  logger.error(f"Error processing homepage {url}: {str(e)}")
 
 
 
 
152
 
153
  async def crawl_website(self, start_url):
 
154
  try:
 
 
155
  await self.process_homepage(start_url)
 
156
  base_domain = urlparse(start_url).netloc
157
  queue = [(start_url, 0)]
158
  seen = {start_url}
 
161
  current_url, depth = queue.pop(0)
162
  if depth > self.max_depth:
163
  continue
 
 
164
  links = await self.crawl_page(current_url, depth, base_domain)
 
 
165
  for link in links:
166
+ if link not in seen:
167
  seen.add(link)
168
  queue.append((link, depth + 1))
169
 
 
 
170
  except Exception as e:
171
  logger.error(f"Error during crawl: {str(e)}")
172
  raise
173
 
174
  def generate_llms_txt(self):
 
 
 
175
  if not self.url_metadata:
176
+ return "No content available."
 
 
 
 
 
 
 
 
 
 
 
 
 
 
177
 
 
 
 
 
 
 
 
178
  content = []
179
+ homepage_title = self.homepage_metadata.get("site_name", "Website")
180
+ homepage_description = self.homepage_metadata.get(
181
+ "description", "No description available."
182
+ )
183
+ content.append(f"# {homepage_title}\n\n> {homepage_description}\n")
184
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
  categories = defaultdict(list)
186
+ for url, metadata in self.url_metadata.items():
187
+ categories[metadata["category"]].append((url, metadata))
 
188
 
 
 
 
189
  category_order = [
190
  "Main",
191
  "Documentation",
192
  "API",
 
193
  "About",
194
  "News",
195
+ "Tools",
196
  "Optional",
197
  ]
 
 
 
 
 
 
 
 
 
 
 
198
  for category in category_order:
199
+ if category in categories:
200
+ content.append(f"## {category}")
201
+ for url, metadata in categories[category]:
202
+ content.append(
203
+ f"- [{metadata['title']}]({url}): {metadata['description']}"
204
+ )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
205
 
206
+ return "\n".join(content)
 
 
207
 
208
 
209
  async def process_url(url, max_depth, max_pages):
 
210
  try:
 
211
  if not url.startswith(("http://", "https://")):
212
  url = "https://" + url
 
 
213
  result = urlparse(url)
214
+ if not result.scheme or not result.netloc:
215
  return "", "Invalid URL format. Please enter a valid URL."
216
 
 
 
 
217
  crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
218
  await crawler.crawl_website(url)
 
 
219
  content = crawler.generate_llms_txt()
220
 
 
 
 
221
  return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
 
222
  except Exception as e:
223
  logger.error(f"Error processing URL {url}: {str(e)}")
224
  return "", f"Error: {str(e)}"
225
 
226
 
227
+ # Gradio interface
228
  theme = gr.themes.Soft(primary_hue="blue", font="Open Sans")
229
 
230
+ with gr.Blocks(theme=theme) as iface:
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
231
  with gr.Row():
232
  url_input = gr.Textbox(
233
+ label="Website URL", placeholder="Enter the website URL (e.g., example.com)"
 
 
234
  )
 
235
  with gr.Row():
236
+ depth_input = gr.Slider(
237
+ minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth"
238
+ )
239
+ pages_input = gr.Slider(
240
+ minimum=10, maximum=100, value=50, step=10, label="Maximum Pages"
241
+ )
242
+ generate_btn = gr.Button("Generate llms.txt")
 
 
 
 
243
  output = gr.Textbox(
244
+ label="Generated llms.txt Content", lines=20, show_copy_button=True
 
 
 
245
  )
 
246
  status = gr.Textbox(label="Status")
247
 
248
+ async def process_url_async_wrapper(url, depth, pages):
249
+ return await process_url(url, depth, pages)
250
+
251
  generate_btn.click(
252
+ fn=process_url_async_wrapper,
253
  inputs=[url_input, depth_input, pages_input],
254
  outputs=[output, status],
255
  )