cyberandy commited on
Commit
1c5e607
·
verified ·
1 Parent(s): 1a04a7a

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +91 -2189
app.py CHANGED
@@ -3,435 +3,39 @@ import requests
3
  from bs4 import BeautifulSoup
4
  import re
5
  from urllib.parse import urljoin, urlparse
6
- import markdown
7
- from concurrent.futures import ThreadPoolExecutor
8
  import asyncio
9
  from collections import defaultdict
10
- import time
11
- import logging
12
  import unicodedata
 
13
 
14
- # Set up logging
15
  logging.basicConfig(level=logging.INFO)
16
  logger = logging.getLogger(__name__)
17
 
18
  class WebsiteCrawler:
19
- def __init__(self, max_depth=3, max_pages=50, timeout=30):
20
  self.max_depth = max_depth
21
  self.max_pages = max_pages
22
- self.timeout = timeout
23
  self.visited_urls = set()
24
- self.url_content = {}
25
  self.url_metadata = defaultdict(dict)
26
  self.headers = {
27
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
28
  }
29
 
30
- def normalize_text(self, text):
31
- """Normalize text to handle encoding issues"""
32
  if not text:
33
  return ""
34
  # Normalize unicode characters
35
  text = unicodedata.normalize('NFKD', text)
36
- # Replace special quotes and dashes with standard characters
37
- text = text.replace('\u201c', '"').replace('\u201d', '"') # smart quotes
38
- text = text.replace('\u2018', "'").replace('\u2019', "'") # smart single quotes
39
- text = text.replace('\u2013', '-').replace('\u2014', '-') # en and em dashes
40
- # Remove any remaining non-ASCII characters
41
- text = text.encode('ascii', 'ignore').decode('ascii')
42
- # Clean up extra whitespace and ensure proper sentence spacing
43
- text = ' '.join(text.split())
44
- return text
45
-
46
- def clean_url(self, url):
47
- """Clean URL by removing fragments and unnecessary parameters"""
48
- # Remove fragments (everything after #)
49
- url = re.sub(r'#.*
50
-
51
- def is_valid_url(self, url, base_domain):
52
- """Check if URL is valid and belongs to the same domain"""
53
- try:
54
- parsed = urlparse(url)
55
- base_parsed = urlparse(base_domain)
56
- return (parsed.netloc == base_parsed.netloc and
57
- parsed.scheme in ['http', 'https'] and
58
- not url.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip')))
59
- except:
60
- return False
61
-
62
- def extract_content(self, soup):
63
- """Extract meaningful content from HTML"""
64
- # Remove script and style elements
65
- for element in soup(['script', 'style', 'nav', 'footer', 'header']):
66
- element.decompose()
67
-
68
- # Get main content
69
- main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
70
- if main_content:
71
- return self.normalize_text(main_content.get_text(strip=True))
72
- return self.normalize_text(soup.get_text(strip=True))
73
-
74
- def get_page_metadata(self, soup, url):
75
- """Extract metadata from the page"""
76
- metadata = {
77
- 'title': None,
78
- 'description': None,
79
- 'importance': 0,
80
- 'category': 'Optional'
81
- }
82
-
83
- # Title extraction with cleaning
84
- title = (
85
- soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
86
- soup.find('title').text if soup.find('title') else
87
- soup.find('h1').text if soup.find('h1') else
88
- url.split('/')[-1]
89
- )
90
- metadata['title'] = self.clean_title(title)
91
-
92
- # Description extraction with cleaning
93
- description = (
94
- soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
95
- soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
96
- ""
97
- )
98
- metadata['description'] = self.clean_description(description)
99
-
100
- # Calculate importance and category
101
- url_lower = url.lower()
102
- if 'docs' in url_lower or 'documentation' in url_lower:
103
- metadata['importance'] = 5
104
- metadata['category'] = 'Docs'
105
- elif 'api' in url_lower:
106
- metadata['importance'] = 4
107
- metadata['category'] = 'API'
108
- elif 'guide' in url_lower or 'tutorial' in url_lower:
109
- metadata['importance'] = 3
110
- metadata['category'] = 'Guides'
111
- elif 'example' in url_lower:
112
- metadata['importance'] = 2
113
- metadata['category'] = 'Examples'
114
- elif 'blog' in url_lower:
115
- metadata['importance'] = 1
116
- metadata['category'] = 'Blog'
117
-
118
- return metadata
119
-
120
- async def crawl_page(self, url, depth, base_domain):
121
- """Crawl a single page and extract information"""
122
- if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages:
123
- return []
124
-
125
- try:
126
- response = requests.get(url, headers=self.headers, timeout=self.timeout)
127
- response.encoding = 'utf-8'
128
- response.raise_for_status()
129
- self.visited_urls.add(url)
130
-
131
- soup = BeautifulSoup(response.text, 'html.parser')
132
- content = self.extract_content(soup)
133
- metadata = self.get_page_metadata(soup, url)
134
-
135
- self.url_content[url] = content
136
- self.url_metadata[url] = metadata
137
-
138
- # Find all links
139
- links = []
140
- for a in soup.find_all('a', href=True):
141
- next_url = urljoin(url, a['href'])
142
- if self.is_valid_url(next_url, base_domain):
143
- links.append(next_url)
144
-
145
- return links
146
-
147
- except Exception as e:
148
- logger.error(f"Error crawling {url}: {str(e)}")
149
- return []
150
-
151
- async def crawl_website(self, start_url):
152
- """Crawl website starting from the given URL"""
153
- base_domain = start_url
154
- queue = [(start_url, 0)]
155
- seen = {start_url}
156
-
157
- while queue and len(self.visited_urls) < self.max_pages:
158
- current_url, depth = queue.pop(0)
159
-
160
- if depth > self.max_depth:
161
- continue
162
-
163
- links = await self.crawl_page(current_url, depth, base_domain)
164
-
165
- for link in links:
166
- if link not in seen:
167
- seen.add(link)
168
- queue.append((link, depth + 1))
169
-
170
- def generate_llms_txt(self):
171
- """Generate llms.txt content from crawled data"""
172
- # Clean and deduplicate metadata
173
- cleaned_metadata = self.remove_duplicate_content(self.url_metadata)
174
-
175
- # Sort URLs by importance
176
- sorted_urls = sorted(
177
- cleaned_metadata.items(),
178
- key=lambda x: (x[1]['importance'], x[0]),
179
- reverse=True
180
- )
181
-
182
- if not sorted_urls:
183
- return "No content was found to generate llms.txt"
184
-
185
- # Group URLs by category
186
- categorized_urls = defaultdict(list)
187
- for url, metadata in sorted_urls:
188
- categorized_urls[metadata['category']].append((url, metadata))
189
-
190
- # Generate content
191
- content = []
192
-
193
- # Add main title and description
194
- main_metadata = sorted_urls[0][1]
195
- content.append(f"# {main_metadata['title']}")
196
- if main_metadata['description']:
197
- content.append(f"\n> {main_metadata['description']}")
198
-
199
- # Add categorized sections
200
- priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
201
-
202
- for category in priority_order:
203
- if category in categorized_urls:
204
- content.append(f"\n## {category}")
205
- for url, metadata in categorized_urls[category]:
206
- if metadata['description']:
207
- content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
208
- else:
209
- content.append(f"\n- [{metadata['title']}]({url})")
210
-
211
- return "\n".join(content)
212
-
213
- async def process_url(url, max_depth, max_pages):
214
- """Process URL and generate llms.txt"""
215
- try:
216
- # Add https:// if not present
217
- if not url.startswith(('http://', 'https://')):
218
- url = 'https://' + url
219
-
220
- # Validate URL format
221
- try:
222
- result = urlparse(url)
223
- if not all([result.scheme, result.netloc]):
224
- return "", "Invalid URL format. Please enter a valid URL."
225
- except:
226
- return "", "Invalid URL format. Please enter a valid URL."
227
-
228
- # Create crawler and process
229
- crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
230
- await crawler.crawl_website(url)
231
- content = crawler.generate_llms_txt()
232
-
233
- return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
234
-
235
- except Exception as e:
236
- return "", f"Error: {str(e)}"
237
-
238
- # Create custom theme
239
- theme = gr.themes.Soft(
240
- primary_hue="blue",
241
- font="Open Sans"
242
- )
243
-
244
- # Create the Gradio interface
245
- with gr.Blocks(theme=theme, css="""
246
- @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
247
-
248
- .gradio-container {
249
- font-family: 'Open Sans', sans-serif !important;
250
- }
251
-
252
- .gr-button {
253
- font-family: 'Open Sans', sans-serif !important;
254
- font-weight: 600 !important;
255
- }
256
-
257
- /* Primary color customization */
258
- .primary-btn {
259
- background-color: #2436d4 !important;
260
- color: white !important;
261
- }
262
-
263
- .primary-btn:hover {
264
- background-color: #1c2aa8 !important;
265
- }
266
-
267
- [data-testid="textbox"] {
268
- font-family: 'Open Sans', sans-serif !important;
269
- }
270
- """) as iface:
271
- gr.Markdown("# llms.txt Generator")
272
- gr.Markdown("Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.")
273
-
274
- with gr.Row():
275
- url_input = gr.Textbox(
276
- label="Website URL",
277
- placeholder="Enter the website URL (e.g., example.com or https://example.com)",
278
- info="The URL will be automatically prefixed with https:// if no protocol is specified."
279
- )
280
-
281
- with gr.Row():
282
- with gr.Column():
283
- depth_input = gr.Slider(
284
- minimum=1,
285
- maximum=5,
286
- value=3,
287
- step=1,
288
- label="Maximum Crawl Depth",
289
- info="Higher values will result in more thorough but slower crawling"
290
- )
291
- with gr.Column():
292
- pages_input = gr.Slider(
293
- minimum=10,
294
- maximum=100,
295
- value=50,
296
- step=10,
297
- label="Maximum Pages to Crawl",
298
- info="Higher values will result in more comprehensive but slower results"
299
- )
300
-
301
- generate_btn = gr.Button("Generate llms.txt", variant="primary")
302
-
303
- with gr.Row():
304
- output = gr.Textbox(
305
- label="Generated llms.txt Content",
306
- lines=20,
307
- max_lines=30,
308
- show_copy_button=True,
309
- container=True,
310
- scale=2,
311
- interactive=True
312
- )
313
-
314
- status = gr.Textbox(label="Status")
315
-
316
- generate_btn.click(
317
- fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
318
- inputs=[url_input, depth_input, pages_input],
319
- outputs=[output, status]
320
- )
321
-
322
- # Launch the app
323
- if __name__ == "__main__":
324
- iface.launch()
325
- , '', url)
326
- # Remove trailing slashes
327
- url = url.rstrip('/')
328
- return url
329
-
330
- def remove_duplicate_content(self, urls_metadata):
331
- """Remove duplicate content based on similar titles and URLs"""
332
- seen_content = {}
333
- cleaned_metadata = {}
334
-
335
- for url, metadata in urls_metadata.items():
336
- clean_url = self.clean_url(url)
337
- base_url = clean_url.split('#')[0] # Remove hash fragments
338
-
339
- # Create a content signature based on title and base URL
340
- title = metadata['title'].lower()
341
-
342
- # Skip entries that are just fragments of the same page
343
- if base_url in seen_content:
344
- # Keep the one with the shortest URL (usually the main page)
345
- if len(clean_url) < len(seen_content[base_url]):
346
- cleaned_metadata[clean_url] = metadata
347
- cleaned_metadata.pop(seen_content[base_url], None)
348
- seen_content[base_url] = clean_url
349
- continue
350
-
351
- seen_content[base_url] = clean_url
352
- cleaned_metadata[clean_url] = metadata
353
-
354
- return cleaned_metadata
355
-
356
- def clean_title(self, title):
357
- """Clean and format titles"""
358
- if not title:
359
- return ""
360
 
361
- title = self.normalize_text(title)
 
 
 
 
362
 
363
- # Remove common suffixes and prefixes
364
- patterns = [
365
- r'\s*\|\s*.*
366
-
367
- def is_valid_url(self, url, base_domain):
368
- """Check if URL is valid and belongs to the same domain"""
369
- try:
370
- parsed = urlparse(url)
371
- base_parsed = urlparse(base_domain)
372
- return (parsed.netloc == base_parsed.netloc and
373
- parsed.scheme in ['http', 'https'] and
374
- not url.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip')))
375
- except:
376
- return False
377
-
378
- def extract_content(self, soup):
379
- """Extract meaningful content from HTML"""
380
- # Remove script and style elements
381
- for element in soup(['script', 'style', 'nav', 'footer', 'header']):
382
- element.decompose()
383
-
384
- # Get main content
385
- main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
386
- if main_content:
387
- return self.normalize_text(main_content.get_text(strip=True))
388
- return self.normalize_text(soup.get_text(strip=True))
389
-
390
- def get_page_metadata(self, soup, url):
391
- """Extract metadata from the page"""
392
- metadata = {
393
- 'title': None,
394
- 'description': None,
395
- 'importance': 0,
396
- 'category': 'Optional'
397
- }
398
-
399
- # Title extraction with cleaning
400
- title = (
401
- soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
402
- soup.find('title').text if soup.find('title') else
403
- soup.find('h1').text if soup.find('h1') else
404
- url.split('/')[-1]
405
- )
406
- metadata['title'] = self.clean_title(title)
407
-
408
- # Description extraction with cleaning
409
- description = (
410
- soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
411
- soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
412
- ""
413
- )
414
- metadata['description'] = self.clean_description(description)
415
-
416
- # Calculate importance and category
417
- url_lower = url.lower()
418
- if 'docs' in url_lower or 'documentation' in url_lower:
419
- metadata['importance'] = 5
420
- metadata['category'] = 'Docs'
421
- elif 'api' in url_lower:
422
- metadata['importance'] = 4
423
- metadata['category'] = 'API'
424
- elif 'guide' in url_lower or 'tutorial' in url_lower:
425
- metadata['importance'] = 3
426
- metadata['category'] = 'Guides'
427
- elif 'example' in url_lower:
428
- metadata['importance'] = 2
429
- metadata['category'] = 'Examples'
430
- elif 'blog' in url_lower:
431
- metadata['importance'] = 1
432
- metadata['category'] = 'Blog'
433
-
434
- return metadata
435
 
436
  async def crawl_page(self, url, depth, base_domain):
437
  """Crawl a single page and extract information"""
@@ -439,26 +43,50 @@ if __name__ == "__main__":
439
  return []
440
 
441
  try:
442
- response = requests.get(url, headers=self.headers, timeout=self.timeout)
443
  response.encoding = 'utf-8'
444
- response.raise_for_status()
445
  self.visited_urls.add(url)
446
 
447
  soup = BeautifulSoup(response.text, 'html.parser')
448
- content = self.extract_content(soup)
449
- metadata = self.get_page_metadata(soup, url)
450
 
451
- self.url_content[url] = content
452
- self.url_metadata[url] = metadata
453
-
454
- # Find all links
455
- links = []
456
- for a in soup.find_all('a', href=True):
457
- next_url = urljoin(url, a['href'])
458
- if self.is_valid_url(next_url, base_domain):
459
- links.append(next_url)
460
-
461
- return links
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
462
 
463
  except Exception as e:
464
  logger.error(f"Error crawling {url}: {str(e)}")
@@ -466,56 +94,55 @@ if __name__ == "__main__":
466
 
467
  async def crawl_website(self, start_url):
468
  """Crawl website starting from the given URL"""
469
- base_domain = start_url
470
  queue = [(start_url, 0)]
471
  seen = {start_url}
472
 
473
  while queue and len(self.visited_urls) < self.max_pages:
474
  current_url, depth = queue.pop(0)
475
-
476
  if depth > self.max_depth:
477
  continue
478
 
479
  links = await self.crawl_page(current_url, depth, base_domain)
480
-
481
  for link in links:
482
- if link not in seen:
483
  seen.add(link)
484
  queue.append((link, depth + 1))
485
 
486
  def generate_llms_txt(self):
487
- """Generate llms.txt content from crawled data"""
488
- # Sort URLs by importance
 
 
 
489
  sorted_urls = sorted(
490
  self.url_metadata.items(),
491
  key=lambda x: (x[1]['importance'], x[0]),
492
  reverse=True
493
  )
494
 
495
- if not sorted_urls:
496
- return "No content was found to generate llms.txt"
497
-
498
- # Group URLs by category
499
- categorized_urls = defaultdict(list)
500
- for url, metadata in sorted_urls:
501
- categorized_urls[metadata['category']].append((url, metadata))
502
-
503
  # Generate content
504
  content = []
505
-
506
- # Add main title and description
507
  main_metadata = sorted_urls[0][1]
508
  content.append(f"# {main_metadata['title']}")
509
  if main_metadata['description']:
510
  content.append(f"\n> {main_metadata['description']}")
 
 
 
 
511
 
512
- # Add categorized sections
513
- priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
514
-
515
- for category in priority_order:
516
- if category in categorized_urls:
 
 
 
 
517
  content.append(f"\n## {category}")
518
- for url, metadata in categorized_urls[category]:
519
  if metadata['description']:
520
  content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
521
  else:
@@ -530,15 +157,12 @@ async def process_url(url, max_depth, max_pages):
530
  if not url.startswith(('http://', 'https://')):
531
  url = 'https://' + url
532
 
533
- # Validate URL format
534
- try:
535
- result = urlparse(url)
536
- if not all([result.scheme, result.netloc]):
537
- return "", "Invalid URL format. Please enter a valid URL."
538
- except:
539
  return "", "Invalid URL format. Please enter a valid URL."
540
 
541
- # Create crawler and process
542
  crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
543
  await crawler.crawl_website(url)
544
  content = crawler.generate_llms_txt()
@@ -548,1758 +172,37 @@ async def process_url(url, max_depth, max_pages):
548
  except Exception as e:
549
  return "", f"Error: {str(e)}"
550
 
551
- # Create custom theme
552
- theme = gr.themes.Soft(
553
- primary_hue="blue",
554
- font="Open Sans"
555
- )
556
 
557
- # Create the Gradio interface
558
- with gr.Blocks(theme=theme, css="""
559
- @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
560
-
561
- .gradio-container {
562
- font-family: 'Open Sans', sans-serif !important;
563
- }
564
-
565
- .gr-button {
566
- font-family: 'Open Sans', sans-serif !important;
567
- font-weight: 600 !important;
568
- }
569
-
570
- /* Primary color customization */
571
- .primary-btn {
572
- background-color: #2436d4 !important;
573
- color: white !important;
574
- }
575
-
576
- .primary-btn:hover {
577
- background-color: #1c2aa8 !important;
578
- }
579
-
580
- [data-testid="textbox"] {
581
- font-family: 'Open Sans', sans-serif !important;
582
- }
583
  """) as iface:
584
  gr.Markdown("# llms.txt Generator")
585
- gr.Markdown("Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.")
586
 
587
  with gr.Row():
588
  url_input = gr.Textbox(
589
  label="Website URL",
590
- placeholder="Enter the website URL (e.g., example.com or https://example.com)",
591
- info="The URL will be automatically prefixed with https:// if no protocol is specified."
592
  )
593
 
594
  with gr.Row():
595
  with gr.Column():
596
- depth_input = gr.Slider(
597
- minimum=1,
598
- maximum=5,
599
- value=3,
600
- step=1,
601
- label="Maximum Crawl Depth",
602
- info="Higher values will result in more thorough but slower crawling"
603
- )
604
  with gr.Column():
605
- pages_input = gr.Slider(
606
- minimum=10,
607
- maximum=100,
608
- value=50,
609
- step=10,
610
- label="Maximum Pages to Crawl",
611
- info="Higher values will result in more comprehensive but slower results"
612
- )
613
 
614
  generate_btn = gr.Button("Generate llms.txt", variant="primary")
615
 
616
- with gr.Row():
617
- output = gr.Textbox(
618
- label="Generated llms.txt Content",
619
- lines=20,
620
- max_lines=30,
621
- show_copy_button=True,
622
- container=True,
623
- scale=2,
624
- interactive=True
625
- )
626
-
627
- status = gr.Textbox(label="Status")
628
-
629
- generate_btn.click(
630
- fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
631
- inputs=[url_input, depth_input, pages_input],
632
- outputs=[output, status]
633
  )
634
-
635
- # Launch the app
636
- if __name__ == "__main__":
637
- iface.launch()
638
- , # Remove pipe and everything after
639
- r'\s*-\s*.*
640
-
641
- def is_valid_url(self, url, base_domain):
642
- """Check if URL is valid and belongs to the same domain"""
643
- try:
644
- parsed = urlparse(url)
645
- base_parsed = urlparse(base_domain)
646
- return (parsed.netloc == base_parsed.netloc and
647
- parsed.scheme in ['http', 'https'] and
648
- not url.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip')))
649
- except:
650
- return False
651
-
652
- def extract_content(self, soup):
653
- """Extract meaningful content from HTML"""
654
- # Remove script and style elements
655
- for element in soup(['script', 'style', 'nav', 'footer', 'header']):
656
- element.decompose()
657
-
658
- # Get main content
659
- main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
660
- if main_content:
661
- return self.normalize_text(main_content.get_text(strip=True))
662
- return self.normalize_text(soup.get_text(strip=True))
663
-
664
- def get_page_metadata(self, soup, url):
665
- """Extract metadata from the page"""
666
- metadata = {
667
- 'title': None,
668
- 'description': None,
669
- 'importance': 0,
670
- 'category': 'Optional'
671
- }
672
-
673
- # Title extraction with cleaning
674
- title = (
675
- soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
676
- soup.find('title').text if soup.find('title') else
677
- soup.find('h1').text if soup.find('h1') else
678
- url.split('/')[-1]
679
- )
680
- metadata['title'] = self.clean_title(title)
681
-
682
- # Description extraction with cleaning
683
- description = (
684
- soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
685
- soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
686
- ""
687
- )
688
- metadata['description'] = self.clean_description(description)
689
-
690
- # Calculate importance and category
691
- url_lower = url.lower()
692
- if 'docs' in url_lower or 'documentation' in url_lower:
693
- metadata['importance'] = 5
694
- metadata['category'] = 'Docs'
695
- elif 'api' in url_lower:
696
- metadata['importance'] = 4
697
- metadata['category'] = 'API'
698
- elif 'guide' in url_lower or 'tutorial' in url_lower:
699
- metadata['importance'] = 3
700
- metadata['category'] = 'Guides'
701
- elif 'example' in url_lower:
702
- metadata['importance'] = 2
703
- metadata['category'] = 'Examples'
704
- elif 'blog' in url_lower:
705
- metadata['importance'] = 1
706
- metadata['category'] = 'Blog'
707
-
708
- return metadata
709
-
710
- async def crawl_page(self, url, depth, base_domain):
711
- """Crawl a single page and extract information"""
712
- if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages:
713
- return []
714
-
715
- try:
716
- response = requests.get(url, headers=self.headers, timeout=self.timeout)
717
- response.encoding = 'utf-8'
718
- response.raise_for_status()
719
- self.visited_urls.add(url)
720
-
721
- soup = BeautifulSoup(response.text, 'html.parser')
722
- content = self.extract_content(soup)
723
- metadata = self.get_page_metadata(soup, url)
724
-
725
- self.url_content[url] = content
726
- self.url_metadata[url] = metadata
727
-
728
- # Find all links
729
- links = []
730
- for a in soup.find_all('a', href=True):
731
- next_url = urljoin(url, a['href'])
732
- if self.is_valid_url(next_url, base_domain):
733
- links.append(next_url)
734
-
735
- return links
736
-
737
- except Exception as e:
738
- logger.error(f"Error crawling {url}: {str(e)}")
739
- return []
740
-
741
- async def crawl_website(self, start_url):
742
- """Crawl website starting from the given URL"""
743
- base_domain = start_url
744
- queue = [(start_url, 0)]
745
- seen = {start_url}
746
-
747
- while queue and len(self.visited_urls) < self.max_pages:
748
- current_url, depth = queue.pop(0)
749
-
750
- if depth > self.max_depth:
751
- continue
752
-
753
- links = await self.crawl_page(current_url, depth, base_domain)
754
-
755
- for link in links:
756
- if link not in seen:
757
- seen.add(link)
758
- queue.append((link, depth + 1))
759
-
760
- def generate_llms_txt(self):
761
- """Generate llms.txt content from crawled data"""
762
- # Sort URLs by importance
763
- sorted_urls = sorted(
764
- self.url_metadata.items(),
765
- key=lambda x: (x[1]['importance'], x[0]),
766
- reverse=True
767
- )
768
-
769
- if not sorted_urls:
770
- return "No content was found to generate llms.txt"
771
-
772
- # Group URLs by category
773
- categorized_urls = defaultdict(list)
774
- for url, metadata in sorted_urls:
775
- categorized_urls[metadata['category']].append((url, metadata))
776
-
777
- # Generate content
778
- content = []
779
-
780
- # Add main title and description
781
- main_metadata = sorted_urls[0][1]
782
- content.append(f"# {main_metadata['title']}")
783
- if main_metadata['description']:
784
- content.append(f"\n> {main_metadata['description']}")
785
-
786
- # Add categorized sections
787
- priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
788
-
789
- for category in priority_order:
790
- if category in categorized_urls:
791
- content.append(f"\n## {category}")
792
- for url, metadata in categorized_urls[category]:
793
- if metadata['description']:
794
- content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
795
- else:
796
- content.append(f"\n- [{metadata['title']}]({url})")
797
-
798
- return "\n".join(content)
799
-
800
- async def process_url(url, max_depth, max_pages):
801
- """Process URL and generate llms.txt"""
802
- try:
803
- # Add https:// if not present
804
- if not url.startswith(('http://', 'https://')):
805
- url = 'https://' + url
806
-
807
- # Validate URL format
808
- try:
809
- result = urlparse(url)
810
- if not all([result.scheme, result.netloc]):
811
- return "", "Invalid URL format. Please enter a valid URL."
812
- except:
813
- return "", "Invalid URL format. Please enter a valid URL."
814
-
815
- # Create crawler and process
816
- crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
817
- await crawler.crawl_website(url)
818
- content = crawler.generate_llms_txt()
819
-
820
- return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
821
-
822
- except Exception as e:
823
- return "", f"Error: {str(e)}"
824
-
825
- # Create custom theme
826
- theme = gr.themes.Soft(
827
- primary_hue="blue",
828
- font="Open Sans"
829
- )
830
-
831
- # Create the Gradio interface
832
- with gr.Blocks(theme=theme, css="""
833
- @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
834
-
835
- .gradio-container {
836
- font-family: 'Open Sans', sans-serif !important;
837
- }
838
-
839
- .gr-button {
840
- font-family: 'Open Sans', sans-serif !important;
841
- font-weight: 600 !important;
842
- }
843
-
844
- /* Primary color customization */
845
- .primary-btn {
846
- background-color: #2436d4 !important;
847
- color: white !important;
848
- }
849
-
850
- .primary-btn:hover {
851
- background-color: #1c2aa8 !important;
852
- }
853
-
854
- [data-testid="textbox"] {
855
- font-family: 'Open Sans', sans-serif !important;
856
- }
857
- """) as iface:
858
- gr.Markdown("# llms.txt Generator")
859
- gr.Markdown("Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.")
860
-
861
- with gr.Row():
862
- url_input = gr.Textbox(
863
- label="Website URL",
864
- placeholder="Enter the website URL (e.g., example.com or https://example.com)",
865
- info="The URL will be automatically prefixed with https:// if no protocol is specified."
866
- )
867
-
868
- with gr.Row():
869
- with gr.Column():
870
- depth_input = gr.Slider(
871
- minimum=1,
872
- maximum=5,
873
- value=3,
874
- step=1,
875
- label="Maximum Crawl Depth",
876
- info="Higher values will result in more thorough but slower crawling"
877
- )
878
- with gr.Column():
879
- pages_input = gr.Slider(
880
- minimum=10,
881
- maximum=100,
882
- value=50,
883
- step=10,
884
- label="Maximum Pages to Crawl",
885
- info="Higher values will result in more comprehensive but slower results"
886
- )
887
-
888
- generate_btn = gr.Button("Generate llms.txt", variant="primary")
889
-
890
- with gr.Row():
891
- output = gr.Textbox(
892
- label="Generated llms.txt Content",
893
- lines=20,
894
- max_lines=30,
895
- show_copy_button=True,
896
- container=True,
897
- scale=2,
898
- interactive=True
899
- )
900
-
901
- status = gr.Textbox(label="Status")
902
-
903
- generate_btn.click(
904
- fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
905
- inputs=[url_input, depth_input, pages_input],
906
- outputs=[output, status]
907
- )
908
-
909
- # Launch the app
910
- if __name__ == "__main__":
911
- iface.launch()
912
- , # Remove dash and everything after
913
- r'\s*:\s*.*
914
-
915
- def is_valid_url(self, url, base_domain):
916
- """Check if URL is valid and belongs to the same domain"""
917
- try:
918
- parsed = urlparse(url)
919
- base_parsed = urlparse(base_domain)
920
- return (parsed.netloc == base_parsed.netloc and
921
- parsed.scheme in ['http', 'https'] and
922
- not url.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip')))
923
- except:
924
- return False
925
-
926
- def extract_content(self, soup):
927
- """Extract meaningful content from HTML"""
928
- # Remove script and style elements
929
- for element in soup(['script', 'style', 'nav', 'footer', 'header']):
930
- element.decompose()
931
-
932
- # Get main content
933
- main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
934
- if main_content:
935
- return self.normalize_text(main_content.get_text(strip=True))
936
- return self.normalize_text(soup.get_text(strip=True))
937
-
938
- def get_page_metadata(self, soup, url):
939
- """Extract metadata from the page"""
940
- metadata = {
941
- 'title': None,
942
- 'description': None,
943
- 'importance': 0,
944
- 'category': 'Optional'
945
- }
946
-
947
- # Title extraction with cleaning
948
- title = (
949
- soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
950
- soup.find('title').text if soup.find('title') else
951
- soup.find('h1').text if soup.find('h1') else
952
- url.split('/')[-1]
953
- )
954
- metadata['title'] = self.clean_title(title)
955
-
956
- # Description extraction with cleaning
957
- description = (
958
- soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
959
- soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
960
- ""
961
- )
962
- metadata['description'] = self.clean_description(description)
963
-
964
- # Calculate importance and category
965
- url_lower = url.lower()
966
- if 'docs' in url_lower or 'documentation' in url_lower:
967
- metadata['importance'] = 5
968
- metadata['category'] = 'Docs'
969
- elif 'api' in url_lower:
970
- metadata['importance'] = 4
971
- metadata['category'] = 'API'
972
- elif 'guide' in url_lower or 'tutorial' in url_lower:
973
- metadata['importance'] = 3
974
- metadata['category'] = 'Guides'
975
- elif 'example' in url_lower:
976
- metadata['importance'] = 2
977
- metadata['category'] = 'Examples'
978
- elif 'blog' in url_lower:
979
- metadata['importance'] = 1
980
- metadata['category'] = 'Blog'
981
-
982
- return metadata
983
-
984
- async def crawl_page(self, url, depth, base_domain):
985
- """Crawl a single page and extract information"""
986
- if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages:
987
- return []
988
-
989
- try:
990
- response = requests.get(url, headers=self.headers, timeout=self.timeout)
991
- response.encoding = 'utf-8'
992
- response.raise_for_status()
993
- self.visited_urls.add(url)
994
-
995
- soup = BeautifulSoup(response.text, 'html.parser')
996
- content = self.extract_content(soup)
997
- metadata = self.get_page_metadata(soup, url)
998
-
999
- self.url_content[url] = content
1000
- self.url_metadata[url] = metadata
1001
-
1002
- # Find all links
1003
- links = []
1004
- for a in soup.find_all('a', href=True):
1005
- next_url = urljoin(url, a['href'])
1006
- if self.is_valid_url(next_url, base_domain):
1007
- links.append(next_url)
1008
-
1009
- return links
1010
-
1011
- except Exception as e:
1012
- logger.error(f"Error crawling {url}: {str(e)}")
1013
- return []
1014
-
1015
- async def crawl_website(self, start_url):
1016
- """Crawl website starting from the given URL"""
1017
- base_domain = start_url
1018
- queue = [(start_url, 0)]
1019
- seen = {start_url}
1020
-
1021
- while queue and len(self.visited_urls) < self.max_pages:
1022
- current_url, depth = queue.pop(0)
1023
-
1024
- if depth > self.max_depth:
1025
- continue
1026
-
1027
- links = await self.crawl_page(current_url, depth, base_domain)
1028
-
1029
- for link in links:
1030
- if link not in seen:
1031
- seen.add(link)
1032
- queue.append((link, depth + 1))
1033
-
1034
- def generate_llms_txt(self):
1035
- """Generate llms.txt content from crawled data"""
1036
- # Sort URLs by importance
1037
- sorted_urls = sorted(
1038
- self.url_metadata.items(),
1039
- key=lambda x: (x[1]['importance'], x[0]),
1040
- reverse=True
1041
- )
1042
-
1043
- if not sorted_urls:
1044
- return "No content was found to generate llms.txt"
1045
-
1046
- # Group URLs by category
1047
- categorized_urls = defaultdict(list)
1048
- for url, metadata in sorted_urls:
1049
- categorized_urls[metadata['category']].append((url, metadata))
1050
-
1051
- # Generate content
1052
- content = []
1053
-
1054
- # Add main title and description
1055
- main_metadata = sorted_urls[0][1]
1056
- content.append(f"# {main_metadata['title']}")
1057
- if main_metadata['description']:
1058
- content.append(f"\n> {main_metadata['description']}")
1059
-
1060
- # Add categorized sections
1061
- priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
1062
-
1063
- for category in priority_order:
1064
- if category in categorized_urls:
1065
- content.append(f"\n## {category}")
1066
- for url, metadata in categorized_urls[category]:
1067
- if metadata['description']:
1068
- content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
1069
- else:
1070
- content.append(f"\n- [{metadata['title']}]({url})")
1071
-
1072
- return "\n".join(content)
1073
-
1074
- async def process_url(url, max_depth, max_pages):
1075
- """Process URL and generate llms.txt"""
1076
- try:
1077
- # Add https:// if not present
1078
- if not url.startswith(('http://', 'https://')):
1079
- url = 'https://' + url
1080
-
1081
- # Validate URL format
1082
- try:
1083
- result = urlparse(url)
1084
- if not all([result.scheme, result.netloc]):
1085
- return "", "Invalid URL format. Please enter a valid URL."
1086
- except:
1087
- return "", "Invalid URL format. Please enter a valid URL."
1088
-
1089
- # Create crawler and process
1090
- crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
1091
- await crawler.crawl_website(url)
1092
- content = crawler.generate_llms_txt()
1093
-
1094
- return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
1095
-
1096
- except Exception as e:
1097
- return "", f"Error: {str(e)}"
1098
-
1099
- # Create custom theme
1100
- theme = gr.themes.Soft(
1101
- primary_hue="blue",
1102
- font="Open Sans"
1103
- )
1104
-
1105
- # Create the Gradio interface
1106
- with gr.Blocks(theme=theme, css="""
1107
- @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
1108
-
1109
- .gradio-container {
1110
- font-family: 'Open Sans', sans-serif !important;
1111
- }
1112
-
1113
- .gr-button {
1114
- font-family: 'Open Sans', sans-serif !important;
1115
- font-weight: 600 !important;
1116
- }
1117
-
1118
- /* Primary color customization */
1119
- .primary-btn {
1120
- background-color: #2436d4 !important;
1121
- color: white !important;
1122
- }
1123
-
1124
- .primary-btn:hover {
1125
- background-color: #1c2aa8 !important;
1126
- }
1127
-
1128
- [data-testid="textbox"] {
1129
- font-family: 'Open Sans', sans-serif !important;
1130
- }
1131
- """) as iface:
1132
- gr.Markdown("# llms.txt Generator")
1133
- gr.Markdown("Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.")
1134
-
1135
- with gr.Row():
1136
- url_input = gr.Textbox(
1137
- label="Website URL",
1138
- placeholder="Enter the website URL (e.g., example.com or https://example.com)",
1139
- info="The URL will be automatically prefixed with https:// if no protocol is specified."
1140
- )
1141
-
1142
- with gr.Row():
1143
- with gr.Column():
1144
- depth_input = gr.Slider(
1145
- minimum=1,
1146
- maximum=5,
1147
- value=3,
1148
- step=1,
1149
- label="Maximum Crawl Depth",
1150
- info="Higher values will result in more thorough but slower crawling"
1151
- )
1152
- with gr.Column():
1153
- pages_input = gr.Slider(
1154
- minimum=10,
1155
- maximum=100,
1156
- value=50,
1157
- step=10,
1158
- label="Maximum Pages to Crawl",
1159
- info="Higher values will result in more comprehensive but slower results"
1160
- )
1161
-
1162
- generate_btn = gr.Button("Generate llms.txt", variant="primary")
1163
-
1164
- with gr.Row():
1165
- output = gr.Textbox(
1166
- label="Generated llms.txt Content",
1167
- lines=20,
1168
- max_lines=30,
1169
- show_copy_button=True,
1170
- container=True,
1171
- scale=2,
1172
- interactive=True
1173
- )
1174
-
1175
- status = gr.Textbox(label="Status")
1176
-
1177
- generate_btn.click(
1178
- fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
1179
- inputs=[url_input, depth_input, pages_input],
1180
- outputs=[output, status]
1181
- )
1182
-
1183
- # Launch the app
1184
- if __name__ == "__main__":
1185
- iface.launch()
1186
- , # Remove colon and everything after
1187
- r'#.*
1188
-
1189
- def is_valid_url(self, url, base_domain):
1190
- """Check if URL is valid and belongs to the same domain"""
1191
- try:
1192
- parsed = urlparse(url)
1193
- base_parsed = urlparse(base_domain)
1194
- return (parsed.netloc == base_parsed.netloc and
1195
- parsed.scheme in ['http', 'https'] and
1196
- not url.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip')))
1197
- except:
1198
- return False
1199
-
1200
- def extract_content(self, soup):
1201
- """Extract meaningful content from HTML"""
1202
- # Remove script and style elements
1203
- for element in soup(['script', 'style', 'nav', 'footer', 'header']):
1204
- element.decompose()
1205
-
1206
- # Get main content
1207
- main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
1208
- if main_content:
1209
- return self.normalize_text(main_content.get_text(strip=True))
1210
- return self.normalize_text(soup.get_text(strip=True))
1211
-
1212
- def get_page_metadata(self, soup, url):
1213
- """Extract metadata from the page"""
1214
- metadata = {
1215
- 'title': None,
1216
- 'description': None,
1217
- 'importance': 0,
1218
- 'category': 'Optional'
1219
- }
1220
-
1221
- # Title extraction with cleaning
1222
- title = (
1223
- soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
1224
- soup.find('title').text if soup.find('title') else
1225
- soup.find('h1').text if soup.find('h1') else
1226
- url.split('/')[-1]
1227
- )
1228
- metadata['title'] = self.clean_title(title)
1229
-
1230
- # Description extraction with cleaning
1231
- description = (
1232
- soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
1233
- soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
1234
- ""
1235
- )
1236
- metadata['description'] = self.clean_description(description)
1237
-
1238
- # Calculate importance and category
1239
- url_lower = url.lower()
1240
- if 'docs' in url_lower or 'documentation' in url_lower:
1241
- metadata['importance'] = 5
1242
- metadata['category'] = 'Docs'
1243
- elif 'api' in url_lower:
1244
- metadata['importance'] = 4
1245
- metadata['category'] = 'API'
1246
- elif 'guide' in url_lower or 'tutorial' in url_lower:
1247
- metadata['importance'] = 3
1248
- metadata['category'] = 'Guides'
1249
- elif 'example' in url_lower:
1250
- metadata['importance'] = 2
1251
- metadata['category'] = 'Examples'
1252
- elif 'blog' in url_lower:
1253
- metadata['importance'] = 1
1254
- metadata['category'] = 'Blog'
1255
-
1256
- return metadata
1257
-
1258
- async def crawl_page(self, url, depth, base_domain):
1259
- """Crawl a single page and extract information"""
1260
- if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages:
1261
- return []
1262
-
1263
- try:
1264
- response = requests.get(url, headers=self.headers, timeout=self.timeout)
1265
- response.encoding = 'utf-8'
1266
- response.raise_for_status()
1267
- self.visited_urls.add(url)
1268
-
1269
- soup = BeautifulSoup(response.text, 'html.parser')
1270
- content = self.extract_content(soup)
1271
- metadata = self.get_page_metadata(soup, url)
1272
-
1273
- self.url_content[url] = content
1274
- self.url_metadata[url] = metadata
1275
-
1276
- # Find all links
1277
- links = []
1278
- for a in soup.find_all('a', href=True):
1279
- next_url = urljoin(url, a['href'])
1280
- if self.is_valid_url(next_url, base_domain):
1281
- links.append(next_url)
1282
-
1283
- return links
1284
-
1285
- except Exception as e:
1286
- logger.error(f"Error crawling {url}: {str(e)}")
1287
- return []
1288
-
1289
- async def crawl_website(self, start_url):
1290
- """Crawl website starting from the given URL"""
1291
- base_domain = start_url
1292
- queue = [(start_url, 0)]
1293
- seen = {start_url}
1294
-
1295
- while queue and len(self.visited_urls) < self.max_pages:
1296
- current_url, depth = queue.pop(0)
1297
-
1298
- if depth > self.max_depth:
1299
- continue
1300
-
1301
- links = await self.crawl_page(current_url, depth, base_domain)
1302
-
1303
- for link in links:
1304
- if link not in seen:
1305
- seen.add(link)
1306
- queue.append((link, depth + 1))
1307
-
1308
- def generate_llms_txt(self):
1309
- """Generate llms.txt content from crawled data"""
1310
- # Sort URLs by importance
1311
- sorted_urls = sorted(
1312
- self.url_metadata.items(),
1313
- key=lambda x: (x[1]['importance'], x[0]),
1314
- reverse=True
1315
- )
1316
-
1317
- if not sorted_urls:
1318
- return "No content was found to generate llms.txt"
1319
-
1320
- # Group URLs by category
1321
- categorized_urls = defaultdict(list)
1322
- for url, metadata in sorted_urls:
1323
- categorized_urls[metadata['category']].append((url, metadata))
1324
-
1325
- # Generate content
1326
- content = []
1327
-
1328
- # Add main title and description
1329
- main_metadata = sorted_urls[0][1]
1330
- content.append(f"# {main_metadata['title']}")
1331
- if main_metadata['description']:
1332
- content.append(f"\n> {main_metadata['description']}")
1333
-
1334
- # Add categorized sections
1335
- priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
1336
-
1337
- for category in priority_order:
1338
- if category in categorized_urls:
1339
- content.append(f"\n## {category}")
1340
- for url, metadata in categorized_urls[category]:
1341
- if metadata['description']:
1342
- content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
1343
- else:
1344
- content.append(f"\n- [{metadata['title']}]({url})")
1345
-
1346
- return "\n".join(content)
1347
-
1348
- async def process_url(url, max_depth, max_pages):
1349
- """Process URL and generate llms.txt"""
1350
- try:
1351
- # Add https:// if not present
1352
- if not url.startswith(('http://', 'https://')):
1353
- url = 'https://' + url
1354
-
1355
- # Validate URL format
1356
- try:
1357
- result = urlparse(url)
1358
- if not all([result.scheme, result.netloc]):
1359
- return "", "Invalid URL format. Please enter a valid URL."
1360
- except:
1361
- return "", "Invalid URL format. Please enter a valid URL."
1362
-
1363
- # Create crawler and process
1364
- crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
1365
- await crawler.crawl_website(url)
1366
- content = crawler.generate_llms_txt()
1367
-
1368
- return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
1369
-
1370
- except Exception as e:
1371
- return "", f"Error: {str(e)}"
1372
-
1373
- # Create custom theme
1374
- theme = gr.themes.Soft(
1375
- primary_hue="blue",
1376
- font="Open Sans"
1377
- )
1378
-
1379
- # Create the Gradio interface
1380
- with gr.Blocks(theme=theme, css="""
1381
- @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
1382
-
1383
- .gradio-container {
1384
- font-family: 'Open Sans', sans-serif !important;
1385
- }
1386
-
1387
- .gr-button {
1388
- font-family: 'Open Sans', sans-serif !important;
1389
- font-weight: 600 !important;
1390
- }
1391
-
1392
- /* Primary color customization */
1393
- .primary-btn {
1394
- background-color: #2436d4 !important;
1395
- color: white !important;
1396
- }
1397
-
1398
- .primary-btn:hover {
1399
- background-color: #1c2aa8 !important;
1400
- }
1401
-
1402
- [data-testid="textbox"] {
1403
- font-family: 'Open Sans', sans-serif !important;
1404
- }
1405
- """) as iface:
1406
- gr.Markdown("# llms.txt Generator")
1407
- gr.Markdown("Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.")
1408
-
1409
- with gr.Row():
1410
- url_input = gr.Textbox(
1411
- label="Website URL",
1412
- placeholder="Enter the website URL (e.g., example.com or https://example.com)",
1413
- info="The URL will be automatically prefixed with https:// if no protocol is specified."
1414
- )
1415
-
1416
- with gr.Row():
1417
- with gr.Column():
1418
- depth_input = gr.Slider(
1419
- minimum=1,
1420
- maximum=5,
1421
- value=3,
1422
- step=1,
1423
- label="Maximum Crawl Depth",
1424
- info="Higher values will result in more thorough but slower crawling"
1425
- )
1426
- with gr.Column():
1427
- pages_input = gr.Slider(
1428
- minimum=10,
1429
- maximum=100,
1430
- value=50,
1431
- step=10,
1432
- label="Maximum Pages to Crawl",
1433
- info="Higher values will result in more comprehensive but slower results"
1434
- )
1435
-
1436
- generate_btn = gr.Button("Generate llms.txt", variant="primary")
1437
-
1438
- with gr.Row():
1439
- output = gr.Textbox(
1440
- label="Generated llms.txt Content",
1441
- lines=20,
1442
- max_lines=30,
1443
- show_copy_button=True,
1444
- container=True,
1445
- scale=2,
1446
- interactive=True
1447
- )
1448
-
1449
- status = gr.Textbox(label="Status")
1450
-
1451
- generate_btn.click(
1452
- fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
1453
- inputs=[url_input, depth_input, pages_input],
1454
- outputs=[output, status]
1455
- )
1456
-
1457
- # Launch the app
1458
- if __name__ == "__main__":
1459
- iface.launch()
1460
- , # Remove hash and everything after
1461
- r'\s*\|.*
1462
-
1463
- def is_valid_url(self, url, base_domain):
1464
- """Check if URL is valid and belongs to the same domain"""
1465
- try:
1466
- parsed = urlparse(url)
1467
- base_parsed = urlparse(base_domain)
1468
- return (parsed.netloc == base_parsed.netloc and
1469
- parsed.scheme in ['http', 'https'] and
1470
- not url.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip')))
1471
- except:
1472
- return False
1473
-
1474
- def extract_content(self, soup):
1475
- """Extract meaningful content from HTML"""
1476
- # Remove script and style elements
1477
- for element in soup(['script', 'style', 'nav', 'footer', 'header']):
1478
- element.decompose()
1479
-
1480
- # Get main content
1481
- main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
1482
- if main_content:
1483
- return self.normalize_text(main_content.get_text(strip=True))
1484
- return self.normalize_text(soup.get_text(strip=True))
1485
-
1486
- def get_page_metadata(self, soup, url):
1487
- """Extract metadata from the page"""
1488
- metadata = {
1489
- 'title': None,
1490
- 'description': None,
1491
- 'importance': 0,
1492
- 'category': 'Optional'
1493
- }
1494
-
1495
- # Title extraction with cleaning
1496
- title = (
1497
- soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
1498
- soup.find('title').text if soup.find('title') else
1499
- soup.find('h1').text if soup.find('h1') else
1500
- url.split('/')[-1]
1501
- )
1502
- metadata['title'] = self.clean_title(title)
1503
-
1504
- # Description extraction with cleaning
1505
- description = (
1506
- soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
1507
- soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
1508
- ""
1509
- )
1510
- metadata['description'] = self.clean_description(description)
1511
-
1512
- # Calculate importance and category
1513
- url_lower = url.lower()
1514
- if 'docs' in url_lower or 'documentation' in url_lower:
1515
- metadata['importance'] = 5
1516
- metadata['category'] = 'Docs'
1517
- elif 'api' in url_lower:
1518
- metadata['importance'] = 4
1519
- metadata['category'] = 'API'
1520
- elif 'guide' in url_lower or 'tutorial' in url_lower:
1521
- metadata['importance'] = 3
1522
- metadata['category'] = 'Guides'
1523
- elif 'example' in url_lower:
1524
- metadata['importance'] = 2
1525
- metadata['category'] = 'Examples'
1526
- elif 'blog' in url_lower:
1527
- metadata['importance'] = 1
1528
- metadata['category'] = 'Blog'
1529
-
1530
- return metadata
1531
-
1532
- async def crawl_page(self, url, depth, base_domain):
1533
- """Crawl a single page and extract information"""
1534
- if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages:
1535
- return []
1536
-
1537
- try:
1538
- response = requests.get(url, headers=self.headers, timeout=self.timeout)
1539
- response.encoding = 'utf-8'
1540
- response.raise_for_status()
1541
- self.visited_urls.add(url)
1542
-
1543
- soup = BeautifulSoup(response.text, 'html.parser')
1544
- content = self.extract_content(soup)
1545
- metadata = self.get_page_metadata(soup, url)
1546
-
1547
- self.url_content[url] = content
1548
- self.url_metadata[url] = metadata
1549
-
1550
- # Find all links
1551
- links = []
1552
- for a in soup.find_all('a', href=True):
1553
- next_url = urljoin(url, a['href'])
1554
- if self.is_valid_url(next_url, base_domain):
1555
- links.append(next_url)
1556
-
1557
- return links
1558
-
1559
- except Exception as e:
1560
- logger.error(f"Error crawling {url}: {str(e)}")
1561
- return []
1562
-
1563
- async def crawl_website(self, start_url):
1564
- """Crawl website starting from the given URL"""
1565
- base_domain = start_url
1566
- queue = [(start_url, 0)]
1567
- seen = {start_url}
1568
-
1569
- while queue and len(self.visited_urls) < self.max_pages:
1570
- current_url, depth = queue.pop(0)
1571
-
1572
- if depth > self.max_depth:
1573
- continue
1574
-
1575
- links = await self.crawl_page(current_url, depth, base_domain)
1576
-
1577
- for link in links:
1578
- if link not in seen:
1579
- seen.add(link)
1580
- queue.append((link, depth + 1))
1581
-
1582
- def generate_llms_txt(self):
1583
- """Generate llms.txt content from crawled data"""
1584
- # Sort URLs by importance
1585
- sorted_urls = sorted(
1586
- self.url_metadata.items(),
1587
- key=lambda x: (x[1]['importance'], x[0]),
1588
- reverse=True
1589
- )
1590
-
1591
- if not sorted_urls:
1592
- return "No content was found to generate llms.txt"
1593
-
1594
- # Group URLs by category
1595
- categorized_urls = defaultdict(list)
1596
- for url, metadata in sorted_urls:
1597
- categorized_urls[metadata['category']].append((url, metadata))
1598
-
1599
- # Generate content
1600
- content = []
1601
-
1602
- # Add main title and description
1603
- main_metadata = sorted_urls[0][1]
1604
- content.append(f"# {main_metadata['title']}")
1605
- if main_metadata['description']:
1606
- content.append(f"\n> {main_metadata['description']}")
1607
-
1608
- # Add categorized sections
1609
- priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
1610
-
1611
- for category in priority_order:
1612
- if category in categorized_urls:
1613
- content.append(f"\n## {category}")
1614
- for url, metadata in categorized_urls[category]:
1615
- if metadata['description']:
1616
- content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
1617
- else:
1618
- content.append(f"\n- [{metadata['title']}]({url})")
1619
-
1620
- return "\n".join(content)
1621
-
1622
- async def process_url(url, max_depth, max_pages):
1623
- """Process URL and generate llms.txt"""
1624
- try:
1625
- # Add https:// if not present
1626
- if not url.startswith(('http://', 'https://')):
1627
- url = 'https://' + url
1628
-
1629
- # Validate URL format
1630
- try:
1631
- result = urlparse(url)
1632
- if not all([result.scheme, result.netloc]):
1633
- return "", "Invalid URL format. Please enter a valid URL."
1634
- except:
1635
- return "", "Invalid URL format. Please enter a valid URL."
1636
-
1637
- # Create crawler and process
1638
- crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
1639
- await crawler.crawl_website(url)
1640
- content = crawler.generate_llms_txt()
1641
-
1642
- return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
1643
-
1644
- except Exception as e:
1645
- return "", f"Error: {str(e)}"
1646
-
1647
- # Create custom theme
1648
- theme = gr.themes.Soft(
1649
- primary_hue="blue",
1650
- font="Open Sans"
1651
- )
1652
-
1653
- # Create the Gradio interface
1654
- with gr.Blocks(theme=theme, css="""
1655
- @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
1656
-
1657
- .gradio-container {
1658
- font-family: 'Open Sans', sans-serif !important;
1659
- }
1660
-
1661
- .gr-button {
1662
- font-family: 'Open Sans', sans-serif !important;
1663
- font-weight: 600 !important;
1664
- }
1665
-
1666
- /* Primary color customization */
1667
- .primary-btn {
1668
- background-color: #2436d4 !important;
1669
- color: white !important;
1670
- }
1671
-
1672
- .primary-btn:hover {
1673
- background-color: #1c2aa8 !important;
1674
- }
1675
-
1676
- [data-testid="textbox"] {
1677
- font-family: 'Open Sans', sans-serif !important;
1678
- }
1679
- """) as iface:
1680
- gr.Markdown("# llms.txt Generator")
1681
- gr.Markdown("Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.")
1682
-
1683
- with gr.Row():
1684
- url_input = gr.Textbox(
1685
- label="Website URL",
1686
- placeholder="Enter the website URL (e.g., example.com or https://example.com)",
1687
- info="The URL will be automatically prefixed with https:// if no protocol is specified."
1688
- )
1689
-
1690
- with gr.Row():
1691
- with gr.Column():
1692
- depth_input = gr.Slider(
1693
- minimum=1,
1694
- maximum=5,
1695
- value=3,
1696
- step=1,
1697
- label="Maximum Crawl Depth",
1698
- info="Higher values will result in more thorough but slower crawling"
1699
- )
1700
- with gr.Column():
1701
- pages_input = gr.Slider(
1702
- minimum=10,
1703
- maximum=100,
1704
- value=50,
1705
- step=10,
1706
- label="Maximum Pages to Crawl",
1707
- info="Higher values will result in more comprehensive but slower results"
1708
- )
1709
-
1710
- generate_btn = gr.Button("Generate llms.txt", variant="primary")
1711
-
1712
- with gr.Row():
1713
- output = gr.Textbox(
1714
- label="Generated llms.txt Content",
1715
- lines=20,
1716
- max_lines=30,
1717
- show_copy_button=True,
1718
- container=True,
1719
- scale=2,
1720
- interactive=True
1721
- )
1722
-
1723
- status = gr.Textbox(label="Status")
1724
-
1725
- generate_btn.click(
1726
- fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
1727
- inputs=[url_input, depth_input, pages_input],
1728
- outputs=[output, status]
1729
- )
1730
-
1731
- # Launch the app
1732
- if __name__ == "__main__":
1733
- iface.launch()
1734
- , # Remove pipe and everything after
1735
- r'\s*•.*
1736
-
1737
- def is_valid_url(self, url, base_domain):
1738
- """Check if URL is valid and belongs to the same domain"""
1739
- try:
1740
- parsed = urlparse(url)
1741
- base_parsed = urlparse(base_domain)
1742
- return (parsed.netloc == base_parsed.netloc and
1743
- parsed.scheme in ['http', 'https'] and
1744
- not url.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip')))
1745
- except:
1746
- return False
1747
-
1748
- def extract_content(self, soup):
1749
- """Extract meaningful content from HTML"""
1750
- # Remove script and style elements
1751
- for element in soup(['script', 'style', 'nav', 'footer', 'header']):
1752
- element.decompose()
1753
-
1754
- # Get main content
1755
- main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
1756
- if main_content:
1757
- return self.normalize_text(main_content.get_text(strip=True))
1758
- return self.normalize_text(soup.get_text(strip=True))
1759
-
1760
- def get_page_metadata(self, soup, url):
1761
- """Extract metadata from the page"""
1762
- metadata = {
1763
- 'title': None,
1764
- 'description': None,
1765
- 'importance': 0,
1766
- 'category': 'Optional'
1767
- }
1768
-
1769
- # Title extraction with cleaning
1770
- title = (
1771
- soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
1772
- soup.find('title').text if soup.find('title') else
1773
- soup.find('h1').text if soup.find('h1') else
1774
- url.split('/')[-1]
1775
- )
1776
- metadata['title'] = self.clean_title(title)
1777
-
1778
- # Description extraction with cleaning
1779
- description = (
1780
- soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
1781
- soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
1782
- ""
1783
- )
1784
- metadata['description'] = self.clean_description(description)
1785
-
1786
- # Calculate importance and category
1787
- url_lower = url.lower()
1788
- if 'docs' in url_lower or 'documentation' in url_lower:
1789
- metadata['importance'] = 5
1790
- metadata['category'] = 'Docs'
1791
- elif 'api' in url_lower:
1792
- metadata['importance'] = 4
1793
- metadata['category'] = 'API'
1794
- elif 'guide' in url_lower or 'tutorial' in url_lower:
1795
- metadata['importance'] = 3
1796
- metadata['category'] = 'Guides'
1797
- elif 'example' in url_lower:
1798
- metadata['importance'] = 2
1799
- metadata['category'] = 'Examples'
1800
- elif 'blog' in url_lower:
1801
- metadata['importance'] = 1
1802
- metadata['category'] = 'Blog'
1803
-
1804
- return metadata
1805
-
1806
- async def crawl_page(self, url, depth, base_domain):
1807
- """Crawl a single page and extract information"""
1808
- if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages:
1809
- return []
1810
-
1811
- try:
1812
- response = requests.get(url, headers=self.headers, timeout=self.timeout)
1813
- response.encoding = 'utf-8'
1814
- response.raise_for_status()
1815
- self.visited_urls.add(url)
1816
-
1817
- soup = BeautifulSoup(response.text, 'html.parser')
1818
- content = self.extract_content(soup)
1819
- metadata = self.get_page_metadata(soup, url)
1820
-
1821
- self.url_content[url] = content
1822
- self.url_metadata[url] = metadata
1823
-
1824
- # Find all links
1825
- links = []
1826
- for a in soup.find_all('a', href=True):
1827
- next_url = urljoin(url, a['href'])
1828
- if self.is_valid_url(next_url, base_domain):
1829
- links.append(next_url)
1830
-
1831
- return links
1832
-
1833
- except Exception as e:
1834
- logger.error(f"Error crawling {url}: {str(e)}")
1835
- return []
1836
-
1837
- async def crawl_website(self, start_url):
1838
- """Crawl website starting from the given URL"""
1839
- base_domain = start_url
1840
- queue = [(start_url, 0)]
1841
- seen = {start_url}
1842
-
1843
- while queue and len(self.visited_urls) < self.max_pages:
1844
- current_url, depth = queue.pop(0)
1845
-
1846
- if depth > self.max_depth:
1847
- continue
1848
-
1849
- links = await self.crawl_page(current_url, depth, base_domain)
1850
-
1851
- for link in links:
1852
- if link not in seen:
1853
- seen.add(link)
1854
- queue.append((link, depth + 1))
1855
-
1856
- def generate_llms_txt(self):
1857
- """Generate llms.txt content from crawled data"""
1858
- # Sort URLs by importance
1859
- sorted_urls = sorted(
1860
- self.url_metadata.items(),
1861
- key=lambda x: (x[1]['importance'], x[0]),
1862
- reverse=True
1863
- )
1864
-
1865
- if not sorted_urls:
1866
- return "No content was found to generate llms.txt"
1867
-
1868
- # Group URLs by category
1869
- categorized_urls = defaultdict(list)
1870
- for url, metadata in sorted_urls:
1871
- categorized_urls[metadata['category']].append((url, metadata))
1872
-
1873
- # Generate content
1874
- content = []
1875
-
1876
- # Add main title and description
1877
- main_metadata = sorted_urls[0][1]
1878
- content.append(f"# {main_metadata['title']}")
1879
- if main_metadata['description']:
1880
- content.append(f"\n> {main_metadata['description']}")
1881
-
1882
- # Add categorized sections
1883
- priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
1884
-
1885
- for category in priority_order:
1886
- if category in categorized_urls:
1887
- content.append(f"\n## {category}")
1888
- for url, metadata in categorized_urls[category]:
1889
- if metadata['description']:
1890
- content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
1891
- else:
1892
- content.append(f"\n- [{metadata['title']}]({url})")
1893
-
1894
- return "\n".join(content)
1895
-
1896
- async def process_url(url, max_depth, max_pages):
1897
- """Process URL and generate llms.txt"""
1898
- try:
1899
- # Add https:// if not present
1900
- if not url.startswith(('http://', 'https://')):
1901
- url = 'https://' + url
1902
-
1903
- # Validate URL format
1904
- try:
1905
- result = urlparse(url)
1906
- if not all([result.scheme, result.netloc]):
1907
- return "", "Invalid URL format. Please enter a valid URL."
1908
- except:
1909
- return "", "Invalid URL format. Please enter a valid URL."
1910
-
1911
- # Create crawler and process
1912
- crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
1913
- await crawler.crawl_website(url)
1914
- content = crawler.generate_llms_txt()
1915
-
1916
- return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
1917
-
1918
- except Exception as e:
1919
- return "", f"Error: {str(e)}"
1920
-
1921
- # Create custom theme
1922
- theme = gr.themes.Soft(
1923
- primary_hue="blue",
1924
- font="Open Sans"
1925
- )
1926
-
1927
- # Create the Gradio interface
1928
- with gr.Blocks(theme=theme, css="""
1929
- @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
1930
-
1931
- .gradio-container {
1932
- font-family: 'Open Sans', sans-serif !important;
1933
- }
1934
-
1935
- .gr-button {
1936
- font-family: 'Open Sans', sans-serif !important;
1937
- font-weight: 600 !important;
1938
- }
1939
-
1940
- /* Primary color customization */
1941
- .primary-btn {
1942
- background-color: #2436d4 !important;
1943
- color: white !important;
1944
- }
1945
-
1946
- .primary-btn:hover {
1947
- background-color: #1c2aa8 !important;
1948
- }
1949
-
1950
- [data-testid="textbox"] {
1951
- font-family: 'Open Sans', sans-serif !important;
1952
- }
1953
- """) as iface:
1954
- gr.Markdown("# llms.txt Generator")
1955
- gr.Markdown("Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.")
1956
-
1957
- with gr.Row():
1958
- url_input = gr.Textbox(
1959
- label="Website URL",
1960
- placeholder="Enter the website URL (e.g., example.com or https://example.com)",
1961
- info="The URL will be automatically prefixed with https:// if no protocol is specified."
1962
- )
1963
-
1964
- with gr.Row():
1965
- with gr.Column():
1966
- depth_input = gr.Slider(
1967
- minimum=1,
1968
- maximum=5,
1969
- value=3,
1970
- step=1,
1971
- label="Maximum Crawl Depth",
1972
- info="Higher values will result in more thorough but slower crawling"
1973
- )
1974
- with gr.Column():
1975
- pages_input = gr.Slider(
1976
- minimum=10,
1977
- maximum=100,
1978
- value=50,
1979
- step=10,
1980
- label="Maximum Pages to Crawl",
1981
- info="Higher values will result in more comprehensive but slower results"
1982
- )
1983
-
1984
- generate_btn = gr.Button("Generate llms.txt", variant="primary")
1985
-
1986
- with gr.Row():
1987
- output = gr.Textbox(
1988
- label="Generated llms.txt Content",
1989
- lines=20,
1990
- max_lines=30,
1991
- show_copy_button=True,
1992
- container=True,
1993
- scale=2,
1994
- interactive=True
1995
- )
1996
-
1997
- status = gr.Textbox(label="Status")
1998
-
1999
- generate_btn.click(
2000
- fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
2001
- inputs=[url_input, depth_input, pages_input],
2002
- outputs=[output, status]
2003
- )
2004
-
2005
- # Launch the app
2006
- if __name__ == "__main__":
2007
- iface.launch()
2008
- , # Remove bullet and everything after
2009
- r'^\s*Welcome to\s+', # Remove "Welcome to" at start
2010
- r'docusaurus_skipToContent_fallback', # Remove docusaurus fragments
2011
- ]
2012
-
2013
- for pattern in patterns:
2014
- title = re.sub(pattern, '', title)
2015
-
2016
- # Clean up whitespace
2017
- title = ' '.join(title.split())
2018
- return title.strip()
2019
-
2020
- def clean_description(self, desc):
2021
- """Clean and format descriptions"""
2022
- if not desc:
2023
- return ""
2024
-
2025
- desc = self.normalize_text(desc)
2026
-
2027
- # Remove duplicate sentences
2028
- sentences = re.split(r'(?<=[.!?])\s+', desc)
2029
- unique_sentences = []
2030
- seen_sentences = set()
2031
-
2032
- for sentence in sentences:
2033
- sentence = sentence.strip()
2034
- sentence_lower = sentence.lower()
2035
- if sentence_lower not in seen_sentences and sentence:
2036
- if not sentence[-1] in '.!?':
2037
- sentence += '.'
2038
- unique_sentences.append(sentence)
2039
- seen_sentences.add(sentence_lower)
2040
-
2041
- cleaned_desc = ' '.join(unique_sentences)
2042
- return cleaned_desc
2043
-
2044
- def is_valid_url(self, url, base_domain):
2045
- """Check if URL is valid and belongs to the same domain"""
2046
- try:
2047
- parsed = urlparse(url)
2048
- base_parsed = urlparse(base_domain)
2049
- return (parsed.netloc == base_parsed.netloc and
2050
- parsed.scheme in ['http', 'https'] and
2051
- not url.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip')))
2052
- except:
2053
- return False
2054
-
2055
- def extract_content(self, soup):
2056
- """Extract meaningful content from HTML"""
2057
- # Remove script and style elements
2058
- for element in soup(['script', 'style', 'nav', 'footer', 'header']):
2059
- element.decompose()
2060
-
2061
- # Get main content
2062
- main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
2063
- if main_content:
2064
- return self.normalize_text(main_content.get_text(strip=True))
2065
- return self.normalize_text(soup.get_text(strip=True))
2066
-
2067
- def get_page_metadata(self, soup, url):
2068
- """Extract metadata from the page"""
2069
- metadata = {
2070
- 'title': None,
2071
- 'description': None,
2072
- 'importance': 0,
2073
- 'category': 'Optional'
2074
- }
2075
-
2076
- # Title extraction with cleaning
2077
- title = (
2078
- soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
2079
- soup.find('title').text if soup.find('title') else
2080
- soup.find('h1').text if soup.find('h1') else
2081
- url.split('/')[-1]
2082
- )
2083
- metadata['title'] = self.clean_title(title)
2084
-
2085
- # Description extraction with cleaning
2086
- description = (
2087
- soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
2088
- soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
2089
- ""
2090
- )
2091
- metadata['description'] = self.clean_description(description)
2092
-
2093
- # Calculate importance and category
2094
- url_lower = url.lower()
2095
- if 'docs' in url_lower or 'documentation' in url_lower:
2096
- metadata['importance'] = 5
2097
- metadata['category'] = 'Docs'
2098
- elif 'api' in url_lower:
2099
- metadata['importance'] = 4
2100
- metadata['category'] = 'API'
2101
- elif 'guide' in url_lower or 'tutorial' in url_lower:
2102
- metadata['importance'] = 3
2103
- metadata['category'] = 'Guides'
2104
- elif 'example' in url_lower:
2105
- metadata['importance'] = 2
2106
- metadata['category'] = 'Examples'
2107
- elif 'blog' in url_lower:
2108
- metadata['importance'] = 1
2109
- metadata['category'] = 'Blog'
2110
-
2111
- return metadata
2112
-
2113
- async def crawl_page(self, url, depth, base_domain):
2114
- """Crawl a single page and extract information"""
2115
- if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages:
2116
- return []
2117
-
2118
- try:
2119
- response = requests.get(url, headers=self.headers, timeout=self.timeout)
2120
- response.encoding = 'utf-8'
2121
- response.raise_for_status()
2122
- self.visited_urls.add(url)
2123
-
2124
- soup = BeautifulSoup(response.text, 'html.parser')
2125
- content = self.extract_content(soup)
2126
- metadata = self.get_page_metadata(soup, url)
2127
-
2128
- self.url_content[url] = content
2129
- self.url_metadata[url] = metadata
2130
-
2131
- # Find all links
2132
- links = []
2133
- for a in soup.find_all('a', href=True):
2134
- next_url = urljoin(url, a['href'])
2135
- if self.is_valid_url(next_url, base_domain):
2136
- links.append(next_url)
2137
-
2138
- return links
2139
-
2140
- except Exception as e:
2141
- logger.error(f"Error crawling {url}: {str(e)}")
2142
- return []
2143
-
2144
- async def crawl_website(self, start_url):
2145
- """Crawl website starting from the given URL"""
2146
- base_domain = start_url
2147
- queue = [(start_url, 0)]
2148
- seen = {start_url}
2149
-
2150
- while queue and len(self.visited_urls) < self.max_pages:
2151
- current_url, depth = queue.pop(0)
2152
-
2153
- if depth > self.max_depth:
2154
- continue
2155
-
2156
- links = await self.crawl_page(current_url, depth, base_domain)
2157
-
2158
- for link in links:
2159
- if link not in seen:
2160
- seen.add(link)
2161
- queue.append((link, depth + 1))
2162
-
2163
- def generate_llms_txt(self):
2164
- """Generate llms.txt content from crawled data"""
2165
- # Sort URLs by importance
2166
- sorted_urls = sorted(
2167
- self.url_metadata.items(),
2168
- key=lambda x: (x[1]['importance'], x[0]),
2169
- reverse=True
2170
- )
2171
-
2172
- if not sorted_urls:
2173
- return "No content was found to generate llms.txt"
2174
-
2175
- # Group URLs by category
2176
- categorized_urls = defaultdict(list)
2177
- for url, metadata in sorted_urls:
2178
- categorized_urls[metadata['category']].append((url, metadata))
2179
-
2180
- # Generate content
2181
- content = []
2182
-
2183
- # Add main title and description
2184
- main_metadata = sorted_urls[0][1]
2185
- content.append(f"# {main_metadata['title']}")
2186
- if main_metadata['description']:
2187
- content.append(f"\n> {main_metadata['description']}")
2188
-
2189
- # Add categorized sections
2190
- priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
2191
-
2192
- for category in priority_order:
2193
- if category in categorized_urls:
2194
- content.append(f"\n## {category}")
2195
- for url, metadata in categorized_urls[category]:
2196
- if metadata['description']:
2197
- content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
2198
- else:
2199
- content.append(f"\n- [{metadata['title']}]({url})")
2200
-
2201
- return "\n".join(content)
2202
-
2203
- async def process_url(url, max_depth, max_pages):
2204
- """Process URL and generate llms.txt"""
2205
- try:
2206
- # Add https:// if not present
2207
- if not url.startswith(('http://', 'https://')):
2208
- url = 'https://' + url
2209
-
2210
- # Validate URL format
2211
- try:
2212
- result = urlparse(url)
2213
- if not all([result.scheme, result.netloc]):
2214
- return "", "Invalid URL format. Please enter a valid URL."
2215
- except:
2216
- return "", "Invalid URL format. Please enter a valid URL."
2217
-
2218
- # Create crawler and process
2219
- crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
2220
- await crawler.crawl_website(url)
2221
- content = crawler.generate_llms_txt()
2222
-
2223
- return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
2224
-
2225
- except Exception as e:
2226
- return "", f"Error: {str(e)}"
2227
-
2228
- # Create custom theme
2229
- theme = gr.themes.Soft(
2230
- primary_hue="blue",
2231
- font="Open Sans"
2232
- )
2233
-
2234
- # Create the Gradio interface
2235
- with gr.Blocks(theme=theme, css="""
2236
- @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
2237
-
2238
- .gradio-container {
2239
- font-family: 'Open Sans', sans-serif !important;
2240
- }
2241
-
2242
- .gr-button {
2243
- font-family: 'Open Sans', sans-serif !important;
2244
- font-weight: 600 !important;
2245
- }
2246
-
2247
- /* Primary color customization */
2248
- .primary-btn {
2249
- background-color: #2436d4 !important;
2250
- color: white !important;
2251
- }
2252
-
2253
- .primary-btn:hover {
2254
- background-color: #1c2aa8 !important;
2255
- }
2256
-
2257
- [data-testid="textbox"] {
2258
- font-family: 'Open Sans', sans-serif !important;
2259
- }
2260
- """) as iface:
2261
- gr.Markdown("# llms.txt Generator")
2262
- gr.Markdown("Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.")
2263
-
2264
- with gr.Row():
2265
- url_input = gr.Textbox(
2266
- label="Website URL",
2267
- placeholder="Enter the website URL (e.g., example.com or https://example.com)",
2268
- info="The URL will be automatically prefixed with https:// if no protocol is specified."
2269
- )
2270
-
2271
- with gr.Row():
2272
- with gr.Column():
2273
- depth_input = gr.Slider(
2274
- minimum=1,
2275
- maximum=5,
2276
- value=3,
2277
- step=1,
2278
- label="Maximum Crawl Depth",
2279
- info="Higher values will result in more thorough but slower crawling"
2280
- )
2281
- with gr.Column():
2282
- pages_input = gr.Slider(
2283
- minimum=10,
2284
- maximum=100,
2285
- value=50,
2286
- step=10,
2287
- label="Maximum Pages to Crawl",
2288
- info="Higher values will result in more comprehensive but slower results"
2289
- )
2290
-
2291
- generate_btn = gr.Button("Generate llms.txt", variant="primary")
2292
-
2293
- with gr.Row():
2294
- output = gr.Textbox(
2295
- label="Generated llms.txt Content",
2296
- lines=20,
2297
- max_lines=30,
2298
- show_copy_button=True,
2299
- container=True,
2300
- scale=2,
2301
- interactive=True
2302
- )
2303
 
2304
  status = gr.Textbox(label="Status")
2305
 
@@ -2309,6 +212,5 @@ with gr.Blocks(theme=theme, css="""
2309
  outputs=[output, status]
2310
  )
2311
 
2312
- # Launch the app
2313
  if __name__ == "__main__":
2314
  iface.launch()
 
3
  from bs4 import BeautifulSoup
4
  import re
5
  from urllib.parse import urljoin, urlparse
 
 
6
  import asyncio
7
  from collections import defaultdict
 
 
8
  import unicodedata
9
+ import logging
10
 
 
11
  logging.basicConfig(level=logging.INFO)
12
  logger = logging.getLogger(__name__)
13
 
14
  class WebsiteCrawler:
15
+ def __init__(self, max_depth=3, max_pages=50):
16
  self.max_depth = max_depth
17
  self.max_pages = max_pages
 
18
  self.visited_urls = set()
 
19
  self.url_metadata = defaultdict(dict)
20
  self.headers = {
21
+ 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
22
  }
23
 
24
+ def clean_text(self, text, is_title=False):
25
+ """Clean and normalize text"""
26
  if not text:
27
  return ""
28
  # Normalize unicode characters
29
  text = unicodedata.normalize('NFKD', text)
30
+ text = re.sub(r'[^\x00-\x7F]+', '', text)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
 
32
+ if is_title:
33
+ # Remove common suffixes and fragments for titles
34
+ text = re.sub(r'\s*[\|\-#:•].*', '', text)
35
+ text = re.sub(r'^\s*Welcome to\s+', '', text)
36
+ text = text.replace('docusaurus_skipToContent_fallback', '')
37
 
38
+ return ' '.join(text.split()).strip()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
39
 
40
  async def crawl_page(self, url, depth, base_domain):
41
  """Crawl a single page and extract information"""
 
43
  return []
44
 
45
  try:
46
+ response = requests.get(url, headers=self.headers, timeout=10)
47
  response.encoding = 'utf-8'
 
48
  self.visited_urls.add(url)
49
 
50
  soup = BeautifulSoup(response.text, 'html.parser')
 
 
51
 
52
+ # Extract metadata
53
+ title = (
54
+ soup.find('meta', property='og:title') or
55
+ soup.find('title') or
56
+ soup.find('h1')
57
+ )
58
+ title = self.clean_text(title.text if title else url.split('/')[-1], is_title=True)
59
+
60
+ desc = soup.find('meta', {'name': 'description'}) or soup.find('meta', property='og:description')
61
+ desc = self.clean_text(desc['content'] if desc else '')
62
+
63
+ # Determine category and importance
64
+ url_lower = url.lower()
65
+ category = 'Optional'
66
+ importance = 0
67
+
68
+ if 'docs' in url_lower or 'documentation' in url_lower:
69
+ category = 'Docs'
70
+ importance = 5
71
+ elif 'api' in url_lower:
72
+ category = 'API'
73
+ importance = 4
74
+
75
+ # Store metadata
76
+ clean_url = re.sub(r'#.*', '', url).rstrip('/')
77
+ self.url_metadata[clean_url] = {
78
+ 'title': title,
79
+ 'description': desc,
80
+ 'category': category,
81
+ 'importance': importance
82
+ }
83
+
84
+ # Find links
85
+ return [
86
+ urljoin(url, a['href'])
87
+ for a in soup.find_all('a', href=True)
88
+ if not any(x in a['href'].lower() for x in ['javascript:', 'mailto:', '.pdf', '.jpg', '.png', '.gif'])
89
+ ]
90
 
91
  except Exception as e:
92
  logger.error(f"Error crawling {url}: {str(e)}")
 
94
 
95
  async def crawl_website(self, start_url):
96
  """Crawl website starting from the given URL"""
97
+ base_domain = urlparse(start_url).netloc
98
  queue = [(start_url, 0)]
99
  seen = {start_url}
100
 
101
  while queue and len(self.visited_urls) < self.max_pages:
102
  current_url, depth = queue.pop(0)
 
103
  if depth > self.max_depth:
104
  continue
105
 
106
  links = await self.crawl_page(current_url, depth, base_domain)
 
107
  for link in links:
108
+ if link not in seen and urlparse(link).netloc == base_domain:
109
  seen.add(link)
110
  queue.append((link, depth + 1))
111
 
112
  def generate_llms_txt(self):
113
+ """Generate llms.txt content"""
114
+ if not self.url_metadata:
115
+ return "No content was found to generate llms.txt"
116
+
117
+ # Sort and filter URLs
118
  sorted_urls = sorted(
119
  self.url_metadata.items(),
120
  key=lambda x: (x[1]['importance'], x[0]),
121
  reverse=True
122
  )
123
 
 
 
 
 
 
 
 
 
124
  # Generate content
125
  content = []
 
 
126
  main_metadata = sorted_urls[0][1]
127
  content.append(f"# {main_metadata['title']}")
128
  if main_metadata['description']:
129
  content.append(f"\n> {main_metadata['description']}")
130
+
131
+ # Group by category
132
+ categories = defaultdict(list)
133
+ seen_titles = set()
134
 
135
+ for url, metadata in sorted_urls:
136
+ title = metadata['title']
137
+ if title not in seen_titles:
138
+ categories[metadata['category']].append((url, metadata))
139
+ seen_titles.add(title)
140
+
141
+ # Add sections
142
+ for category in ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']:
143
+ if category in categories:
144
  content.append(f"\n## {category}")
145
+ for url, metadata in categories[category]:
146
  if metadata['description']:
147
  content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
148
  else:
 
157
  if not url.startswith(('http://', 'https://')):
158
  url = 'https://' + url
159
 
160
+ # Validate URL
161
+ result = urlparse(url)
162
+ if not all([result.scheme, result.netloc]):
 
 
 
163
  return "", "Invalid URL format. Please enter a valid URL."
164
 
165
+ # Process website
166
  crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
167
  await crawler.crawl_website(url)
168
  content = crawler.generate_llms_txt()
 
172
  except Exception as e:
173
  return "", f"Error: {str(e)}"
174
 
175
+ # Create Gradio interface
176
+ theme = gr.themes.Soft(primary_hue="blue", font="Open Sans")
 
 
 
177
 
178
+ with gr.Blocks(theme=theme, css="""
179
+ .primary-btn {background-color: #2436d4 !important;}
180
+ .primary-btn:hover {background-color: #1c2aa8 !important;}
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
181
  """) as iface:
182
  gr.Markdown("# llms.txt Generator")
183
+ gr.Markdown("Generate an llms.txt file from a website following the specification.")
184
 
185
  with gr.Row():
186
  url_input = gr.Textbox(
187
  label="Website URL",
188
+ placeholder="Enter the website URL (e.g., example.com)",
189
+ info="The URL will be automatically prefixed with https:// if not provided"
190
  )
191
 
192
  with gr.Row():
193
  with gr.Column():
194
+ depth_input = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth")
 
 
 
 
 
 
 
195
  with gr.Column():
196
+ pages_input = gr.Slider(minimum=10, maximum=100, value=50, step=10, label="Maximum Pages")
 
 
 
 
 
 
 
197
 
198
  generate_btn = gr.Button("Generate llms.txt", variant="primary")
199
 
200
+ output = gr.Textbox(
201
+ label="Generated llms.txt Content",
202
+ lines=20,
203
+ show_copy_button=True,
204
+ container=True
 
 
 
 
 
 
 
 
 
 
 
 
205
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
206
 
207
  status = gr.Textbox(label="Status")
208
 
 
212
  outputs=[output, status]
213
  )
214
 
 
215
  if __name__ == "__main__":
216
  iface.launch()