cyberandy commited on
Commit
1a04a7a
·
verified ·
1 Parent(s): aaec994

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +1987 -14
app.py CHANGED
@@ -43,30 +43,2003 @@ class WebsiteCrawler:
43
  text = ' '.join(text.split())
44
  return text
45
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
46
  def clean_title(self, title):
47
  """Clean and format titles"""
 
 
 
48
  title = self.normalize_text(title)
49
- # Remove common suffixes
50
- title = re.sub(r'\s*\|\s*.*$', '', title) # Remove pipe and everything after
51
- title = re.sub(r'\s*-\s*.*$', '', title) # Remove dash and everything after
52
- title = title.strip()
53
- return title
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
54
 
55
  def clean_description(self, desc):
56
  """Clean and format descriptions"""
57
  if not desc:
58
  return ""
 
59
  desc = self.normalize_text(desc)
60
- # Find the last complete sentence
 
61
  sentences = re.split(r'(?<=[.!?])\s+', desc)
62
- if sentences:
63
- # Take up to two complete sentences
64
- cleaned_desc = ' '.join(sentences[:2]).strip()
65
- # Ensure it ends with proper punctuation
66
- if not cleaned_desc[-1] in '.!?':
67
- cleaned_desc += '.'
68
- return cleaned_desc
69
- return desc
 
 
 
 
 
 
70
 
71
  def is_valid_url(self, url, base_domain):
72
  """Check if URL is valid and belongs to the same domain"""
 
43
  text = ' '.join(text.split())
44
  return text
45
 
46
+ def clean_url(self, url):
47
+ """Clean URL by removing fragments and unnecessary parameters"""
48
+ # Remove fragments (everything after #)
49
+ url = re.sub(r'#.*
50
+
51
+ def is_valid_url(self, url, base_domain):
52
+ """Check if URL is valid and belongs to the same domain"""
53
+ try:
54
+ parsed = urlparse(url)
55
+ base_parsed = urlparse(base_domain)
56
+ return (parsed.netloc == base_parsed.netloc and
57
+ parsed.scheme in ['http', 'https'] and
58
+ not url.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip')))
59
+ except:
60
+ return False
61
+
62
+ def extract_content(self, soup):
63
+ """Extract meaningful content from HTML"""
64
+ # Remove script and style elements
65
+ for element in soup(['script', 'style', 'nav', 'footer', 'header']):
66
+ element.decompose()
67
+
68
+ # Get main content
69
+ main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
70
+ if main_content:
71
+ return self.normalize_text(main_content.get_text(strip=True))
72
+ return self.normalize_text(soup.get_text(strip=True))
73
+
74
+ def get_page_metadata(self, soup, url):
75
+ """Extract metadata from the page"""
76
+ metadata = {
77
+ 'title': None,
78
+ 'description': None,
79
+ 'importance': 0,
80
+ 'category': 'Optional'
81
+ }
82
+
83
+ # Title extraction with cleaning
84
+ title = (
85
+ soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
86
+ soup.find('title').text if soup.find('title') else
87
+ soup.find('h1').text if soup.find('h1') else
88
+ url.split('/')[-1]
89
+ )
90
+ metadata['title'] = self.clean_title(title)
91
+
92
+ # Description extraction with cleaning
93
+ description = (
94
+ soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
95
+ soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
96
+ ""
97
+ )
98
+ metadata['description'] = self.clean_description(description)
99
+
100
+ # Calculate importance and category
101
+ url_lower = url.lower()
102
+ if 'docs' in url_lower or 'documentation' in url_lower:
103
+ metadata['importance'] = 5
104
+ metadata['category'] = 'Docs'
105
+ elif 'api' in url_lower:
106
+ metadata['importance'] = 4
107
+ metadata['category'] = 'API'
108
+ elif 'guide' in url_lower or 'tutorial' in url_lower:
109
+ metadata['importance'] = 3
110
+ metadata['category'] = 'Guides'
111
+ elif 'example' in url_lower:
112
+ metadata['importance'] = 2
113
+ metadata['category'] = 'Examples'
114
+ elif 'blog' in url_lower:
115
+ metadata['importance'] = 1
116
+ metadata['category'] = 'Blog'
117
+
118
+ return metadata
119
+
120
+ async def crawl_page(self, url, depth, base_domain):
121
+ """Crawl a single page and extract information"""
122
+ if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages:
123
+ return []
124
+
125
+ try:
126
+ response = requests.get(url, headers=self.headers, timeout=self.timeout)
127
+ response.encoding = 'utf-8'
128
+ response.raise_for_status()
129
+ self.visited_urls.add(url)
130
+
131
+ soup = BeautifulSoup(response.text, 'html.parser')
132
+ content = self.extract_content(soup)
133
+ metadata = self.get_page_metadata(soup, url)
134
+
135
+ self.url_content[url] = content
136
+ self.url_metadata[url] = metadata
137
+
138
+ # Find all links
139
+ links = []
140
+ for a in soup.find_all('a', href=True):
141
+ next_url = urljoin(url, a['href'])
142
+ if self.is_valid_url(next_url, base_domain):
143
+ links.append(next_url)
144
+
145
+ return links
146
+
147
+ except Exception as e:
148
+ logger.error(f"Error crawling {url}: {str(e)}")
149
+ return []
150
+
151
+ async def crawl_website(self, start_url):
152
+ """Crawl website starting from the given URL"""
153
+ base_domain = start_url
154
+ queue = [(start_url, 0)]
155
+ seen = {start_url}
156
+
157
+ while queue and len(self.visited_urls) < self.max_pages:
158
+ current_url, depth = queue.pop(0)
159
+
160
+ if depth > self.max_depth:
161
+ continue
162
+
163
+ links = await self.crawl_page(current_url, depth, base_domain)
164
+
165
+ for link in links:
166
+ if link not in seen:
167
+ seen.add(link)
168
+ queue.append((link, depth + 1))
169
+
170
+ def generate_llms_txt(self):
171
+ """Generate llms.txt content from crawled data"""
172
+ # Clean and deduplicate metadata
173
+ cleaned_metadata = self.remove_duplicate_content(self.url_metadata)
174
+
175
+ # Sort URLs by importance
176
+ sorted_urls = sorted(
177
+ cleaned_metadata.items(),
178
+ key=lambda x: (x[1]['importance'], x[0]),
179
+ reverse=True
180
+ )
181
+
182
+ if not sorted_urls:
183
+ return "No content was found to generate llms.txt"
184
+
185
+ # Group URLs by category
186
+ categorized_urls = defaultdict(list)
187
+ for url, metadata in sorted_urls:
188
+ categorized_urls[metadata['category']].append((url, metadata))
189
+
190
+ # Generate content
191
+ content = []
192
+
193
+ # Add main title and description
194
+ main_metadata = sorted_urls[0][1]
195
+ content.append(f"# {main_metadata['title']}")
196
+ if main_metadata['description']:
197
+ content.append(f"\n> {main_metadata['description']}")
198
+
199
+ # Add categorized sections
200
+ priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
201
+
202
+ for category in priority_order:
203
+ if category in categorized_urls:
204
+ content.append(f"\n## {category}")
205
+ for url, metadata in categorized_urls[category]:
206
+ if metadata['description']:
207
+ content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
208
+ else:
209
+ content.append(f"\n- [{metadata['title']}]({url})")
210
+
211
+ return "\n".join(content)
212
+
213
+ async def process_url(url, max_depth, max_pages):
214
+ """Process URL and generate llms.txt"""
215
+ try:
216
+ # Add https:// if not present
217
+ if not url.startswith(('http://', 'https://')):
218
+ url = 'https://' + url
219
+
220
+ # Validate URL format
221
+ try:
222
+ result = urlparse(url)
223
+ if not all([result.scheme, result.netloc]):
224
+ return "", "Invalid URL format. Please enter a valid URL."
225
+ except:
226
+ return "", "Invalid URL format. Please enter a valid URL."
227
+
228
+ # Create crawler and process
229
+ crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
230
+ await crawler.crawl_website(url)
231
+ content = crawler.generate_llms_txt()
232
+
233
+ return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
234
+
235
+ except Exception as e:
236
+ return "", f"Error: {str(e)}"
237
+
238
+ # Create custom theme
239
+ theme = gr.themes.Soft(
240
+ primary_hue="blue",
241
+ font="Open Sans"
242
+ )
243
+
244
+ # Create the Gradio interface
245
+ with gr.Blocks(theme=theme, css="""
246
+ @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
247
+
248
+ .gradio-container {
249
+ font-family: 'Open Sans', sans-serif !important;
250
+ }
251
+
252
+ .gr-button {
253
+ font-family: 'Open Sans', sans-serif !important;
254
+ font-weight: 600 !important;
255
+ }
256
+
257
+ /* Primary color customization */
258
+ .primary-btn {
259
+ background-color: #2436d4 !important;
260
+ color: white !important;
261
+ }
262
+
263
+ .primary-btn:hover {
264
+ background-color: #1c2aa8 !important;
265
+ }
266
+
267
+ [data-testid="textbox"] {
268
+ font-family: 'Open Sans', sans-serif !important;
269
+ }
270
+ """) as iface:
271
+ gr.Markdown("# llms.txt Generator")
272
+ gr.Markdown("Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.")
273
+
274
+ with gr.Row():
275
+ url_input = gr.Textbox(
276
+ label="Website URL",
277
+ placeholder="Enter the website URL (e.g., example.com or https://example.com)",
278
+ info="The URL will be automatically prefixed with https:// if no protocol is specified."
279
+ )
280
+
281
+ with gr.Row():
282
+ with gr.Column():
283
+ depth_input = gr.Slider(
284
+ minimum=1,
285
+ maximum=5,
286
+ value=3,
287
+ step=1,
288
+ label="Maximum Crawl Depth",
289
+ info="Higher values will result in more thorough but slower crawling"
290
+ )
291
+ with gr.Column():
292
+ pages_input = gr.Slider(
293
+ minimum=10,
294
+ maximum=100,
295
+ value=50,
296
+ step=10,
297
+ label="Maximum Pages to Crawl",
298
+ info="Higher values will result in more comprehensive but slower results"
299
+ )
300
+
301
+ generate_btn = gr.Button("Generate llms.txt", variant="primary")
302
+
303
+ with gr.Row():
304
+ output = gr.Textbox(
305
+ label="Generated llms.txt Content",
306
+ lines=20,
307
+ max_lines=30,
308
+ show_copy_button=True,
309
+ container=True,
310
+ scale=2,
311
+ interactive=True
312
+ )
313
+
314
+ status = gr.Textbox(label="Status")
315
+
316
+ generate_btn.click(
317
+ fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
318
+ inputs=[url_input, depth_input, pages_input],
319
+ outputs=[output, status]
320
+ )
321
+
322
+ # Launch the app
323
+ if __name__ == "__main__":
324
+ iface.launch()
325
+ , '', url)
326
+ # Remove trailing slashes
327
+ url = url.rstrip('/')
328
+ return url
329
+
330
+ def remove_duplicate_content(self, urls_metadata):
331
+ """Remove duplicate content based on similar titles and URLs"""
332
+ seen_content = {}
333
+ cleaned_metadata = {}
334
+
335
+ for url, metadata in urls_metadata.items():
336
+ clean_url = self.clean_url(url)
337
+ base_url = clean_url.split('#')[0] # Remove hash fragments
338
+
339
+ # Create a content signature based on title and base URL
340
+ title = metadata['title'].lower()
341
+
342
+ # Skip entries that are just fragments of the same page
343
+ if base_url in seen_content:
344
+ # Keep the one with the shortest URL (usually the main page)
345
+ if len(clean_url) < len(seen_content[base_url]):
346
+ cleaned_metadata[clean_url] = metadata
347
+ cleaned_metadata.pop(seen_content[base_url], None)
348
+ seen_content[base_url] = clean_url
349
+ continue
350
+
351
+ seen_content[base_url] = clean_url
352
+ cleaned_metadata[clean_url] = metadata
353
+
354
+ return cleaned_metadata
355
+
356
  def clean_title(self, title):
357
  """Clean and format titles"""
358
+ if not title:
359
+ return ""
360
+
361
  title = self.normalize_text(title)
362
+
363
+ # Remove common suffixes and prefixes
364
+ patterns = [
365
+ r'\s*\|\s*.*
366
+
367
+ def is_valid_url(self, url, base_domain):
368
+ """Check if URL is valid and belongs to the same domain"""
369
+ try:
370
+ parsed = urlparse(url)
371
+ base_parsed = urlparse(base_domain)
372
+ return (parsed.netloc == base_parsed.netloc and
373
+ parsed.scheme in ['http', 'https'] and
374
+ not url.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip')))
375
+ except:
376
+ return False
377
+
378
+ def extract_content(self, soup):
379
+ """Extract meaningful content from HTML"""
380
+ # Remove script and style elements
381
+ for element in soup(['script', 'style', 'nav', 'footer', 'header']):
382
+ element.decompose()
383
+
384
+ # Get main content
385
+ main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
386
+ if main_content:
387
+ return self.normalize_text(main_content.get_text(strip=True))
388
+ return self.normalize_text(soup.get_text(strip=True))
389
+
390
+ def get_page_metadata(self, soup, url):
391
+ """Extract metadata from the page"""
392
+ metadata = {
393
+ 'title': None,
394
+ 'description': None,
395
+ 'importance': 0,
396
+ 'category': 'Optional'
397
+ }
398
+
399
+ # Title extraction with cleaning
400
+ title = (
401
+ soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
402
+ soup.find('title').text if soup.find('title') else
403
+ soup.find('h1').text if soup.find('h1') else
404
+ url.split('/')[-1]
405
+ )
406
+ metadata['title'] = self.clean_title(title)
407
+
408
+ # Description extraction with cleaning
409
+ description = (
410
+ soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
411
+ soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
412
+ ""
413
+ )
414
+ metadata['description'] = self.clean_description(description)
415
+
416
+ # Calculate importance and category
417
+ url_lower = url.lower()
418
+ if 'docs' in url_lower or 'documentation' in url_lower:
419
+ metadata['importance'] = 5
420
+ metadata['category'] = 'Docs'
421
+ elif 'api' in url_lower:
422
+ metadata['importance'] = 4
423
+ metadata['category'] = 'API'
424
+ elif 'guide' in url_lower or 'tutorial' in url_lower:
425
+ metadata['importance'] = 3
426
+ metadata['category'] = 'Guides'
427
+ elif 'example' in url_lower:
428
+ metadata['importance'] = 2
429
+ metadata['category'] = 'Examples'
430
+ elif 'blog' in url_lower:
431
+ metadata['importance'] = 1
432
+ metadata['category'] = 'Blog'
433
+
434
+ return metadata
435
+
436
+ async def crawl_page(self, url, depth, base_domain):
437
+ """Crawl a single page and extract information"""
438
+ if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages:
439
+ return []
440
+
441
+ try:
442
+ response = requests.get(url, headers=self.headers, timeout=self.timeout)
443
+ response.encoding = 'utf-8'
444
+ response.raise_for_status()
445
+ self.visited_urls.add(url)
446
+
447
+ soup = BeautifulSoup(response.text, 'html.parser')
448
+ content = self.extract_content(soup)
449
+ metadata = self.get_page_metadata(soup, url)
450
+
451
+ self.url_content[url] = content
452
+ self.url_metadata[url] = metadata
453
+
454
+ # Find all links
455
+ links = []
456
+ for a in soup.find_all('a', href=True):
457
+ next_url = urljoin(url, a['href'])
458
+ if self.is_valid_url(next_url, base_domain):
459
+ links.append(next_url)
460
+
461
+ return links
462
+
463
+ except Exception as e:
464
+ logger.error(f"Error crawling {url}: {str(e)}")
465
+ return []
466
+
467
+ async def crawl_website(self, start_url):
468
+ """Crawl website starting from the given URL"""
469
+ base_domain = start_url
470
+ queue = [(start_url, 0)]
471
+ seen = {start_url}
472
+
473
+ while queue and len(self.visited_urls) < self.max_pages:
474
+ current_url, depth = queue.pop(0)
475
+
476
+ if depth > self.max_depth:
477
+ continue
478
+
479
+ links = await self.crawl_page(current_url, depth, base_domain)
480
+
481
+ for link in links:
482
+ if link not in seen:
483
+ seen.add(link)
484
+ queue.append((link, depth + 1))
485
+
486
+ def generate_llms_txt(self):
487
+ """Generate llms.txt content from crawled data"""
488
+ # Sort URLs by importance
489
+ sorted_urls = sorted(
490
+ self.url_metadata.items(),
491
+ key=lambda x: (x[1]['importance'], x[0]),
492
+ reverse=True
493
+ )
494
+
495
+ if not sorted_urls:
496
+ return "No content was found to generate llms.txt"
497
+
498
+ # Group URLs by category
499
+ categorized_urls = defaultdict(list)
500
+ for url, metadata in sorted_urls:
501
+ categorized_urls[metadata['category']].append((url, metadata))
502
+
503
+ # Generate content
504
+ content = []
505
+
506
+ # Add main title and description
507
+ main_metadata = sorted_urls[0][1]
508
+ content.append(f"# {main_metadata['title']}")
509
+ if main_metadata['description']:
510
+ content.append(f"\n> {main_metadata['description']}")
511
+
512
+ # Add categorized sections
513
+ priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
514
+
515
+ for category in priority_order:
516
+ if category in categorized_urls:
517
+ content.append(f"\n## {category}")
518
+ for url, metadata in categorized_urls[category]:
519
+ if metadata['description']:
520
+ content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
521
+ else:
522
+ content.append(f"\n- [{metadata['title']}]({url})")
523
+
524
+ return "\n".join(content)
525
+
526
+ async def process_url(url, max_depth, max_pages):
527
+ """Process URL and generate llms.txt"""
528
+ try:
529
+ # Add https:// if not present
530
+ if not url.startswith(('http://', 'https://')):
531
+ url = 'https://' + url
532
+
533
+ # Validate URL format
534
+ try:
535
+ result = urlparse(url)
536
+ if not all([result.scheme, result.netloc]):
537
+ return "", "Invalid URL format. Please enter a valid URL."
538
+ except:
539
+ return "", "Invalid URL format. Please enter a valid URL."
540
+
541
+ # Create crawler and process
542
+ crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
543
+ await crawler.crawl_website(url)
544
+ content = crawler.generate_llms_txt()
545
+
546
+ return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
547
+
548
+ except Exception as e:
549
+ return "", f"Error: {str(e)}"
550
+
551
+ # Create custom theme
552
+ theme = gr.themes.Soft(
553
+ primary_hue="blue",
554
+ font="Open Sans"
555
+ )
556
+
557
+ # Create the Gradio interface
558
+ with gr.Blocks(theme=theme, css="""
559
+ @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
560
+
561
+ .gradio-container {
562
+ font-family: 'Open Sans', sans-serif !important;
563
+ }
564
+
565
+ .gr-button {
566
+ font-family: 'Open Sans', sans-serif !important;
567
+ font-weight: 600 !important;
568
+ }
569
+
570
+ /* Primary color customization */
571
+ .primary-btn {
572
+ background-color: #2436d4 !important;
573
+ color: white !important;
574
+ }
575
+
576
+ .primary-btn:hover {
577
+ background-color: #1c2aa8 !important;
578
+ }
579
+
580
+ [data-testid="textbox"] {
581
+ font-family: 'Open Sans', sans-serif !important;
582
+ }
583
+ """) as iface:
584
+ gr.Markdown("# llms.txt Generator")
585
+ gr.Markdown("Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.")
586
+
587
+ with gr.Row():
588
+ url_input = gr.Textbox(
589
+ label="Website URL",
590
+ placeholder="Enter the website URL (e.g., example.com or https://example.com)",
591
+ info="The URL will be automatically prefixed with https:// if no protocol is specified."
592
+ )
593
+
594
+ with gr.Row():
595
+ with gr.Column():
596
+ depth_input = gr.Slider(
597
+ minimum=1,
598
+ maximum=5,
599
+ value=3,
600
+ step=1,
601
+ label="Maximum Crawl Depth",
602
+ info="Higher values will result in more thorough but slower crawling"
603
+ )
604
+ with gr.Column():
605
+ pages_input = gr.Slider(
606
+ minimum=10,
607
+ maximum=100,
608
+ value=50,
609
+ step=10,
610
+ label="Maximum Pages to Crawl",
611
+ info="Higher values will result in more comprehensive but slower results"
612
+ )
613
+
614
+ generate_btn = gr.Button("Generate llms.txt", variant="primary")
615
+
616
+ with gr.Row():
617
+ output = gr.Textbox(
618
+ label="Generated llms.txt Content",
619
+ lines=20,
620
+ max_lines=30,
621
+ show_copy_button=True,
622
+ container=True,
623
+ scale=2,
624
+ interactive=True
625
+ )
626
+
627
+ status = gr.Textbox(label="Status")
628
+
629
+ generate_btn.click(
630
+ fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
631
+ inputs=[url_input, depth_input, pages_input],
632
+ outputs=[output, status]
633
+ )
634
+
635
+ # Launch the app
636
+ if __name__ == "__main__":
637
+ iface.launch()
638
+ , # Remove pipe and everything after
639
+ r'\s*-\s*.*
640
+
641
+ def is_valid_url(self, url, base_domain):
642
+ """Check if URL is valid and belongs to the same domain"""
643
+ try:
644
+ parsed = urlparse(url)
645
+ base_parsed = urlparse(base_domain)
646
+ return (parsed.netloc == base_parsed.netloc and
647
+ parsed.scheme in ['http', 'https'] and
648
+ not url.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip')))
649
+ except:
650
+ return False
651
+
652
+ def extract_content(self, soup):
653
+ """Extract meaningful content from HTML"""
654
+ # Remove script and style elements
655
+ for element in soup(['script', 'style', 'nav', 'footer', 'header']):
656
+ element.decompose()
657
+
658
+ # Get main content
659
+ main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
660
+ if main_content:
661
+ return self.normalize_text(main_content.get_text(strip=True))
662
+ return self.normalize_text(soup.get_text(strip=True))
663
+
664
+ def get_page_metadata(self, soup, url):
665
+ """Extract metadata from the page"""
666
+ metadata = {
667
+ 'title': None,
668
+ 'description': None,
669
+ 'importance': 0,
670
+ 'category': 'Optional'
671
+ }
672
+
673
+ # Title extraction with cleaning
674
+ title = (
675
+ soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
676
+ soup.find('title').text if soup.find('title') else
677
+ soup.find('h1').text if soup.find('h1') else
678
+ url.split('/')[-1]
679
+ )
680
+ metadata['title'] = self.clean_title(title)
681
+
682
+ # Description extraction with cleaning
683
+ description = (
684
+ soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
685
+ soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
686
+ ""
687
+ )
688
+ metadata['description'] = self.clean_description(description)
689
+
690
+ # Calculate importance and category
691
+ url_lower = url.lower()
692
+ if 'docs' in url_lower or 'documentation' in url_lower:
693
+ metadata['importance'] = 5
694
+ metadata['category'] = 'Docs'
695
+ elif 'api' in url_lower:
696
+ metadata['importance'] = 4
697
+ metadata['category'] = 'API'
698
+ elif 'guide' in url_lower or 'tutorial' in url_lower:
699
+ metadata['importance'] = 3
700
+ metadata['category'] = 'Guides'
701
+ elif 'example' in url_lower:
702
+ metadata['importance'] = 2
703
+ metadata['category'] = 'Examples'
704
+ elif 'blog' in url_lower:
705
+ metadata['importance'] = 1
706
+ metadata['category'] = 'Blog'
707
+
708
+ return metadata
709
+
710
+ async def crawl_page(self, url, depth, base_domain):
711
+ """Crawl a single page and extract information"""
712
+ if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages:
713
+ return []
714
+
715
+ try:
716
+ response = requests.get(url, headers=self.headers, timeout=self.timeout)
717
+ response.encoding = 'utf-8'
718
+ response.raise_for_status()
719
+ self.visited_urls.add(url)
720
+
721
+ soup = BeautifulSoup(response.text, 'html.parser')
722
+ content = self.extract_content(soup)
723
+ metadata = self.get_page_metadata(soup, url)
724
+
725
+ self.url_content[url] = content
726
+ self.url_metadata[url] = metadata
727
+
728
+ # Find all links
729
+ links = []
730
+ for a in soup.find_all('a', href=True):
731
+ next_url = urljoin(url, a['href'])
732
+ if self.is_valid_url(next_url, base_domain):
733
+ links.append(next_url)
734
+
735
+ return links
736
+
737
+ except Exception as e:
738
+ logger.error(f"Error crawling {url}: {str(e)}")
739
+ return []
740
+
741
+ async def crawl_website(self, start_url):
742
+ """Crawl website starting from the given URL"""
743
+ base_domain = start_url
744
+ queue = [(start_url, 0)]
745
+ seen = {start_url}
746
+
747
+ while queue and len(self.visited_urls) < self.max_pages:
748
+ current_url, depth = queue.pop(0)
749
+
750
+ if depth > self.max_depth:
751
+ continue
752
+
753
+ links = await self.crawl_page(current_url, depth, base_domain)
754
+
755
+ for link in links:
756
+ if link not in seen:
757
+ seen.add(link)
758
+ queue.append((link, depth + 1))
759
+
760
+ def generate_llms_txt(self):
761
+ """Generate llms.txt content from crawled data"""
762
+ # Sort URLs by importance
763
+ sorted_urls = sorted(
764
+ self.url_metadata.items(),
765
+ key=lambda x: (x[1]['importance'], x[0]),
766
+ reverse=True
767
+ )
768
+
769
+ if not sorted_urls:
770
+ return "No content was found to generate llms.txt"
771
+
772
+ # Group URLs by category
773
+ categorized_urls = defaultdict(list)
774
+ for url, metadata in sorted_urls:
775
+ categorized_urls[metadata['category']].append((url, metadata))
776
+
777
+ # Generate content
778
+ content = []
779
+
780
+ # Add main title and description
781
+ main_metadata = sorted_urls[0][1]
782
+ content.append(f"# {main_metadata['title']}")
783
+ if main_metadata['description']:
784
+ content.append(f"\n> {main_metadata['description']}")
785
+
786
+ # Add categorized sections
787
+ priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
788
+
789
+ for category in priority_order:
790
+ if category in categorized_urls:
791
+ content.append(f"\n## {category}")
792
+ for url, metadata in categorized_urls[category]:
793
+ if metadata['description']:
794
+ content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
795
+ else:
796
+ content.append(f"\n- [{metadata['title']}]({url})")
797
+
798
+ return "\n".join(content)
799
+
800
+ async def process_url(url, max_depth, max_pages):
801
+ """Process URL and generate llms.txt"""
802
+ try:
803
+ # Add https:// if not present
804
+ if not url.startswith(('http://', 'https://')):
805
+ url = 'https://' + url
806
+
807
+ # Validate URL format
808
+ try:
809
+ result = urlparse(url)
810
+ if not all([result.scheme, result.netloc]):
811
+ return "", "Invalid URL format. Please enter a valid URL."
812
+ except:
813
+ return "", "Invalid URL format. Please enter a valid URL."
814
+
815
+ # Create crawler and process
816
+ crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
817
+ await crawler.crawl_website(url)
818
+ content = crawler.generate_llms_txt()
819
+
820
+ return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
821
+
822
+ except Exception as e:
823
+ return "", f"Error: {str(e)}"
824
+
825
+ # Create custom theme
826
+ theme = gr.themes.Soft(
827
+ primary_hue="blue",
828
+ font="Open Sans"
829
+ )
830
+
831
+ # Create the Gradio interface
832
+ with gr.Blocks(theme=theme, css="""
833
+ @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
834
+
835
+ .gradio-container {
836
+ font-family: 'Open Sans', sans-serif !important;
837
+ }
838
+
839
+ .gr-button {
840
+ font-family: 'Open Sans', sans-serif !important;
841
+ font-weight: 600 !important;
842
+ }
843
+
844
+ /* Primary color customization */
845
+ .primary-btn {
846
+ background-color: #2436d4 !important;
847
+ color: white !important;
848
+ }
849
+
850
+ .primary-btn:hover {
851
+ background-color: #1c2aa8 !important;
852
+ }
853
+
854
+ [data-testid="textbox"] {
855
+ font-family: 'Open Sans', sans-serif !important;
856
+ }
857
+ """) as iface:
858
+ gr.Markdown("# llms.txt Generator")
859
+ gr.Markdown("Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.")
860
+
861
+ with gr.Row():
862
+ url_input = gr.Textbox(
863
+ label="Website URL",
864
+ placeholder="Enter the website URL (e.g., example.com or https://example.com)",
865
+ info="The URL will be automatically prefixed with https:// if no protocol is specified."
866
+ )
867
+
868
+ with gr.Row():
869
+ with gr.Column():
870
+ depth_input = gr.Slider(
871
+ minimum=1,
872
+ maximum=5,
873
+ value=3,
874
+ step=1,
875
+ label="Maximum Crawl Depth",
876
+ info="Higher values will result in more thorough but slower crawling"
877
+ )
878
+ with gr.Column():
879
+ pages_input = gr.Slider(
880
+ minimum=10,
881
+ maximum=100,
882
+ value=50,
883
+ step=10,
884
+ label="Maximum Pages to Crawl",
885
+ info="Higher values will result in more comprehensive but slower results"
886
+ )
887
+
888
+ generate_btn = gr.Button("Generate llms.txt", variant="primary")
889
+
890
+ with gr.Row():
891
+ output = gr.Textbox(
892
+ label="Generated llms.txt Content",
893
+ lines=20,
894
+ max_lines=30,
895
+ show_copy_button=True,
896
+ container=True,
897
+ scale=2,
898
+ interactive=True
899
+ )
900
+
901
+ status = gr.Textbox(label="Status")
902
+
903
+ generate_btn.click(
904
+ fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
905
+ inputs=[url_input, depth_input, pages_input],
906
+ outputs=[output, status]
907
+ )
908
+
909
+ # Launch the app
910
+ if __name__ == "__main__":
911
+ iface.launch()
912
+ , # Remove dash and everything after
913
+ r'\s*:\s*.*
914
+
915
+ def is_valid_url(self, url, base_domain):
916
+ """Check if URL is valid and belongs to the same domain"""
917
+ try:
918
+ parsed = urlparse(url)
919
+ base_parsed = urlparse(base_domain)
920
+ return (parsed.netloc == base_parsed.netloc and
921
+ parsed.scheme in ['http', 'https'] and
922
+ not url.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip')))
923
+ except:
924
+ return False
925
+
926
+ def extract_content(self, soup):
927
+ """Extract meaningful content from HTML"""
928
+ # Remove script and style elements
929
+ for element in soup(['script', 'style', 'nav', 'footer', 'header']):
930
+ element.decompose()
931
+
932
+ # Get main content
933
+ main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
934
+ if main_content:
935
+ return self.normalize_text(main_content.get_text(strip=True))
936
+ return self.normalize_text(soup.get_text(strip=True))
937
+
938
+ def get_page_metadata(self, soup, url):
939
+ """Extract metadata from the page"""
940
+ metadata = {
941
+ 'title': None,
942
+ 'description': None,
943
+ 'importance': 0,
944
+ 'category': 'Optional'
945
+ }
946
+
947
+ # Title extraction with cleaning
948
+ title = (
949
+ soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
950
+ soup.find('title').text if soup.find('title') else
951
+ soup.find('h1').text if soup.find('h1') else
952
+ url.split('/')[-1]
953
+ )
954
+ metadata['title'] = self.clean_title(title)
955
+
956
+ # Description extraction with cleaning
957
+ description = (
958
+ soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
959
+ soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
960
+ ""
961
+ )
962
+ metadata['description'] = self.clean_description(description)
963
+
964
+ # Calculate importance and category
965
+ url_lower = url.lower()
966
+ if 'docs' in url_lower or 'documentation' in url_lower:
967
+ metadata['importance'] = 5
968
+ metadata['category'] = 'Docs'
969
+ elif 'api' in url_lower:
970
+ metadata['importance'] = 4
971
+ metadata['category'] = 'API'
972
+ elif 'guide' in url_lower or 'tutorial' in url_lower:
973
+ metadata['importance'] = 3
974
+ metadata['category'] = 'Guides'
975
+ elif 'example' in url_lower:
976
+ metadata['importance'] = 2
977
+ metadata['category'] = 'Examples'
978
+ elif 'blog' in url_lower:
979
+ metadata['importance'] = 1
980
+ metadata['category'] = 'Blog'
981
+
982
+ return metadata
983
+
984
+ async def crawl_page(self, url, depth, base_domain):
985
+ """Crawl a single page and extract information"""
986
+ if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages:
987
+ return []
988
+
989
+ try:
990
+ response = requests.get(url, headers=self.headers, timeout=self.timeout)
991
+ response.encoding = 'utf-8'
992
+ response.raise_for_status()
993
+ self.visited_urls.add(url)
994
+
995
+ soup = BeautifulSoup(response.text, 'html.parser')
996
+ content = self.extract_content(soup)
997
+ metadata = self.get_page_metadata(soup, url)
998
+
999
+ self.url_content[url] = content
1000
+ self.url_metadata[url] = metadata
1001
+
1002
+ # Find all links
1003
+ links = []
1004
+ for a in soup.find_all('a', href=True):
1005
+ next_url = urljoin(url, a['href'])
1006
+ if self.is_valid_url(next_url, base_domain):
1007
+ links.append(next_url)
1008
+
1009
+ return links
1010
+
1011
+ except Exception as e:
1012
+ logger.error(f"Error crawling {url}: {str(e)}")
1013
+ return []
1014
+
1015
+ async def crawl_website(self, start_url):
1016
+ """Crawl website starting from the given URL"""
1017
+ base_domain = start_url
1018
+ queue = [(start_url, 0)]
1019
+ seen = {start_url}
1020
+
1021
+ while queue and len(self.visited_urls) < self.max_pages:
1022
+ current_url, depth = queue.pop(0)
1023
+
1024
+ if depth > self.max_depth:
1025
+ continue
1026
+
1027
+ links = await self.crawl_page(current_url, depth, base_domain)
1028
+
1029
+ for link in links:
1030
+ if link not in seen:
1031
+ seen.add(link)
1032
+ queue.append((link, depth + 1))
1033
+
1034
+ def generate_llms_txt(self):
1035
+ """Generate llms.txt content from crawled data"""
1036
+ # Sort URLs by importance
1037
+ sorted_urls = sorted(
1038
+ self.url_metadata.items(),
1039
+ key=lambda x: (x[1]['importance'], x[0]),
1040
+ reverse=True
1041
+ )
1042
+
1043
+ if not sorted_urls:
1044
+ return "No content was found to generate llms.txt"
1045
+
1046
+ # Group URLs by category
1047
+ categorized_urls = defaultdict(list)
1048
+ for url, metadata in sorted_urls:
1049
+ categorized_urls[metadata['category']].append((url, metadata))
1050
+
1051
+ # Generate content
1052
+ content = []
1053
+
1054
+ # Add main title and description
1055
+ main_metadata = sorted_urls[0][1]
1056
+ content.append(f"# {main_metadata['title']}")
1057
+ if main_metadata['description']:
1058
+ content.append(f"\n> {main_metadata['description']}")
1059
+
1060
+ # Add categorized sections
1061
+ priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
1062
+
1063
+ for category in priority_order:
1064
+ if category in categorized_urls:
1065
+ content.append(f"\n## {category}")
1066
+ for url, metadata in categorized_urls[category]:
1067
+ if metadata['description']:
1068
+ content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
1069
+ else:
1070
+ content.append(f"\n- [{metadata['title']}]({url})")
1071
+
1072
+ return "\n".join(content)
1073
+
1074
+ async def process_url(url, max_depth, max_pages):
1075
+ """Process URL and generate llms.txt"""
1076
+ try:
1077
+ # Add https:// if not present
1078
+ if not url.startswith(('http://', 'https://')):
1079
+ url = 'https://' + url
1080
+
1081
+ # Validate URL format
1082
+ try:
1083
+ result = urlparse(url)
1084
+ if not all([result.scheme, result.netloc]):
1085
+ return "", "Invalid URL format. Please enter a valid URL."
1086
+ except:
1087
+ return "", "Invalid URL format. Please enter a valid URL."
1088
+
1089
+ # Create crawler and process
1090
+ crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
1091
+ await crawler.crawl_website(url)
1092
+ content = crawler.generate_llms_txt()
1093
+
1094
+ return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
1095
+
1096
+ except Exception as e:
1097
+ return "", f"Error: {str(e)}"
1098
+
1099
+ # Create custom theme
1100
+ theme = gr.themes.Soft(
1101
+ primary_hue="blue",
1102
+ font="Open Sans"
1103
+ )
1104
+
1105
+ # Create the Gradio interface
1106
+ with gr.Blocks(theme=theme, css="""
1107
+ @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
1108
+
1109
+ .gradio-container {
1110
+ font-family: 'Open Sans', sans-serif !important;
1111
+ }
1112
+
1113
+ .gr-button {
1114
+ font-family: 'Open Sans', sans-serif !important;
1115
+ font-weight: 600 !important;
1116
+ }
1117
+
1118
+ /* Primary color customization */
1119
+ .primary-btn {
1120
+ background-color: #2436d4 !important;
1121
+ color: white !important;
1122
+ }
1123
+
1124
+ .primary-btn:hover {
1125
+ background-color: #1c2aa8 !important;
1126
+ }
1127
+
1128
+ [data-testid="textbox"] {
1129
+ font-family: 'Open Sans', sans-serif !important;
1130
+ }
1131
+ """) as iface:
1132
+ gr.Markdown("# llms.txt Generator")
1133
+ gr.Markdown("Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.")
1134
+
1135
+ with gr.Row():
1136
+ url_input = gr.Textbox(
1137
+ label="Website URL",
1138
+ placeholder="Enter the website URL (e.g., example.com or https://example.com)",
1139
+ info="The URL will be automatically prefixed with https:// if no protocol is specified."
1140
+ )
1141
+
1142
+ with gr.Row():
1143
+ with gr.Column():
1144
+ depth_input = gr.Slider(
1145
+ minimum=1,
1146
+ maximum=5,
1147
+ value=3,
1148
+ step=1,
1149
+ label="Maximum Crawl Depth",
1150
+ info="Higher values will result in more thorough but slower crawling"
1151
+ )
1152
+ with gr.Column():
1153
+ pages_input = gr.Slider(
1154
+ minimum=10,
1155
+ maximum=100,
1156
+ value=50,
1157
+ step=10,
1158
+ label="Maximum Pages to Crawl",
1159
+ info="Higher values will result in more comprehensive but slower results"
1160
+ )
1161
+
1162
+ generate_btn = gr.Button("Generate llms.txt", variant="primary")
1163
+
1164
+ with gr.Row():
1165
+ output = gr.Textbox(
1166
+ label="Generated llms.txt Content",
1167
+ lines=20,
1168
+ max_lines=30,
1169
+ show_copy_button=True,
1170
+ container=True,
1171
+ scale=2,
1172
+ interactive=True
1173
+ )
1174
+
1175
+ status = gr.Textbox(label="Status")
1176
+
1177
+ generate_btn.click(
1178
+ fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
1179
+ inputs=[url_input, depth_input, pages_input],
1180
+ outputs=[output, status]
1181
+ )
1182
+
1183
+ # Launch the app
1184
+ if __name__ == "__main__":
1185
+ iface.launch()
1186
+ , # Remove colon and everything after
1187
+ r'#.*
1188
+
1189
+ def is_valid_url(self, url, base_domain):
1190
+ """Check if URL is valid and belongs to the same domain"""
1191
+ try:
1192
+ parsed = urlparse(url)
1193
+ base_parsed = urlparse(base_domain)
1194
+ return (parsed.netloc == base_parsed.netloc and
1195
+ parsed.scheme in ['http', 'https'] and
1196
+ not url.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip')))
1197
+ except:
1198
+ return False
1199
+
1200
+ def extract_content(self, soup):
1201
+ """Extract meaningful content from HTML"""
1202
+ # Remove script and style elements
1203
+ for element in soup(['script', 'style', 'nav', 'footer', 'header']):
1204
+ element.decompose()
1205
+
1206
+ # Get main content
1207
+ main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
1208
+ if main_content:
1209
+ return self.normalize_text(main_content.get_text(strip=True))
1210
+ return self.normalize_text(soup.get_text(strip=True))
1211
+
1212
+ def get_page_metadata(self, soup, url):
1213
+ """Extract metadata from the page"""
1214
+ metadata = {
1215
+ 'title': None,
1216
+ 'description': None,
1217
+ 'importance': 0,
1218
+ 'category': 'Optional'
1219
+ }
1220
+
1221
+ # Title extraction with cleaning
1222
+ title = (
1223
+ soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
1224
+ soup.find('title').text if soup.find('title') else
1225
+ soup.find('h1').text if soup.find('h1') else
1226
+ url.split('/')[-1]
1227
+ )
1228
+ metadata['title'] = self.clean_title(title)
1229
+
1230
+ # Description extraction with cleaning
1231
+ description = (
1232
+ soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
1233
+ soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
1234
+ ""
1235
+ )
1236
+ metadata['description'] = self.clean_description(description)
1237
+
1238
+ # Calculate importance and category
1239
+ url_lower = url.lower()
1240
+ if 'docs' in url_lower or 'documentation' in url_lower:
1241
+ metadata['importance'] = 5
1242
+ metadata['category'] = 'Docs'
1243
+ elif 'api' in url_lower:
1244
+ metadata['importance'] = 4
1245
+ metadata['category'] = 'API'
1246
+ elif 'guide' in url_lower or 'tutorial' in url_lower:
1247
+ metadata['importance'] = 3
1248
+ metadata['category'] = 'Guides'
1249
+ elif 'example' in url_lower:
1250
+ metadata['importance'] = 2
1251
+ metadata['category'] = 'Examples'
1252
+ elif 'blog' in url_lower:
1253
+ metadata['importance'] = 1
1254
+ metadata['category'] = 'Blog'
1255
+
1256
+ return metadata
1257
+
1258
+ async def crawl_page(self, url, depth, base_domain):
1259
+ """Crawl a single page and extract information"""
1260
+ if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages:
1261
+ return []
1262
+
1263
+ try:
1264
+ response = requests.get(url, headers=self.headers, timeout=self.timeout)
1265
+ response.encoding = 'utf-8'
1266
+ response.raise_for_status()
1267
+ self.visited_urls.add(url)
1268
+
1269
+ soup = BeautifulSoup(response.text, 'html.parser')
1270
+ content = self.extract_content(soup)
1271
+ metadata = self.get_page_metadata(soup, url)
1272
+
1273
+ self.url_content[url] = content
1274
+ self.url_metadata[url] = metadata
1275
+
1276
+ # Find all links
1277
+ links = []
1278
+ for a in soup.find_all('a', href=True):
1279
+ next_url = urljoin(url, a['href'])
1280
+ if self.is_valid_url(next_url, base_domain):
1281
+ links.append(next_url)
1282
+
1283
+ return links
1284
+
1285
+ except Exception as e:
1286
+ logger.error(f"Error crawling {url}: {str(e)}")
1287
+ return []
1288
+
1289
+ async def crawl_website(self, start_url):
1290
+ """Crawl website starting from the given URL"""
1291
+ base_domain = start_url
1292
+ queue = [(start_url, 0)]
1293
+ seen = {start_url}
1294
+
1295
+ while queue and len(self.visited_urls) < self.max_pages:
1296
+ current_url, depth = queue.pop(0)
1297
+
1298
+ if depth > self.max_depth:
1299
+ continue
1300
+
1301
+ links = await self.crawl_page(current_url, depth, base_domain)
1302
+
1303
+ for link in links:
1304
+ if link not in seen:
1305
+ seen.add(link)
1306
+ queue.append((link, depth + 1))
1307
+
1308
+ def generate_llms_txt(self):
1309
+ """Generate llms.txt content from crawled data"""
1310
+ # Sort URLs by importance
1311
+ sorted_urls = sorted(
1312
+ self.url_metadata.items(),
1313
+ key=lambda x: (x[1]['importance'], x[0]),
1314
+ reverse=True
1315
+ )
1316
+
1317
+ if not sorted_urls:
1318
+ return "No content was found to generate llms.txt"
1319
+
1320
+ # Group URLs by category
1321
+ categorized_urls = defaultdict(list)
1322
+ for url, metadata in sorted_urls:
1323
+ categorized_urls[metadata['category']].append((url, metadata))
1324
+
1325
+ # Generate content
1326
+ content = []
1327
+
1328
+ # Add main title and description
1329
+ main_metadata = sorted_urls[0][1]
1330
+ content.append(f"# {main_metadata['title']}")
1331
+ if main_metadata['description']:
1332
+ content.append(f"\n> {main_metadata['description']}")
1333
+
1334
+ # Add categorized sections
1335
+ priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
1336
+
1337
+ for category in priority_order:
1338
+ if category in categorized_urls:
1339
+ content.append(f"\n## {category}")
1340
+ for url, metadata in categorized_urls[category]:
1341
+ if metadata['description']:
1342
+ content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
1343
+ else:
1344
+ content.append(f"\n- [{metadata['title']}]({url})")
1345
+
1346
+ return "\n".join(content)
1347
+
1348
+ async def process_url(url, max_depth, max_pages):
1349
+ """Process URL and generate llms.txt"""
1350
+ try:
1351
+ # Add https:// if not present
1352
+ if not url.startswith(('http://', 'https://')):
1353
+ url = 'https://' + url
1354
+
1355
+ # Validate URL format
1356
+ try:
1357
+ result = urlparse(url)
1358
+ if not all([result.scheme, result.netloc]):
1359
+ return "", "Invalid URL format. Please enter a valid URL."
1360
+ except:
1361
+ return "", "Invalid URL format. Please enter a valid URL."
1362
+
1363
+ # Create crawler and process
1364
+ crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
1365
+ await crawler.crawl_website(url)
1366
+ content = crawler.generate_llms_txt()
1367
+
1368
+ return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
1369
+
1370
+ except Exception as e:
1371
+ return "", f"Error: {str(e)}"
1372
+
1373
+ # Create custom theme
1374
+ theme = gr.themes.Soft(
1375
+ primary_hue="blue",
1376
+ font="Open Sans"
1377
+ )
1378
+
1379
+ # Create the Gradio interface
1380
+ with gr.Blocks(theme=theme, css="""
1381
+ @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
1382
+
1383
+ .gradio-container {
1384
+ font-family: 'Open Sans', sans-serif !important;
1385
+ }
1386
+
1387
+ .gr-button {
1388
+ font-family: 'Open Sans', sans-serif !important;
1389
+ font-weight: 600 !important;
1390
+ }
1391
+
1392
+ /* Primary color customization */
1393
+ .primary-btn {
1394
+ background-color: #2436d4 !important;
1395
+ color: white !important;
1396
+ }
1397
+
1398
+ .primary-btn:hover {
1399
+ background-color: #1c2aa8 !important;
1400
+ }
1401
+
1402
+ [data-testid="textbox"] {
1403
+ font-family: 'Open Sans', sans-serif !important;
1404
+ }
1405
+ """) as iface:
1406
+ gr.Markdown("# llms.txt Generator")
1407
+ gr.Markdown("Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.")
1408
+
1409
+ with gr.Row():
1410
+ url_input = gr.Textbox(
1411
+ label="Website URL",
1412
+ placeholder="Enter the website URL (e.g., example.com or https://example.com)",
1413
+ info="The URL will be automatically prefixed with https:// if no protocol is specified."
1414
+ )
1415
+
1416
+ with gr.Row():
1417
+ with gr.Column():
1418
+ depth_input = gr.Slider(
1419
+ minimum=1,
1420
+ maximum=5,
1421
+ value=3,
1422
+ step=1,
1423
+ label="Maximum Crawl Depth",
1424
+ info="Higher values will result in more thorough but slower crawling"
1425
+ )
1426
+ with gr.Column():
1427
+ pages_input = gr.Slider(
1428
+ minimum=10,
1429
+ maximum=100,
1430
+ value=50,
1431
+ step=10,
1432
+ label="Maximum Pages to Crawl",
1433
+ info="Higher values will result in more comprehensive but slower results"
1434
+ )
1435
+
1436
+ generate_btn = gr.Button("Generate llms.txt", variant="primary")
1437
+
1438
+ with gr.Row():
1439
+ output = gr.Textbox(
1440
+ label="Generated llms.txt Content",
1441
+ lines=20,
1442
+ max_lines=30,
1443
+ show_copy_button=True,
1444
+ container=True,
1445
+ scale=2,
1446
+ interactive=True
1447
+ )
1448
+
1449
+ status = gr.Textbox(label="Status")
1450
+
1451
+ generate_btn.click(
1452
+ fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
1453
+ inputs=[url_input, depth_input, pages_input],
1454
+ outputs=[output, status]
1455
+ )
1456
+
1457
+ # Launch the app
1458
+ if __name__ == "__main__":
1459
+ iface.launch()
1460
+ , # Remove hash and everything after
1461
+ r'\s*\|.*
1462
+
1463
+ def is_valid_url(self, url, base_domain):
1464
+ """Check if URL is valid and belongs to the same domain"""
1465
+ try:
1466
+ parsed = urlparse(url)
1467
+ base_parsed = urlparse(base_domain)
1468
+ return (parsed.netloc == base_parsed.netloc and
1469
+ parsed.scheme in ['http', 'https'] and
1470
+ not url.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip')))
1471
+ except:
1472
+ return False
1473
+
1474
+ def extract_content(self, soup):
1475
+ """Extract meaningful content from HTML"""
1476
+ # Remove script and style elements
1477
+ for element in soup(['script', 'style', 'nav', 'footer', 'header']):
1478
+ element.decompose()
1479
+
1480
+ # Get main content
1481
+ main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
1482
+ if main_content:
1483
+ return self.normalize_text(main_content.get_text(strip=True))
1484
+ return self.normalize_text(soup.get_text(strip=True))
1485
+
1486
+ def get_page_metadata(self, soup, url):
1487
+ """Extract metadata from the page"""
1488
+ metadata = {
1489
+ 'title': None,
1490
+ 'description': None,
1491
+ 'importance': 0,
1492
+ 'category': 'Optional'
1493
+ }
1494
+
1495
+ # Title extraction with cleaning
1496
+ title = (
1497
+ soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
1498
+ soup.find('title').text if soup.find('title') else
1499
+ soup.find('h1').text if soup.find('h1') else
1500
+ url.split('/')[-1]
1501
+ )
1502
+ metadata['title'] = self.clean_title(title)
1503
+
1504
+ # Description extraction with cleaning
1505
+ description = (
1506
+ soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
1507
+ soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
1508
+ ""
1509
+ )
1510
+ metadata['description'] = self.clean_description(description)
1511
+
1512
+ # Calculate importance and category
1513
+ url_lower = url.lower()
1514
+ if 'docs' in url_lower or 'documentation' in url_lower:
1515
+ metadata['importance'] = 5
1516
+ metadata['category'] = 'Docs'
1517
+ elif 'api' in url_lower:
1518
+ metadata['importance'] = 4
1519
+ metadata['category'] = 'API'
1520
+ elif 'guide' in url_lower or 'tutorial' in url_lower:
1521
+ metadata['importance'] = 3
1522
+ metadata['category'] = 'Guides'
1523
+ elif 'example' in url_lower:
1524
+ metadata['importance'] = 2
1525
+ metadata['category'] = 'Examples'
1526
+ elif 'blog' in url_lower:
1527
+ metadata['importance'] = 1
1528
+ metadata['category'] = 'Blog'
1529
+
1530
+ return metadata
1531
+
1532
+ async def crawl_page(self, url, depth, base_domain):
1533
+ """Crawl a single page and extract information"""
1534
+ if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages:
1535
+ return []
1536
+
1537
+ try:
1538
+ response = requests.get(url, headers=self.headers, timeout=self.timeout)
1539
+ response.encoding = 'utf-8'
1540
+ response.raise_for_status()
1541
+ self.visited_urls.add(url)
1542
+
1543
+ soup = BeautifulSoup(response.text, 'html.parser')
1544
+ content = self.extract_content(soup)
1545
+ metadata = self.get_page_metadata(soup, url)
1546
+
1547
+ self.url_content[url] = content
1548
+ self.url_metadata[url] = metadata
1549
+
1550
+ # Find all links
1551
+ links = []
1552
+ for a in soup.find_all('a', href=True):
1553
+ next_url = urljoin(url, a['href'])
1554
+ if self.is_valid_url(next_url, base_domain):
1555
+ links.append(next_url)
1556
+
1557
+ return links
1558
+
1559
+ except Exception as e:
1560
+ logger.error(f"Error crawling {url}: {str(e)}")
1561
+ return []
1562
+
1563
+ async def crawl_website(self, start_url):
1564
+ """Crawl website starting from the given URL"""
1565
+ base_domain = start_url
1566
+ queue = [(start_url, 0)]
1567
+ seen = {start_url}
1568
+
1569
+ while queue and len(self.visited_urls) < self.max_pages:
1570
+ current_url, depth = queue.pop(0)
1571
+
1572
+ if depth > self.max_depth:
1573
+ continue
1574
+
1575
+ links = await self.crawl_page(current_url, depth, base_domain)
1576
+
1577
+ for link in links:
1578
+ if link not in seen:
1579
+ seen.add(link)
1580
+ queue.append((link, depth + 1))
1581
+
1582
+ def generate_llms_txt(self):
1583
+ """Generate llms.txt content from crawled data"""
1584
+ # Sort URLs by importance
1585
+ sorted_urls = sorted(
1586
+ self.url_metadata.items(),
1587
+ key=lambda x: (x[1]['importance'], x[0]),
1588
+ reverse=True
1589
+ )
1590
+
1591
+ if not sorted_urls:
1592
+ return "No content was found to generate llms.txt"
1593
+
1594
+ # Group URLs by category
1595
+ categorized_urls = defaultdict(list)
1596
+ for url, metadata in sorted_urls:
1597
+ categorized_urls[metadata['category']].append((url, metadata))
1598
+
1599
+ # Generate content
1600
+ content = []
1601
+
1602
+ # Add main title and description
1603
+ main_metadata = sorted_urls[0][1]
1604
+ content.append(f"# {main_metadata['title']}")
1605
+ if main_metadata['description']:
1606
+ content.append(f"\n> {main_metadata['description']}")
1607
+
1608
+ # Add categorized sections
1609
+ priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
1610
+
1611
+ for category in priority_order:
1612
+ if category in categorized_urls:
1613
+ content.append(f"\n## {category}")
1614
+ for url, metadata in categorized_urls[category]:
1615
+ if metadata['description']:
1616
+ content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
1617
+ else:
1618
+ content.append(f"\n- [{metadata['title']}]({url})")
1619
+
1620
+ return "\n".join(content)
1621
+
1622
+ async def process_url(url, max_depth, max_pages):
1623
+ """Process URL and generate llms.txt"""
1624
+ try:
1625
+ # Add https:// if not present
1626
+ if not url.startswith(('http://', 'https://')):
1627
+ url = 'https://' + url
1628
+
1629
+ # Validate URL format
1630
+ try:
1631
+ result = urlparse(url)
1632
+ if not all([result.scheme, result.netloc]):
1633
+ return "", "Invalid URL format. Please enter a valid URL."
1634
+ except:
1635
+ return "", "Invalid URL format. Please enter a valid URL."
1636
+
1637
+ # Create crawler and process
1638
+ crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
1639
+ await crawler.crawl_website(url)
1640
+ content = crawler.generate_llms_txt()
1641
+
1642
+ return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
1643
+
1644
+ except Exception as e:
1645
+ return "", f"Error: {str(e)}"
1646
+
1647
+ # Create custom theme
1648
+ theme = gr.themes.Soft(
1649
+ primary_hue="blue",
1650
+ font="Open Sans"
1651
+ )
1652
+
1653
+ # Create the Gradio interface
1654
+ with gr.Blocks(theme=theme, css="""
1655
+ @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
1656
+
1657
+ .gradio-container {
1658
+ font-family: 'Open Sans', sans-serif !important;
1659
+ }
1660
+
1661
+ .gr-button {
1662
+ font-family: 'Open Sans', sans-serif !important;
1663
+ font-weight: 600 !important;
1664
+ }
1665
+
1666
+ /* Primary color customization */
1667
+ .primary-btn {
1668
+ background-color: #2436d4 !important;
1669
+ color: white !important;
1670
+ }
1671
+
1672
+ .primary-btn:hover {
1673
+ background-color: #1c2aa8 !important;
1674
+ }
1675
+
1676
+ [data-testid="textbox"] {
1677
+ font-family: 'Open Sans', sans-serif !important;
1678
+ }
1679
+ """) as iface:
1680
+ gr.Markdown("# llms.txt Generator")
1681
+ gr.Markdown("Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.")
1682
+
1683
+ with gr.Row():
1684
+ url_input = gr.Textbox(
1685
+ label="Website URL",
1686
+ placeholder="Enter the website URL (e.g., example.com or https://example.com)",
1687
+ info="The URL will be automatically prefixed with https:// if no protocol is specified."
1688
+ )
1689
+
1690
+ with gr.Row():
1691
+ with gr.Column():
1692
+ depth_input = gr.Slider(
1693
+ minimum=1,
1694
+ maximum=5,
1695
+ value=3,
1696
+ step=1,
1697
+ label="Maximum Crawl Depth",
1698
+ info="Higher values will result in more thorough but slower crawling"
1699
+ )
1700
+ with gr.Column():
1701
+ pages_input = gr.Slider(
1702
+ minimum=10,
1703
+ maximum=100,
1704
+ value=50,
1705
+ step=10,
1706
+ label="Maximum Pages to Crawl",
1707
+ info="Higher values will result in more comprehensive but slower results"
1708
+ )
1709
+
1710
+ generate_btn = gr.Button("Generate llms.txt", variant="primary")
1711
+
1712
+ with gr.Row():
1713
+ output = gr.Textbox(
1714
+ label="Generated llms.txt Content",
1715
+ lines=20,
1716
+ max_lines=30,
1717
+ show_copy_button=True,
1718
+ container=True,
1719
+ scale=2,
1720
+ interactive=True
1721
+ )
1722
+
1723
+ status = gr.Textbox(label="Status")
1724
+
1725
+ generate_btn.click(
1726
+ fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
1727
+ inputs=[url_input, depth_input, pages_input],
1728
+ outputs=[output, status]
1729
+ )
1730
+
1731
+ # Launch the app
1732
+ if __name__ == "__main__":
1733
+ iface.launch()
1734
+ , # Remove pipe and everything after
1735
+ r'\s*•.*
1736
+
1737
+ def is_valid_url(self, url, base_domain):
1738
+ """Check if URL is valid and belongs to the same domain"""
1739
+ try:
1740
+ parsed = urlparse(url)
1741
+ base_parsed = urlparse(base_domain)
1742
+ return (parsed.netloc == base_parsed.netloc and
1743
+ parsed.scheme in ['http', 'https'] and
1744
+ not url.endswith(('.pdf', '.jpg', '.png', '.gif', '.zip')))
1745
+ except:
1746
+ return False
1747
+
1748
+ def extract_content(self, soup):
1749
+ """Extract meaningful content from HTML"""
1750
+ # Remove script and style elements
1751
+ for element in soup(['script', 'style', 'nav', 'footer', 'header']):
1752
+ element.decompose()
1753
+
1754
+ # Get main content
1755
+ main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
1756
+ if main_content:
1757
+ return self.normalize_text(main_content.get_text(strip=True))
1758
+ return self.normalize_text(soup.get_text(strip=True))
1759
+
1760
+ def get_page_metadata(self, soup, url):
1761
+ """Extract metadata from the page"""
1762
+ metadata = {
1763
+ 'title': None,
1764
+ 'description': None,
1765
+ 'importance': 0,
1766
+ 'category': 'Optional'
1767
+ }
1768
+
1769
+ # Title extraction with cleaning
1770
+ title = (
1771
+ soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
1772
+ soup.find('title').text if soup.find('title') else
1773
+ soup.find('h1').text if soup.find('h1') else
1774
+ url.split('/')[-1]
1775
+ )
1776
+ metadata['title'] = self.clean_title(title)
1777
+
1778
+ # Description extraction with cleaning
1779
+ description = (
1780
+ soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
1781
+ soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
1782
+ ""
1783
+ )
1784
+ metadata['description'] = self.clean_description(description)
1785
+
1786
+ # Calculate importance and category
1787
+ url_lower = url.lower()
1788
+ if 'docs' in url_lower or 'documentation' in url_lower:
1789
+ metadata['importance'] = 5
1790
+ metadata['category'] = 'Docs'
1791
+ elif 'api' in url_lower:
1792
+ metadata['importance'] = 4
1793
+ metadata['category'] = 'API'
1794
+ elif 'guide' in url_lower or 'tutorial' in url_lower:
1795
+ metadata['importance'] = 3
1796
+ metadata['category'] = 'Guides'
1797
+ elif 'example' in url_lower:
1798
+ metadata['importance'] = 2
1799
+ metadata['category'] = 'Examples'
1800
+ elif 'blog' in url_lower:
1801
+ metadata['importance'] = 1
1802
+ metadata['category'] = 'Blog'
1803
+
1804
+ return metadata
1805
+
1806
+ async def crawl_page(self, url, depth, base_domain):
1807
+ """Crawl a single page and extract information"""
1808
+ if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages:
1809
+ return []
1810
+
1811
+ try:
1812
+ response = requests.get(url, headers=self.headers, timeout=self.timeout)
1813
+ response.encoding = 'utf-8'
1814
+ response.raise_for_status()
1815
+ self.visited_urls.add(url)
1816
+
1817
+ soup = BeautifulSoup(response.text, 'html.parser')
1818
+ content = self.extract_content(soup)
1819
+ metadata = self.get_page_metadata(soup, url)
1820
+
1821
+ self.url_content[url] = content
1822
+ self.url_metadata[url] = metadata
1823
+
1824
+ # Find all links
1825
+ links = []
1826
+ for a in soup.find_all('a', href=True):
1827
+ next_url = urljoin(url, a['href'])
1828
+ if self.is_valid_url(next_url, base_domain):
1829
+ links.append(next_url)
1830
+
1831
+ return links
1832
+
1833
+ except Exception as e:
1834
+ logger.error(f"Error crawling {url}: {str(e)}")
1835
+ return []
1836
+
1837
+ async def crawl_website(self, start_url):
1838
+ """Crawl website starting from the given URL"""
1839
+ base_domain = start_url
1840
+ queue = [(start_url, 0)]
1841
+ seen = {start_url}
1842
+
1843
+ while queue and len(self.visited_urls) < self.max_pages:
1844
+ current_url, depth = queue.pop(0)
1845
+
1846
+ if depth > self.max_depth:
1847
+ continue
1848
+
1849
+ links = await self.crawl_page(current_url, depth, base_domain)
1850
+
1851
+ for link in links:
1852
+ if link not in seen:
1853
+ seen.add(link)
1854
+ queue.append((link, depth + 1))
1855
+
1856
+ def generate_llms_txt(self):
1857
+ """Generate llms.txt content from crawled data"""
1858
+ # Sort URLs by importance
1859
+ sorted_urls = sorted(
1860
+ self.url_metadata.items(),
1861
+ key=lambda x: (x[1]['importance'], x[0]),
1862
+ reverse=True
1863
+ )
1864
+
1865
+ if not sorted_urls:
1866
+ return "No content was found to generate llms.txt"
1867
+
1868
+ # Group URLs by category
1869
+ categorized_urls = defaultdict(list)
1870
+ for url, metadata in sorted_urls:
1871
+ categorized_urls[metadata['category']].append((url, metadata))
1872
+
1873
+ # Generate content
1874
+ content = []
1875
+
1876
+ # Add main title and description
1877
+ main_metadata = sorted_urls[0][1]
1878
+ content.append(f"# {main_metadata['title']}")
1879
+ if main_metadata['description']:
1880
+ content.append(f"\n> {main_metadata['description']}")
1881
+
1882
+ # Add categorized sections
1883
+ priority_order = ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']
1884
+
1885
+ for category in priority_order:
1886
+ if category in categorized_urls:
1887
+ content.append(f"\n## {category}")
1888
+ for url, metadata in categorized_urls[category]:
1889
+ if metadata['description']:
1890
+ content.append(f"\n- [{metadata['title']}]({url}): {metadata['description']}")
1891
+ else:
1892
+ content.append(f"\n- [{metadata['title']}]({url})")
1893
+
1894
+ return "\n".join(content)
1895
+
1896
+ async def process_url(url, max_depth, max_pages):
1897
+ """Process URL and generate llms.txt"""
1898
+ try:
1899
+ # Add https:// if not present
1900
+ if not url.startswith(('http://', 'https://')):
1901
+ url = 'https://' + url
1902
+
1903
+ # Validate URL format
1904
+ try:
1905
+ result = urlparse(url)
1906
+ if not all([result.scheme, result.netloc]):
1907
+ return "", "Invalid URL format. Please enter a valid URL."
1908
+ except:
1909
+ return "", "Invalid URL format. Please enter a valid URL."
1910
+
1911
+ # Create crawler and process
1912
+ crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
1913
+ await crawler.crawl_website(url)
1914
+ content = crawler.generate_llms_txt()
1915
+
1916
+ return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
1917
+
1918
+ except Exception as e:
1919
+ return "", f"Error: {str(e)}"
1920
+
1921
+ # Create custom theme
1922
+ theme = gr.themes.Soft(
1923
+ primary_hue="blue",
1924
+ font="Open Sans"
1925
+ )
1926
+
1927
+ # Create the Gradio interface
1928
+ with gr.Blocks(theme=theme, css="""
1929
+ @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
1930
+
1931
+ .gradio-container {
1932
+ font-family: 'Open Sans', sans-serif !important;
1933
+ }
1934
+
1935
+ .gr-button {
1936
+ font-family: 'Open Sans', sans-serif !important;
1937
+ font-weight: 600 !important;
1938
+ }
1939
+
1940
+ /* Primary color customization */
1941
+ .primary-btn {
1942
+ background-color: #2436d4 !important;
1943
+ color: white !important;
1944
+ }
1945
+
1946
+ .primary-btn:hover {
1947
+ background-color: #1c2aa8 !important;
1948
+ }
1949
+
1950
+ [data-testid="textbox"] {
1951
+ font-family: 'Open Sans', sans-serif !important;
1952
+ }
1953
+ """) as iface:
1954
+ gr.Markdown("# llms.txt Generator")
1955
+ gr.Markdown("Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.")
1956
+
1957
+ with gr.Row():
1958
+ url_input = gr.Textbox(
1959
+ label="Website URL",
1960
+ placeholder="Enter the website URL (e.g., example.com or https://example.com)",
1961
+ info="The URL will be automatically prefixed with https:// if no protocol is specified."
1962
+ )
1963
+
1964
+ with gr.Row():
1965
+ with gr.Column():
1966
+ depth_input = gr.Slider(
1967
+ minimum=1,
1968
+ maximum=5,
1969
+ value=3,
1970
+ step=1,
1971
+ label="Maximum Crawl Depth",
1972
+ info="Higher values will result in more thorough but slower crawling"
1973
+ )
1974
+ with gr.Column():
1975
+ pages_input = gr.Slider(
1976
+ minimum=10,
1977
+ maximum=100,
1978
+ value=50,
1979
+ step=10,
1980
+ label="Maximum Pages to Crawl",
1981
+ info="Higher values will result in more comprehensive but slower results"
1982
+ )
1983
+
1984
+ generate_btn = gr.Button("Generate llms.txt", variant="primary")
1985
+
1986
+ with gr.Row():
1987
+ output = gr.Textbox(
1988
+ label="Generated llms.txt Content",
1989
+ lines=20,
1990
+ max_lines=30,
1991
+ show_copy_button=True,
1992
+ container=True,
1993
+ scale=2,
1994
+ interactive=True
1995
+ )
1996
+
1997
+ status = gr.Textbox(label="Status")
1998
+
1999
+ generate_btn.click(
2000
+ fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
2001
+ inputs=[url_input, depth_input, pages_input],
2002
+ outputs=[output, status]
2003
+ )
2004
+
2005
+ # Launch the app
2006
+ if __name__ == "__main__":
2007
+ iface.launch()
2008
+ , # Remove bullet and everything after
2009
+ r'^\s*Welcome to\s+', # Remove "Welcome to" at start
2010
+ r'docusaurus_skipToContent_fallback', # Remove docusaurus fragments
2011
+ ]
2012
+
2013
+ for pattern in patterns:
2014
+ title = re.sub(pattern, '', title)
2015
+
2016
+ # Clean up whitespace
2017
+ title = ' '.join(title.split())
2018
+ return title.strip()
2019
 
2020
  def clean_description(self, desc):
2021
  """Clean and format descriptions"""
2022
  if not desc:
2023
  return ""
2024
+
2025
  desc = self.normalize_text(desc)
2026
+
2027
+ # Remove duplicate sentences
2028
  sentences = re.split(r'(?<=[.!?])\s+', desc)
2029
+ unique_sentences = []
2030
+ seen_sentences = set()
2031
+
2032
+ for sentence in sentences:
2033
+ sentence = sentence.strip()
2034
+ sentence_lower = sentence.lower()
2035
+ if sentence_lower not in seen_sentences and sentence:
2036
+ if not sentence[-1] in '.!?':
2037
+ sentence += '.'
2038
+ unique_sentences.append(sentence)
2039
+ seen_sentences.add(sentence_lower)
2040
+
2041
+ cleaned_desc = ' '.join(unique_sentences)
2042
+ return cleaned_desc
2043
 
2044
  def is_valid_url(self, url, base_domain):
2045
  """Check if URL is valid and belongs to the same domain"""