cyberandy commited on
Commit
dd2349f
·
1 Parent(s): 8dd9e80
Files changed (1) hide show
  1. app.py +110 -93
app.py CHANGED
@@ -11,6 +11,7 @@ import logging
11
  logging.basicConfig(level=logging.INFO)
12
  logger = logging.getLogger(__name__)
13
 
 
14
  class WebsiteCrawler:
15
  def __init__(self, max_depth=3, max_pages=50):
16
  self.max_depth = max_depth
@@ -18,7 +19,7 @@ class WebsiteCrawler:
18
  self.visited_urls = set()
19
  self.url_metadata = defaultdict(dict)
20
  self.headers = {
21
- 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36'
22
  }
23
 
24
  def clean_text(self, text, is_title=False):
@@ -26,99 +27,106 @@ class WebsiteCrawler:
26
  if not text:
27
  return ""
28
  # Normalize unicode characters
29
- text = unicodedata.normalize('NFKD', text)
30
- text = re.sub(r'[^\x00-\x7F]+', '', text)
31
-
32
  if is_title:
33
  # Remove common suffixes and fragments for titles
34
- text = re.sub(r'\s*[\|\-#:•].*', '', text)
35
- text = re.sub(r'^\s*Welcome to\s+', '', text)
36
- text = text.replace('docusaurus_skipToContent_fallback', '')
37
-
38
- return ' '.join(text.split()).strip()
39
 
40
  async def crawl_page(self, url, depth, base_domain):
41
  """Crawl a single page and extract information"""
42
- if depth > self.max_depth or url in self.visited_urls or len(self.visited_urls) >= self.max_pages:
 
 
 
 
43
  return []
44
 
45
  try:
46
  response = requests.get(url, headers=self.headers, timeout=10)
47
- response.encoding = 'utf-8'
48
  self.visited_urls.add(url)
49
 
50
- soup = BeautifulSoup(response.text, 'html.parser')
51
-
52
  # Extract title with fallbacks
53
  title = None
54
- meta_title = soup.find('meta', property='og:title')
55
- if meta_title and meta_title.get('content'):
56
- title = meta_title['content']
57
  if not title:
58
- title_tag = soup.find('title')
59
  if title_tag:
60
  title = title_tag.text
61
  if not title:
62
- h1_tag = soup.find('h1')
63
  if h1_tag:
64
  title = h1_tag.text
65
  if not title:
66
- title = url.split('/')[-1]
67
 
68
  title = self.clean_text(title, is_title=True)
69
-
70
  # Extract description with fallbacks
71
  desc = None
72
- meta_desc = soup.find('meta', {'name': 'description'})
73
- if meta_desc and meta_desc.get('content'):
74
- desc = meta_desc['content']
75
  if not desc:
76
- og_desc = soup.find('meta', property='og:description')
77
- if og_desc and og_desc.get('content'):
78
- desc = og_desc['content']
79
  if not desc:
80
- first_p = soup.find('p')
81
  if first_p:
82
  desc = first_p.text
83
-
84
  desc = self.clean_text(desc) if desc else ""
85
 
86
  # Determine category and importance
87
  url_lower = url.lower()
88
- category = 'Optional'
89
  importance = 0
90
-
91
- if 'docs' in url_lower or 'documentation' in url_lower:
92
- category = 'Docs'
93
  importance = 5
94
- elif 'api' in url_lower:
95
- category = 'API'
96
  importance = 4
97
- elif 'guide' in url_lower or 'tutorial' in url_lower:
98
- category = 'Guides'
99
  importance = 3
100
- elif 'example' in url_lower:
101
- category = 'Examples'
102
  importance = 2
103
- elif 'blog' in url_lower:
104
- category = 'Blog'
105
  importance = 1
106
-
107
  # Store metadata
108
- clean_url = re.sub(r'#.*', '', url).rstrip('/')
109
  if title and len(title.strip()) > 0: # Only store if we have a valid title
110
  self.url_metadata[clean_url] = {
111
- 'title': title,
112
- 'description': desc,
113
- 'category': category,
114
- 'importance': importance
115
  }
116
 
117
  # Find links
118
  links = []
119
- for a in soup.find_all('a', href=True):
120
- href = a['href']
121
- if not any(x in href.lower() for x in ['javascript:', 'mailto:', '.pdf', '.jpg', '.png', '.gif']):
 
 
 
122
  next_url = urljoin(url, href)
123
  if urlparse(next_url).netloc == base_domain:
124
  links.append(next_url)
@@ -150,7 +158,7 @@ class WebsiteCrawler:
150
  if not desc:
151
  return ""
152
  # Remove leading dashes, hyphens, or colons
153
- desc = re.sub(r'^[-:\s]+', '', desc)
154
  # Remove any strings that are just "Editors", "APIs", etc.
155
  if len(desc.split()) <= 1:
156
  return ""
@@ -164,33 +172,33 @@ class WebsiteCrawler:
164
  # Sort URLs by importance and remove duplicates
165
  sorted_urls = []
166
  seen_titles = set()
167
-
168
  for url, metadata in sorted(
169
  self.url_metadata.items(),
170
- key=lambda x: (x[1]['importance'], x[0]),
171
- reverse=True
172
  ):
173
- if metadata['title'] not in seen_titles:
174
  sorted_urls.append((url, metadata))
175
- seen_titles.add(metadata['title'])
176
 
177
  if not sorted_urls:
178
  return "No valid content was found"
179
 
180
  # Generate content
181
  content = []
182
-
183
  # Find the best title for the main header (prefer "Welcome" or "Overview")
184
  main_title = "Welcome" # Default to Welcome
185
-
186
  # Find a good description for the blockquote
187
  best_description = None
188
  for _, metadata in sorted_urls:
189
- desc = self.clean_description(metadata['description'])
190
  if desc and len(desc) > 20 and "null" not in desc.lower():
191
  best_description = desc
192
  break
193
-
194
  content.append(f"# {main_title}")
195
  if best_description:
196
  content.append(f"\n> {best_description}")
@@ -198,34 +206,35 @@ class WebsiteCrawler:
198
  # Group by category
199
  categories = defaultdict(list)
200
  for url, metadata in sorted_urls:
201
- if metadata['title'] and url:
202
- categories[metadata['category']].append((url, metadata))
203
 
204
  # Add sections
205
- for category in ['Docs', 'API', 'Guides', 'Examples', 'Blog', 'Optional']:
206
  if category in categories:
207
  content.append(f"\n## {category}")
208
-
209
  # Add links without extra newlines
210
  links = []
211
  for url, metadata in categories[category]:
212
- title = metadata['title'].strip()
213
- desc = self.clean_description(metadata['description'])
214
  if desc:
215
  links.append(f"- [{title}]({url}): {desc}")
216
  else:
217
  links.append(f"- [{title}]({url})")
218
-
219
- content.append('\n'.join(links))
220
 
221
- return '\n'.join(content)
 
 
 
222
 
223
  async def process_url(url, max_depth, max_pages):
224
  """Process URL and generate llms.txt"""
225
  try:
226
  # Add https:// if not present
227
- if not url.startswith(('http://', 'https://')):
228
- url = 'https://' + url
229
 
230
  # Validate URL
231
  result = urlparse(url)
@@ -236,22 +245,25 @@ async def process_url(url, max_depth, max_pages):
236
  crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
237
  await crawler.crawl_website(url)
238
  content = crawler.generate_llms_txt()
239
-
240
  return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
241
-
242
  except Exception as e:
243
  return "", f"Error: {str(e)}"
244
 
 
245
  # Create Gradio interface
246
  theme = gr.themes.Soft(primary_hue="blue", font="Open Sans")
247
 
248
- with gr.Blocks(theme=theme, css="""
 
 
249
  @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
250
-
251
  .gradio-container {
252
  font-family: 'Open Sans', sans-serif !important;
253
  }
254
-
255
  .gr-button {
256
  font-family: 'Open Sans', sans-serif !important;
257
  font-weight: 600 !important;
@@ -265,55 +277,60 @@ with gr.Blocks(theme=theme, css="""
265
  .primary-btn:hover {
266
  background-color: #1c2aa8 !important;
267
  }
268
-
269
  [data-testid="textbox"] {
270
  font-family: 'Open Sans', sans-serif !important;
271
  }
272
-
273
  .gr-padded {
274
  font-family: 'Open Sans', sans-serif !important;
275
  }
276
-
277
  .gr-input {
278
  font-family: 'Open Sans', sans-serif !important;
279
  }
280
-
281
  .gr-label {
282
  font-family: 'Open Sans', sans-serif !important;
283
  }
284
- """) as iface:
 
285
  gr.Markdown("# llms.txt Generator")
286
  gr.Markdown("Generate an llms.txt file from a website following the specification.")
287
-
288
  with gr.Row():
289
  url_input = gr.Textbox(
290
- label="Website URL",
291
  placeholder="Enter the website URL (e.g., example.com)",
292
- info="The URL will be automatically prefixed with https:// if not provided"
293
  )
294
-
295
  with gr.Row():
296
  with gr.Column():
297
- depth_input = gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth")
 
 
298
  with gr.Column():
299
- pages_input = gr.Slider(minimum=10, maximum=100, value=50, step=10, label="Maximum Pages")
300
-
 
 
301
  generate_btn = gr.Button("Generate llms.txt", variant="primary")
302
-
303
  output = gr.Textbox(
304
  label="Generated llms.txt Content",
305
  lines=20,
306
  show_copy_button=True,
307
- container=True
308
  )
309
-
310
  status = gr.Textbox(label="Status")
311
-
312
  generate_btn.click(
313
  fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
314
  inputs=[url_input, depth_input, pages_input],
315
- outputs=[output, status]
316
  )
317
 
318
  if __name__ == "__main__":
319
- iface.launch()
 
11
  logging.basicConfig(level=logging.INFO)
12
  logger = logging.getLogger(__name__)
13
 
14
+
15
  class WebsiteCrawler:
16
  def __init__(self, max_depth=3, max_pages=50):
17
  self.max_depth = max_depth
 
19
  self.visited_urls = set()
20
  self.url_metadata = defaultdict(dict)
21
  self.headers = {
22
+ "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
23
  }
24
 
25
  def clean_text(self, text, is_title=False):
 
27
  if not text:
28
  return ""
29
  # Normalize unicode characters
30
+ text = unicodedata.normalize("NFKD", text)
31
+ text = re.sub(r"[^\x00-\x7F]+", "", text)
32
+
33
  if is_title:
34
  # Remove common suffixes and fragments for titles
35
+ text = re.sub(r"\s*[\|\-#:•].*", "", text)
36
+ text = re.sub(r"^\s*Welcome to\s+", "", text)
37
+ text = text.replace("docusaurus_skipToContent_fallback", "")
38
+
39
+ return " ".join(text.split()).strip()
40
 
41
  async def crawl_page(self, url, depth, base_domain):
42
  """Crawl a single page and extract information"""
43
+ if (
44
+ depth > self.max_depth
45
+ or url in self.visited_urls
46
+ or len(self.visited_urls) >= self.max_pages
47
+ ):
48
  return []
49
 
50
  try:
51
  response = requests.get(url, headers=self.headers, timeout=10)
52
+ response.encoding = "utf-8"
53
  self.visited_urls.add(url)
54
 
55
+ soup = BeautifulSoup(response.text, "html.parser")
56
+
57
  # Extract title with fallbacks
58
  title = None
59
+ meta_title = soup.find("meta", property="og:title")
60
+ if meta_title and meta_title.get("content"):
61
+ title = meta_title["content"]
62
  if not title:
63
+ title_tag = soup.find("title")
64
  if title_tag:
65
  title = title_tag.text
66
  if not title:
67
+ h1_tag = soup.find("h1")
68
  if h1_tag:
69
  title = h1_tag.text
70
  if not title:
71
+ title = url.split("/")[-1]
72
 
73
  title = self.clean_text(title, is_title=True)
74
+
75
  # Extract description with fallbacks
76
  desc = None
77
+ meta_desc = soup.find("meta", {"name": "description"})
78
+ if meta_desc and meta_desc.get("content"):
79
+ desc = meta_desc["content"]
80
  if not desc:
81
+ og_desc = soup.find("meta", property="og:description")
82
+ if og_desc and og_desc.get("content"):
83
+ desc = og_desc["content"]
84
  if not desc:
85
+ first_p = soup.find("p")
86
  if first_p:
87
  desc = first_p.text
88
+
89
  desc = self.clean_text(desc) if desc else ""
90
 
91
  # Determine category and importance
92
  url_lower = url.lower()
93
+ category = "Optional"
94
  importance = 0
95
+
96
+ if "docs" in url_lower or "documentation" in url_lower:
97
+ category = "Docs"
98
  importance = 5
99
+ elif "api" in url_lower:
100
+ category = "API"
101
  importance = 4
102
+ elif "guide" in url_lower or "tutorial" in url_lower:
103
+ category = "Guides"
104
  importance = 3
105
+ elif "example" in url_lower:
106
+ category = "Examples"
107
  importance = 2
108
+ elif "blog" in url_lower:
109
+ category = "Blog"
110
  importance = 1
111
+
112
  # Store metadata
113
+ clean_url = re.sub(r"#.*", "", url).rstrip("/")
114
  if title and len(title.strip()) > 0: # Only store if we have a valid title
115
  self.url_metadata[clean_url] = {
116
+ "title": title,
117
+ "description": desc,
118
+ "category": category,
119
+ "importance": importance,
120
  }
121
 
122
  # Find links
123
  links = []
124
+ for a in soup.find_all("a", href=True):
125
+ href = a["href"]
126
+ if not any(
127
+ x in href.lower()
128
+ for x in ["javascript:", "mailto:", ".pdf", ".jpg", ".png", ".gif"]
129
+ ):
130
  next_url = urljoin(url, href)
131
  if urlparse(next_url).netloc == base_domain:
132
  links.append(next_url)
 
158
  if not desc:
159
  return ""
160
  # Remove leading dashes, hyphens, or colons
161
+ desc = re.sub(r"^[-:\s]+", "", desc)
162
  # Remove any strings that are just "Editors", "APIs", etc.
163
  if len(desc.split()) <= 1:
164
  return ""
 
172
  # Sort URLs by importance and remove duplicates
173
  sorted_urls = []
174
  seen_titles = set()
175
+
176
  for url, metadata in sorted(
177
  self.url_metadata.items(),
178
+ key=lambda x: (x[1]["importance"], x[0]),
179
+ reverse=True,
180
  ):
181
+ if metadata["title"] not in seen_titles:
182
  sorted_urls.append((url, metadata))
183
+ seen_titles.add(metadata["title"])
184
 
185
  if not sorted_urls:
186
  return "No valid content was found"
187
 
188
  # Generate content
189
  content = []
190
+
191
  # Find the best title for the main header (prefer "Welcome" or "Overview")
192
  main_title = "Welcome" # Default to Welcome
193
+
194
  # Find a good description for the blockquote
195
  best_description = None
196
  for _, metadata in sorted_urls:
197
+ desc = self.clean_description(metadata["description"])
198
  if desc and len(desc) > 20 and "null" not in desc.lower():
199
  best_description = desc
200
  break
201
+
202
  content.append(f"# {main_title}")
203
  if best_description:
204
  content.append(f"\n> {best_description}")
 
206
  # Group by category
207
  categories = defaultdict(list)
208
  for url, metadata in sorted_urls:
209
+ if metadata["title"] and url:
210
+ categories[metadata["category"]].append((url, metadata))
211
 
212
  # Add sections
213
+ for category in ["Docs", "API", "Guides", "Examples", "Blog", "Optional"]:
214
  if category in categories:
215
  content.append(f"\n## {category}")
216
+
217
  # Add links without extra newlines
218
  links = []
219
  for url, metadata in categories[category]:
220
+ title = metadata["title"].strip()
221
+ desc = self.clean_description(metadata["description"])
222
  if desc:
223
  links.append(f"- [{title}]({url}): {desc}")
224
  else:
225
  links.append(f"- [{title}]({url})")
 
 
226
 
227
+ content.append("\n".join(links))
228
+
229
+ return "\n".join(content)
230
+
231
 
232
  async def process_url(url, max_depth, max_pages):
233
  """Process URL and generate llms.txt"""
234
  try:
235
  # Add https:// if not present
236
+ if not url.startswith(("http://", "https://")):
237
+ url = "https://" + url
238
 
239
  # Validate URL
240
  result = urlparse(url)
 
245
  crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
246
  await crawler.crawl_website(url)
247
  content = crawler.generate_llms_txt()
248
+
249
  return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
250
+
251
  except Exception as e:
252
  return "", f"Error: {str(e)}"
253
 
254
+
255
  # Create Gradio interface
256
  theme = gr.themes.Soft(primary_hue="blue", font="Open Sans")
257
 
258
+ with gr.Blocks(
259
+ theme=theme,
260
+ css="""
261
  @import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
262
+
263
  .gradio-container {
264
  font-family: 'Open Sans', sans-serif !important;
265
  }
266
+
267
  .gr-button {
268
  font-family: 'Open Sans', sans-serif !important;
269
  font-weight: 600 !important;
 
277
  .primary-btn:hover {
278
  background-color: #1c2aa8 !important;
279
  }
280
+
281
  [data-testid="textbox"] {
282
  font-family: 'Open Sans', sans-serif !important;
283
  }
284
+
285
  .gr-padded {
286
  font-family: 'Open Sans', sans-serif !important;
287
  }
288
+
289
  .gr-input {
290
  font-family: 'Open Sans', sans-serif !important;
291
  }
292
+
293
  .gr-label {
294
  font-family: 'Open Sans', sans-serif !important;
295
  }
296
+ """,
297
+ ) as iface:
298
  gr.Markdown("# llms.txt Generator")
299
  gr.Markdown("Generate an llms.txt file from a website following the specification.")
300
+
301
  with gr.Row():
302
  url_input = gr.Textbox(
303
+ label="Website URL",
304
  placeholder="Enter the website URL (e.g., example.com)",
305
+ info="The URL will be automatically prefixed with https:// if not provided",
306
  )
307
+
308
  with gr.Row():
309
  with gr.Column():
310
+ depth_input = gr.Slider(
311
+ minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth"
312
+ )
313
  with gr.Column():
314
+ pages_input = gr.Slider(
315
+ minimum=10, maximum=100, value=50, step=10, label="Maximum Pages"
316
+ )
317
+
318
  generate_btn = gr.Button("Generate llms.txt", variant="primary")
319
+
320
  output = gr.Textbox(
321
  label="Generated llms.txt Content",
322
  lines=20,
323
  show_copy_button=True,
324
+ container=True,
325
  )
326
+
327
  status = gr.Textbox(label="Status")
328
+
329
  generate_btn.click(
330
  fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
331
  inputs=[url_input, depth_input, pages_input],
332
+ outputs=[output, status],
333
  )
334
 
335
  if __name__ == "__main__":
336
+ iface.launch()