cyberandy commited on
Commit
4f7928b
1 Parent(s): ab2a9d9

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +70 -31
app.py CHANGED
@@ -9,6 +9,7 @@ import asyncio
9
  from collections import defaultdict
10
  import time
11
  import logging
 
12
 
13
  # Set up logging
14
  logging.basicConfig(level=logging.INFO)
@@ -26,6 +27,20 @@ class WebsiteCrawler:
26
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
27
  }
28
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
29
  def is_valid_url(self, url, base_domain):
30
  """Check if URL is valid and belongs to the same domain"""
31
  try:
@@ -46,8 +61,8 @@ class WebsiteCrawler:
46
  # Get main content
47
  main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
48
  if main_content:
49
- return main_content.get_text(strip=True)
50
- return soup.get_text(strip=True)
51
 
52
  def get_page_metadata(self, soup, url):
53
  """Extract metadata from the page"""
@@ -58,20 +73,22 @@ class WebsiteCrawler:
58
  'category': 'Optional'
59
  }
60
 
61
- # Title extraction
62
- metadata['title'] = (
63
  soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
64
  soup.find('title').text if soup.find('title') else
65
  soup.find('h1').text if soup.find('h1') else
66
  url.split('/')[-1]
67
  )
 
68
 
69
- # Description extraction
70
- metadata['description'] = (
71
  soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
72
  soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
73
  ""
74
  )
 
75
 
76
  # Calculate importance based on various factors
77
  importance = 0
@@ -101,6 +118,7 @@ class WebsiteCrawler:
101
 
102
  try:
103
  response = requests.get(url, headers=self.headers, timeout=self.timeout)
 
104
  response.raise_for_status()
105
  self.visited_urls.add(url)
106
 
@@ -182,27 +200,27 @@ class WebsiteCrawler:
182
 
183
  return "\n".join(content)
184
 
185
- def save_llms_txt(content, save_path="llms.txt"):
186
- """Save the generated content to a file"""
187
- try:
188
- with open(save_path, 'w', encoding='utf-8') as f:
189
- f.write(content)
190
- return f"Successfully saved to {save_path}"
191
- except Exception as e:
192
- return f"Error saving file: {str(e)}"
193
-
194
- async def process_url(url, max_depth, max_pages, save_to_file=False):
195
  """Process URL and generate llms.txt"""
196
  try:
 
 
 
 
 
 
 
 
 
 
 
 
 
197
  crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
198
  await crawler.crawl_website(url)
199
  content = crawler.generate_llms_txt()
200
 
201
- if save_to_file:
202
- save_message = save_llms_txt(content)
203
- return content, f"Crawled {len(crawler.visited_urls)} pages. {save_message}"
204
-
205
- return content, f"Crawled {len(crawler.visited_urls)} pages. File not saved (checkbox not selected)"
206
 
207
  except Exception as e:
208
  return "", f"Error: {str(e)}"
@@ -224,27 +242,48 @@ body, .gradio-container {
224
  font-family: 'Open Sans', sans-serif !important;
225
  font-weight: 600 !important;
226
  }
 
 
 
 
227
  """
228
 
229
  # Create the Gradio interface
230
  iface = gr.Interface(
231
- fn=lambda url, max_depth, max_pages, save: asyncio.run(process_url(url, max_depth, max_pages, save)),
232
  inputs=[
233
- gr.Textbox(label="Website URL", placeholder="Enter the website URL..."),
234
- gr.Slider(minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth"),
235
- gr.Slider(minimum=10, maximum=100, value=50, step=10, label="Maximum Pages to Crawl"),
236
- gr.Checkbox(label="Save to file", value=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
237
  ],
238
  outputs=[
239
- gr.Textbox(label="Generated llms.txt Content", lines=20),
 
 
 
 
240
  gr.Textbox(label="Status")
241
  ],
242
  title="llms.txt Generator",
243
  description="Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.",
244
- examples=[
245
- ["https://example.com", 3, 50, False],
246
- ["https://docs.python.org", 3, 50, True]
247
- ],
248
  theme=gr.themes.Soft(),
249
  css=css
250
  )
 
9
  from collections import defaultdict
10
  import time
11
  import logging
12
+ import unicodedata
13
 
14
  # Set up logging
15
  logging.basicConfig(level=logging.INFO)
 
27
  'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
28
  }
29
 
30
+ def normalize_text(self, text):
31
+ """Normalize text to handle encoding issues"""
32
+ if not text:
33
+ return ""
34
+ # Normalize unicode characters
35
+ text = unicodedata.normalize('NFKD', text)
36
+ # Replace special quotes and dashes with standard characters
37
+ text = text.replace('"', '"').replace('"', '"').replace(''', "'").replace('—', '-')
38
+ # Remove any remaining non-ASCII characters
39
+ text = text.encode('ascii', 'ignore').decode('ascii')
40
+ # Clean up extra whitespace
41
+ text = ' '.join(text.split())
42
+ return text
43
+
44
  def is_valid_url(self, url, base_domain):
45
  """Check if URL is valid and belongs to the same domain"""
46
  try:
 
61
  # Get main content
62
  main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
63
  if main_content:
64
+ return self.normalize_text(main_content.get_text(strip=True))
65
+ return self.normalize_text(soup.get_text(strip=True))
66
 
67
  def get_page_metadata(self, soup, url):
68
  """Extract metadata from the page"""
 
73
  'category': 'Optional'
74
  }
75
 
76
+ # Title extraction with normalization
77
+ title = (
78
  soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
79
  soup.find('title').text if soup.find('title') else
80
  soup.find('h1').text if soup.find('h1') else
81
  url.split('/')[-1]
82
  )
83
+ metadata['title'] = self.normalize_text(title)
84
 
85
+ # Description extraction with normalization
86
+ description = (
87
  soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
88
  soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
89
  ""
90
  )
91
+ metadata['description'] = self.normalize_text(description)
92
 
93
  # Calculate importance based on various factors
94
  importance = 0
 
118
 
119
  try:
120
  response = requests.get(url, headers=self.headers, timeout=self.timeout)
121
+ response.encoding = 'utf-8' # Explicitly set encoding
122
  response.raise_for_status()
123
  self.visited_urls.add(url)
124
 
 
200
 
201
  return "\n".join(content)
202
 
203
+ async def process_url(url, max_depth, max_pages):
 
 
 
 
 
 
 
 
 
204
  """Process URL and generate llms.txt"""
205
  try:
206
+ # Add https:// if not present
207
+ if not url.startswith(('http://', 'https://')):
208
+ url = 'https://' + url
209
+
210
+ # Validate URL format
211
+ try:
212
+ result = urlparse(url)
213
+ if not all([result.scheme, result.netloc]):
214
+ return "", "Invalid URL format. Please enter a valid URL."
215
+ except:
216
+ return "", "Invalid URL format. Please enter a valid URL."
217
+
218
+ # Create crawler and process
219
  crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
220
  await crawler.crawl_website(url)
221
  content = crawler.generate_llms_txt()
222
 
223
+ return content, f"Successfully crawled {len(crawler.visited_urls)} pages. You can now copy the generated content."
 
 
 
 
224
 
225
  except Exception as e:
226
  return "", f"Error: {str(e)}"
 
242
  font-family: 'Open Sans', sans-serif !important;
243
  font-weight: 600 !important;
244
  }
245
+
246
+ .gr-input {
247
+ font-family: 'Open Sans', sans-serif !important;
248
+ }
249
  """
250
 
251
  # Create the Gradio interface
252
  iface = gr.Interface(
253
+ fn=lambda url, max_depth, max_pages: asyncio.run(process_url(url, max_depth, max_pages)),
254
  inputs=[
255
+ gr.Textbox(
256
+ label="Website URL",
257
+ placeholder="Enter the website URL (e.g., example.com or https://example.com)",
258
+ info="The URL will be automatically prefixed with https:// if no protocol is specified."
259
+ ),
260
+ gr.Slider(
261
+ minimum=1,
262
+ maximum=5,
263
+ value=3,
264
+ step=1,
265
+ label="Maximum Crawl Depth",
266
+ info="Higher values will result in more thorough but slower crawling"
267
+ ),
268
+ gr.Slider(
269
+ minimum=10,
270
+ maximum=100,
271
+ value=50,
272
+ step=10,
273
+ label="Maximum Pages to Crawl",
274
+ info="Higher values will result in more comprehensive but slower results"
275
+ )
276
  ],
277
  outputs=[
278
+ gr.Textbox(
279
+ label="Generated llms.txt Content",
280
+ lines=20,
281
+ info="Copy this content to create your llms.txt file"
282
+ ),
283
  gr.Textbox(label="Status")
284
  ],
285
  title="llms.txt Generator",
286
  description="Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.",
 
 
 
 
287
  theme=gr.themes.Soft(),
288
  css=css
289
  )