Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -9,6 +9,7 @@ import asyncio
|
|
9 |
from collections import defaultdict
|
10 |
import time
|
11 |
import logging
|
|
|
12 |
|
13 |
# Set up logging
|
14 |
logging.basicConfig(level=logging.INFO)
|
@@ -26,6 +27,20 @@ class WebsiteCrawler:
|
|
26 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
27 |
}
|
28 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
29 |
def is_valid_url(self, url, base_domain):
|
30 |
"""Check if URL is valid and belongs to the same domain"""
|
31 |
try:
|
@@ -46,8 +61,8 @@ class WebsiteCrawler:
|
|
46 |
# Get main content
|
47 |
main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
|
48 |
if main_content:
|
49 |
-
return main_content.get_text(strip=True)
|
50 |
-
return soup.get_text(strip=True)
|
51 |
|
52 |
def get_page_metadata(self, soup, url):
|
53 |
"""Extract metadata from the page"""
|
@@ -58,20 +73,22 @@ class WebsiteCrawler:
|
|
58 |
'category': 'Optional'
|
59 |
}
|
60 |
|
61 |
-
# Title extraction
|
62 |
-
|
63 |
soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
|
64 |
soup.find('title').text if soup.find('title') else
|
65 |
soup.find('h1').text if soup.find('h1') else
|
66 |
url.split('/')[-1]
|
67 |
)
|
|
|
68 |
|
69 |
-
# Description extraction
|
70 |
-
|
71 |
soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
|
72 |
soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
|
73 |
""
|
74 |
)
|
|
|
75 |
|
76 |
# Calculate importance based on various factors
|
77 |
importance = 0
|
@@ -101,6 +118,7 @@ class WebsiteCrawler:
|
|
101 |
|
102 |
try:
|
103 |
response = requests.get(url, headers=self.headers, timeout=self.timeout)
|
|
|
104 |
response.raise_for_status()
|
105 |
self.visited_urls.add(url)
|
106 |
|
@@ -182,27 +200,27 @@ class WebsiteCrawler:
|
|
182 |
|
183 |
return "\n".join(content)
|
184 |
|
185 |
-
def
|
186 |
-
"""Save the generated content to a file"""
|
187 |
-
try:
|
188 |
-
with open(save_path, 'w', encoding='utf-8') as f:
|
189 |
-
f.write(content)
|
190 |
-
return f"Successfully saved to {save_path}"
|
191 |
-
except Exception as e:
|
192 |
-
return f"Error saving file: {str(e)}"
|
193 |
-
|
194 |
-
async def process_url(url, max_depth, max_pages, save_to_file=False):
|
195 |
"""Process URL and generate llms.txt"""
|
196 |
try:
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
197 |
crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
|
198 |
await crawler.crawl_website(url)
|
199 |
content = crawler.generate_llms_txt()
|
200 |
|
201 |
-
|
202 |
-
save_message = save_llms_txt(content)
|
203 |
-
return content, f"Crawled {len(crawler.visited_urls)} pages. {save_message}"
|
204 |
-
|
205 |
-
return content, f"Crawled {len(crawler.visited_urls)} pages. File not saved (checkbox not selected)"
|
206 |
|
207 |
except Exception as e:
|
208 |
return "", f"Error: {str(e)}"
|
@@ -224,27 +242,48 @@ body, .gradio-container {
|
|
224 |
font-family: 'Open Sans', sans-serif !important;
|
225 |
font-weight: 600 !important;
|
226 |
}
|
|
|
|
|
|
|
|
|
227 |
"""
|
228 |
|
229 |
# Create the Gradio interface
|
230 |
iface = gr.Interface(
|
231 |
-
fn=lambda url, max_depth, max_pages
|
232 |
inputs=[
|
233 |
-
gr.Textbox(
|
234 |
-
|
235 |
-
|
236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
237 |
],
|
238 |
outputs=[
|
239 |
-
gr.Textbox(
|
|
|
|
|
|
|
|
|
240 |
gr.Textbox(label="Status")
|
241 |
],
|
242 |
title="llms.txt Generator",
|
243 |
description="Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.",
|
244 |
-
examples=[
|
245 |
-
["https://example.com", 3, 50, False],
|
246 |
-
["https://docs.python.org", 3, 50, True]
|
247 |
-
],
|
248 |
theme=gr.themes.Soft(),
|
249 |
css=css
|
250 |
)
|
|
|
9 |
from collections import defaultdict
|
10 |
import time
|
11 |
import logging
|
12 |
+
import unicodedata
|
13 |
|
14 |
# Set up logging
|
15 |
logging.basicConfig(level=logging.INFO)
|
|
|
27 |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36'
|
28 |
}
|
29 |
|
30 |
+
def normalize_text(self, text):
|
31 |
+
"""Normalize text to handle encoding issues"""
|
32 |
+
if not text:
|
33 |
+
return ""
|
34 |
+
# Normalize unicode characters
|
35 |
+
text = unicodedata.normalize('NFKD', text)
|
36 |
+
# Replace special quotes and dashes with standard characters
|
37 |
+
text = text.replace('"', '"').replace('"', '"').replace(''', "'").replace('—', '-')
|
38 |
+
# Remove any remaining non-ASCII characters
|
39 |
+
text = text.encode('ascii', 'ignore').decode('ascii')
|
40 |
+
# Clean up extra whitespace
|
41 |
+
text = ' '.join(text.split())
|
42 |
+
return text
|
43 |
+
|
44 |
def is_valid_url(self, url, base_domain):
|
45 |
"""Check if URL is valid and belongs to the same domain"""
|
46 |
try:
|
|
|
61 |
# Get main content
|
62 |
main_content = soup.find('main') or soup.find('article') or soup.find('div', {'class': re.compile(r'content|main', re.I)})
|
63 |
if main_content:
|
64 |
+
return self.normalize_text(main_content.get_text(strip=True))
|
65 |
+
return self.normalize_text(soup.get_text(strip=True))
|
66 |
|
67 |
def get_page_metadata(self, soup, url):
|
68 |
"""Extract metadata from the page"""
|
|
|
73 |
'category': 'Optional'
|
74 |
}
|
75 |
|
76 |
+
# Title extraction with normalization
|
77 |
+
title = (
|
78 |
soup.find('meta', property='og:title')['content'] if soup.find('meta', property='og:title') else
|
79 |
soup.find('title').text if soup.find('title') else
|
80 |
soup.find('h1').text if soup.find('h1') else
|
81 |
url.split('/')[-1]
|
82 |
)
|
83 |
+
metadata['title'] = self.normalize_text(title)
|
84 |
|
85 |
+
# Description extraction with normalization
|
86 |
+
description = (
|
87 |
soup.find('meta', {'name': 'description'})['content'] if soup.find('meta', {'name': 'description'}) else
|
88 |
soup.find('meta', property='og:description')['content'] if soup.find('meta', property='og:description') else
|
89 |
""
|
90 |
)
|
91 |
+
metadata['description'] = self.normalize_text(description)
|
92 |
|
93 |
# Calculate importance based on various factors
|
94 |
importance = 0
|
|
|
118 |
|
119 |
try:
|
120 |
response = requests.get(url, headers=self.headers, timeout=self.timeout)
|
121 |
+
response.encoding = 'utf-8' # Explicitly set encoding
|
122 |
response.raise_for_status()
|
123 |
self.visited_urls.add(url)
|
124 |
|
|
|
200 |
|
201 |
return "\n".join(content)
|
202 |
|
203 |
+
async def process_url(url, max_depth, max_pages):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
"""Process URL and generate llms.txt"""
|
205 |
try:
|
206 |
+
# Add https:// if not present
|
207 |
+
if not url.startswith(('http://', 'https://')):
|
208 |
+
url = 'https://' + url
|
209 |
+
|
210 |
+
# Validate URL format
|
211 |
+
try:
|
212 |
+
result = urlparse(url)
|
213 |
+
if not all([result.scheme, result.netloc]):
|
214 |
+
return "", "Invalid URL format. Please enter a valid URL."
|
215 |
+
except:
|
216 |
+
return "", "Invalid URL format. Please enter a valid URL."
|
217 |
+
|
218 |
+
# Create crawler and process
|
219 |
crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
|
220 |
await crawler.crawl_website(url)
|
221 |
content = crawler.generate_llms_txt()
|
222 |
|
223 |
+
return content, f"Successfully crawled {len(crawler.visited_urls)} pages. You can now copy the generated content."
|
|
|
|
|
|
|
|
|
224 |
|
225 |
except Exception as e:
|
226 |
return "", f"Error: {str(e)}"
|
|
|
242 |
font-family: 'Open Sans', sans-serif !important;
|
243 |
font-weight: 600 !important;
|
244 |
}
|
245 |
+
|
246 |
+
.gr-input {
|
247 |
+
font-family: 'Open Sans', sans-serif !important;
|
248 |
+
}
|
249 |
"""
|
250 |
|
251 |
# Create the Gradio interface
|
252 |
iface = gr.Interface(
|
253 |
+
fn=lambda url, max_depth, max_pages: asyncio.run(process_url(url, max_depth, max_pages)),
|
254 |
inputs=[
|
255 |
+
gr.Textbox(
|
256 |
+
label="Website URL",
|
257 |
+
placeholder="Enter the website URL (e.g., example.com or https://example.com)",
|
258 |
+
info="The URL will be automatically prefixed with https:// if no protocol is specified."
|
259 |
+
),
|
260 |
+
gr.Slider(
|
261 |
+
minimum=1,
|
262 |
+
maximum=5,
|
263 |
+
value=3,
|
264 |
+
step=1,
|
265 |
+
label="Maximum Crawl Depth",
|
266 |
+
info="Higher values will result in more thorough but slower crawling"
|
267 |
+
),
|
268 |
+
gr.Slider(
|
269 |
+
minimum=10,
|
270 |
+
maximum=100,
|
271 |
+
value=50,
|
272 |
+
step=10,
|
273 |
+
label="Maximum Pages to Crawl",
|
274 |
+
info="Higher values will result in more comprehensive but slower results"
|
275 |
+
)
|
276 |
],
|
277 |
outputs=[
|
278 |
+
gr.Textbox(
|
279 |
+
label="Generated llms.txt Content",
|
280 |
+
lines=20,
|
281 |
+
info="Copy this content to create your llms.txt file"
|
282 |
+
),
|
283 |
gr.Textbox(label="Status")
|
284 |
],
|
285 |
title="llms.txt Generator",
|
286 |
description="Generate an llms.txt file from a website following the specification. The tool crawls the website and creates a structured markdown file suitable for LLMs.",
|
|
|
|
|
|
|
|
|
287 |
theme=gr.themes.Soft(),
|
288 |
css=css
|
289 |
)
|