Spaces:
Running
Running
update
Browse files
app.py
CHANGED
@@ -11,6 +11,7 @@ import logging
|
|
11 |
logging.basicConfig(level=logging.INFO)
|
12 |
logger = logging.getLogger(__name__)
|
13 |
|
|
|
14 |
class WebsiteCrawler:
|
15 |
def __init__(self, max_depth=3, max_pages=50):
|
16 |
self.max_depth = max_depth
|
@@ -18,7 +19,7 @@ class WebsiteCrawler:
|
|
18 |
self.visited_urls = set()
|
19 |
self.url_metadata = defaultdict(dict)
|
20 |
self.headers = {
|
21 |
-
|
22 |
}
|
23 |
|
24 |
def clean_text(self, text, is_title=False):
|
@@ -26,99 +27,106 @@ class WebsiteCrawler:
|
|
26 |
if not text:
|
27 |
return ""
|
28 |
# Normalize unicode characters
|
29 |
-
text = unicodedata.normalize(
|
30 |
-
text = re.sub(r
|
31 |
-
|
32 |
if is_title:
|
33 |
# Remove common suffixes and fragments for titles
|
34 |
-
text = re.sub(r
|
35 |
-
text = re.sub(r
|
36 |
-
text = text.replace(
|
37 |
-
|
38 |
-
return
|
39 |
|
40 |
async def crawl_page(self, url, depth, base_domain):
|
41 |
"""Crawl a single page and extract information"""
|
42 |
-
if
|
|
|
|
|
|
|
|
|
43 |
return []
|
44 |
|
45 |
try:
|
46 |
response = requests.get(url, headers=self.headers, timeout=10)
|
47 |
-
response.encoding =
|
48 |
self.visited_urls.add(url)
|
49 |
|
50 |
-
soup = BeautifulSoup(response.text,
|
51 |
-
|
52 |
# Extract title with fallbacks
|
53 |
title = None
|
54 |
-
meta_title = soup.find(
|
55 |
-
if meta_title and meta_title.get(
|
56 |
-
title = meta_title[
|
57 |
if not title:
|
58 |
-
title_tag = soup.find(
|
59 |
if title_tag:
|
60 |
title = title_tag.text
|
61 |
if not title:
|
62 |
-
h1_tag = soup.find(
|
63 |
if h1_tag:
|
64 |
title = h1_tag.text
|
65 |
if not title:
|
66 |
-
title = url.split(
|
67 |
|
68 |
title = self.clean_text(title, is_title=True)
|
69 |
-
|
70 |
# Extract description with fallbacks
|
71 |
desc = None
|
72 |
-
meta_desc = soup.find(
|
73 |
-
if meta_desc and meta_desc.get(
|
74 |
-
desc = meta_desc[
|
75 |
if not desc:
|
76 |
-
og_desc = soup.find(
|
77 |
-
if og_desc and og_desc.get(
|
78 |
-
desc = og_desc[
|
79 |
if not desc:
|
80 |
-
first_p = soup.find(
|
81 |
if first_p:
|
82 |
desc = first_p.text
|
83 |
-
|
84 |
desc = self.clean_text(desc) if desc else ""
|
85 |
|
86 |
# Determine category and importance
|
87 |
url_lower = url.lower()
|
88 |
-
category =
|
89 |
importance = 0
|
90 |
-
|
91 |
-
if
|
92 |
-
category =
|
93 |
importance = 5
|
94 |
-
elif
|
95 |
-
category =
|
96 |
importance = 4
|
97 |
-
elif
|
98 |
-
category =
|
99 |
importance = 3
|
100 |
-
elif
|
101 |
-
category =
|
102 |
importance = 2
|
103 |
-
elif
|
104 |
-
category =
|
105 |
importance = 1
|
106 |
-
|
107 |
# Store metadata
|
108 |
-
clean_url = re.sub(r
|
109 |
if title and len(title.strip()) > 0: # Only store if we have a valid title
|
110 |
self.url_metadata[clean_url] = {
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
}
|
116 |
|
117 |
# Find links
|
118 |
links = []
|
119 |
-
for a in soup.find_all(
|
120 |
-
href = a[
|
121 |
-
if not any(
|
|
|
|
|
|
|
122 |
next_url = urljoin(url, href)
|
123 |
if urlparse(next_url).netloc == base_domain:
|
124 |
links.append(next_url)
|
@@ -150,7 +158,7 @@ class WebsiteCrawler:
|
|
150 |
if not desc:
|
151 |
return ""
|
152 |
# Remove leading dashes, hyphens, or colons
|
153 |
-
desc = re.sub(r
|
154 |
# Remove any strings that are just "Editors", "APIs", etc.
|
155 |
if len(desc.split()) <= 1:
|
156 |
return ""
|
@@ -164,33 +172,33 @@ class WebsiteCrawler:
|
|
164 |
# Sort URLs by importance and remove duplicates
|
165 |
sorted_urls = []
|
166 |
seen_titles = set()
|
167 |
-
|
168 |
for url, metadata in sorted(
|
169 |
self.url_metadata.items(),
|
170 |
-
key=lambda x: (x[1][
|
171 |
-
reverse=True
|
172 |
):
|
173 |
-
if metadata[
|
174 |
sorted_urls.append((url, metadata))
|
175 |
-
seen_titles.add(metadata[
|
176 |
|
177 |
if not sorted_urls:
|
178 |
return "No valid content was found"
|
179 |
|
180 |
# Generate content
|
181 |
content = []
|
182 |
-
|
183 |
# Find the best title for the main header (prefer "Welcome" or "Overview")
|
184 |
main_title = "Welcome" # Default to Welcome
|
185 |
-
|
186 |
# Find a good description for the blockquote
|
187 |
best_description = None
|
188 |
for _, metadata in sorted_urls:
|
189 |
-
desc = self.clean_description(metadata[
|
190 |
if desc and len(desc) > 20 and "null" not in desc.lower():
|
191 |
best_description = desc
|
192 |
break
|
193 |
-
|
194 |
content.append(f"# {main_title}")
|
195 |
if best_description:
|
196 |
content.append(f"\n> {best_description}")
|
@@ -198,34 +206,35 @@ class WebsiteCrawler:
|
|
198 |
# Group by category
|
199 |
categories = defaultdict(list)
|
200 |
for url, metadata in sorted_urls:
|
201 |
-
if metadata[
|
202 |
-
categories[metadata[
|
203 |
|
204 |
# Add sections
|
205 |
-
for category in [
|
206 |
if category in categories:
|
207 |
content.append(f"\n## {category}")
|
208 |
-
|
209 |
# Add links without extra newlines
|
210 |
links = []
|
211 |
for url, metadata in categories[category]:
|
212 |
-
title = metadata[
|
213 |
-
desc = self.clean_description(metadata[
|
214 |
if desc:
|
215 |
links.append(f"- [{title}]({url}): {desc}")
|
216 |
else:
|
217 |
links.append(f"- [{title}]({url})")
|
218 |
-
|
219 |
-
content.append('\n'.join(links))
|
220 |
|
221 |
-
|
|
|
|
|
|
|
222 |
|
223 |
async def process_url(url, max_depth, max_pages):
|
224 |
"""Process URL and generate llms.txt"""
|
225 |
try:
|
226 |
# Add https:// if not present
|
227 |
-
if not url.startswith((
|
228 |
-
url =
|
229 |
|
230 |
# Validate URL
|
231 |
result = urlparse(url)
|
@@ -236,22 +245,25 @@ async def process_url(url, max_depth, max_pages):
|
|
236 |
crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
|
237 |
await crawler.crawl_website(url)
|
238 |
content = crawler.generate_llms_txt()
|
239 |
-
|
240 |
return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
|
241 |
-
|
242 |
except Exception as e:
|
243 |
return "", f"Error: {str(e)}"
|
244 |
|
|
|
245 |
# Create Gradio interface
|
246 |
theme = gr.themes.Soft(primary_hue="blue", font="Open Sans")
|
247 |
|
248 |
-
with gr.Blocks(
|
|
|
|
|
249 |
@import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
|
250 |
-
|
251 |
.gradio-container {
|
252 |
font-family: 'Open Sans', sans-serif !important;
|
253 |
}
|
254 |
-
|
255 |
.gr-button {
|
256 |
font-family: 'Open Sans', sans-serif !important;
|
257 |
font-weight: 600 !important;
|
@@ -265,55 +277,60 @@ with gr.Blocks(theme=theme, css="""
|
|
265 |
.primary-btn:hover {
|
266 |
background-color: #1c2aa8 !important;
|
267 |
}
|
268 |
-
|
269 |
[data-testid="textbox"] {
|
270 |
font-family: 'Open Sans', sans-serif !important;
|
271 |
}
|
272 |
-
|
273 |
.gr-padded {
|
274 |
font-family: 'Open Sans', sans-serif !important;
|
275 |
}
|
276 |
-
|
277 |
.gr-input {
|
278 |
font-family: 'Open Sans', sans-serif !important;
|
279 |
}
|
280 |
-
|
281 |
.gr-label {
|
282 |
font-family: 'Open Sans', sans-serif !important;
|
283 |
}
|
284 |
-
"""
|
|
|
285 |
gr.Markdown("# llms.txt Generator")
|
286 |
gr.Markdown("Generate an llms.txt file from a website following the specification.")
|
287 |
-
|
288 |
with gr.Row():
|
289 |
url_input = gr.Textbox(
|
290 |
-
label="Website URL",
|
291 |
placeholder="Enter the website URL (e.g., example.com)",
|
292 |
-
info="The URL will be automatically prefixed with https:// if not provided"
|
293 |
)
|
294 |
-
|
295 |
with gr.Row():
|
296 |
with gr.Column():
|
297 |
-
depth_input = gr.Slider(
|
|
|
|
|
298 |
with gr.Column():
|
299 |
-
pages_input = gr.Slider(
|
300 |
-
|
|
|
|
|
301 |
generate_btn = gr.Button("Generate llms.txt", variant="primary")
|
302 |
-
|
303 |
output = gr.Textbox(
|
304 |
label="Generated llms.txt Content",
|
305 |
lines=20,
|
306 |
show_copy_button=True,
|
307 |
-
container=True
|
308 |
)
|
309 |
-
|
310 |
status = gr.Textbox(label="Status")
|
311 |
-
|
312 |
generate_btn.click(
|
313 |
fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
|
314 |
inputs=[url_input, depth_input, pages_input],
|
315 |
-
outputs=[output, status]
|
316 |
)
|
317 |
|
318 |
if __name__ == "__main__":
|
319 |
-
iface.launch()
|
|
|
11 |
logging.basicConfig(level=logging.INFO)
|
12 |
logger = logging.getLogger(__name__)
|
13 |
|
14 |
+
|
15 |
class WebsiteCrawler:
|
16 |
def __init__(self, max_depth=3, max_pages=50):
|
17 |
self.max_depth = max_depth
|
|
|
19 |
self.visited_urls = set()
|
20 |
self.url_metadata = defaultdict(dict)
|
21 |
self.headers = {
|
22 |
+
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
23 |
}
|
24 |
|
25 |
def clean_text(self, text, is_title=False):
|
|
|
27 |
if not text:
|
28 |
return ""
|
29 |
# Normalize unicode characters
|
30 |
+
text = unicodedata.normalize("NFKD", text)
|
31 |
+
text = re.sub(r"[^\x00-\x7F]+", "", text)
|
32 |
+
|
33 |
if is_title:
|
34 |
# Remove common suffixes and fragments for titles
|
35 |
+
text = re.sub(r"\s*[\|\-#:•].*", "", text)
|
36 |
+
text = re.sub(r"^\s*Welcome to\s+", "", text)
|
37 |
+
text = text.replace("docusaurus_skipToContent_fallback", "")
|
38 |
+
|
39 |
+
return " ".join(text.split()).strip()
|
40 |
|
41 |
async def crawl_page(self, url, depth, base_domain):
|
42 |
"""Crawl a single page and extract information"""
|
43 |
+
if (
|
44 |
+
depth > self.max_depth
|
45 |
+
or url in self.visited_urls
|
46 |
+
or len(self.visited_urls) >= self.max_pages
|
47 |
+
):
|
48 |
return []
|
49 |
|
50 |
try:
|
51 |
response = requests.get(url, headers=self.headers, timeout=10)
|
52 |
+
response.encoding = "utf-8"
|
53 |
self.visited_urls.add(url)
|
54 |
|
55 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
56 |
+
|
57 |
# Extract title with fallbacks
|
58 |
title = None
|
59 |
+
meta_title = soup.find("meta", property="og:title")
|
60 |
+
if meta_title and meta_title.get("content"):
|
61 |
+
title = meta_title["content"]
|
62 |
if not title:
|
63 |
+
title_tag = soup.find("title")
|
64 |
if title_tag:
|
65 |
title = title_tag.text
|
66 |
if not title:
|
67 |
+
h1_tag = soup.find("h1")
|
68 |
if h1_tag:
|
69 |
title = h1_tag.text
|
70 |
if not title:
|
71 |
+
title = url.split("/")[-1]
|
72 |
|
73 |
title = self.clean_text(title, is_title=True)
|
74 |
+
|
75 |
# Extract description with fallbacks
|
76 |
desc = None
|
77 |
+
meta_desc = soup.find("meta", {"name": "description"})
|
78 |
+
if meta_desc and meta_desc.get("content"):
|
79 |
+
desc = meta_desc["content"]
|
80 |
if not desc:
|
81 |
+
og_desc = soup.find("meta", property="og:description")
|
82 |
+
if og_desc and og_desc.get("content"):
|
83 |
+
desc = og_desc["content"]
|
84 |
if not desc:
|
85 |
+
first_p = soup.find("p")
|
86 |
if first_p:
|
87 |
desc = first_p.text
|
88 |
+
|
89 |
desc = self.clean_text(desc) if desc else ""
|
90 |
|
91 |
# Determine category and importance
|
92 |
url_lower = url.lower()
|
93 |
+
category = "Optional"
|
94 |
importance = 0
|
95 |
+
|
96 |
+
if "docs" in url_lower or "documentation" in url_lower:
|
97 |
+
category = "Docs"
|
98 |
importance = 5
|
99 |
+
elif "api" in url_lower:
|
100 |
+
category = "API"
|
101 |
importance = 4
|
102 |
+
elif "guide" in url_lower or "tutorial" in url_lower:
|
103 |
+
category = "Guides"
|
104 |
importance = 3
|
105 |
+
elif "example" in url_lower:
|
106 |
+
category = "Examples"
|
107 |
importance = 2
|
108 |
+
elif "blog" in url_lower:
|
109 |
+
category = "Blog"
|
110 |
importance = 1
|
111 |
+
|
112 |
# Store metadata
|
113 |
+
clean_url = re.sub(r"#.*", "", url).rstrip("/")
|
114 |
if title and len(title.strip()) > 0: # Only store if we have a valid title
|
115 |
self.url_metadata[clean_url] = {
|
116 |
+
"title": title,
|
117 |
+
"description": desc,
|
118 |
+
"category": category,
|
119 |
+
"importance": importance,
|
120 |
}
|
121 |
|
122 |
# Find links
|
123 |
links = []
|
124 |
+
for a in soup.find_all("a", href=True):
|
125 |
+
href = a["href"]
|
126 |
+
if not any(
|
127 |
+
x in href.lower()
|
128 |
+
for x in ["javascript:", "mailto:", ".pdf", ".jpg", ".png", ".gif"]
|
129 |
+
):
|
130 |
next_url = urljoin(url, href)
|
131 |
if urlparse(next_url).netloc == base_domain:
|
132 |
links.append(next_url)
|
|
|
158 |
if not desc:
|
159 |
return ""
|
160 |
# Remove leading dashes, hyphens, or colons
|
161 |
+
desc = re.sub(r"^[-:\s]+", "", desc)
|
162 |
# Remove any strings that are just "Editors", "APIs", etc.
|
163 |
if len(desc.split()) <= 1:
|
164 |
return ""
|
|
|
172 |
# Sort URLs by importance and remove duplicates
|
173 |
sorted_urls = []
|
174 |
seen_titles = set()
|
175 |
+
|
176 |
for url, metadata in sorted(
|
177 |
self.url_metadata.items(),
|
178 |
+
key=lambda x: (x[1]["importance"], x[0]),
|
179 |
+
reverse=True,
|
180 |
):
|
181 |
+
if metadata["title"] not in seen_titles:
|
182 |
sorted_urls.append((url, metadata))
|
183 |
+
seen_titles.add(metadata["title"])
|
184 |
|
185 |
if not sorted_urls:
|
186 |
return "No valid content was found"
|
187 |
|
188 |
# Generate content
|
189 |
content = []
|
190 |
+
|
191 |
# Find the best title for the main header (prefer "Welcome" or "Overview")
|
192 |
main_title = "Welcome" # Default to Welcome
|
193 |
+
|
194 |
# Find a good description for the blockquote
|
195 |
best_description = None
|
196 |
for _, metadata in sorted_urls:
|
197 |
+
desc = self.clean_description(metadata["description"])
|
198 |
if desc and len(desc) > 20 and "null" not in desc.lower():
|
199 |
best_description = desc
|
200 |
break
|
201 |
+
|
202 |
content.append(f"# {main_title}")
|
203 |
if best_description:
|
204 |
content.append(f"\n> {best_description}")
|
|
|
206 |
# Group by category
|
207 |
categories = defaultdict(list)
|
208 |
for url, metadata in sorted_urls:
|
209 |
+
if metadata["title"] and url:
|
210 |
+
categories[metadata["category"]].append((url, metadata))
|
211 |
|
212 |
# Add sections
|
213 |
+
for category in ["Docs", "API", "Guides", "Examples", "Blog", "Optional"]:
|
214 |
if category in categories:
|
215 |
content.append(f"\n## {category}")
|
216 |
+
|
217 |
# Add links without extra newlines
|
218 |
links = []
|
219 |
for url, metadata in categories[category]:
|
220 |
+
title = metadata["title"].strip()
|
221 |
+
desc = self.clean_description(metadata["description"])
|
222 |
if desc:
|
223 |
links.append(f"- [{title}]({url}): {desc}")
|
224 |
else:
|
225 |
links.append(f"- [{title}]({url})")
|
|
|
|
|
226 |
|
227 |
+
content.append("\n".join(links))
|
228 |
+
|
229 |
+
return "\n".join(content)
|
230 |
+
|
231 |
|
232 |
async def process_url(url, max_depth, max_pages):
|
233 |
"""Process URL and generate llms.txt"""
|
234 |
try:
|
235 |
# Add https:// if not present
|
236 |
+
if not url.startswith(("http://", "https://")):
|
237 |
+
url = "https://" + url
|
238 |
|
239 |
# Validate URL
|
240 |
result = urlparse(url)
|
|
|
245 |
crawler = WebsiteCrawler(max_depth=int(max_depth), max_pages=int(max_pages))
|
246 |
await crawler.crawl_website(url)
|
247 |
content = crawler.generate_llms_txt()
|
248 |
+
|
249 |
return content, f"Successfully crawled {len(crawler.visited_urls)} pages."
|
250 |
+
|
251 |
except Exception as e:
|
252 |
return "", f"Error: {str(e)}"
|
253 |
|
254 |
+
|
255 |
# Create Gradio interface
|
256 |
theme = gr.themes.Soft(primary_hue="blue", font="Open Sans")
|
257 |
|
258 |
+
with gr.Blocks(
|
259 |
+
theme=theme,
|
260 |
+
css="""
|
261 |
@import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
|
262 |
+
|
263 |
.gradio-container {
|
264 |
font-family: 'Open Sans', sans-serif !important;
|
265 |
}
|
266 |
+
|
267 |
.gr-button {
|
268 |
font-family: 'Open Sans', sans-serif !important;
|
269 |
font-weight: 600 !important;
|
|
|
277 |
.primary-btn:hover {
|
278 |
background-color: #1c2aa8 !important;
|
279 |
}
|
280 |
+
|
281 |
[data-testid="textbox"] {
|
282 |
font-family: 'Open Sans', sans-serif !important;
|
283 |
}
|
284 |
+
|
285 |
.gr-padded {
|
286 |
font-family: 'Open Sans', sans-serif !important;
|
287 |
}
|
288 |
+
|
289 |
.gr-input {
|
290 |
font-family: 'Open Sans', sans-serif !important;
|
291 |
}
|
292 |
+
|
293 |
.gr-label {
|
294 |
font-family: 'Open Sans', sans-serif !important;
|
295 |
}
|
296 |
+
""",
|
297 |
+
) as iface:
|
298 |
gr.Markdown("# llms.txt Generator")
|
299 |
gr.Markdown("Generate an llms.txt file from a website following the specification.")
|
300 |
+
|
301 |
with gr.Row():
|
302 |
url_input = gr.Textbox(
|
303 |
+
label="Website URL",
|
304 |
placeholder="Enter the website URL (e.g., example.com)",
|
305 |
+
info="The URL will be automatically prefixed with https:// if not provided",
|
306 |
)
|
307 |
+
|
308 |
with gr.Row():
|
309 |
with gr.Column():
|
310 |
+
depth_input = gr.Slider(
|
311 |
+
minimum=1, maximum=5, value=3, step=1, label="Maximum Crawl Depth"
|
312 |
+
)
|
313 |
with gr.Column():
|
314 |
+
pages_input = gr.Slider(
|
315 |
+
minimum=10, maximum=100, value=50, step=10, label="Maximum Pages"
|
316 |
+
)
|
317 |
+
|
318 |
generate_btn = gr.Button("Generate llms.txt", variant="primary")
|
319 |
+
|
320 |
output = gr.Textbox(
|
321 |
label="Generated llms.txt Content",
|
322 |
lines=20,
|
323 |
show_copy_button=True,
|
324 |
+
container=True,
|
325 |
)
|
326 |
+
|
327 |
status = gr.Textbox(label="Status")
|
328 |
+
|
329 |
generate_btn.click(
|
330 |
fn=lambda url, depth, pages: asyncio.run(process_url(url, depth, pages)),
|
331 |
inputs=[url_input, depth_input, pages_input],
|
332 |
+
outputs=[output, status],
|
333 |
)
|
334 |
|
335 |
if __name__ == "__main__":
|
336 |
+
iface.launch()
|