Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -49,16 +49,39 @@ class WebsiteCrawler:
|
|
49 |
|
50 |
soup = BeautifulSoup(response.text, 'html.parser')
|
51 |
|
52 |
-
# Extract
|
53 |
-
title =
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
59 |
|
60 |
-
desc =
|
61 |
-
desc = self.clean_text(desc['content'] if desc else '')
|
62 |
|
63 |
# Determine category and importance
|
64 |
url_lower = url.lower()
|
@@ -71,22 +94,35 @@ class WebsiteCrawler:
|
|
71 |
elif 'api' in url_lower:
|
72 |
category = 'API'
|
73 |
importance = 4
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
# Store metadata
|
76 |
clean_url = re.sub(r'#.*', '', url).rstrip('/')
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
|
|
83 |
|
84 |
# Find links
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
if not any(x in
|
89 |
-
|
|
|
|
|
|
|
90 |
|
91 |
except Exception as e:
|
92 |
logger.error(f"Error crawling {url}: {str(e)}")
|
@@ -176,8 +212,41 @@ async def process_url(url, max_depth, max_pages):
|
|
176 |
theme = gr.themes.Soft(primary_hue="blue", font="Open Sans")
|
177 |
|
178 |
with gr.Blocks(theme=theme, css="""
|
179 |
-
.
|
180 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
""") as iface:
|
182 |
gr.Markdown("# llms.txt Generator")
|
183 |
gr.Markdown("Generate an llms.txt file from a website following the specification.")
|
|
|
49 |
|
50 |
soup = BeautifulSoup(response.text, 'html.parser')
|
51 |
|
52 |
+
# Extract title with fallbacks
|
53 |
+
title = None
|
54 |
+
meta_title = soup.find('meta', property='og:title')
|
55 |
+
if meta_title and meta_title.get('content'):
|
56 |
+
title = meta_title['content']
|
57 |
+
if not title:
|
58 |
+
title_tag = soup.find('title')
|
59 |
+
if title_tag:
|
60 |
+
title = title_tag.text
|
61 |
+
if not title:
|
62 |
+
h1_tag = soup.find('h1')
|
63 |
+
if h1_tag:
|
64 |
+
title = h1_tag.text
|
65 |
+
if not title:
|
66 |
+
title = url.split('/')[-1]
|
67 |
+
|
68 |
+
title = self.clean_text(title, is_title=True)
|
69 |
+
|
70 |
+
# Extract description with fallbacks
|
71 |
+
desc = None
|
72 |
+
meta_desc = soup.find('meta', {'name': 'description'})
|
73 |
+
if meta_desc and meta_desc.get('content'):
|
74 |
+
desc = meta_desc['content']
|
75 |
+
if not desc:
|
76 |
+
og_desc = soup.find('meta', property='og:description')
|
77 |
+
if og_desc and og_desc.get('content'):
|
78 |
+
desc = og_desc['content']
|
79 |
+
if not desc:
|
80 |
+
first_p = soup.find('p')
|
81 |
+
if first_p:
|
82 |
+
desc = first_p.text
|
83 |
|
84 |
+
desc = self.clean_text(desc) if desc else ""
|
|
|
85 |
|
86 |
# Determine category and importance
|
87 |
url_lower = url.lower()
|
|
|
94 |
elif 'api' in url_lower:
|
95 |
category = 'API'
|
96 |
importance = 4
|
97 |
+
elif 'guide' in url_lower or 'tutorial' in url_lower:
|
98 |
+
category = 'Guides'
|
99 |
+
importance = 3
|
100 |
+
elif 'example' in url_lower:
|
101 |
+
category = 'Examples'
|
102 |
+
importance = 2
|
103 |
+
elif 'blog' in url_lower:
|
104 |
+
category = 'Blog'
|
105 |
+
importance = 1
|
106 |
|
107 |
# Store metadata
|
108 |
clean_url = re.sub(r'#.*', '', url).rstrip('/')
|
109 |
+
if title and len(title.strip()) > 0: # Only store if we have a valid title
|
110 |
+
self.url_metadata[clean_url] = {
|
111 |
+
'title': title,
|
112 |
+
'description': desc,
|
113 |
+
'category': category,
|
114 |
+
'importance': importance
|
115 |
+
}
|
116 |
|
117 |
# Find links
|
118 |
+
links = []
|
119 |
+
for a in soup.find_all('a', href=True):
|
120 |
+
href = a['href']
|
121 |
+
if not any(x in href.lower() for x in ['javascript:', 'mailto:', '.pdf', '.jpg', '.png', '.gif']):
|
122 |
+
next_url = urljoin(url, href)
|
123 |
+
if urlparse(next_url).netloc == base_domain:
|
124 |
+
links.append(next_url)
|
125 |
+
return links
|
126 |
|
127 |
except Exception as e:
|
128 |
logger.error(f"Error crawling {url}: {str(e)}")
|
|
|
212 |
theme = gr.themes.Soft(primary_hue="blue", font="Open Sans")
|
213 |
|
214 |
with gr.Blocks(theme=theme, css="""
|
215 |
+
@import url('https://fonts.googleapis.com/css2?family=Open+Sans:wght@400;600&display=swap');
|
216 |
+
|
217 |
+
.gradio-container {
|
218 |
+
font-family: 'Open Sans', sans-serif !important;
|
219 |
+
}
|
220 |
+
|
221 |
+
.gr-button {
|
222 |
+
font-family: 'Open Sans', sans-serif !important;
|
223 |
+
font-weight: 600 !important;
|
224 |
+
}
|
225 |
+
|
226 |
+
.primary-btn {
|
227 |
+
background-color: #2436d4 !important;
|
228 |
+
color: white !important;
|
229 |
+
}
|
230 |
+
|
231 |
+
.primary-btn:hover {
|
232 |
+
background-color: #1c2aa8 !important;
|
233 |
+
}
|
234 |
+
|
235 |
+
[data-testid="textbox"] {
|
236 |
+
font-family: 'Open Sans', sans-serif !important;
|
237 |
+
}
|
238 |
+
|
239 |
+
.gr-padded {
|
240 |
+
font-family: 'Open Sans', sans-serif !important;
|
241 |
+
}
|
242 |
+
|
243 |
+
.gr-input {
|
244 |
+
font-family: 'Open Sans', sans-serif !important;
|
245 |
+
}
|
246 |
+
|
247 |
+
.gr-label {
|
248 |
+
font-family: 'Open Sans', sans-serif !important;
|
249 |
+
}
|
250 |
""") as iface:
|
251 |
gr.Markdown("# llms.txt Generator")
|
252 |
gr.Markdown("Generate an llms.txt file from a website following the specification.")
|