Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -152,6 +152,132 @@ class WebsiteCrawler:
|
|
152 |
|
153 |
return "\n".join(content)
|
154 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
|
156 |
async def process_url(url, max_depth, max_pages):
|
157 |
"""Process URL and generate llms.txt"""
|
|
|
152 |
|
153 |
return "\n".join(content)
|
154 |
|
155 |
+
def clean_text(self, text, is_title=False):
|
156 |
+
"""Clean and normalize text"""
|
157 |
+
if not text:
|
158 |
+
return ""
|
159 |
+
# Normalize unicode characters
|
160 |
+
text = unicodedata.normalize("NFKD", text)
|
161 |
+
text = re.sub(r"[^\x00-\x7F]+", "", text)
|
162 |
+
|
163 |
+
if is_title:
|
164 |
+
# Remove common suffixes and fragments for titles
|
165 |
+
text = re.sub(r"\s*[\|\-#:•].*", "", text)
|
166 |
+
text = re.sub(r"^\s*Welcome to\s+", "", text)
|
167 |
+
text = text.replace("docusaurus_skipToContent_fallback", "")
|
168 |
+
|
169 |
+
return " ".join(text.split()).strip()
|
170 |
+
|
171 |
+
def clean_description(self, desc):
|
172 |
+
"""Clean description text"""
|
173 |
+
if not desc:
|
174 |
+
return ""
|
175 |
+
# Remove leading dashes, hyphens, or colons
|
176 |
+
desc = re.sub(r"^[-:\s]+", "", desc)
|
177 |
+
# Remove any strings that are just "Editors", "APIs", etc.
|
178 |
+
if len(desc.split()) <= 1:
|
179 |
+
return ""
|
180 |
+
return desc.strip()
|
181 |
+
|
182 |
+
async def crawl_page(self, url, depth, base_domain):
|
183 |
+
"""Crawl a single page and extract information"""
|
184 |
+
if (
|
185 |
+
depth > self.max_depth
|
186 |
+
or url in self.visited_urls
|
187 |
+
or len(self.visited_urls) >= self.max_pages
|
188 |
+
):
|
189 |
+
return []
|
190 |
+
|
191 |
+
try:
|
192 |
+
response = requests.get(url, headers=self.headers, timeout=10)
|
193 |
+
response.encoding = "utf-8"
|
194 |
+
self.visited_urls.add(url)
|
195 |
+
|
196 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
197 |
+
|
198 |
+
# Extract title with fallbacks
|
199 |
+
title = None
|
200 |
+
meta_title = soup.find("meta", property="og:title")
|
201 |
+
if meta_title and meta_title.get("content"):
|
202 |
+
title = meta_title["content"]
|
203 |
+
if not title:
|
204 |
+
title_tag = soup.find("title")
|
205 |
+
if title_tag:
|
206 |
+
title = title_tag.text
|
207 |
+
if not title:
|
208 |
+
h1_tag = soup.find("h1")
|
209 |
+
if h1_tag:
|
210 |
+
title = h1_tag.text
|
211 |
+
if not title:
|
212 |
+
title = url.split("/")[-1]
|
213 |
+
|
214 |
+
title = self.clean_text(title, is_title=True)
|
215 |
+
|
216 |
+
# Extract description with fallbacks
|
217 |
+
desc = None
|
218 |
+
meta_desc = soup.find("meta", {"name": "description"})
|
219 |
+
if meta_desc and meta_desc.get("content"):
|
220 |
+
desc = meta_desc["content"]
|
221 |
+
if not desc:
|
222 |
+
og_desc = soup.find("meta", property="og:description")
|
223 |
+
if og_desc and og_desc.get("content"):
|
224 |
+
desc = og_desc["content"]
|
225 |
+
if not desc:
|
226 |
+
first_p = soup.find("p")
|
227 |
+
if first_p:
|
228 |
+
desc = first_p.text
|
229 |
+
|
230 |
+
desc = self.clean_text(desc) if desc else ""
|
231 |
+
|
232 |
+
# Determine category and importance
|
233 |
+
url_lower = url.lower()
|
234 |
+
category = "Optional"
|
235 |
+
importance = 0
|
236 |
+
|
237 |
+
if "docs" in url_lower or "documentation" in url_lower:
|
238 |
+
category = "Docs"
|
239 |
+
importance = 5
|
240 |
+
elif "api" in url_lower:
|
241 |
+
category = "API"
|
242 |
+
importance = 4
|
243 |
+
elif "guide" in url_lower or "tutorial" in url_lower:
|
244 |
+
category = "Guides"
|
245 |
+
importance = 3
|
246 |
+
elif "example" in url_lower:
|
247 |
+
category = "Examples"
|
248 |
+
importance = 2
|
249 |
+
elif "blog" in url_lower:
|
250 |
+
category = "Blog"
|
251 |
+
importance = 1
|
252 |
+
|
253 |
+
# Store metadata
|
254 |
+
clean_url = re.sub(r"#.*", "", url).rstrip("/")
|
255 |
+
if title and len(title.strip()) > 0: # Only store if we have a valid title
|
256 |
+
self.url_metadata[clean_url] = {
|
257 |
+
"title": title,
|
258 |
+
"description": desc,
|
259 |
+
"category": category,
|
260 |
+
"importance": importance,
|
261 |
+
}
|
262 |
+
|
263 |
+
# Find links
|
264 |
+
links = []
|
265 |
+
for a in soup.find_all("a", href=True):
|
266 |
+
href = a["href"]
|
267 |
+
if not any(
|
268 |
+
x in href.lower()
|
269 |
+
for x in ["javascript:", "mailto:", ".pdf", ".jpg", ".png", ".gif"]
|
270 |
+
):
|
271 |
+
next_url = urljoin(url, href)
|
272 |
+
if urlparse(next_url).netloc == base_domain:
|
273 |
+
links.append(next_url)
|
274 |
+
return links
|
275 |
+
|
276 |
+
except Exception as e:
|
277 |
+
logger.error(f"Error crawling {url}: {str(e)}")
|
278 |
+
return []
|
279 |
+
|
280 |
+
|
281 |
|
282 |
async def process_url(url, max_depth, max_pages):
|
283 |
"""Process URL and generate llms.txt"""
|