cyberandy commited on
Commit
e9f1fb9
·
verified ·
1 Parent(s): 03dc650

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +126 -0
app.py CHANGED
@@ -152,6 +152,132 @@ class WebsiteCrawler:
152
 
153
  return "\n".join(content)
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
 
156
  async def process_url(url, max_depth, max_pages):
157
  """Process URL and generate llms.txt"""
 
152
 
153
  return "\n".join(content)
154
 
155
+ def clean_text(self, text, is_title=False):
156
+ """Clean and normalize text"""
157
+ if not text:
158
+ return ""
159
+ # Normalize unicode characters
160
+ text = unicodedata.normalize("NFKD", text)
161
+ text = re.sub(r"[^\x00-\x7F]+", "", text)
162
+
163
+ if is_title:
164
+ # Remove common suffixes and fragments for titles
165
+ text = re.sub(r"\s*[\|\-#:•].*", "", text)
166
+ text = re.sub(r"^\s*Welcome to\s+", "", text)
167
+ text = text.replace("docusaurus_skipToContent_fallback", "")
168
+
169
+ return " ".join(text.split()).strip()
170
+
171
+ def clean_description(self, desc):
172
+ """Clean description text"""
173
+ if not desc:
174
+ return ""
175
+ # Remove leading dashes, hyphens, or colons
176
+ desc = re.sub(r"^[-:\s]+", "", desc)
177
+ # Remove any strings that are just "Editors", "APIs", etc.
178
+ if len(desc.split()) <= 1:
179
+ return ""
180
+ return desc.strip()
181
+
182
+ async def crawl_page(self, url, depth, base_domain):
183
+ """Crawl a single page and extract information"""
184
+ if (
185
+ depth > self.max_depth
186
+ or url in self.visited_urls
187
+ or len(self.visited_urls) >= self.max_pages
188
+ ):
189
+ return []
190
+
191
+ try:
192
+ response = requests.get(url, headers=self.headers, timeout=10)
193
+ response.encoding = "utf-8"
194
+ self.visited_urls.add(url)
195
+
196
+ soup = BeautifulSoup(response.text, "html.parser")
197
+
198
+ # Extract title with fallbacks
199
+ title = None
200
+ meta_title = soup.find("meta", property="og:title")
201
+ if meta_title and meta_title.get("content"):
202
+ title = meta_title["content"]
203
+ if not title:
204
+ title_tag = soup.find("title")
205
+ if title_tag:
206
+ title = title_tag.text
207
+ if not title:
208
+ h1_tag = soup.find("h1")
209
+ if h1_tag:
210
+ title = h1_tag.text
211
+ if not title:
212
+ title = url.split("/")[-1]
213
+
214
+ title = self.clean_text(title, is_title=True)
215
+
216
+ # Extract description with fallbacks
217
+ desc = None
218
+ meta_desc = soup.find("meta", {"name": "description"})
219
+ if meta_desc and meta_desc.get("content"):
220
+ desc = meta_desc["content"]
221
+ if not desc:
222
+ og_desc = soup.find("meta", property="og:description")
223
+ if og_desc and og_desc.get("content"):
224
+ desc = og_desc["content"]
225
+ if not desc:
226
+ first_p = soup.find("p")
227
+ if first_p:
228
+ desc = first_p.text
229
+
230
+ desc = self.clean_text(desc) if desc else ""
231
+
232
+ # Determine category and importance
233
+ url_lower = url.lower()
234
+ category = "Optional"
235
+ importance = 0
236
+
237
+ if "docs" in url_lower or "documentation" in url_lower:
238
+ category = "Docs"
239
+ importance = 5
240
+ elif "api" in url_lower:
241
+ category = "API"
242
+ importance = 4
243
+ elif "guide" in url_lower or "tutorial" in url_lower:
244
+ category = "Guides"
245
+ importance = 3
246
+ elif "example" in url_lower:
247
+ category = "Examples"
248
+ importance = 2
249
+ elif "blog" in url_lower:
250
+ category = "Blog"
251
+ importance = 1
252
+
253
+ # Store metadata
254
+ clean_url = re.sub(r"#.*", "", url).rstrip("/")
255
+ if title and len(title.strip()) > 0: # Only store if we have a valid title
256
+ self.url_metadata[clean_url] = {
257
+ "title": title,
258
+ "description": desc,
259
+ "category": category,
260
+ "importance": importance,
261
+ }
262
+
263
+ # Find links
264
+ links = []
265
+ for a in soup.find_all("a", href=True):
266
+ href = a["href"]
267
+ if not any(
268
+ x in href.lower()
269
+ for x in ["javascript:", "mailto:", ".pdf", ".jpg", ".png", ".gif"]
270
+ ):
271
+ next_url = urljoin(url, href)
272
+ if urlparse(next_url).netloc == base_domain:
273
+ links.append(next_url)
274
+ return links
275
+
276
+ except Exception as e:
277
+ logger.error(f"Error crawling {url}: {str(e)}")
278
+ return []
279
+
280
+
281
 
282
  async def process_url(url, max_depth, max_pages):
283
  """Process URL and generate llms.txt"""