cyberandy commited on
Commit
4206e10
·
verified ·
1 Parent(s): 66fe9ad

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +136 -136
app.py CHANGED
@@ -23,6 +23,33 @@ class WebsiteCrawler:
23
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
24
  }
25
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
26
  def extract_homepage_description(self, soup):
27
  """Extract description from homepage with multiple fallbacks"""
28
  # Try meta description first
@@ -56,6 +83,104 @@ class WebsiteCrawler:
56
 
57
  return None
58
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
  async def process_homepage(self, url):
60
  """Specifically process the homepage to extract key metadata"""
61
  try:
@@ -112,11 +237,11 @@ class WebsiteCrawler:
112
  """Generate llms.txt content"""
113
  if not self.url_metadata:
114
  return "No content was found to generate llms.txt"
115
-
116
  # Sort URLs by importance and remove duplicates
117
  sorted_urls = []
118
  seen_titles = set()
119
-
120
  for url, metadata in sorted(
121
  self.url_metadata.items(),
122
  key=lambda x: (x[1]["importance"], x[0]),
@@ -125,17 +250,17 @@ class WebsiteCrawler:
125
  if metadata["title"] not in seen_titles:
126
  sorted_urls.append((url, metadata))
127
  seen_titles.add(metadata["title"])
128
-
129
  if not sorted_urls:
130
  return "No valid content was found"
131
-
132
  # Generate content
133
  content = []
134
-
135
  # Use homepage metadata for main title and description
136
  main_title = self.homepage_metadata.get("site_name", "Welcome")
137
  homepage_description = self.homepage_metadata.get("description")
138
-
139
  content.append(f"# {main_title}")
140
  if homepage_description:
141
  content.append(f"\n> {homepage_description}")
@@ -146,18 +271,18 @@ class WebsiteCrawler:
146
  if desc and len(desc) > 20 and "null" not in desc.lower():
147
  content.append(f"\n> {desc}")
148
  break
149
-
150
  # Group by category
151
  categories = defaultdict(list)
152
  for url, metadata in sorted_urls:
153
  if metadata["title"] and url:
154
  categories[metadata["category"]].append((url, metadata))
155
-
156
  # Add sections
157
  for category in ["Docs", "API", "Guides", "Examples", "Blog", "Optional"]:
158
  if category in categories:
159
  content.append(f"\n## {category}")
160
-
161
  # Add links without extra newlines
162
  links = []
163
  for url, metadata in categories[category]:
@@ -167,135 +292,10 @@ class WebsiteCrawler:
167
  links.append(f"- [{title}]({url}): {desc}")
168
  else:
169
  links.append(f"- [{title}]({url})")
170
-
171
  content.append("\n".join(links))
172
-
173
- return "\n".join(content)
174
 
175
- def clean_text(self, text, is_title=False):
176
- """Clean and normalize text"""
177
- if not text:
178
- return ""
179
- # Normalize unicode characters
180
- text = unicodedata.normalize("NFKD", text)
181
- text = re.sub(r"[^\x00-\x7F]+", "", text)
182
-
183
- if is_title:
184
- # Remove common suffixes and fragments for titles
185
- text = re.sub(r"\s*[\|\-#:•].*", "", text)
186
- text = re.sub(r"^\s*Welcome to\s+", "", text)
187
- text = text.replace("docusaurus_skipToContent_fallback", "")
188
-
189
- return " ".join(text.split()).strip()
190
-
191
- def clean_description(self, desc):
192
- """Clean description text"""
193
- if not desc:
194
- return ""
195
- # Remove leading dashes, hyphens, or colons
196
- desc = re.sub(r"^[-:\s]+", "", desc)
197
- # Remove any strings that are just "Editors", "APIs", etc.
198
- if len(desc.split()) <= 1:
199
- return ""
200
- return desc.strip()
201
-
202
- async def crawl_page(self, url, depth, base_domain):
203
- """Crawl a single page and extract information"""
204
- if (
205
- depth > self.max_depth
206
- or url in self.visited_urls
207
- or len(self.visited_urls) >= self.max_pages
208
- ):
209
- return []
210
-
211
- try:
212
- response = requests.get(url, headers=self.headers, timeout=10)
213
- response.encoding = "utf-8"
214
- self.visited_urls.add(url)
215
-
216
- soup = BeautifulSoup(response.text, "html.parser")
217
-
218
- # Extract title with fallbacks
219
- title = None
220
- meta_title = soup.find("meta", property="og:title")
221
- if meta_title and meta_title.get("content"):
222
- title = meta_title["content"]
223
- if not title:
224
- title_tag = soup.find("title")
225
- if title_tag:
226
- title = title_tag.text
227
- if not title:
228
- h1_tag = soup.find("h1")
229
- if h1_tag:
230
- title = h1_tag.text
231
- if not title:
232
- title = url.split("/")[-1]
233
-
234
- title = self.clean_text(title, is_title=True)
235
-
236
- # Extract description with fallbacks
237
- desc = None
238
- meta_desc = soup.find("meta", {"name": "description"})
239
- if meta_desc and meta_desc.get("content"):
240
- desc = meta_desc["content"]
241
- if not desc:
242
- og_desc = soup.find("meta", property="og:description")
243
- if og_desc and og_desc.get("content"):
244
- desc = og_desc["content"]
245
- if not desc:
246
- first_p = soup.find("p")
247
- if first_p:
248
- desc = first_p.text
249
-
250
- desc = self.clean_text(desc) if desc else ""
251
-
252
- # Determine category and importance
253
- url_lower = url.lower()
254
- category = "Optional"
255
- importance = 0
256
-
257
- if "docs" in url_lower or "documentation" in url_lower:
258
- category = "Docs"
259
- importance = 5
260
- elif "api" in url_lower:
261
- category = "API"
262
- importance = 4
263
- elif "guide" in url_lower or "tutorial" in url_lower:
264
- category = "Guides"
265
- importance = 3
266
- elif "example" in url_lower:
267
- category = "Examples"
268
- importance = 2
269
- elif "blog" in url_lower:
270
- category = "Blog"
271
- importance = 1
272
-
273
- # Store metadata
274
- clean_url = re.sub(r"#.*", "", url).rstrip("/")
275
- if title and len(title.strip()) > 0: # Only store if we have a valid title
276
- self.url_metadata[clean_url] = {
277
- "title": title,
278
- "description": desc,
279
- "category": category,
280
- "importance": importance,
281
- }
282
-
283
- # Find links
284
- links = []
285
- for a in soup.find_all("a", href=True):
286
- href = a["href"]
287
- if not any(
288
- x in href.lower()
289
- for x in ["javascript:", "mailto:", ".pdf", ".jpg", ".png", ".gif"]
290
- ):
291
- next_url = urljoin(url, href)
292
- if urlparse(next_url).netloc == base_domain:
293
- links.append(next_url)
294
- return links
295
-
296
- except Exception as e:
297
- logger.error(f"Error crawling {url}: {str(e)}")
298
- return []
299
 
300
 
301
 
 
23
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
24
  }
25
 
26
+ def clean_text(self, text, is_title=False):
27
+ """Clean and normalize text"""
28
+ if not text:
29
+ return ""
30
+ # Normalize unicode characters
31
+ text = unicodedata.normalize("NFKD", text)
32
+ text = re.sub(r"[^\x00-\x7F]+", "", text)
33
+
34
+ if is_title:
35
+ # Remove common suffixes and fragments for titles
36
+ text = re.sub(r"\s*[\|\-#:•].*", "", text)
37
+ text = re.sub(r"^\s*Welcome to\s+", "", text)
38
+ text = text.replace("docusaurus_skipToContent_fallback", "")
39
+
40
+ return " ".join(text.split()).strip()
41
+
42
+ def clean_description(self, desc):
43
+ """Clean description text"""
44
+ if not desc:
45
+ return ""
46
+ # Remove leading dashes, hyphens, or colons
47
+ desc = re.sub(r"^[-:\s]+", "", desc)
48
+ # Remove any strings that are just "Editors", "APIs", etc.
49
+ if len(desc.split()) <= 1:
50
+ return ""
51
+ return desc.strip()
52
+
53
  def extract_homepage_description(self, soup):
54
  """Extract description from homepage with multiple fallbacks"""
55
  # Try meta description first
 
83
 
84
  return None
85
 
86
+ async def crawl_page(self, url, depth, base_domain):
87
+ """Crawl a single page and extract information"""
88
+ if (
89
+ depth > self.max_depth
90
+ or url in self.visited_urls
91
+ or len(self.visited_urls) >= self.max_pages
92
+ ):
93
+ return []
94
+
95
+ try:
96
+ response = requests.get(url, headers=self.headers, timeout=10)
97
+ response.encoding = "utf-8"
98
+ self.visited_urls.add(url)
99
+
100
+ soup = BeautifulSoup(response.text, "html.parser")
101
+
102
+ # Extract title with fallbacks
103
+ title = None
104
+ meta_title = soup.find("meta", property="og:title")
105
+ if meta_title and meta_title.get("content"):
106
+ title = meta_title["content"]
107
+ if not title:
108
+ title_tag = soup.find("title")
109
+ if title_tag:
110
+ title = title_tag.text
111
+ if not title:
112
+ h1_tag = soup.find("h1")
113
+ if h1_tag:
114
+ title = h1_tag.text
115
+ if not title:
116
+ title = url.split("/")[-1]
117
+
118
+ title = self.clean_text(title, is_title=True)
119
+
120
+ # Extract description with fallbacks
121
+ desc = None
122
+ meta_desc = soup.find("meta", {"name": "description"})
123
+ if meta_desc and meta_desc.get("content"):
124
+ desc = meta_desc["content"]
125
+ if not desc:
126
+ og_desc = soup.find("meta", property="og:description")
127
+ if og_desc and og_desc.get("content"):
128
+ desc = og_desc["content"]
129
+ if not desc:
130
+ first_p = soup.find("p")
131
+ if first_p:
132
+ desc = first_p.text
133
+
134
+ desc = self.clean_text(desc) if desc else ""
135
+
136
+ # Determine category and importance
137
+ url_lower = url.lower()
138
+ category = "Optional"
139
+ importance = 0
140
+
141
+ if "docs" in url_lower or "documentation" in url_lower:
142
+ category = "Docs"
143
+ importance = 5
144
+ elif "api" in url_lower:
145
+ category = "API"
146
+ importance = 4
147
+ elif "guide" in url_lower or "tutorial" in url_lower:
148
+ category = "Guides"
149
+ importance = 3
150
+ elif "example" in url_lower:
151
+ category = "Examples"
152
+ importance = 2
153
+ elif "blog" in url_lower:
154
+ category = "Blog"
155
+ importance = 1
156
+
157
+ # Store metadata
158
+ clean_url = re.sub(r"#.*", "", url).rstrip("/")
159
+ if title and len(title.strip()) > 0: # Only store if we have a valid title
160
+ self.url_metadata[clean_url] = {
161
+ "title": title,
162
+ "description": desc,
163
+ "category": category,
164
+ "importance": importance,
165
+ }
166
+
167
+ # Find links
168
+ links = []
169
+ for a in soup.find_all("a", href=True):
170
+ href = a["href"]
171
+ if not any(
172
+ x in href.lower()
173
+ for x in ["javascript:", "mailto:", ".pdf", ".jpg", ".png", ".gif"]
174
+ ):
175
+ next_url = urljoin(url, href)
176
+ if urlparse(next_url).netloc == base_domain:
177
+ links.append(next_url)
178
+ return links
179
+
180
+ except Exception as e:
181
+ logger.error(f"Error crawling {url}: {str(e)}")
182
+ return []
183
+
184
  async def process_homepage(self, url):
185
  """Specifically process the homepage to extract key metadata"""
186
  try:
 
237
  """Generate llms.txt content"""
238
  if not self.url_metadata:
239
  return "No content was found to generate llms.txt"
240
+
241
  # Sort URLs by importance and remove duplicates
242
  sorted_urls = []
243
  seen_titles = set()
244
+
245
  for url, metadata in sorted(
246
  self.url_metadata.items(),
247
  key=lambda x: (x[1]["importance"], x[0]),
 
250
  if metadata["title"] not in seen_titles:
251
  sorted_urls.append((url, metadata))
252
  seen_titles.add(metadata["title"])
253
+
254
  if not sorted_urls:
255
  return "No valid content was found"
256
+
257
  # Generate content
258
  content = []
259
+
260
  # Use homepage metadata for main title and description
261
  main_title = self.homepage_metadata.get("site_name", "Welcome")
262
  homepage_description = self.homepage_metadata.get("description")
263
+
264
  content.append(f"# {main_title}")
265
  if homepage_description:
266
  content.append(f"\n> {homepage_description}")
 
271
  if desc and len(desc) > 20 and "null" not in desc.lower():
272
  content.append(f"\n> {desc}")
273
  break
274
+
275
  # Group by category
276
  categories = defaultdict(list)
277
  for url, metadata in sorted_urls:
278
  if metadata["title"] and url:
279
  categories[metadata["category"]].append((url, metadata))
280
+
281
  # Add sections
282
  for category in ["Docs", "API", "Guides", "Examples", "Blog", "Optional"]:
283
  if category in categories:
284
  content.append(f"\n## {category}")
285
+
286
  # Add links without extra newlines
287
  links = []
288
  for url, metadata in categories[category]:
 
292
  links.append(f"- [{title}]({url}): {desc}")
293
  else:
294
  links.append(f"- [{title}]({url})")
295
+
296
  content.append("\n".join(links))
 
 
297
 
298
+ return "\n".join(content)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
299
 
300
 
301