cyberandy commited on
Commit
03dc650
·
verified ·
1 Parent(s): dd2349f

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +76 -152
app.py CHANGED
@@ -18,126 +18,81 @@ class WebsiteCrawler:
18
  self.max_pages = max_pages
19
  self.visited_urls = set()
20
  self.url_metadata = defaultdict(dict)
 
21
  self.headers = {
22
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
23
  }
24
 
25
- def clean_text(self, text, is_title=False):
26
- """Clean and normalize text"""
27
- if not text:
28
- return ""
29
- # Normalize unicode characters
30
- text = unicodedata.normalize("NFKD", text)
31
- text = re.sub(r"[^\x00-\x7F]+", "", text)
32
-
33
- if is_title:
34
- # Remove common suffixes and fragments for titles
35
- text = re.sub(r"\s*[\|\-#:•].*", "", text)
36
- text = re.sub(r"^\s*Welcome to\s+", "", text)
37
- text = text.replace("docusaurus_skipToContent_fallback", "")
38
-
39
- return " ".join(text.split()).strip()
40
-
41
- async def crawl_page(self, url, depth, base_domain):
42
- """Crawl a single page and extract information"""
43
- if (
44
- depth > self.max_depth
45
- or url in self.visited_urls
46
- or len(self.visited_urls) >= self.max_pages
47
- ):
48
- return []
49
-
 
 
 
 
 
 
 
 
 
 
50
  try:
51
  response = requests.get(url, headers=self.headers, timeout=10)
52
  response.encoding = "utf-8"
53
- self.visited_urls.add(url)
54
-
55
  soup = BeautifulSoup(response.text, "html.parser")
56
 
57
- # Extract title with fallbacks
58
- title = None
59
- meta_title = soup.find("meta", property="og:title")
60
- if meta_title and meta_title.get("content"):
61
- title = meta_title["content"]
62
- if not title:
63
- title_tag = soup.find("title")
64
- if title_tag:
65
- title = title_tag.text
66
- if not title:
67
- h1_tag = soup.find("h1")
68
- if h1_tag:
69
- title = h1_tag.text
70
- if not title:
71
- title = url.split("/")[-1]
72
-
73
- title = self.clean_text(title, is_title=True)
74
-
75
- # Extract description with fallbacks
76
- desc = None
77
- meta_desc = soup.find("meta", {"name": "description"})
78
- if meta_desc and meta_desc.get("content"):
79
- desc = meta_desc["content"]
80
- if not desc:
81
- og_desc = soup.find("meta", property="og:description")
82
- if og_desc and og_desc.get("content"):
83
- desc = og_desc["content"]
84
- if not desc:
85
- first_p = soup.find("p")
86
- if first_p:
87
- desc = first_p.text
88
-
89
- desc = self.clean_text(desc) if desc else ""
90
-
91
- # Determine category and importance
92
- url_lower = url.lower()
93
- category = "Optional"
94
- importance = 0
95
-
96
- if "docs" in url_lower or "documentation" in url_lower:
97
- category = "Docs"
98
- importance = 5
99
- elif "api" in url_lower:
100
- category = "API"
101
- importance = 4
102
- elif "guide" in url_lower or "tutorial" in url_lower:
103
- category = "Guides"
104
- importance = 3
105
- elif "example" in url_lower:
106
- category = "Examples"
107
- importance = 2
108
- elif "blog" in url_lower:
109
- category = "Blog"
110
- importance = 1
111
-
112
- # Store metadata
113
- clean_url = re.sub(r"#.*", "", url).rstrip("/")
114
- if title and len(title.strip()) > 0: # Only store if we have a valid title
115
- self.url_metadata[clean_url] = {
116
- "title": title,
117
- "description": desc,
118
- "category": category,
119
- "importance": importance,
120
- }
121
-
122
- # Find links
123
- links = []
124
- for a in soup.find_all("a", href=True):
125
- href = a["href"]
126
- if not any(
127
- x in href.lower()
128
- for x in ["javascript:", "mailto:", ".pdf", ".jpg", ".png", ".gif"]
129
- ):
130
- next_url = urljoin(url, href)
131
- if urlparse(next_url).netloc == base_domain:
132
- links.append(next_url)
133
- return links
134
 
135
  except Exception as e:
136
- logger.error(f"Error crawling {url}: {str(e)}")
137
- return []
 
 
 
138
 
139
  async def crawl_website(self, start_url):
140
  """Crawl website starting from the given URL"""
 
 
 
141
  base_domain = urlparse(start_url).netloc
142
  queue = [(start_url, 0)]
143
  seen = {start_url}
@@ -153,17 +108,6 @@ class WebsiteCrawler:
153
  seen.add(link)
154
  queue.append((link, depth + 1))
155
 
156
- def clean_description(self, desc):
157
- """Clean description text"""
158
- if not desc:
159
- return ""
160
- # Remove leading dashes, hyphens, or colons
161
- desc = re.sub(r"^[-:\s]+", "", desc)
162
- # Remove any strings that are just "Editors", "APIs", etc.
163
- if len(desc.split()) <= 1:
164
- return ""
165
- return desc.strip()
166
-
167
  def generate_llms_txt(self):
168
  """Generate llms.txt content"""
169
  if not self.url_metadata:
@@ -188,43 +132,23 @@ class WebsiteCrawler:
188
  # Generate content
189
  content = []
190
 
191
- # Find the best title for the main header (prefer "Welcome" or "Overview")
192
- main_title = "Welcome" # Default to Welcome
193
-
194
- # Find a good description for the blockquote
195
- best_description = None
196
- for _, metadata in sorted_urls:
197
- desc = self.clean_description(metadata["description"])
198
- if desc and len(desc) > 20 and "null" not in desc.lower():
199
- best_description = desc
200
- break
201
 
202
  content.append(f"# {main_title}")
203
- if best_description:
204
- content.append(f"\n> {best_description}")
205
-
206
- # Group by category
207
- categories = defaultdict(list)
208
- for url, metadata in sorted_urls:
209
- if metadata["title"] and url:
210
- categories[metadata["category"]].append((url, metadata))
211
-
212
- # Add sections
213
- for category in ["Docs", "API", "Guides", "Examples", "Blog", "Optional"]:
214
- if category in categories:
215
- content.append(f"\n## {category}")
216
-
217
- # Add links without extra newlines
218
- links = []
219
- for url, metadata in categories[category]:
220
- title = metadata["title"].strip()
221
- desc = self.clean_description(metadata["description"])
222
- if desc:
223
- links.append(f"- [{title}]({url}): {desc}")
224
- else:
225
- links.append(f"- [{title}]({url})")
226
-
227
- content.append("\n".join(links))
228
 
229
  return "\n".join(content)
230
 
 
18
  self.max_pages = max_pages
19
  self.visited_urls = set()
20
  self.url_metadata = defaultdict(dict)
21
+ self.homepage_metadata = None # New field for homepage specific metadata
22
  self.headers = {
23
  "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
24
  }
25
 
26
+ def extract_homepage_description(self, soup):
27
+ """Extract description from homepage with multiple fallbacks"""
28
+ # Try meta description first
29
+ meta_desc = soup.find("meta", {"name": "description"})
30
+ if meta_desc and meta_desc.get("content"):
31
+ desc = meta_desc["content"]
32
+ if desc and len(desc.strip()) > 20:
33
+ return self.clean_text(desc)
34
+
35
+ # Try OpenGraph description
36
+ og_desc = soup.find("meta", property="og:description")
37
+ if og_desc and og_desc.get("content"):
38
+ desc = og_desc["content"]
39
+ if desc and len(desc.strip()) > 20:
40
+ return self.clean_text(desc)
41
+
42
+ # Try first significant paragraph
43
+ for p in soup.find_all("p"):
44
+ text = p.get_text().strip()
45
+ if len(text) > 50 and not any(x in text.lower() for x in ["cookie", "accept", "privacy"]):
46
+ return self.clean_text(text)
47
+
48
+ # Try main content area if exists
49
+ main = soup.find("main")
50
+ if main:
51
+ first_p = main.find("p")
52
+ if first_p:
53
+ text = first_p.get_text().strip()
54
+ if len(text) > 50:
55
+ return self.clean_text(text)
56
+
57
+ return None
58
+
59
+ async def process_homepage(self, url):
60
+ """Specifically process the homepage to extract key metadata"""
61
  try:
62
  response = requests.get(url, headers=self.headers, timeout=10)
63
  response.encoding = "utf-8"
 
 
64
  soup = BeautifulSoup(response.text, "html.parser")
65
 
66
+ # Extract site name with fallbacks
67
+ site_name = None
68
+ site_meta = soup.find("meta", property="og:site_name")
69
+ if site_meta and site_meta.get("content"):
70
+ site_name = site_meta["content"]
71
+ if not site_name:
72
+ site_name = soup.find("title").text if soup.find("title") else None
73
+ if not site_name:
74
+ site_name = urlparse(url).netloc.split('.')[0].capitalize()
75
+
76
+ # Get homepage description
77
+ description = self.extract_homepage_description(soup)
78
+
79
+ self.homepage_metadata = {
80
+ "site_name": self.clean_text(site_name, is_title=True),
81
+ "description": description
82
+ }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
83
 
84
  except Exception as e:
85
+ logger.error(f"Error processing homepage {url}: {str(e)}")
86
+ self.homepage_metadata = {
87
+ "site_name": urlparse(url).netloc.split('.')[0].capitalize(),
88
+ "description": None
89
+ }
90
 
91
  async def crawl_website(self, start_url):
92
  """Crawl website starting from the given URL"""
93
+ # First process the homepage
94
+ await self.process_homepage(start_url)
95
+
96
  base_domain = urlparse(start_url).netloc
97
  queue = [(start_url, 0)]
98
  seen = {start_url}
 
108
  seen.add(link)
109
  queue.append((link, depth + 1))
110
 
 
 
 
 
 
 
 
 
 
 
 
111
  def generate_llms_txt(self):
112
  """Generate llms.txt content"""
113
  if not self.url_metadata:
 
132
  # Generate content
133
  content = []
134
 
135
+ # Use homepage metadata for main title and description
136
+ main_title = self.homepage_metadata.get("site_name", "Welcome")
137
+ homepage_description = self.homepage_metadata.get("description")
 
 
 
 
 
 
 
138
 
139
  content.append(f"# {main_title}")
140
+ if homepage_description:
141
+ content.append(f"\n> {homepage_description}")
142
+ elif len(sorted_urls) > 0:
143
+ # Fallback to first good description from content if no homepage description
144
+ for _, metadata in sorted_urls:
145
+ desc = self.clean_description(metadata["description"])
146
+ if desc and len(desc) > 20 and "null" not in desc.lower():
147
+ content.append(f"\n> {desc}")
148
+ break
149
+
150
+ # Rest of the generation remains the same...
151
+ # [Previous category grouping and link generation code]
 
 
 
 
 
 
 
 
 
 
 
 
 
152
 
153
  return "\n".join(content)
154