Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -18,126 +18,81 @@ class WebsiteCrawler:
|
|
18 |
self.max_pages = max_pages
|
19 |
self.visited_urls = set()
|
20 |
self.url_metadata = defaultdict(dict)
|
|
|
21 |
self.headers = {
|
22 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
23 |
}
|
24 |
|
25 |
-
def
|
26 |
-
"""
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
50 |
try:
|
51 |
response = requests.get(url, headers=self.headers, timeout=10)
|
52 |
response.encoding = "utf-8"
|
53 |
-
self.visited_urls.add(url)
|
54 |
-
|
55 |
soup = BeautifulSoup(response.text, "html.parser")
|
56 |
|
57 |
-
# Extract
|
58 |
-
|
59 |
-
|
60 |
-
if
|
61 |
-
|
62 |
-
if not
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
# Extract description with fallbacks
|
76 |
-
desc = None
|
77 |
-
meta_desc = soup.find("meta", {"name": "description"})
|
78 |
-
if meta_desc and meta_desc.get("content"):
|
79 |
-
desc = meta_desc["content"]
|
80 |
-
if not desc:
|
81 |
-
og_desc = soup.find("meta", property="og:description")
|
82 |
-
if og_desc and og_desc.get("content"):
|
83 |
-
desc = og_desc["content"]
|
84 |
-
if not desc:
|
85 |
-
first_p = soup.find("p")
|
86 |
-
if first_p:
|
87 |
-
desc = first_p.text
|
88 |
-
|
89 |
-
desc = self.clean_text(desc) if desc else ""
|
90 |
-
|
91 |
-
# Determine category and importance
|
92 |
-
url_lower = url.lower()
|
93 |
-
category = "Optional"
|
94 |
-
importance = 0
|
95 |
-
|
96 |
-
if "docs" in url_lower or "documentation" in url_lower:
|
97 |
-
category = "Docs"
|
98 |
-
importance = 5
|
99 |
-
elif "api" in url_lower:
|
100 |
-
category = "API"
|
101 |
-
importance = 4
|
102 |
-
elif "guide" in url_lower or "tutorial" in url_lower:
|
103 |
-
category = "Guides"
|
104 |
-
importance = 3
|
105 |
-
elif "example" in url_lower:
|
106 |
-
category = "Examples"
|
107 |
-
importance = 2
|
108 |
-
elif "blog" in url_lower:
|
109 |
-
category = "Blog"
|
110 |
-
importance = 1
|
111 |
-
|
112 |
-
# Store metadata
|
113 |
-
clean_url = re.sub(r"#.*", "", url).rstrip("/")
|
114 |
-
if title and len(title.strip()) > 0: # Only store if we have a valid title
|
115 |
-
self.url_metadata[clean_url] = {
|
116 |
-
"title": title,
|
117 |
-
"description": desc,
|
118 |
-
"category": category,
|
119 |
-
"importance": importance,
|
120 |
-
}
|
121 |
-
|
122 |
-
# Find links
|
123 |
-
links = []
|
124 |
-
for a in soup.find_all("a", href=True):
|
125 |
-
href = a["href"]
|
126 |
-
if not any(
|
127 |
-
x in href.lower()
|
128 |
-
for x in ["javascript:", "mailto:", ".pdf", ".jpg", ".png", ".gif"]
|
129 |
-
):
|
130 |
-
next_url = urljoin(url, href)
|
131 |
-
if urlparse(next_url).netloc == base_domain:
|
132 |
-
links.append(next_url)
|
133 |
-
return links
|
134 |
|
135 |
except Exception as e:
|
136 |
-
logger.error(f"Error
|
137 |
-
|
|
|
|
|
|
|
138 |
|
139 |
async def crawl_website(self, start_url):
|
140 |
"""Crawl website starting from the given URL"""
|
|
|
|
|
|
|
141 |
base_domain = urlparse(start_url).netloc
|
142 |
queue = [(start_url, 0)]
|
143 |
seen = {start_url}
|
@@ -153,17 +108,6 @@ class WebsiteCrawler:
|
|
153 |
seen.add(link)
|
154 |
queue.append((link, depth + 1))
|
155 |
|
156 |
-
def clean_description(self, desc):
|
157 |
-
"""Clean description text"""
|
158 |
-
if not desc:
|
159 |
-
return ""
|
160 |
-
# Remove leading dashes, hyphens, or colons
|
161 |
-
desc = re.sub(r"^[-:\s]+", "", desc)
|
162 |
-
# Remove any strings that are just "Editors", "APIs", etc.
|
163 |
-
if len(desc.split()) <= 1:
|
164 |
-
return ""
|
165 |
-
return desc.strip()
|
166 |
-
|
167 |
def generate_llms_txt(self):
|
168 |
"""Generate llms.txt content"""
|
169 |
if not self.url_metadata:
|
@@ -188,43 +132,23 @@ class WebsiteCrawler:
|
|
188 |
# Generate content
|
189 |
content = []
|
190 |
|
191 |
-
#
|
192 |
-
main_title = "
|
193 |
-
|
194 |
-
# Find a good description for the blockquote
|
195 |
-
best_description = None
|
196 |
-
for _, metadata in sorted_urls:
|
197 |
-
desc = self.clean_description(metadata["description"])
|
198 |
-
if desc and len(desc) > 20 and "null" not in desc.lower():
|
199 |
-
best_description = desc
|
200 |
-
break
|
201 |
|
202 |
content.append(f"# {main_title}")
|
203 |
-
if
|
204 |
-
content.append(f"\n> {
|
205 |
-
|
206 |
-
|
207 |
-
|
208 |
-
|
209 |
-
|
210 |
-
|
211 |
-
|
212 |
-
|
213 |
-
|
214 |
-
|
215 |
-
content.append(f"\n## {category}")
|
216 |
-
|
217 |
-
# Add links without extra newlines
|
218 |
-
links = []
|
219 |
-
for url, metadata in categories[category]:
|
220 |
-
title = metadata["title"].strip()
|
221 |
-
desc = self.clean_description(metadata["description"])
|
222 |
-
if desc:
|
223 |
-
links.append(f"- [{title}]({url}): {desc}")
|
224 |
-
else:
|
225 |
-
links.append(f"- [{title}]({url})")
|
226 |
-
|
227 |
-
content.append("\n".join(links))
|
228 |
|
229 |
return "\n".join(content)
|
230 |
|
|
|
18 |
self.max_pages = max_pages
|
19 |
self.visited_urls = set()
|
20 |
self.url_metadata = defaultdict(dict)
|
21 |
+
self.homepage_metadata = None # New field for homepage specific metadata
|
22 |
self.headers = {
|
23 |
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36"
|
24 |
}
|
25 |
|
26 |
+
def extract_homepage_description(self, soup):
|
27 |
+
"""Extract description from homepage with multiple fallbacks"""
|
28 |
+
# Try meta description first
|
29 |
+
meta_desc = soup.find("meta", {"name": "description"})
|
30 |
+
if meta_desc and meta_desc.get("content"):
|
31 |
+
desc = meta_desc["content"]
|
32 |
+
if desc and len(desc.strip()) > 20:
|
33 |
+
return self.clean_text(desc)
|
34 |
+
|
35 |
+
# Try OpenGraph description
|
36 |
+
og_desc = soup.find("meta", property="og:description")
|
37 |
+
if og_desc and og_desc.get("content"):
|
38 |
+
desc = og_desc["content"]
|
39 |
+
if desc and len(desc.strip()) > 20:
|
40 |
+
return self.clean_text(desc)
|
41 |
+
|
42 |
+
# Try first significant paragraph
|
43 |
+
for p in soup.find_all("p"):
|
44 |
+
text = p.get_text().strip()
|
45 |
+
if len(text) > 50 and not any(x in text.lower() for x in ["cookie", "accept", "privacy"]):
|
46 |
+
return self.clean_text(text)
|
47 |
+
|
48 |
+
# Try main content area if exists
|
49 |
+
main = soup.find("main")
|
50 |
+
if main:
|
51 |
+
first_p = main.find("p")
|
52 |
+
if first_p:
|
53 |
+
text = first_p.get_text().strip()
|
54 |
+
if len(text) > 50:
|
55 |
+
return self.clean_text(text)
|
56 |
+
|
57 |
+
return None
|
58 |
+
|
59 |
+
async def process_homepage(self, url):
|
60 |
+
"""Specifically process the homepage to extract key metadata"""
|
61 |
try:
|
62 |
response = requests.get(url, headers=self.headers, timeout=10)
|
63 |
response.encoding = "utf-8"
|
|
|
|
|
64 |
soup = BeautifulSoup(response.text, "html.parser")
|
65 |
|
66 |
+
# Extract site name with fallbacks
|
67 |
+
site_name = None
|
68 |
+
site_meta = soup.find("meta", property="og:site_name")
|
69 |
+
if site_meta and site_meta.get("content"):
|
70 |
+
site_name = site_meta["content"]
|
71 |
+
if not site_name:
|
72 |
+
site_name = soup.find("title").text if soup.find("title") else None
|
73 |
+
if not site_name:
|
74 |
+
site_name = urlparse(url).netloc.split('.')[0].capitalize()
|
75 |
+
|
76 |
+
# Get homepage description
|
77 |
+
description = self.extract_homepage_description(soup)
|
78 |
+
|
79 |
+
self.homepage_metadata = {
|
80 |
+
"site_name": self.clean_text(site_name, is_title=True),
|
81 |
+
"description": description
|
82 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
83 |
|
84 |
except Exception as e:
|
85 |
+
logger.error(f"Error processing homepage {url}: {str(e)}")
|
86 |
+
self.homepage_metadata = {
|
87 |
+
"site_name": urlparse(url).netloc.split('.')[0].capitalize(),
|
88 |
+
"description": None
|
89 |
+
}
|
90 |
|
91 |
async def crawl_website(self, start_url):
|
92 |
"""Crawl website starting from the given URL"""
|
93 |
+
# First process the homepage
|
94 |
+
await self.process_homepage(start_url)
|
95 |
+
|
96 |
base_domain = urlparse(start_url).netloc
|
97 |
queue = [(start_url, 0)]
|
98 |
seen = {start_url}
|
|
|
108 |
seen.add(link)
|
109 |
queue.append((link, depth + 1))
|
110 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
111 |
def generate_llms_txt(self):
|
112 |
"""Generate llms.txt content"""
|
113 |
if not self.url_metadata:
|
|
|
132 |
# Generate content
|
133 |
content = []
|
134 |
|
135 |
+
# Use homepage metadata for main title and description
|
136 |
+
main_title = self.homepage_metadata.get("site_name", "Welcome")
|
137 |
+
homepage_description = self.homepage_metadata.get("description")
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
138 |
|
139 |
content.append(f"# {main_title}")
|
140 |
+
if homepage_description:
|
141 |
+
content.append(f"\n> {homepage_description}")
|
142 |
+
elif len(sorted_urls) > 0:
|
143 |
+
# Fallback to first good description from content if no homepage description
|
144 |
+
for _, metadata in sorted_urls:
|
145 |
+
desc = self.clean_description(metadata["description"])
|
146 |
+
if desc and len(desc) > 20 and "null" not in desc.lower():
|
147 |
+
content.append(f"\n> {desc}")
|
148 |
+
break
|
149 |
+
|
150 |
+
# Rest of the generation remains the same...
|
151 |
+
# [Previous category grouping and link generation code]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
152 |
|
153 |
return "\n".join(content)
|
154 |
|