Update app.py
Browse files
app.py
CHANGED
@@ -177,13 +177,7 @@ async def chat(
|
|
177 |
|
178 |
def extract_text_from_webpage(html_content):
|
179 |
"""Extracts visible text from HTML content using BeautifulSoup."""
|
180 |
-
|
181 |
-
# Remove unwanted tags
|
182 |
-
for tag in soup(["script", "style", "header", "footer", "nav"]):
|
183 |
-
tag.extract()
|
184 |
-
# Get the remaining visible text
|
185 |
-
visible_text = soup.get_text(strip=True)
|
186 |
-
return visible_text
|
187 |
|
188 |
async def fetch_and_extract(url, max_chars, proxy: Optional[str] = None):
|
189 |
"""Fetches a URL and extracts text asynchronously."""
|
@@ -248,19 +242,12 @@ async def web_search_and_extract(
|
|
248 |
|
249 |
def extract_text_from_webpage2(html_content):
|
250 |
"""Extracts visible text from HTML content using BeautifulSoup."""
|
251 |
-
|
252 |
-
|
253 |
-
|
254 |
-
tag.extract()
|
255 |
-
# Get the remaining visible text
|
256 |
-
visible_text = soup.get_text(strip=True)
|
257 |
-
return visible_text
|
258 |
-
|
259 |
-
def fetch_and_extract2(url, max_chars, proxy: Optional[str] = None):
|
260 |
"""Fetches a URL and extracts text using threading."""
|
261 |
-
proxies = {'http': proxy, 'https': proxy} if proxy else None
|
262 |
try:
|
263 |
-
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"}
|
264 |
response.raise_for_status()
|
265 |
html_content = response.text
|
266 |
visible_text = extract_text_from_webpage2(html_content)
|
@@ -274,14 +261,13 @@ def fetch_and_extract2(url, max_chars, proxy: Optional[str] = None):
|
|
274 |
@app.get("/api/websearch-and-extract-threading")
|
275 |
def web_search_and_extract_threading(
|
276 |
q: str,
|
277 |
-
max_results: int =
|
278 |
timelimit: Optional[str] = None,
|
279 |
safesearch: str = "moderate",
|
280 |
region: str = "wt-wt",
|
281 |
backend: str = "html",
|
282 |
-
max_chars: int =
|
283 |
-
extract_only: bool = True
|
284 |
-
proxy: Optional[str] = None
|
285 |
):
|
286 |
"""
|
287 |
Searches using WEBS, extracts text from the top results using threading, and returns both.
|
@@ -297,7 +283,7 @@ def web_search_and_extract_threading(
|
|
297 |
threads = []
|
298 |
for result in search_results:
|
299 |
if 'href' in result:
|
300 |
-
thread = threading.Thread(target=lambda: extracted_results.append(fetch_and_extract2(result['href'], max_chars
|
301 |
threads.append(thread)
|
302 |
thread.start()
|
303 |
|
|
|
177 |
|
178 |
def extract_text_from_webpage(html_content):
|
179 |
"""Extracts visible text from HTML content using BeautifulSoup."""
|
180 |
+
return BeautifulSoup(html_content).get_text(strip=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
181 |
|
182 |
async def fetch_and_extract(url, max_chars, proxy: Optional[str] = None):
|
183 |
"""Fetches a URL and extracts text asynchronously."""
|
|
|
242 |
|
243 |
def extract_text_from_webpage2(html_content):
|
244 |
"""Extracts visible text from HTML content using BeautifulSoup."""
|
245 |
+
return BeautifulSoup(html_content).get_text(strip=True)
|
246 |
+
|
247 |
+
def fetch_and_extract2(url, max_chars):
|
|
|
|
|
|
|
|
|
|
|
|
|
248 |
"""Fetches a URL and extracts text using threading."""
|
|
|
249 |
try:
|
250 |
+
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
|
251 |
response.raise_for_status()
|
252 |
html_content = response.text
|
253 |
visible_text = extract_text_from_webpage2(html_content)
|
|
|
261 |
@app.get("/api/websearch-and-extract-threading")
|
262 |
def web_search_and_extract_threading(
|
263 |
q: str,
|
264 |
+
max_results: int = 10,
|
265 |
timelimit: Optional[str] = None,
|
266 |
safesearch: str = "moderate",
|
267 |
region: str = "wt-wt",
|
268 |
backend: str = "html",
|
269 |
+
max_chars: int = 10000,
|
270 |
+
extract_only: bool = True
|
|
|
271 |
):
|
272 |
"""
|
273 |
Searches using WEBS, extracts text from the top results using threading, and returns both.
|
|
|
283 |
threads = []
|
284 |
for result in search_results:
|
285 |
if 'href' in result:
|
286 |
+
thread = threading.Thread(target=lambda: extracted_results.append(fetch_and_extract2(result['href'], max_chars)))
|
287 |
threads.append(thread)
|
288 |
thread.start()
|
289 |
|