Spaces:
Running
Running
Update app.py
Browse files
app.py
CHANGED
@@ -8,6 +8,7 @@ import requests
|
|
8 |
import urllib.parse
|
9 |
import asyncio
|
10 |
import aiohttp
|
|
|
11 |
from typing import List
|
12 |
|
13 |
app = FastAPI()
|
@@ -213,6 +214,71 @@ async def web_search_and_extract(
|
|
213 |
except Exception as e:
|
214 |
raise HTTPException(status_code=500, detail=f"Error during search and extraction: {e}")
|
215 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
216 |
@app.get("/api/adv_web_search")
|
217 |
async def adv_web_search(
|
218 |
q: str,
|
@@ -285,7 +351,7 @@ async def website_summarizer(url: str):
|
|
285 |
raise HTTPException(status_code=500, detail=f"Error fetching or processing URL: {e}")
|
286 |
except Exception as e:
|
287 |
raise HTTPException(status_code=500, detail=f"Error during summarization: {e}")
|
288 |
-
|
289 |
@app.get("/api/ask_website")
|
290 |
async def ask_website(url: str, question: str, model: str = "llama-3-70b"):
|
291 |
"""
|
|
|
8 |
import urllib.parse
|
9 |
import asyncio
|
10 |
import aiohttp
|
11 |
+
import threading
|
12 |
from typing import List
|
13 |
|
14 |
app = FastAPI()
|
|
|
214 |
except Exception as e:
|
215 |
raise HTTPException(status_code=500, detail=f"Error during search and extraction: {e}")
|
216 |
|
217 |
+
def extract_text_from_webpage2(html_content):
|
218 |
+
"""Extracts visible text from HTML content using BeautifulSoup."""
|
219 |
+
soup = BeautifulSoup(html_content, "html.parser")
|
220 |
+
# Remove unwanted tags
|
221 |
+
for tag in soup(["script", "style", "header", "footer", "nav"]):
|
222 |
+
tag.extract()
|
223 |
+
# Get the remaining visible text
|
224 |
+
visible_text = soup.get_text(strip=True)
|
225 |
+
return visible_text
|
226 |
+
|
227 |
+
def fetch_and_extract2(url, max_chars):
|
228 |
+
"""Fetches a URL and extracts text using threading."""
|
229 |
+
try:
|
230 |
+
response = requests.get(url, headers={"User-Agent": "Mozilla/5.0"})
|
231 |
+
response.raise_for_status()
|
232 |
+
html_content = response.text
|
233 |
+
visible_text = extract_text_from_webpage2(html_content)
|
234 |
+
if len(visible_text) > max_chars:
|
235 |
+
visible_text = visible_text[:max_chars] + "..."
|
236 |
+
return {"link": url, "text": visible_text}
|
237 |
+
except (requests.exceptions.RequestException) as e:
|
238 |
+
print(f"Error fetching or processing {url}: {e}")
|
239 |
+
return {"link": url, "text": None}
|
240 |
+
|
241 |
+
@app.get("/api/websearch-and-extract-threading")
|
242 |
+
def web_search_and_extract_threading(
|
243 |
+
q: str,
|
244 |
+
max_results: int = 3,
|
245 |
+
timelimit: Optional[str] = None,
|
246 |
+
safesearch: str = "moderate",
|
247 |
+
region: str = "wt-wt",
|
248 |
+
backend: str = "html",
|
249 |
+
max_chars: int = 6000,
|
250 |
+
extract_only: bool = True
|
251 |
+
):
|
252 |
+
"""
|
253 |
+
Searches using WEBS, extracts text from the top results using threading, and returns both.
|
254 |
+
"""
|
255 |
+
try:
|
256 |
+
with WEBS() as webs:
|
257 |
+
# Perform WEBS search
|
258 |
+
search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
|
259 |
+
timelimit=timelimit, backend=backend, max_results=max_results)
|
260 |
+
|
261 |
+
# Extract text from each result's link using threading
|
262 |
+
extracted_results = []
|
263 |
+
threads = []
|
264 |
+
for result in search_results:
|
265 |
+
if 'href' in result:
|
266 |
+
thread = threading.Thread(target=lambda: extracted_results.append(fetch_and_extract2(result['href'], max_chars)))
|
267 |
+
threads.append(thread)
|
268 |
+
thread.start()
|
269 |
+
|
270 |
+
# Wait for all threads to finish
|
271 |
+
for thread in threads:
|
272 |
+
thread.join()
|
273 |
+
|
274 |
+
if extract_only:
|
275 |
+
return JSONResponse(content=jsonable_encoder(extracted_results))
|
276 |
+
else:
|
277 |
+
return JSONResponse(content=jsonable_encoder({"search_results": search_results, "extracted_results": extracted_results}))
|
278 |
+
except Exception as e:
|
279 |
+
raise HTTPException(status_code=500, detail=f"Error during search and extraction: {e}")
|
280 |
+
|
281 |
+
|
282 |
@app.get("/api/adv_web_search")
|
283 |
async def adv_web_search(
|
284 |
q: str,
|
|
|
351 |
raise HTTPException(status_code=500, detail=f"Error fetching or processing URL: {e}")
|
352 |
except Exception as e:
|
353 |
raise HTTPException(status_code=500, detail=f"Error during summarization: {e}")
|
354 |
+
|
355 |
@app.get("/api/ask_website")
|
356 |
async def ask_website(url: str, question: str, model: str = "llama-3-70b"):
|
357 |
"""
|