KingNish commited on
Commit
775d6d5
1 Parent(s): d4b4fb1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +32 -40
app.py CHANGED
@@ -1,11 +1,14 @@
1
- from fastapi import FastAPI, HTTPException, Query # Make sure Query is imported
2
  from fastapi.responses import JSONResponse
3
  from webscout import WEBS, transcriber, LLM
4
- from typing import Optional, List, Dict, Union # Import List, Dict, Union
5
  from fastapi.encoders import jsonable_encoder
6
  from bs4 import BeautifulSoup
7
  import requests
8
  import urllib.parse
 
 
 
9
 
10
  app = FastAPI()
11
 
@@ -152,6 +155,21 @@ def extract_text_from_webpage(html_content):
152
  visible_text = soup.get_text(strip=True)
153
  return visible_text
154
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
155
  @app.get("/api/web_extract")
156
  async def web_extract(
157
  url: str,
@@ -159,12 +177,8 @@ async def web_extract(
159
  ):
160
  """Extracts text from a given URL."""
161
  try:
162
- response = requests.get(url, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
163
- response.raise_for_status()
164
- visible_text = extract_text_from_webpage(response.text)
165
- if len(visible_text) > max_chars:
166
- visible_text = visible_text[:max_chars] + "..."
167
- return {"url": url, "text": visible_text}
168
  except requests.exceptions.RequestException as e:
169
  raise HTTPException(status_code=500, detail=f"Error fetching or processing URL: {e}")
170
 
@@ -188,23 +202,10 @@ async def web_search_and_extract(
188
  search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
189
  timelimit=timelimit, backend=backend, max_results=max_results)
190
 
191
- # Extract text from each result's link
192
- extracted_results = []
193
- for result in search_results:
194
- if 'href' in result:
195
- link = result['href']
196
- try:
197
- response = requests.get(link, headers={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:109.0) Gecko/20100101 Firefox/111.0"})
198
- response.raise_for_status()
199
- visible_text = extract_text_from_webpage(response.text)
200
- if len(visible_text) > max_chars:
201
- visible_text = visible_text[:max_chars] + "..."
202
- extracted_results.append({"link": link, "text": visible_text})
203
- except requests.exceptions.RequestException as e:
204
- print(f"Error fetching or processing {link}: {e}")
205
- extracted_results.append({"link": link, "text": None})
206
- else:
207
- extracted_results.append({"link": None, "text": None})
208
  if extract_only:
209
  return JSONResponse(content=jsonable_encoder({extracted_results}))
210
  else:
@@ -235,22 +236,13 @@ async def adv_web_search(
235
  timelimit=timelimit, backend=backend,
236
  max_results=max_results)
237
 
238
- # 2. Extract text from top search result URLs
239
  extracted_text = ""
240
- for result in search_results:
241
- if 'href' in result:
242
- link = result['href']
243
- try:
244
- response = requests.get(link, headers={"User-Agent": "Mozilla/5.0"})
245
- response.raise_for_status()
246
- visible_text = extract_text_from_webpage(response.text)
247
- if len(visible_text) > max_chars:
248
- visible_text = visible_text[:max_chars] + "..."
249
- extracted_text += f"## Content from: {link}\n\n{visible_text}\n\n"
250
- except requests.exceptions.RequestException as e:
251
- print(f"Error fetching or processing {link}: {e}")
252
- else:
253
- pass
254
 
255
  # 3. Construct the prompt for the LLM
256
  llm_prompt = f"Query by user: {q} , Answer the query asked by user in detail. Now, You are provided with Google Search Results, To increase your accuracy and providing real time data. SEarch Result: {extracted_text}"
 
1
+ from fastapi import FastAPI, HTTPException, Query
2
  from fastapi.responses import JSONResponse
3
  from webscout import WEBS, transcriber, LLM
4
+ from typing import Optional, List, Dict, Union
5
  from fastapi.encoders import jsonable_encoder
6
  from bs4 import BeautifulSoup
7
  import requests
8
  import urllib.parse
9
+ import asyncio
10
+ import aiohttp
11
+ from typing import List
12
 
13
  app = FastAPI()
14
 
 
155
  visible_text = soup.get_text(strip=True)
156
  return visible_text
157
 
158
+ async def fetch_and_extract(url, max_chars):
159
+ """Fetches a URL and extracts text asynchronously."""
160
+ async with aiohttp.ClientSession() as session:
161
+ try:
162
+ async with session.get(url, headers={"User-Agent": "Mozilla/5.0"}) as response:
163
+ response.raise_for_status()
164
+ html_content = await response.text()
165
+ visible_text = extract_text_from_webpage(html_content)
166
+ if len(visible_text) > max_chars:
167
+ visible_text = visible_text[:max_chars] + "..."
168
+ return {"link": url, "text": visible_text}
169
+ except (aiohttp.ClientError, requests.exceptions.RequestException) as e:
170
+ print(f"Error fetching or processing {url}: {e}")
171
+ return {"link": url, "text": None}
172
+
173
  @app.get("/api/web_extract")
174
  async def web_extract(
175
  url: str,
 
177
  ):
178
  """Extracts text from a given URL."""
179
  try:
180
+ result = await fetch_and_extract(url, max_chars)
181
+ return {"url": url, "text": result["text"]}
 
 
 
 
182
  except requests.exceptions.RequestException as e:
183
  raise HTTPException(status_code=500, detail=f"Error fetching or processing URL: {e}")
184
 
 
202
  search_results = webs.text(keywords=q, region=region, safesearch=safesearch,
203
  timelimit=timelimit, backend=backend, max_results=max_results)
204
 
205
+ # Extract text from each result's link asynchronously
206
+ tasks = [fetch_and_extract(result['href'], max_chars) for result in search_results if 'href' in result]
207
+ extracted_results = await asyncio.gather(*tasks)
208
+
 
 
 
 
 
 
 
 
 
 
 
 
 
209
  if extract_only:
210
  return JSONResponse(content=jsonable_encoder({extracted_results}))
211
  else:
 
236
  timelimit=timelimit, backend=backend,
237
  max_results=max_results)
238
 
239
+ # 2. Extract text from top search result URLs asynchronously
240
  extracted_text = ""
241
+ tasks = [fetch_and_extract(result['href'], max_chars) for result in search_results if 'href' in result]
242
+ extracted_results = await asyncio.gather(*tasks)
243
+ for result in extracted_results:
244
+ if result['text']:
245
+ extracted_text += f"## Content from: {result['link']}\n\n{result['text']}\n\n"
 
 
 
 
 
 
 
 
 
246
 
247
  # 3. Construct the prompt for the LLM
248
  llm_prompt = f"Query by user: {q} , Answer the query asked by user in detail. Now, You are provided with Google Search Results, To increase your accuracy and providing real time data. SEarch Result: {extracted_text}"