ebook-gen / search_client.py
pragneshbarik's picture
initial commit
b3f1ffe
raw
history blame
2.54 kB
# utility :
# search engine for using web retrieval
# extracts text content from ranked search websites,
# it would be useful for connecting mixtral to web.
# use a reranker if required.
import requests
from bs4 import BeautifulSoup
import re
class SearchClient:
def __init__(self, vendor, engine_id=None, api_key=None):
self.vendor = vendor
if vendor == 'google':
self.endpoint = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={engine_id}"
elif vendor == 'bing':
self.endpoint = "https://api.bing.microsoft.com/v7.0/search"
self.headers = {
'Ocp-Apim-Subscription-Key': api_key,
}
def search(self, query, n_crawl):
if self.vendor == 'google':
return self._google_search(query, n_crawl)
elif self.vendor == 'bing':
return self._bing_search(query, n_crawl)
else:
return "Invalid vendor"
@staticmethod
def _extract_text_from_link(link):
page = requests.get(link)
if page.status_code == 200:
soup = BeautifulSoup(page.content, 'html.parser')
text = soup.get_text()
cleaned_text = re.sub(r'\s+', ' ', text)
return cleaned_text
return None
def _google_search(self, query, n_crawl):
response = requests.get(self.endpoint, params={'q': query})
search_results = response.json()
results = []
count = 0
for item in search_results.get('items', []):
if count >= n_crawl:
break
link = item['link']
cleaned_text = self._extract_text_from_link(link)
if cleaned_text:
results.append({"text": cleaned_text, "link": link})
count += 1
return results
def _bing_search(self, query, n_crawl):
params = {
'q': query,
'count': n_crawl, # You might need to adjust this based on Bing API requirements
'mkt': 'en-US'
}
response = requests.get(self.endpoint, headers=self.headers, params=params)
search_results = response.json()
print(search_results)
results = []
for item in search_results.get('webPages', {}).get('value', []):
link = item['url']
cleaned_text = self._extract_text_from_link(link)
if cleaned_text:
results.append({"text": cleaned_text, "link": link})
return results