Spaces:
Running
Running
# utility : | |
# search engine for using web retrieval | |
# extracts text content from ranked search websites, | |
# it would be useful for connecting mixtral to web. | |
# use a reranker if required. | |
import requests | |
from bs4 import BeautifulSoup | |
import re | |
class SearchClient: | |
def __init__(self, vendor, engine_id=None, api_key=None): | |
self.vendor = vendor | |
if vendor == 'google': | |
self.endpoint = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={engine_id}" | |
elif vendor == 'bing': | |
self.endpoint = "https://api.bing.microsoft.com/v7.0/search" | |
self.headers = { | |
'Ocp-Apim-Subscription-Key': api_key, | |
} | |
def search(self, query, n_crawl): | |
if self.vendor == 'google': | |
return self._google_search(query, n_crawl) | |
elif self.vendor == 'bing': | |
return self._bing_search(query, n_crawl) | |
else: | |
return "Invalid vendor" | |
def _extract_text_from_link(link): | |
page = requests.get(link) | |
if page.status_code == 200: | |
soup = BeautifulSoup(page.content, 'html.parser') | |
text = soup.get_text() | |
cleaned_text = re.sub(r'\s+', ' ', text) | |
return cleaned_text | |
return None | |
def _google_search(self, query, n_crawl): | |
response = requests.get(self.endpoint, params={'q': query}) | |
search_results = response.json() | |
results = [] | |
count = 0 | |
for item in search_results.get('items', []): | |
if count >= n_crawl: | |
break | |
link = item['link'] | |
cleaned_text = self._extract_text_from_link(link) | |
if cleaned_text: | |
results.append({"text": cleaned_text, "link": link}) | |
count += 1 | |
return results | |
def _bing_search(self, query, n_crawl): | |
params = { | |
'q': query, | |
'count': n_crawl, # You might need to adjust this based on Bing API requirements | |
'mkt': 'en-US' | |
} | |
response = requests.get(self.endpoint, headers=self.headers, params=params) | |
search_results = response.json() | |
print(search_results) | |
results = [] | |
for item in search_results.get('webPages', {}).get('value', []): | |
link = item['url'] | |
cleaned_text = self._extract_text_from_link(link) | |
if cleaned_text: | |
results.append({"text": cleaned_text, "link": link}) | |
return results | |