Spaces:
Running
Running
import requests | |
from bs4 import BeautifulSoup | |
import re | |
import concurrent.futures | |
from GoogleNews import GoogleNews | |
class SearchClient: | |
def __init__(self, vendor, engine_id=None, api_key=None): | |
self.vendor = vendor | |
# if vendor == "google": | |
# self.endpoint = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={engine_id}" | |
if vendor == "bing": | |
self.endpoint = "https://api.bing.microsoft.com/v7.0/search" | |
self.headers = { | |
"Ocp-Apim-Subscription-Key": api_key, | |
} | |
def _extract_text_from_link(link): | |
page = requests.get(link) | |
if page.status_code == 200: | |
soup = BeautifulSoup(page.content, "html.parser") | |
text = soup.get_text() | |
cleaned_text = re.sub(r"\s+", " ", text) | |
return cleaned_text | |
return None | |
def _fetch_text_from_links(self, links): | |
results = [] | |
with concurrent.futures.ThreadPoolExecutor() as executor: | |
future_to_link = { | |
executor.submit(self._extract_text_from_link, link): link | |
for link in links | |
} | |
for future in concurrent.futures.as_completed(future_to_link): | |
link = future_to_link[future] | |
try: | |
cleaned_text = future.result() | |
if cleaned_text: | |
results.append({"text": cleaned_text, "link": link}) | |
except Exception as e: | |
print(f"Error fetching data from {link}: {e}") | |
return results | |
def _google_search(self, start_date,end_date, query, n_crawl): | |
keywords_lst=query.split(',') | |
for keyword in keywords_lst: | |
googlenews = GoogleNews(start=start_date,end=end_date) | |
googlenews = GoogleNews(lang='en', region='IN') | |
googlenews.get_news(keyword) | |
results=googlenews.results | |
texts = googlenews.get_texts() # List of news texts | |
links = googlenews.get_links() # List of news links | |
if len(texts)<n_crawl: | |
data = {'Keyword': [keyword]*len(texts), 'Links': links, 'Text': texts} | |
else: | |
data = {'Keyword': [keyword]*(n_crawl), 'Links': links[:n_crawl], 'Text': texts[:n_crawl]} | |
results=data['Links'] | |
corrected_urls = ["https://" + url for url in results] | |
text_results = self._fetch_text_from_links(corrected_urls) | |
return text_results | |
def _bing_search(self, query, n_crawl): | |
params = { | |
"q": query, | |
"count": n_crawl, # You might need to adjust this based on Bing API requirements | |
"mkt": "en-US", | |
} | |
response = requests.get(self.endpoint, headers=self.headers, params=params) | |
search_results = response.json() | |
results = [] | |
for item in search_results.get("webPages", {}).get("value", []): | |
link = item["url"] | |
results.append(link) | |
text_results = self._fetch_text_from_links(results) | |
return text_results | |
def search(self, query, n_crawl): | |
if self.vendor == "bing": | |
return self._bing_search(query, n_crawl) | |
else: | |
return "Invalid vendor" | |
def search_google(self,start_date,end_date, query, n_crawl): | |
if self.vendor == "google": | |
return self._google_search(start_date,end_date,query, n_crawl) | |
else: | |
return "Invalid vendor" | |