NCTC_OSINT_AGENT / middlewares /search_client.py
NCTCMumbai's picture
Update middlewares/search_client.py
03e03a6 verified
raw
history blame
3.53 kB
import requests
from bs4 import BeautifulSoup
import re
import concurrent.futures
from GoogleNews import GoogleNews
class SearchClient:
def __init__(self, vendor, engine_id=None, api_key=None):
self.vendor = vendor
# if vendor == "google":
# self.endpoint = f"https://www.googleapis.com/customsearch/v1?key={api_key}&cx={engine_id}"
if vendor == "bing":
self.endpoint = "https://api.bing.microsoft.com/v7.0/search"
self.headers = {
"Ocp-Apim-Subscription-Key": api_key,
}
@staticmethod
def _extract_text_from_link(link):
page = requests.get(link)
if page.status_code == 200:
soup = BeautifulSoup(page.content, "html.parser")
text = soup.get_text()
cleaned_text = re.sub(r"\s+", " ", text)
return cleaned_text
return None
def _fetch_text_from_links(self, links):
results = []
with concurrent.futures.ThreadPoolExecutor() as executor:
future_to_link = {
executor.submit(self._extract_text_from_link, link): link
for link in links
}
for future in concurrent.futures.as_completed(future_to_link):
link = future_to_link[future]
try:
cleaned_text = future.result()
if cleaned_text:
results.append({"text": cleaned_text, "link": link})
except Exception as e:
print(f"Error fetching data from {link}: {e}")
return results
def _google_search(self, start_date,end_date, query, n_crawl):
keywords_lst=query.split(',')
for keyword in keywords_lst:
googlenews = GoogleNews(start=start_date,end=end_date)
googlenews = GoogleNews(lang='en', region='IN')
googlenews.get_news(keyword)
results=googlenews.results
texts = googlenews.get_texts() # List of news texts
links = googlenews.get_links() # List of news links
if len(texts)<n_crawl:
data = {'Keyword': [keyword]*len(texts), 'Links': links, 'Text': texts}
else:
data = {'Keyword': [keyword]*(n_crawl), 'Links': links[:n_crawl], 'Text': texts[:n_crawl]}
results=data['Links']
corrected_urls = ["https://" + url for url in results]
text_results = self._fetch_text_from_links(corrected_urls)
return text_results
def _bing_search(self, query, n_crawl):
params = {
"q": query,
"count": n_crawl, # You might need to adjust this based on Bing API requirements
"mkt": "en-US",
}
response = requests.get(self.endpoint, headers=self.headers, params=params)
search_results = response.json()
results = []
for item in search_results.get("webPages", {}).get("value", []):
link = item["url"]
results.append(link)
text_results = self._fetch_text_from_links(results)
return text_results
def search(self, query, n_crawl):
if self.vendor == "bing":
return self._bing_search(query, n_crawl)
else:
return "Invalid vendor"
def search_google(self,start_date,end_date, query, n_crawl):
if self.vendor == "google":
return self._google_search(start_date,end_date,query, n_crawl)
else:
return "Invalid vendor"