Spaces:
Sleeping
Sleeping
import gradio as gr | |
import requests | |
import time | |
import random | |
import os | |
import logging | |
from bs4 import BeautifulSoup | |
import trafilatura | |
from huggingface_hub import InferenceClient | |
# Set up logging | |
logging.basicConfig(level=logging.DEBUG, format='%(asctime)s - %(levelname)s - %(message)s') | |
logger = logging.getLogger(__name__) | |
USER_AGENTS = [ | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:89.0) Gecko/20100101 Firefox/89.0', | |
'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.1 Safari/605.1.15', | |
'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Mozilla/5.0 (iPhone; CPU iPhone OS 14_6 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.0 Mobile/15E148 Safari/604.1' | |
] | |
def get_random_user_agent(): | |
return random.choice(USER_AGENTS) | |
def extract_content_bs4(url, max_chars): | |
try: | |
response = requests.get(url, headers={'User-Agent': get_random_user_agent()}, timeout=10) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
paragraphs = soup.find_all('p') | |
content = ' '.join([p.text for p in paragraphs]) | |
return content[:max_chars] + "..." if len(content) > max_chars else content | |
except Exception as e: | |
return f"Error extracting content: {str(e)}" | |
def extract_content_trafilatura(url, max_chars): | |
try: | |
downloaded = trafilatura.fetch_url(url, headers={'User-Agent': get_random_user_agent()}) | |
content = trafilatura.extract(downloaded, include_comments=False, include_tables=False) | |
return content[:max_chars] + "..." if content and len(content) > max_chars else content | |
except Exception as e: | |
return f"Error extracting content: {str(e)}" | |
def search_searx(query, instance_url='https://searx.org', categories='general', max_retries=3, num_results=10, | |
use_trafilatura=False, time_range='', language='en', safesearch=0, search_engines='all', | |
sort_by='relevance', max_chars=1000): | |
""" | |
Perform a search using the SearXNG API with advanced options. | |
""" | |
search_endpoint = f"{instance_url}/search" | |
params = { | |
'q': query, | |
'format': 'json', | |
'categories': categories, | |
'pageno': 1, | |
'time_range': time_range, | |
'language': language, | |
'safesearch': safesearch, | |
'results': str(num_results), | |
'engines': ','.join(search_engines) if 'all' not in search_engines else 'all', | |
'sort': sort_by | |
} | |
headers = { | |
'User-Agent': get_random_user_agent(), | |
'Accept': 'application/json, text/javascript, */*; q=0.01', | |
'Accept-Language': 'en-US,en;q=0.5', | |
'Referer': instance_url, | |
'DNT': '1', | |
'Connection': 'keep-alive', | |
'Upgrade-Insecure-Requests': '1' | |
} | |
for attempt in range(max_retries): | |
try: | |
response = requests.get(search_endpoint, params=params, headers=headers, timeout=10) | |
response.raise_for_status() | |
data = response.json() | |
if 'results' not in data or not data['results']: | |
return "No results found." | |
formatted_results = "" | |
for idx, result in enumerate(data['results'][:num_results], start=1): | |
title = result.get('title', 'No Title') | |
url = result.get('url', 'No URL') | |
if use_trafilatura: | |
content = extract_content_trafilatura(url, max_chars) | |
else: | |
content = extract_content_bs4(url, max_chars) | |
formatted_results += f"**{idx}. {title}**\n[{url}]({url})\n{content}\n\n" | |
return formatted_results | |
except requests.exceptions.RequestException as e: | |
if response.status_code == 429: | |
wait_time = 2 ** attempt + random.uniform(0, 1) | |
time.sleep(wait_time) | |
else: | |
return f"An error occurred while searching: {e}" | |
return "Max retries reached. Please try again later." | |
def summarize_with_llm(query, search_results): | |
logger.debug(f"Attempting to summarize results for query: {query}") | |
try: | |
api_key = os.getenv("HUGGINGFACE_API_KEY") | |
if not api_key: | |
logger.error("HUGGINGFACE_API_KEY environment variable is not set") | |
return "Error: Hugging Face API key is not set. Please set the HUGGINGFACE_API_KEY environment variable." | |
logger.debug("Initializing InferenceClient") | |
client = InferenceClient( | |
"mistralai/Mistral-Nemo-Instruct-2407", | |
token=api_key, | |
) | |
system_prompt = """You are an AI assistant tasked with summarizing search results. Your goal is to provide a concise, informative summary of the search results in relation to the user's query. Focus on the most relevant information and present it in a clear, organized manner.""" | |
user_prompt = f"""Query: {query} | |
Search Results: | |
{search_results} | |
Please provide a summary of the search results in relation to the query. Highlight the most relevant information, identify any common themes or contradictions, and present the information in a clear and concise manner. If there are any gaps in the information or areas that require further research, please mention them as well.""" | |
messages = [ | |
{"role": "system", "content": system_prompt}, | |
{"role": "user", "content": user_prompt} | |
] | |
logger.debug("Sending request to Hugging Face API") | |
summary = "" | |
for message in client.chat_completion( | |
messages=messages, | |
max_tokens=500, | |
stream=True, | |
): | |
summary += message.choices[0].delta.content | |
logger.debug("Successfully generated summary") | |
return summary | |
except Exception as e: | |
logger.exception(f"Error in summarize_with_llm: {str(e)}") | |
return f"Error generating summary: {str(e)}" | |
def create_gradio_interface(): | |
with gr.Blocks() as demo: | |
gr.Markdown("# 🕵️♂️ Advanced SearXNG Search with LLM Summary") | |
gr.Markdown( | |
"This application allows you to perform private searches using SearXNG with advanced options and get an AI-generated summary of the results." | |
) | |
with gr.Row(): | |
with gr.Column(): | |
query = gr.Textbox( | |
label="Search Query", | |
placeholder="Enter your search query here...", | |
lines=1 | |
) | |
instance_url = gr.Textbox( | |
label="SearXNG Instance URL", | |
value="https://searx.org", | |
placeholder="https://searx.instance.url", | |
lines=1 | |
) | |
categories = gr.Textbox( | |
label="Categories", | |
value="general", | |
placeholder="e.g., general, news, science", | |
lines=1 | |
) | |
num_results = gr.Slider( | |
minimum=1, | |
maximum=20, | |
value=10, | |
step=1, | |
label="Number of Results" | |
) | |
use_trafilatura = gr.Checkbox(label="Use Trafilatura for extraction (instead of BeautifulSoup)") | |
time_range = gr.Dropdown( | |
choices=["", "day", "week", "month", "year"], | |
value="", | |
label="Time Range" | |
) | |
language = gr.Textbox( | |
label="Language", | |
value="en", | |
placeholder="e.g., en, fr, de", | |
lines=1 | |
) | |
safesearch = gr.Slider( | |
minimum=0, | |
maximum=2, | |
value=0, | |
step=1, | |
label="SafeSearch (0: Off, 1: Moderate, 2: Strict)" | |
) | |
search_engines = gr.Dropdown( | |
choices=["all", "google", "bing", "duckduckgo", "wikipedia"], | |
value="all", | |
label="Search Engines", | |
multiselect=True | |
) | |
sort_by = gr.Dropdown( | |
choices=["relevance", "date"], | |
value="relevance", | |
label="Sort Results By" | |
) | |
max_chars = gr.Slider( | |
minimum=100, | |
maximum=10000, | |
value=1000, | |
step=100, | |
label="Max Characters to Extract" | |
) | |
search_button = gr.Button("Search and Summarize") | |
with gr.Column(): | |
results = gr.Markdown("### Search Results and Summary will appear here...") | |
def perform_search_and_summarize(q, url, cats, num, use_traf, t_range, lang, safe, engines, sort, chars): | |
logger.debug(f"Performing search for query: {q}") | |
try: | |
search_results = search_searx(q, instance_url=url, categories=cats, num_results=int(num), | |
use_trafilatura=use_traf, time_range=t_range, language=lang, safesearch=int(safe), | |
search_engines=engines, sort_by=sort, max_chars=chars) | |
logger.debug("Search completed, attempting to summarize") | |
summary = summarize_with_llm(q, search_results) | |
return f"## AI-Generated Summary\n\n{summary}\n\n## Original Search Results\n\n{search_results}" | |
except Exception as e: | |
logger.exception(f"Error in perform_search_and_summarize: {str(e)}") | |
return f"An error occurred: {str(e)}" | |
search_button.click( | |
perform_search_and_summarize, | |
inputs=[query, instance_url, categories, num_results, use_trafilatura, time_range, language, safesearch, | |
search_engines, sort_by, max_chars], | |
outputs=results | |
) | |
gr.Markdown( | |
""" | |
--- | |
**Note:** This application uses SearXNG to fetch results from multiple sources while preserving your privacy. | |
It then attempts to extract content from the original sources, which may be subject to the terms of service of those websites. | |
The AI-generated summary is provided by a Mistral Nemo LLM and should be reviewed for accuracy. | |
""" | |
) | |
return demo | |
iface = create_gradio_interface() | |
if __name__ == "__main__": | |
logger.info("Starting the application") | |
iface.launch() |