Spaces:
Running
Running
import gradio as gr | |
import requests | |
import time | |
import random | |
from bs4 import BeautifulSoup | |
import trafilatura | |
def extract_content_bs4(url): | |
try: | |
response = requests.get(url, timeout=10) | |
soup = BeautifulSoup(response.content, 'html.parser') | |
# This is a simple extraction and might need to be adjusted based on the structure of the websites you're scraping | |
paragraphs = soup.find_all('p') | |
content = ' '.join([p.text for p in paragraphs]) | |
return content[:1000] + "..." if len(content) > 1000 else content | |
except Exception as e: | |
return f"Error extracting content: {str(e)}" | |
def extract_content_trafilatura(url): | |
try: | |
downloaded = trafilatura.fetch_url(url) | |
content = trafilatura.extract(downloaded, include_comments=False, include_tables=False) | |
return content[:1000] + "..." if content and len(content) > 1000 else content | |
except Exception as e: | |
return f"Error extracting content: {str(e)}" | |
def search_searx(query, instance_url='https://searx.org', categories='general', max_retries=3, num_results=10, use_trafilatura=False): | |
""" | |
Perform a search using the Searx API with error handling, retry logic, limited results, and content extraction. | |
""" | |
search_endpoint = f"{instance_url}/search" | |
params = { | |
'q': query, | |
'format': 'json', | |
'categories': categories, | |
'pageno': 1, | |
'time_range': '', | |
'engines': '', | |
'safesearch': '0', | |
'results': str(num_results) | |
} | |
headers = { | |
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36', | |
'Accept': 'application/json, text/javascript, */*; q=0.01', | |
'Accept-Language': 'en-US,en;q=0.5', | |
'Referer': instance_url, | |
'DNT': '1', | |
'Connection': 'keep-alive', | |
'Upgrade-Insecure-Requests': '1' | |
} | |
for attempt in range(max_retries): | |
try: | |
response = requests.get(search_endpoint, params=params, headers=headers, timeout=10) | |
response.raise_for_status() | |
data = response.json() | |
if 'results' not in data or not data['results']: | |
return "No results found." | |
formatted_results = "" | |
for idx, result in enumerate(data['results'][:num_results], start=1): | |
title = result.get('title', 'No Title') | |
url = result.get('url', 'No URL') | |
# Extract content using the selected method | |
if use_trafilatura: | |
content = extract_content_trafilatura(url) | |
else: | |
content = extract_content_bs4(url) | |
formatted_results += f"**{idx}. {title}**\n[{url}]({url})\n{content}\n\n" | |
return formatted_results | |
except requests.exceptions.RequestException as e: | |
if response.status_code == 429: | |
wait_time = 2 ** attempt + random.uniform(0, 1) | |
time.sleep(wait_time) | |
else: | |
return f"An error occurred while searching: {e}" | |
return "Max retries reached. Please try again later." | |
def create_gradio_interface(): | |
""" | |
Creates and returns the Gradio interface. | |
""" | |
with gr.Blocks() as demo: | |
gr.Markdown("# π΅οΈββοΈ Private Search with Searx and Content Extraction") | |
gr.Markdown( | |
"This application allows you to perform private searches using the [Searx](https://searx.org/) metasearch engine and extract content from the results." | |
) | |
with gr.Row(): | |
with gr.Column(): | |
query = gr.Textbox( | |
label="Search Query", | |
placeholder="Enter your search query here...", | |
lines=1 | |
) | |
instance_url = gr.Textbox( | |
label="Searx Instance URL", | |
value="https://searx.org", | |
placeholder="https://searx.instance.url", | |
lines=1 | |
) | |
categories = gr.Textbox( | |
label="Categories", | |
value="general", | |
placeholder="e.g., general, images, videos", | |
lines=1 | |
) | |
num_results = gr.Slider( | |
minimum=1, | |
maximum=20, | |
value=10, | |
step=1, | |
label="Number of Results" | |
) | |
use_trafilatura = gr.Checkbox(label="Use Trafilatura for extraction (instead of BeautifulSoup)") | |
search_button = gr.Button("Search") | |
with gr.Column(): | |
results = gr.Markdown("### Search Results will appear here...") | |
def perform_search(q, url, cats, num, use_traf): | |
return search_searx(q, instance_url=url, categories=cats, num_results=int(num), use_trafilatura=use_traf) | |
search_button.click( | |
perform_search, | |
inputs=[query, instance_url, categories, num_results, use_trafilatura], | |
outputs=results | |
) | |
gr.Markdown( | |
""" | |
--- | |
**Note:** This application uses the Searx metasearch engine to fetch results from multiple sources while preserving your privacy. | |
It then attempts to extract content from the original sources, which may be subject to the terms of service of those websites. | |
""" | |
) | |
return demo | |
iface = create_gradio_interface() | |
if __name__ == "__main__": | |
iface.launch() |