Spaces:
Runtime error
Runtime error
File size: 4,667 Bytes
ed13a16 f5f0488 fe74f2e 75a77d2 c5c1684 15fc415 75a77d2 493c56b f5f0488 5962cca 94ef62e 75a77d2 32abacb 75a77d2 57c73d3 75a77d2 15fc415 f9b9e87 f5f0488 f9b9e87 f5f0488 fe74f2e 6fa45ec 991f3f5 6fa45ec 72da323 15fc415 f9b9e87 f5f0488 5962cca 2d47e8d fe74f2e 2d47e8d 5962cca 57c73d3 f9b9e87 5962cca 9a4cf94 5962cca f9b9e87 5962cca f9b9e87 57c73d3 f9b9e87 15fc415 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 |
//! The `searx` module handles the scraping of results from the searx search engine instance
//! by querying the upstream searx search engine instance with user provided query and with a page
//! number if provided.
use reqwest::header::HeaderMap;
use reqwest::Client;
use scraper::Html;
use std::collections::HashMap;
use super::search_result_parser::SearchResultParser;
use crate::models::aggregation_models::SearchResult;
use crate::models::engine_models::{EngineError, SearchEngine};
use error_stack::{Report, Result, ResultExt};
/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily.
pub struct Searx {
/// The parser, used to interpret the search result.
parser: SearchResultParser,
}
impl Searx {
/// creates a Searx parser
pub fn new() -> Result<Searx, EngineError> {
Ok(Self {
parser: SearchResultParser::new(
"#urls>.dialog-error>p",
".result",
"h3>a",
"h3>a",
".content",
)?,
})
}
}
#[async_trait::async_trait]
impl SearchEngine for Searx {
async fn results(
&self,
query: &str,
page: u32,
user_agent: &str,
client: &Client,
mut safe_search: u8,
) -> Result<Vec<(String, SearchResult)>, EngineError> {
// A branchless condition to check whether the `safe_search` parameter has the
// value greater than equal to three or not. If it is, then it modifies the
// `safesearch` parameters value to 2.
//
// Moreover, the below branchless code is equivalent to the following code below:
//
// ```rust
// safe_search = u8::from(safe_search == 3) * 2;
// ```
//
// For more information on branchless programming. See:
//
// * https://piped.video/watch?v=bVJ-mWWL7cE
safe_search = u8::from(safe_search >= 3) * 2;
let url: String = format!(
"https://searx.be/search?q={query}&pageno={}&safesearch={safe_search}",
page + 1
);
// initializing headers and adding appropriate headers.
let header_map = HeaderMap::try_from(&HashMap::from([
("USER_AGENT".to_string(), user_agent.to_string()),
("REFERER".to_string(), "https://google.com/".to_string()),
("CONTENT_TYPE".to_string(), "application/x-www-form-urlencoded".to_string()),
("COOKIE".to_string(), "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".to_string())
]))
.change_context(EngineError::UnexpectedError)?;
let document: Html = Html::parse_document(
&Searx::fetch_html_from_upstream(self, &url, header_map, client).await?,
);
if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(1) {
if no_result_msg.inner_html()
== "we didn't find any results. Please use another query or search in more categories"
{
return Err(Report::new(EngineError::EmptyResultSet));
}
}
// scrape all the results from the html
self.parser
.parse_for_results(&document, |title, url, desc| {
url.value().attr("href").map(|url| {
SearchResult::new(
title.inner_html().trim(),
url,
desc.inner_html().trim(),
&["searx"],
)
})
})
}
}
|