File size: 4,667 Bytes
ed13a16
 
 
 
f5f0488
fe74f2e
75a77d2
c5c1684
15fc415
75a77d2
493c56b
 
f5f0488
5962cca
94ef62e
 
75a77d2
32abacb
75a77d2
 
 
 
57c73d3
75a77d2
 
 
 
 
 
 
 
 
 
 
 
15fc415
f9b9e87
 
 
 
f5f0488
f9b9e87
f5f0488
fe74f2e
6fa45ec
991f3f5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6fa45ec
72da323
 
 
 
15fc415
f9b9e87
f5f0488
 
 
 
 
 
 
5962cca
2d47e8d
fe74f2e
2d47e8d
5962cca
57c73d3
f9b9e87
5962cca
 
9a4cf94
5962cca
f9b9e87
5962cca
f9b9e87
57c73d3
 
 
 
 
 
 
 
 
 
f9b9e87
 
15fc415
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
//! The `searx` module handles the scraping of results from the searx search engine instance
//! by querying the upstream searx search engine instance with user provided query and with a page
//! number if provided.

use reqwest::header::HeaderMap;
use reqwest::Client;
use scraper::Html;
use std::collections::HashMap;

use super::search_result_parser::SearchResultParser;
use crate::models::aggregation_models::SearchResult;
use crate::models::engine_models::{EngineError, SearchEngine};
use error_stack::{Report, Result, ResultExt};

/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
/// reduce code duplication as well as allows to create vector of different search engines easily.
pub struct Searx {
    /// The parser, used to interpret the search result.
    parser: SearchResultParser,
}

impl Searx {
    /// creates a Searx parser
    pub fn new() -> Result<Searx, EngineError> {
        Ok(Self {
            parser: SearchResultParser::new(
                "#urls>.dialog-error>p",
                ".result",
                "h3>a",
                "h3>a",
                ".content",
            )?,
        })
    }
}

#[async_trait::async_trait]
impl SearchEngine for Searx {
    async fn results(
        &self,
        query: &str,
        page: u32,
        user_agent: &str,
        client: &Client,
        mut safe_search: u8,
    ) -> Result<Vec<(String, SearchResult)>, EngineError> {
        // A branchless condition to check whether the `safe_search` parameter has the
        // value greater than equal to three or not. If it is, then it modifies the
        // `safesearch` parameters value to 2.
        //
        // Moreover, the below branchless code is equivalent to the following code below:
        //
        // ```rust
        // safe_search = u8::from(safe_search == 3) * 2;
        // ```
        //
        // For more information on branchless programming. See:
        //
        // * https://piped.video/watch?v=bVJ-mWWL7cE
        safe_search = u8::from(safe_search >= 3) * 2;

        let url: String = format!(
            "https://searx.be/search?q={query}&pageno={}&safesearch={safe_search}",
            page + 1
        );

        // initializing headers and adding appropriate headers.
        let header_map = HeaderMap::try_from(&HashMap::from([
            ("USER_AGENT".to_string(), user_agent.to_string()),
            ("REFERER".to_string(), "https://google.com/".to_string()),
            ("CONTENT_TYPE".to_string(), "application/x-www-form-urlencoded".to_string()),
            ("COOKIE".to_string(), "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".to_string())
        ]))
        .change_context(EngineError::UnexpectedError)?;

        let document: Html = Html::parse_document(
            &Searx::fetch_html_from_upstream(self, &url, header_map, client).await?,
        );

        if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(1) {
            if no_result_msg.inner_html()
            == "we didn't find any results. Please use another query or search in more categories"
        {
            return Err(Report::new(EngineError::EmptyResultSet));
        }
        }

        // scrape all the results from the html
        self.parser
            .parse_for_results(&document, |title, url, desc| {
                url.value().attr("href").map(|url| {
                    SearchResult::new(
                        title.inner_html().trim(),
                        url,
                        desc.inner_html().trim(),
                        &["searx"],
                    )
                })
            })
    }
}