File size: 5,798 Bytes
ed13a16
 
 
 
019b332
15fc415
0502a8f
15fc415
 
4402168
15fc415
9a4cf94
 
 
5962cca
ed13a16
 
 
 
 
 
 
 
c170de8
ed13a16
 
 
 
5962cca
 
 
 
15fc415
 
c170de8
15fc415
9a4cf94
15fc415
dbe5b53
15fc415
c170de8
 
 
 
 
 
 
 
 
 
15fc415
 
 
c170de8
 
9a4cf94
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c170de8
15fc415
 
 
9a4cf94
dbe5b53
15fc415
9a4cf94
 
 
15fc415
9a4cf94
 
 
15fc415
 
5962cca
9a4cf94
 
 
5962cca
4460730
9a4cf94
5962cca
 
9a4cf94
 
 
 
 
 
 
 
 
 
 
 
15fc415
 
f94ac50
 
 
 
15fc415
f94ac50
15fc415
 
 
 
f94ac50
 
 
 
 
 
 
 
 
 
15fc415
f94ac50
15fc415
 
 
 
f94ac50
 
 
 
 
 
15fc415
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
//! The `duckduckgo` module handles the scraping of results from the duckduckgo search engine
//! by querying the upstream duckduckgo search engine with user provided query and with a page
//! number if provided.

use std::{collections::HashMap, time::Duration};

use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
use scraper::{Html, Selector};

use crate::results::aggregation_models::RawSearchResult;

use super::engine_models::EngineError;

use error_stack::{IntoReport, Report, Result, ResultExt};

/// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
/// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
/// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
/// values are RawSearchResult struct and then returns it within a Result enum.
///
/// # Arguments
///
/// * `query` - Takes the user provided query to query to the upstream search engine with.
/// * `page` - Takes an u32 as an argument.
/// * `user_agent` - Takes a random user agent string as an argument.
///
/// # Errors
///
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
/// provide results for the requested search query and also returns error if the scraping selector
/// or HeaderMap fails to initialize.
pub async fn results(
    query: &str,
    page: u32,
    user_agent: &str,
) -> Result<HashMap<String, RawSearchResult>, EngineError> {
    // Page number can be missing or empty string and so appropriate handling is required
    // so that upstream server receives valid page number.
    let url: String = match page {
        1 => {
            format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")
        }
        _ => {
            format!(
                "https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js",
                query,
                (page / 2 + (page % 2)) * 30,
                (page / 2 + (page % 2)) * 30 + 1
            )
        }
    };

    // initializing HeaderMap and adding appropriate headers.
    let mut header_map = HeaderMap::new();
    header_map.insert(
        USER_AGENT,
        user_agent
            .parse()
            .into_report()
            .change_context(EngineError::UnexpectedError)?,
    );
    header_map.insert(
        REFERER,
        "https://google.com/"
            .parse()
            .into_report()
            .change_context(EngineError::UnexpectedError)?,
    );
    header_map.insert(
        CONTENT_TYPE,
        "application/x-www-form-urlencoded"
            .parse()
            .into_report()
            .change_context(EngineError::UnexpectedError)?,
    );
    header_map.insert(
        COOKIE,
        "kl=wt-wt"
            .parse()
            .into_report()
            .change_context(EngineError::UnexpectedError)?,
    );

    // fetch the html from upstream duckduckgo engine
    let results: String = reqwest::Client::new()
        .get(url)
        .timeout(Duration::from_secs(5))
        .headers(header_map) // add spoofed headers to emulate human behavior
        .send()
        .await
        .into_report()
        .change_context(EngineError::RequestError)?
        .text()
        .await
        .into_report()
        .change_context(EngineError::RequestError)?;

    let document: Html = Html::parse_document(&results);

    let no_result: Selector = Selector::parse(".no-results")
        .map_err(|_| Report::new(EngineError::UnexpectedError))
        .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?;

    if document.select(&no_result).next().is_some() {
        return Err(Report::new(EngineError::EmptyResultSet));
    }

    let results: Selector = Selector::parse(".result")
        .map_err(|_| Report::new(EngineError::UnexpectedError))
        .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
    let result_title: Selector = Selector::parse(".result__a")
        .map_err(|_| Report::new(EngineError::UnexpectedError))
        .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?;
    let result_url: Selector = Selector::parse(".result__url")
        .map_err(|_| Report::new(EngineError::UnexpectedError))
        .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?;
    let result_desc: Selector = Selector::parse(".result__snippet")
        .map_err(|_| Report::new(EngineError::UnexpectedError))
        .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?;

    // scrape all the results from the html
    Ok(document
        .select(&results)
        .map(|result| {
            RawSearchResult::new(
                result
                    .select(&result_title)
                    .next()
                    .unwrap()
                    .inner_html()
                    .trim()
                    .to_string(),
                format!(
                    "https://{}",
                    result
                        .select(&result_url)
                        .next()
                        .unwrap()
                        .inner_html()
                        .trim()
                ),
                result
                    .select(&result_desc)
                    .next()
                    .unwrap()
                    .inner_html()
                    .trim()
                    .to_string(),
                vec!["duckduckgo".to_string()],
            )
        })
        .map(|search_result| (search_result.visiting_url.clone(), search_result))
        .collect())
}