neon_arch commited on
Commit
f9b9e87
·
1 Parent(s): b72af01

✨ feat: rewrite code by implementing common engine trait `SearchEngine`

Browse files
Files changed (2) hide show
  1. src/engines/duckduckgo.rs +106 -111
  2. src/engines/searx.rs +93 -96
src/engines/duckduckgo.rs CHANGED
@@ -9,7 +9,7 @@ use scraper::{Html, Selector};
9
 
10
  use crate::search_results_handler::aggregation_models::RawSearchResult;
11
 
12
- use super::engine_models::EngineError;
13
 
14
  use error_stack::{IntoReport, Report, Result, ResultExt};
15
 
@@ -30,126 +30,121 @@ use error_stack::{IntoReport, Report, Result, ResultExt};
30
  /// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
31
  /// provide results for the requested search query and also returns error if the scraping selector
32
  /// or HeaderMap fails to initialize.
33
- pub async fn results(
34
- query: &str,
35
- page: u32,
36
- user_agent: &str,
37
- ) -> Result<HashMap<String, RawSearchResult>, EngineError> {
38
- // Page number can be missing or empty string and so appropriate handling is required
39
- // so that upstream server recieves valid page number.
40
- let url: String = match page {
41
- 1 => {
42
- format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")
43
- }
44
- _ => {
45
- format!(
46
- "https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js",
47
- query,
48
- (page / 2 + (page % 2)) * 30,
49
- (page / 2 + (page % 2)) * 30 + 1
50
- )
51
- }
52
- };
53
 
54
- // initializing HeaderMap and adding appropriate headers.
55
- let mut header_map = HeaderMap::new();
56
- header_map.insert(
57
- USER_AGENT,
58
- user_agent
59
- .parse()
60
- .into_report()
61
- .change_context(EngineError::UnexpectedError)?,
62
- );
63
- header_map.insert(
64
- REFERER,
65
- "https://google.com/"
66
- .parse()
67
- .into_report()
68
- .change_context(EngineError::UnexpectedError)?,
69
- );
70
- header_map.insert(
71
- CONTENT_TYPE,
72
- "application/x-www-form-urlencoded"
73
- .parse()
74
- .into_report()
75
- .change_context(EngineError::UnexpectedError)?,
76
- );
77
- header_map.insert(
78
- COOKIE,
79
- "kl=wt-wt"
80
- .parse()
81
- .into_report()
82
- .change_context(EngineError::UnexpectedError)?,
83
- );
84
 
85
- // fetch the html from upstream duckduckgo engine
86
- let results: String = reqwest::Client::new()
87
- .get(url)
88
- .timeout(Duration::from_secs(5))
89
- .headers(header_map) // add spoofed headers to emulate human behaviour
90
- .send()
91
- .await
92
- .into_report()
93
- .change_context(EngineError::RequestError)?
94
- .text()
95
- .await
96
- .into_report()
97
- .change_context(EngineError::RequestError)?;
 
 
 
 
 
 
 
 
 
 
98
 
99
- let document: Html = Html::parse_document(&results);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
100
 
101
- let no_result: Selector = Selector::parse(".no-results")
102
- .map_err(|_| Report::new(EngineError::UnexpectedError))
103
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?;
104
 
105
- if document.select(&no_result).next().is_some() {
106
- return Err(Report::new(EngineError::EmptyResultSet));
107
- }
108
 
109
- let results: Selector = Selector::parse(".result")
110
- .map_err(|_| Report::new(EngineError::UnexpectedError))
111
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
112
- let result_title: Selector = Selector::parse(".result__a")
113
- .map_err(|_| Report::new(EngineError::UnexpectedError))
114
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?;
115
- let result_url: Selector = Selector::parse(".result__url")
116
- .map_err(|_| Report::new(EngineError::UnexpectedError))
117
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?;
118
- let result_desc: Selector = Selector::parse(".result__snippet")
119
- .map_err(|_| Report::new(EngineError::UnexpectedError))
120
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?;
121
 
122
- // scrape all the results from the html
123
- Ok(document
124
- .select(&results)
125
- .map(|result| {
126
- RawSearchResult::new(
127
- result
128
- .select(&result_title)
129
- .next()
130
- .unwrap()
131
- .inner_html()
132
- .trim()
133
- .to_string(),
134
- format!(
135
- "https://{}",
 
 
 
 
136
  result
137
- .select(&result_url)
138
  .next()
139
  .unwrap()
140
  .inner_html()
141
  .trim()
142
- ),
143
- result
144
- .select(&result_desc)
145
- .next()
146
- .unwrap()
147
- .inner_html()
148
- .trim()
149
- .to_string(),
150
- vec!["duckduckgo".to_string()],
151
- )
152
- })
153
- .map(|search_result| (search_result.visiting_url.clone(), search_result))
154
- .collect())
 
 
 
 
 
 
 
 
 
 
155
  }
 
9
 
10
  use crate::search_results_handler::aggregation_models::RawSearchResult;
11
 
12
+ use super::engine_models::{EngineError, SearchEngine};
13
 
14
  use error_stack::{IntoReport, Report, Result, ResultExt};
15
 
 
30
  /// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
31
  /// provide results for the requested search query and also returns error if the scraping selector
32
  /// or HeaderMap fails to initialize.
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ pub struct DuckDuckGo;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
35
 
36
+ #[async_trait::async_trait]
37
+ impl SearchEngine for DuckDuckGo {
38
+ async fn results(
39
+ &self,
40
+ query: String,
41
+ page: u32,
42
+ user_agent: String,
43
+ ) -> Result<HashMap<String, RawSearchResult>, EngineError> {
44
+ // Page number can be missing or empty string and so appropriate handling is required
45
+ // so that upstream server recieves valid page number.
46
+ let url: String = match page {
47
+ 1 => {
48
+ format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")
49
+ }
50
+ _ => {
51
+ format!(
52
+ "https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js",
53
+ query,
54
+ (page / 2 + (page % 2)) * 30,
55
+ (page / 2 + (page % 2)) * 30 + 1
56
+ )
57
+ }
58
+ };
59
 
60
+ // initializing HeaderMap and adding appropriate headers.
61
+ let mut header_map = HeaderMap::new();
62
+ header_map.insert(
63
+ USER_AGENT,
64
+ user_agent
65
+ .parse()
66
+ .into_report()
67
+ .change_context(EngineError::UnexpectedError)?,
68
+ );
69
+ header_map.insert(
70
+ REFERER,
71
+ "https://google.com/"
72
+ .parse()
73
+ .into_report()
74
+ .change_context(EngineError::UnexpectedError)?,
75
+ );
76
+ header_map.insert(
77
+ CONTENT_TYPE,
78
+ "application/x-www-form-urlencoded"
79
+ .parse()
80
+ .into_report()
81
+ .change_context(EngineError::UnexpectedError)?,
82
+ );
83
+ header_map.insert(
84
+ COOKIE,
85
+ "kl=wt-wt"
86
+ .parse()
87
+ .into_report()
88
+ .change_context(EngineError::UnexpectedError)?,
89
+ );
90
 
91
+ let document: Html = Html::parse_document(
92
+ &DuckDuckGo::fetch_html_from_upstream(&self, url, header_map).await?,
93
+ );
94
 
95
+ let no_result: Selector = Selector::parse(".no-results")
96
+ .map_err(|_| Report::new(EngineError::UnexpectedError))
97
+ .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?;
98
 
99
+ if document.select(&no_result).next().is_some() {
100
+ return Err(Report::new(EngineError::EmptyResultSet));
101
+ }
 
 
 
 
 
 
 
 
 
102
 
103
+ let results: Selector = Selector::parse(".result")
104
+ .map_err(|_| Report::new(EngineError::UnexpectedError))
105
+ .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
106
+ let result_title: Selector = Selector::parse(".result__a")
107
+ .map_err(|_| Report::new(EngineError::UnexpectedError))
108
+ .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?;
109
+ let result_url: Selector = Selector::parse(".result__url")
110
+ .map_err(|_| Report::new(EngineError::UnexpectedError))
111
+ .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?;
112
+ let result_desc: Selector = Selector::parse(".result__snippet")
113
+ .map_err(|_| Report::new(EngineError::UnexpectedError))
114
+ .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?;
115
+
116
+ // scrape all the results from the html
117
+ Ok(document
118
+ .select(&results)
119
+ .map(|result| {
120
+ RawSearchResult::new(
121
  result
122
+ .select(&result_title)
123
  .next()
124
  .unwrap()
125
  .inner_html()
126
  .trim()
127
+ .to_string(),
128
+ format!(
129
+ "https://{}",
130
+ result
131
+ .select(&result_url)
132
+ .next()
133
+ .unwrap()
134
+ .inner_html()
135
+ .trim()
136
+ ),
137
+ result
138
+ .select(&result_desc)
139
+ .next()
140
+ .unwrap()
141
+ .inner_html()
142
+ .trim()
143
+ .to_string(),
144
+ vec!["duckduckgo".to_string()],
145
+ )
146
+ })
147
+ .map(|search_result| (search_result.visiting_url.clone(), search_result))
148
+ .collect())
149
+ }
150
  }
src/engines/searx.rs CHANGED
@@ -8,7 +8,7 @@ use std::collections::HashMap;
8
 
9
  use crate::search_results_handler::aggregation_models::RawSearchResult;
10
 
11
- use super::engine_models::EngineError;
12
  use error_stack::{IntoReport, Report, Result, ResultExt};
13
 
14
  /// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
@@ -28,111 +28,108 @@ use error_stack::{IntoReport, Report, Result, ResultExt};
28
  /// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
29
  /// provide results for the requested search query and also returns error if the scraping selector
30
  /// or HeaderMap fails to initialize.
31
- pub async fn results(
32
- query: &str,
33
- page: u32,
34
- user_agent: &str,
35
- ) -> Result<HashMap<String, RawSearchResult>, EngineError> {
36
- // Page number can be missing or empty string and so appropriate handling is required
37
- // so that upstream server recieves valid page number.
38
- let url: String = format!("https://searx.work/search?q={query}&pageno={page}");
39
 
40
- // initializing headers and adding appropriate headers.
41
- let mut header_map = HeaderMap::new();
42
- header_map.insert(
43
- USER_AGENT,
44
- user_agent
45
- .parse()
46
- .into_report()
47
- .change_context(EngineError::UnexpectedError)?,
48
- );
49
- header_map.insert(
50
- REFERER,
51
- "https://google.com/"
52
- .parse()
53
- .into_report()
54
- .change_context(EngineError::UnexpectedError)?,
55
- );
56
- header_map.insert(
57
- CONTENT_TYPE,
58
- "application/x-www-form-urlencoded"
59
- .parse()
60
- .into_report()
61
- .change_context(EngineError::UnexpectedError)?,
62
- );
63
- header_map.insert(COOKIE, "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".parse().into_report().change_context(EngineError::UnexpectedError)?);
64
 
65
- // fetch the html from upstream searx instance engine
66
- let results: String = reqwest::Client::new()
67
- .get(url)
68
- .headers(header_map) // add spoofed headers to emulate human behaviours.
69
- .send()
70
- .await
71
- .into_report()
72
- .change_context(EngineError::RequestError)?
73
- .text()
74
- .await
75
- .into_report()
76
- .change_context(EngineError::RequestError)?;
77
 
78
- let document: Html = Html::parse_document(&results);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
79
 
80
- let no_result: Selector = Selector::parse("#urls>.dialog-error>p")
81
- .map_err(|_| Report::new(EngineError::UnexpectedError))
82
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", "#urls>.dialog-error>p"))?;
83
 
84
- if let Some(no_result_msg) = document.select(&no_result).nth(1) {
85
- if no_result_msg.inner_html()
 
 
 
 
 
 
86
  == "we didn't find any results. Please use another query or search in more categories"
87
  {
88
  return Err(Report::new(EngineError::EmptyResultSet));
89
  }
90
- }
91
 
92
- let results: Selector = Selector::parse(".result")
93
- .map_err(|_| Report::new(EngineError::UnexpectedError))
94
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
95
- let result_title: Selector = Selector::parse("h3>a")
96
- .map_err(|_| Report::new(EngineError::UnexpectedError))
97
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
98
- let result_url: Selector = Selector::parse("h3>a")
99
- .map_err(|_| Report::new(EngineError::UnexpectedError))
100
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
101
 
102
- let result_desc: Selector = Selector::parse(".content")
103
- .map_err(|_| Report::new(EngineError::UnexpectedError))
104
- .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?;
105
 
106
- // scrape all the results from the html
107
- Ok(document
108
- .select(&results)
109
- .map(|result| {
110
- RawSearchResult::new(
111
- result
112
- .select(&result_title)
113
- .next()
114
- .unwrap()
115
- .inner_html()
116
- .trim()
117
- .to_string(),
118
- result
119
- .select(&result_url)
120
- .next()
121
- .unwrap()
122
- .value()
123
- .attr("href")
124
- .unwrap()
125
- .to_string(),
126
- result
127
- .select(&result_desc)
128
- .next()
129
- .unwrap()
130
- .inner_html()
131
- .trim()
132
- .to_string(),
133
- vec!["searx".to_string()],
134
- )
135
- })
136
- .map(|search_result| (search_result.visiting_url.clone(), search_result))
137
- .collect())
 
138
  }
 
8
 
9
  use crate::search_results_handler::aggregation_models::RawSearchResult;
10
 
11
+ use super::engine_models::{EngineError, SearchEngine};
12
  use error_stack::{IntoReport, Report, Result, ResultExt};
13
 
14
  /// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
 
28
  /// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
29
  /// provide results for the requested search query and also returns error if the scraping selector
30
  /// or HeaderMap fails to initialize.
 
 
 
 
 
 
 
 
31
 
32
+ pub struct Searx;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
+ #[async_trait::async_trait]
35
+ impl SearchEngine for Searx {
36
+ async fn results(
37
+ &self,
38
+ query: String,
39
+ page: u32,
40
+ user_agent: String,
41
+ ) -> Result<HashMap<String, RawSearchResult>, EngineError> {
42
+ // Page number can be missing or empty string and so appropriate handling is required
43
+ // so that upstream server recieves valid page number.
44
+ let url: String = format!("https://searx.work/search?q={query}&pageno={page}");
 
45
 
46
+ // initializing headers and adding appropriate headers.
47
+ let mut header_map = HeaderMap::new();
48
+ header_map.insert(
49
+ USER_AGENT,
50
+ user_agent
51
+ .parse()
52
+ .into_report()
53
+ .change_context(EngineError::UnexpectedError)?,
54
+ );
55
+ header_map.insert(
56
+ REFERER,
57
+ "https://google.com/"
58
+ .parse()
59
+ .into_report()
60
+ .change_context(EngineError::UnexpectedError)?,
61
+ );
62
+ header_map.insert(
63
+ CONTENT_TYPE,
64
+ "application/x-www-form-urlencoded"
65
+ .parse()
66
+ .into_report()
67
+ .change_context(EngineError::UnexpectedError)?,
68
+ );
69
+ header_map.insert(COOKIE, "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".parse().into_report().change_context(EngineError::UnexpectedError)?);
70
 
71
+ let document: Html =
72
+ Html::parse_document(&Searx::fetch_html_from_upstream(&self, url, header_map).await?);
 
73
 
74
+ let no_result: Selector = Selector::parse("#urls>.dialog-error>p")
75
+ .map_err(|_| Report::new(EngineError::UnexpectedError))
76
+ .attach_printable_lazy(|| {
77
+ format!("invalid CSS selector: {}", "#urls>.dialog-error>p")
78
+ })?;
79
+
80
+ if let Some(no_result_msg) = document.select(&no_result).nth(1) {
81
+ if no_result_msg.inner_html()
82
  == "we didn't find any results. Please use another query or search in more categories"
83
  {
84
  return Err(Report::new(EngineError::EmptyResultSet));
85
  }
86
+ }
87
 
88
+ let results: Selector = Selector::parse(".result")
89
+ .map_err(|_| Report::new(EngineError::UnexpectedError))
90
+ .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
91
+ let result_title: Selector = Selector::parse("h3>a")
92
+ .map_err(|_| Report::new(EngineError::UnexpectedError))
93
+ .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
94
+ let result_url: Selector = Selector::parse("h3>a")
95
+ .map_err(|_| Report::new(EngineError::UnexpectedError))
96
+ .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
97
 
98
+ let result_desc: Selector = Selector::parse(".content")
99
+ .map_err(|_| Report::new(EngineError::UnexpectedError))
100
+ .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?;
101
 
102
+ // scrape all the results from the html
103
+ Ok(document
104
+ .select(&results)
105
+ .map(|result| {
106
+ RawSearchResult::new(
107
+ result
108
+ .select(&result_title)
109
+ .next()
110
+ .unwrap()
111
+ .inner_html()
112
+ .trim()
113
+ .to_string(),
114
+ result
115
+ .select(&result_url)
116
+ .next()
117
+ .unwrap()
118
+ .value()
119
+ .attr("href")
120
+ .unwrap()
121
+ .to_string(),
122
+ result
123
+ .select(&result_desc)
124
+ .next()
125
+ .unwrap()
126
+ .inner_html()
127
+ .trim()
128
+ .to_string(),
129
+ vec!["searx".to_string()],
130
+ )
131
+ })
132
+ .map(|search_result| (search_result.visiting_url.clone(), search_result))
133
+ .collect())
134
+ }
135
  }