neon_arch commited on
Commit
f5f0488
1 Parent(s): 4ccd048

⚙️ refactor: add several optimizations to the engine code (#180)(#178)

Browse files
Files changed (2) hide show
  1. src/engines/duckduckgo.rs +20 -40
  2. src/engines/searx.rs +16 -35
src/engines/duckduckgo.rs CHANGED
@@ -4,14 +4,14 @@
4
 
5
  use std::collections::HashMap;
6
 
7
- use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
8
  use scraper::{Html, Selector};
9
 
10
  use crate::results::aggregation_models::SearchResult;
11
 
12
  use super::engine_models::{EngineError, SearchEngine};
13
 
14
- use error_stack::{IntoReport, Report, Result, ResultExt};
15
 
16
  /// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
17
  /// reduce code duplication as well as allows to create vector of different search engines easily.
@@ -39,9 +39,9 @@ impl SearchEngine for DuckDuckGo {
39
  /// or HeaderMap fails to initialize.
40
  async fn results(
41
  &self,
42
- query: String,
43
  page: u32,
44
- user_agent: String,
45
  request_timeout: u8,
46
  ) -> Result<HashMap<String, SearchResult>, EngineError> {
47
  // Page number can be missing or empty string and so appropriate handling is required
@@ -61,38 +61,19 @@ impl SearchEngine for DuckDuckGo {
61
  };
62
 
63
  // initializing HeaderMap and adding appropriate headers.
64
- let mut header_map = HeaderMap::new();
65
- header_map.insert(
66
- USER_AGENT,
67
- user_agent
68
- .parse()
69
- .into_report()
70
- .change_context(EngineError::UnexpectedError)?,
71
- );
72
- header_map.insert(
73
- REFERER,
74
- "https://google.com/"
75
- .parse()
76
- .into_report()
77
- .change_context(EngineError::UnexpectedError)?,
78
- );
79
- header_map.insert(
80
- CONTENT_TYPE,
81
- "application/x-www-form-urlencoded"
82
- .parse()
83
- .into_report()
84
- .change_context(EngineError::UnexpectedError)?,
85
- );
86
- header_map.insert(
87
- COOKIE,
88
- "kl=wt-wt"
89
- .parse()
90
- .into_report()
91
- .change_context(EngineError::UnexpectedError)?,
92
- );
93
 
94
  let document: Html = Html::parse_document(
95
- &DuckDuckGo::fetch_html_from_upstream(self, url, header_map, request_timeout).await?,
96
  );
97
 
98
  let no_result: Selector = Selector::parse(".no-results")
@@ -126,8 +107,7 @@ impl SearchEngine for DuckDuckGo {
126
  .next()
127
  .unwrap()
128
  .inner_html()
129
- .trim()
130
- .to_string(),
131
  format!(
132
  "https://{}",
133
  result
@@ -136,15 +116,15 @@ impl SearchEngine for DuckDuckGo {
136
  .unwrap()
137
  .inner_html()
138
  .trim()
139
- ),
 
140
  result
141
  .select(&result_desc)
142
  .next()
143
  .unwrap()
144
  .inner_html()
145
- .trim()
146
- .to_string(),
147
- vec!["duckduckgo".to_string()],
148
  )
149
  })
150
  .map(|search_result| (search_result.url.clone(), search_result))
 
4
 
5
  use std::collections::HashMap;
6
 
7
+ use reqwest::header::HeaderMap;
8
  use scraper::{Html, Selector};
9
 
10
  use crate::results::aggregation_models::SearchResult;
11
 
12
  use super::engine_models::{EngineError, SearchEngine};
13
 
14
+ use error_stack::{Report, Result, ResultExt};
15
 
16
  /// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
17
  /// reduce code duplication as well as allows to create vector of different search engines easily.
 
39
  /// or HeaderMap fails to initialize.
40
  async fn results(
41
  &self,
42
+ query: &str,
43
  page: u32,
44
+ user_agent: &str,
45
  request_timeout: u8,
46
  ) -> Result<HashMap<String, SearchResult>, EngineError> {
47
  // Page number can be missing or empty string and so appropriate handling is required
 
61
  };
62
 
63
  // initializing HeaderMap and adding appropriate headers.
64
+ let header_map = HeaderMap::try_from(&HashMap::from([
65
+ ("USER_AGENT".to_string(), user_agent.to_string()),
66
+ ("REFERER".to_string(), "https://google.com/".to_string()),
67
+ (
68
+ "CONTENT_TYPE".to_string(),
69
+ "application/x-www-form-urlencoded".to_string(),
70
+ ),
71
+ ("COOKIE".to_string(), "kl=wt-wt".to_string()),
72
+ ]))
73
+ .change_context(EngineError::UnexpectedError)?;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
74
 
75
  let document: Html = Html::parse_document(
76
+ &DuckDuckGo::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
77
  );
78
 
79
  let no_result: Selector = Selector::parse(".no-results")
 
107
  .next()
108
  .unwrap()
109
  .inner_html()
110
+ .trim(),
 
111
  format!(
112
  "https://{}",
113
  result
 
116
  .unwrap()
117
  .inner_html()
118
  .trim()
119
+ )
120
+ .as_str(),
121
  result
122
  .select(&result_desc)
123
  .next()
124
  .unwrap()
125
  .inner_html()
126
+ .trim(),
127
+ &["duckduckgo"],
 
128
  )
129
  })
130
  .map(|search_result| (search_result.url.clone(), search_result))
src/engines/searx.rs CHANGED
@@ -2,14 +2,14 @@
2
  //! by querying the upstream searx search engine instance with user provided query and with a page
3
  //! number if provided.
4
 
5
- use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
6
  use scraper::{Html, Selector};
7
  use std::collections::HashMap;
8
 
9
  use crate::results::aggregation_models::SearchResult;
10
 
11
  use super::engine_models::{EngineError, SearchEngine};
12
- use error_stack::{IntoReport, Report, Result, ResultExt};
13
 
14
  /// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
15
  /// reduce code duplication as well as allows to create vector of different search engines easily.
@@ -38,9 +38,9 @@ impl SearchEngine for Searx {
38
 
39
  async fn results(
40
  &self,
41
- query: String,
42
  page: u32,
43
- user_agent: String,
44
  request_timeout: u8,
45
  ) -> Result<HashMap<String, SearchResult>, EngineError> {
46
  // Page number can be missing or empty string and so appropriate handling is required
@@ -51,32 +51,16 @@ impl SearchEngine for Searx {
51
  };
52
 
53
  // initializing headers and adding appropriate headers.
54
- let mut header_map = HeaderMap::new();
55
- header_map.insert(
56
- USER_AGENT,
57
- user_agent
58
- .parse()
59
- .into_report()
60
- .change_context(EngineError::UnexpectedError)?,
61
- );
62
- header_map.insert(
63
- REFERER,
64
- "https://google.com/"
65
- .parse()
66
- .into_report()
67
- .change_context(EngineError::UnexpectedError)?,
68
- );
69
- header_map.insert(
70
- CONTENT_TYPE,
71
- "application/x-www-form-urlencoded"
72
- .parse()
73
- .into_report()
74
- .change_context(EngineError::UnexpectedError)?,
75
- );
76
- header_map.insert(COOKIE, "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".parse().into_report().change_context(EngineError::UnexpectedError)?);
77
 
78
  let document: Html = Html::parse_document(
79
- &Searx::fetch_html_from_upstream(self, url, header_map, request_timeout).await?,
80
  );
81
 
82
  let no_result: Selector = Selector::parse("#urls>.dialog-error>p")
@@ -117,24 +101,21 @@ impl SearchEngine for Searx {
117
  .next()
118
  .unwrap()
119
  .inner_html()
120
- .trim()
121
- .to_string(),
122
  result
123
  .select(&result_url)
124
  .next()
125
  .unwrap()
126
  .value()
127
  .attr("href")
128
- .unwrap()
129
- .to_string(),
130
  result
131
  .select(&result_desc)
132
  .next()
133
  .unwrap()
134
  .inner_html()
135
- .trim()
136
- .to_string(),
137
- vec!["searx".to_string()],
138
  )
139
  })
140
  .map(|search_result| (search_result.url.clone(), search_result))
 
2
  //! by querying the upstream searx search engine instance with user provided query and with a page
3
  //! number if provided.
4
 
5
+ use reqwest::header::HeaderMap;
6
  use scraper::{Html, Selector};
7
  use std::collections::HashMap;
8
 
9
  use crate::results::aggregation_models::SearchResult;
10
 
11
  use super::engine_models::{EngineError, SearchEngine};
12
+ use error_stack::{Report, Result, ResultExt};
13
 
14
  /// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
15
  /// reduce code duplication as well as allows to create vector of different search engines easily.
 
38
 
39
  async fn results(
40
  &self,
41
+ query: &str,
42
  page: u32,
43
+ user_agent: &str,
44
  request_timeout: u8,
45
  ) -> Result<HashMap<String, SearchResult>, EngineError> {
46
  // Page number can be missing or empty string and so appropriate handling is required
 
51
  };
52
 
53
  // initializing headers and adding appropriate headers.
54
+ let header_map = HeaderMap::try_from(&HashMap::from([
55
+ ("USER_AGENT".to_string(), user_agent.to_string()),
56
+ ("REFERER".to_string(), "https://google.com/".to_string()),
57
+ ("CONTENT_TYPE".to_string(), "application/x-www-form-urlencoded".to_string()),
58
+ ("COOKIE".to_string(), "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".to_string())
59
+ ]))
60
+ .change_context(EngineError::UnexpectedError)?;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
61
 
62
  let document: Html = Html::parse_document(
63
+ &Searx::fetch_html_from_upstream(self, &url, header_map, request_timeout).await?,
64
  );
65
 
66
  let no_result: Selector = Selector::parse("#urls>.dialog-error>p")
 
101
  .next()
102
  .unwrap()
103
  .inner_html()
104
+ .trim(),
 
105
  result
106
  .select(&result_url)
107
  .next()
108
  .unwrap()
109
  .value()
110
  .attr("href")
111
+ .unwrap(),
 
112
  result
113
  .select(&result_desc)
114
  .next()
115
  .unwrap()
116
  .inner_html()
117
+ .trim(),
118
+ &["searx"],
 
119
  )
120
  })
121
  .map(|search_result| (search_result.url.clone(), search_result))