Milim commited on
Commit
5aca5c0
1 Parent(s): 15dfda6

Improve aggregation

Browse files

Adds the EngineHandler struct
Removes vulnerability where an attacker could send requests cookies with fake engine names and crash the server.
Merged RawSearchResult and SearchResult, as they were functionally identical.

src/config/parser.rs CHANGED
@@ -34,7 +34,7 @@ pub struct Config {
34
  pub aggregator: AggregatorConfig,
35
  pub logging: bool,
36
  pub debug: bool,
37
- pub upstream_search_engines: Vec<String>,
38
  pub request_timeout: u8,
39
  pub threads: u8,
40
  }
@@ -107,6 +107,7 @@ impl Config {
107
  .get::<_, HashMap<String, bool>>("upstream_search_engines")?
108
  .into_iter()
109
  .filter_map(|(key, value)| value.then_some(key))
 
110
  .collect(),
111
  request_timeout: globals.get::<_, u8>("request_timeout")?,
112
  threads,
 
34
  pub aggregator: AggregatorConfig,
35
  pub logging: bool,
36
  pub debug: bool,
37
+ pub upstream_search_engines: Vec<crate::engines::engine_models::EngineHandler>,
38
  pub request_timeout: u8,
39
  pub threads: u8,
40
  }
 
107
  .get::<_, HashMap<String, bool>>("upstream_search_engines")?
108
  .into_iter()
109
  .filter_map(|(key, value)| value.then_some(key))
110
+ .filter_map(|engine| crate::engines::engine_models::EngineHandler::new(&engine))
111
  .collect(),
112
  request_timeout: globals.get::<_, u8>("request_timeout")?,
113
  threads,
src/engines/duckduckgo.rs CHANGED
@@ -7,7 +7,7 @@ use std::collections::HashMap;
7
  use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
8
  use scraper::{Html, Selector};
9
 
10
- use crate::results::aggregation_models::RawSearchResult;
11
 
12
  use super::engine_models::{EngineError, SearchEngine};
13
 
@@ -43,7 +43,7 @@ impl SearchEngine for DuckDuckGo {
43
  page: u32,
44
  user_agent: String,
45
  request_timeout: u8,
46
- ) -> Result<HashMap<String, RawSearchResult>, EngineError> {
47
  // Page number can be missing or empty string and so appropriate handling is required
48
  // so that upstream server recieves valid page number.
49
  let url: String = match page {
@@ -120,7 +120,7 @@ impl SearchEngine for DuckDuckGo {
120
  Ok(document
121
  .select(&results)
122
  .map(|result| {
123
- RawSearchResult::new(
124
  result
125
  .select(&result_title)
126
  .next()
 
7
  use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
8
  use scraper::{Html, Selector};
9
 
10
+ use crate::results::aggregation_models::SearchResult;
11
 
12
  use super::engine_models::{EngineError, SearchEngine};
13
 
 
43
  page: u32,
44
  user_agent: String,
45
  request_timeout: u8,
46
+ ) -> Result<HashMap<String, SearchResult>, EngineError> {
47
  // Page number can be missing or empty string and so appropriate handling is required
48
  // so that upstream server recieves valid page number.
49
  let url: String = match page {
 
120
  Ok(document
121
  .select(&results)
122
  .map(|result| {
123
+ SearchResult::new(
124
  result
125
  .select(&result_title)
126
  .next()
src/engines/engine_models.rs CHANGED
@@ -1,7 +1,7 @@
1
  //! This module provides the error enum to handle different errors associated while requesting data from
2
  //! the upstream search engines with the search query provided by the user.
3
 
4
- use crate::results::aggregation_models::RawSearchResult;
5
  use error_stack::{IntoReport, Result, ResultExt};
6
  use std::{collections::HashMap, fmt, time::Duration};
7
 
@@ -45,7 +45,7 @@ impl error_stack::Context for EngineError {}
45
 
46
  /// A trait to define common behavior for all search engines.
47
  #[async_trait::async_trait]
48
- pub trait SearchEngine {
49
  async fn fetch_html_from_upstream(
50
  &self,
51
  url: String,
@@ -73,5 +73,37 @@ pub trait SearchEngine {
73
  page: u32,
74
  user_agent: String,
75
  request_timeout: u8,
76
- ) -> Result<HashMap<String, RawSearchResult>, EngineError>;
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77
  }
 
1
  //! This module provides the error enum to handle different errors associated while requesting data from
2
  //! the upstream search engines with the search query provided by the user.
3
 
4
+ use crate::results::aggregation_models::SearchResult;
5
  use error_stack::{IntoReport, Result, ResultExt};
6
  use std::{collections::HashMap, fmt, time::Duration};
7
 
 
45
 
46
  /// A trait to define common behavior for all search engines.
47
  #[async_trait::async_trait]
48
+ pub trait SearchEngine: Sync + Send {
49
  async fn fetch_html_from_upstream(
50
  &self,
51
  url: String,
 
73
  page: u32,
74
  user_agent: String,
75
  request_timeout: u8,
76
+ ) -> Result<HashMap<String, SearchResult>, EngineError>;
77
+ }
78
+
79
+ pub struct EngineHandler {
80
+ engine: Box<dyn SearchEngine>,
81
+ name: &'static str,
82
+ }
83
+
84
+ impl Clone for EngineHandler {
85
+ fn clone(&self) -> Self {
86
+ Self::new(self.name).unwrap()
87
+ }
88
+ }
89
+
90
+ impl EngineHandler {
91
+ /// parses an engine name into an engine handler, returns none if the engine is unknown
92
+ pub fn new(engine_name: &str) -> Option<Self> {
93
+ let engine: (&'static str, Box<dyn SearchEngine>) =
94
+ match engine_name.to_lowercase().as_str() {
95
+ "duckduckgo" => ("duckduckgo", Box::new(super::duckduckgo::DuckDuckGo)),
96
+ "searx" => ("searx", Box::new(super::searx::Searx)),
97
+ _ => return None,
98
+ };
99
+
100
+ Some(Self {
101
+ engine: engine.1,
102
+ name: engine.0,
103
+ })
104
+ }
105
+
106
+ pub fn into_name_engine(self) -> (&'static str, Box<dyn SearchEngine>) {
107
+ (self.name, self.engine)
108
+ }
109
  }
src/engines/searx.rs CHANGED
@@ -6,7 +6,7 @@ use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
6
  use scraper::{Html, Selector};
7
  use std::collections::HashMap;
8
 
9
- use crate::results::aggregation_models::RawSearchResult;
10
 
11
  use super::engine_models::{EngineError, SearchEngine};
12
  use error_stack::{IntoReport, Report, Result, ResultExt};
@@ -42,7 +42,7 @@ impl SearchEngine for Searx {
42
  page: u32,
43
  user_agent: String,
44
  request_timeout: u8,
45
- ) -> Result<HashMap<String, RawSearchResult>, EngineError> {
46
  // Page number can be missing or empty string and so appropriate handling is required
47
  // so that upstream server recieves valid page number.
48
  let url: String = match page {
@@ -111,7 +111,7 @@ impl SearchEngine for Searx {
111
  Ok(document
112
  .select(&results)
113
  .map(|result| {
114
- RawSearchResult::new(
115
  result
116
  .select(&result_title)
117
  .next()
 
6
  use scraper::{Html, Selector};
7
  use std::collections::HashMap;
8
 
9
+ use crate::results::aggregation_models::SearchResult;
10
 
11
  use super::engine_models::{EngineError, SearchEngine};
12
  use error_stack::{IntoReport, Report, Result, ResultExt};
 
42
  page: u32,
43
  user_agent: String,
44
  request_timeout: u8,
45
+ ) -> Result<HashMap<String, SearchResult>, EngineError> {
46
  // Page number can be missing or empty string and so appropriate handling is required
47
  // so that upstream server recieves valid page number.
48
  let url: String = match page {
 
111
  Ok(document
112
  .select(&results)
113
  .map(|result| {
114
+ SearchResult::new(
115
  result
116
  .select(&result_title)
117
  .next()
src/results/aggregation_models.rs CHANGED
@@ -5,54 +5,6 @@ use serde::{Deserialize, Serialize};
5
 
6
  use crate::{config::parser_models::Style, engines::engine_models::EngineError};
7
 
8
- /// A named struct to store, serialize and deserializes the individual search result from all the
9
- /// scraped and aggregated search results from the upstream search engines.
10
- ///
11
- /// # Fields
12
- ///
13
- /// * `title` - The title of the search result.
14
- /// * `url` - The url to be displayed below the search result title in html.
15
- /// * `description` - The description of the search result.
16
- /// * `engine` - The names of the upstream engines from which this results were provided.
17
- #[derive(Serialize, Deserialize)]
18
- #[serde(rename_all = "camelCase")]
19
- pub struct SearchResult {
20
- pub title: String,
21
- pub url: String,
22
- pub description: String,
23
- pub engine: Vec<String>,
24
- }
25
-
26
- impl SearchResult {
27
- /// Constructs a new `SearchResult` with the given arguments needed for the struct.
28
- ///
29
- /// # Arguments
30
- ///
31
- /// * `title` - The title of the search result.
32
- /// * `visiting_url` - The url which is accessed when clicked on it
33
- /// (href url in html in simple words).
34
- /// * `url` - The url to be displayed below the search result title in html.
35
- /// * `description` - The description of the search result.
36
- /// * `engine` - The names of the upstream engines from which this results were provided.
37
- pub fn new(title: String, url: String, description: String, engine: Vec<String>) -> Self {
38
- SearchResult {
39
- title,
40
- url,
41
- description,
42
- engine,
43
- }
44
- }
45
-
46
- pub fn from_raw(raw: RawSearchResult) -> Self {
47
- SearchResult {
48
- title: raw.title,
49
- url: raw.url,
50
- description: raw.description,
51
- engine: raw.engine,
52
- }
53
- }
54
- }
55
-
56
  /// A named struct to store the raw scraped search results scraped search results from the
57
  /// upstream search engines before aggregating it.It derives the Clone trait which is needed
58
  /// to write idiomatic rust using `Iterators`.
@@ -64,15 +16,16 @@ impl SearchResult {
64
  /// (href url in html in simple words).
65
  /// * `description` - The description of the search result.
66
  /// * `engine` - The names of the upstream engines from which this results were provided.
67
- #[derive(Clone)]
68
- pub struct RawSearchResult {
 
69
  pub title: String,
70
  pub url: String,
71
  pub description: String,
72
  pub engine: Vec<String>,
73
  }
74
 
75
- impl RawSearchResult {
76
  /// Constructs a new `RawSearchResult` with the given arguments needed for the struct.
77
  ///
78
  /// # Arguments
@@ -83,7 +36,7 @@ impl RawSearchResult {
83
  /// * `description` - The description of the search result.
84
  /// * `engine` - The names of the upstream engines from which this results were provided.
85
  pub fn new(title: String, url: String, description: String, engine: Vec<String>) -> Self {
86
- RawSearchResult {
87
  title,
88
  url,
89
  description,
 
5
 
6
  use crate::{config::parser_models::Style, engines::engine_models::EngineError};
7
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  /// A named struct to store the raw scraped search results scraped search results from the
9
  /// upstream search engines before aggregating it.It derives the Clone trait which is needed
10
  /// to write idiomatic rust using `Iterators`.
 
16
  /// (href url in html in simple words).
17
  /// * `description` - The description of the search result.
18
  /// * `engine` - The names of the upstream engines from which this results were provided.
19
+ #[derive(Clone, Serialize, Deserialize)]
20
+ #[serde(rename_all = "camelCase")]
21
+ pub struct SearchResult {
22
  pub title: String,
23
  pub url: String,
24
  pub description: String,
25
  pub engine: Vec<String>,
26
  }
27
 
28
+ impl SearchResult {
29
  /// Constructs a new `RawSearchResult` with the given arguments needed for the struct.
30
  ///
31
  /// # Arguments
 
36
  /// * `description` - The description of the search result.
37
  /// * `engine` - The names of the upstream engines from which this results were provided.
38
  pub fn new(title: String, url: String, description: String, engine: Vec<String>) -> Self {
39
+ SearchResult {
40
  title,
41
  url,
42
  description,
src/results/aggregator.rs CHANGED
@@ -8,18 +8,14 @@ use rand::Rng;
8
  use tokio::task::JoinHandle;
9
 
10
  use super::{
11
- aggregation_models::{EngineErrorInfo, RawSearchResult, SearchResult, SearchResults},
12
  user_agent::random_user_agent,
13
  };
14
 
15
- use crate::engines::{
16
- duckduckgo,
17
- engine_models::{EngineError, SearchEngine},
18
- searx,
19
- };
20
 
21
  /// Aliases for long type annotations
22
- type FutureVec = Vec<JoinHandle<Result<HashMap<String, RawSearchResult>, Report<EngineError>>>>;
23
 
24
  /// The function aggregates the scraped results from the user-selected upstream search engines.
25
  /// These engines can be chosen either from the user interface (UI) or from the configuration file.
@@ -64,7 +60,7 @@ pub async fn aggregate(
64
  page: u32,
65
  random_delay: bool,
66
  debug: bool,
67
- mut upstream_search_engines: Vec<String>,
68
  request_timeout: u8,
69
  ) -> Result<SearchResults, Box<dyn std::error::Error>> {
70
  let user_agent: String = random_user_agent();
@@ -76,24 +72,22 @@ pub async fn aggregate(
76
  tokio::time::sleep(Duration::from_secs(delay_secs)).await;
77
  }
78
 
 
 
79
  // create tasks for upstream result fetching
80
- let tasks: FutureVec = upstream_search_engines
81
- .iter()
82
- .map(|engine| match engine.to_lowercase().as_str() {
83
- "duckduckgo" => Box::new(duckduckgo::DuckDuckGo) as Box<dyn SearchEngine + Send + Sync>,
84
- "searx" => Box::new(searx::Searx) as Box<dyn SearchEngine + Send + Sync>,
85
- &_ => panic!("Config Error: Incorrect config file option provided"),
86
- })
87
- .map(|search_engine| {
88
- let query: String = query.clone();
89
- let user_agent: String = user_agent.clone();
90
- tokio::spawn(async move {
91
- search_engine
92
- .results(query, page, user_agent.clone(), request_timeout)
93
- .await
94
- })
95
- })
96
- .collect();
97
 
98
  // get upstream responses
99
  let mut responses = Vec::with_capacity(tasks.len());
@@ -105,20 +99,20 @@ pub async fn aggregate(
105
  }
106
 
107
  // aggregate search results, removing duplicates and handling errors the upstream engines returned
108
- let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
109
  let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new();
110
 
111
  let mut handle_error = |error: Report<EngineError>, engine_name: String| {
112
  log::error!("Engine Error: {:?}", error);
113
  engine_errors_info.push(EngineErrorInfo::new(
114
  error.downcast_ref::<EngineError>().unwrap(),
115
- engine_name,
116
  ));
117
  };
118
 
119
  for _ in 0..responses.len() {
120
  let response = responses.pop().unwrap();
121
- let engine_name = upstream_search_engines.pop().unwrap();
122
 
123
  if result_map.is_empty() {
124
  match response {
@@ -126,7 +120,7 @@ pub async fn aggregate(
126
  result_map = results.clone();
127
  }
128
  Err(error) => {
129
- handle_error(error, engine_name.clone());
130
  }
131
  }
132
  continue;
@@ -138,21 +132,18 @@ pub async fn aggregate(
138
  result_map
139
  .entry(key)
140
  .and_modify(|result| {
141
- result.add_engines(engine_name.clone());
142
  })
143
- .or_insert_with(|| -> RawSearchResult { value });
144
  });
145
  }
146
  Err(error) => {
147
- handle_error(error, engine_name.clone());
148
  }
149
  }
150
  }
151
 
152
- let mut results = Vec::with_capacity(result_map.len());
153
- for (_, result) in result_map {
154
- results.push(SearchResult::from_raw(result))
155
- }
156
 
157
  Ok(SearchResults::new(
158
  results,
 
8
  use tokio::task::JoinHandle;
9
 
10
  use super::{
11
+ aggregation_models::{EngineErrorInfo, SearchResult, SearchResults},
12
  user_agent::random_user_agent,
13
  };
14
 
15
+ use crate::engines::engine_models::{EngineError, EngineHandler};
 
 
 
 
16
 
17
  /// Aliases for long type annotations
18
+ type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;
19
 
20
  /// The function aggregates the scraped results from the user-selected upstream search engines.
21
  /// These engines can be chosen either from the user interface (UI) or from the configuration file.
 
60
  page: u32,
61
  random_delay: bool,
62
  debug: bool,
63
+ upstream_search_engines: Vec<EngineHandler>,
64
  request_timeout: u8,
65
  ) -> Result<SearchResults, Box<dyn std::error::Error>> {
66
  let user_agent: String = random_user_agent();
 
72
  tokio::time::sleep(Duration::from_secs(delay_secs)).await;
73
  }
74
 
75
+ let mut names: Vec<&str> = vec![];
76
+
77
  // create tasks for upstream result fetching
78
+ let mut tasks: FutureVec = FutureVec::new();
79
+
80
+ for engine_handler in upstream_search_engines {
81
+ let (name, search_engine) = engine_handler.into_name_engine();
82
+ names.push(name);
83
+ let query: String = query.clone();
84
+ let user_agent: String = user_agent.clone();
85
+ tasks.push(tokio::spawn(async move {
86
+ search_engine
87
+ .results(query, page, user_agent.clone(), request_timeout)
88
+ .await
89
+ }));
90
+ }
 
 
 
 
91
 
92
  // get upstream responses
93
  let mut responses = Vec::with_capacity(tasks.len());
 
99
  }
100
 
101
  // aggregate search results, removing duplicates and handling errors the upstream engines returned
102
+ let mut result_map: HashMap<String, SearchResult> = HashMap::new();
103
  let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new();
104
 
105
  let mut handle_error = |error: Report<EngineError>, engine_name: String| {
106
  log::error!("Engine Error: {:?}", error);
107
  engine_errors_info.push(EngineErrorInfo::new(
108
  error.downcast_ref::<EngineError>().unwrap(),
109
+ engine_name.to_string(),
110
  ));
111
  };
112
 
113
  for _ in 0..responses.len() {
114
  let response = responses.pop().unwrap();
115
+ let engine = names.pop().unwrap().to_string();
116
 
117
  if result_map.is_empty() {
118
  match response {
 
120
  result_map = results.clone();
121
  }
122
  Err(error) => {
123
+ handle_error(error, engine);
124
  }
125
  }
126
  continue;
 
132
  result_map
133
  .entry(key)
134
  .and_modify(|result| {
135
+ result.add_engines(engine.clone());
136
  })
137
+ .or_insert_with(|| -> SearchResult { value });
138
  });
139
  }
140
  Err(error) => {
141
+ handle_error(error, engine);
142
  }
143
  }
144
  }
145
 
146
+ let results = result_map.into_values().collect();
 
 
 
147
 
148
  Ok(SearchResults::new(
149
  results,
src/server/routes.rs CHANGED
@@ -7,6 +7,7 @@ use std::fs::read_to_string;
7
  use crate::{
8
  cache::cacher::RedisCache,
9
  config::parser::Config,
 
10
  handler::public_paths::public_path,
11
  results::{aggregation_models::SearchResults, aggregator::aggregate},
12
  };
@@ -175,12 +176,19 @@ async fn results(
175
  {
176
  Some(cookie_value) => {
177
  let cookie_value: Cookie = serde_json::from_str(cookie_value.name_value().1)?;
 
 
 
 
 
 
 
178
  aggregate(
179
  query,
180
  page,
181
  config.aggregator.random_delay,
182
  config.debug,
183
- cookie_value.engines,
184
  config.request_timeout,
185
  )
186
  .await?
 
7
  use crate::{
8
  cache::cacher::RedisCache,
9
  config::parser::Config,
10
+ engines::engine_models::EngineHandler,
11
  handler::public_paths::public_path,
12
  results::{aggregation_models::SearchResults, aggregator::aggregate},
13
  };
 
176
  {
177
  Some(cookie_value) => {
178
  let cookie_value: Cookie = serde_json::from_str(cookie_value.name_value().1)?;
179
+
180
+ let engines = cookie_value
181
+ .engines
182
+ .iter()
183
+ .filter_map(|name| EngineHandler::new(name))
184
+ .collect();
185
+
186
  aggregate(
187
  query,
188
  page,
189
  config.aggregator.random_delay,
190
  config.debug,
191
+ engines,
192
  config.request_timeout,
193
  )
194
  .await?