Milim commited on
Commit
15dfda6
1 Parent(s): af3b1cb

Improve Aggregation function & config parser

Browse files

Refactor aggregation function
Rename visiting_url to url, as they are always the same (see upstream engine scalping).
Refactor parsing function to be more readable.

.cspell.json CHANGED
@@ -11,6 +11,10 @@
11
  ],
12
  "language": "en",
13
  "noConfigSearch": true,
14
- "words": ["megalinter", "oxsecurity"],
 
 
 
 
15
  "version": "0.2"
16
  }
 
11
  ],
12
  "language": "en",
13
  "noConfigSearch": true,
14
+ "words": [
15
+ "megalinter",
16
+ "oxsecurity",
17
+ "websurfx"
18
+ ],
19
  "version": "0.2"
20
  }
src/config/parser.rs CHANGED
@@ -57,7 +57,7 @@ impl Config {
57
  /// # Arguments
58
  ///
59
  /// * `logging_initialized` - It takes a boolean which ensures that the logging doesn't get
60
- /// initialized twice.
61
  ///
62
  /// # Error
63
  ///
@@ -76,23 +76,9 @@ impl Config {
76
 
77
  let debug: bool = globals.get::<_, bool>("debug")?;
78
  let logging:bool= globals.get::<_, bool>("logging")?;
79
-
80
- // Check whether logging has not been initialized before.
81
- if logging_initialized {
82
- if let Ok(pkg_env_var) = std::env::var("PKG_ENV"){
83
- if pkg_env_var.to_lowercase() == "dev" {
84
- env_logger::Builder::new().filter(None, LevelFilter::Trace).init();
85
- }
86
- } else {
87
- // Initializing logging middleware with level set to default or info.
88
- let mut log_level: LevelFilter = LevelFilter::Error;
89
- if logging && debug == false {
90
- log_level = LevelFilter::Info;
91
- } else if debug {
92
- log_level = LevelFilter::Debug;
93
- };
94
- env_logger::Builder::new().filter(None, log_level).init();
95
- }
96
  }
97
 
98
  let threads: u8 = if parsed_threads == 0 {
@@ -127,6 +113,7 @@ impl Config {
127
  })
128
  })
129
  }
 
130
  /// A helper function which returns an appropriate config file path checking if the config
131
  /// file exists on that path.
132
  ///
@@ -173,3 +160,26 @@ impl Config {
173
  Err("Config file not found!!".to_string().into())
174
  }
175
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
  /// # Arguments
58
  ///
59
  /// * `logging_initialized` - It takes a boolean which ensures that the logging doesn't get
60
+ /// initialized twice. Pass false if the logger has not yet been initialized.
61
  ///
62
  /// # Error
63
  ///
 
76
 
77
  let debug: bool = globals.get::<_, bool>("debug")?;
78
  let logging:bool= globals.get::<_, bool>("logging")?;
79
+
80
+ if !logging_initialized {
81
+ set_logging_level(debug, logging);
 
 
 
 
 
 
 
 
 
 
 
 
 
 
82
  }
83
 
84
  let threads: u8 = if parsed_threads == 0 {
 
113
  })
114
  })
115
  }
116
+
117
  /// A helper function which returns an appropriate config file path checking if the config
118
  /// file exists on that path.
119
  ///
 
160
  Err("Config file not found!!".to_string().into())
161
  }
162
  }
163
+
164
+ /// a helper function that sets the proper logging level
165
+ fn set_logging_level(debug: bool, logging: bool) {
166
+
167
+ if let Ok(pkg_env_var) = std::env::var("PKG_ENV") {
168
+ if pkg_env_var.to_lowercase() == "dev" {
169
+ env_logger::Builder::new()
170
+ .filter(None, LevelFilter::Trace)
171
+ .init();
172
+ return;
173
+ }
174
+ }
175
+
176
+ // Initializing logging middleware with level set to default or info.
177
+ let log_level = match (debug, logging) {
178
+ (true, true) => LevelFilter::Error,
179
+ (true, false) => LevelFilter::Debug,
180
+ (false, true) => LevelFilter::Info,
181
+ (false, false) => LevelFilter::Error,
182
+ };
183
+
184
+ env_logger::Builder::new().filter(None, log_level).init();
185
+ }
src/engines/duckduckgo.rs CHANGED
@@ -147,7 +147,7 @@ impl SearchEngine for DuckDuckGo {
147
  vec!["duckduckgo".to_string()],
148
  )
149
  })
150
- .map(|search_result| (search_result.visiting_url.clone(), search_result))
151
  .collect())
152
  }
153
  }
 
147
  vec!["duckduckgo".to_string()],
148
  )
149
  })
150
+ .map(|search_result| (search_result.url.clone(), search_result))
151
  .collect())
152
  }
153
  }
src/engines/engine_models.rs CHANGED
@@ -43,7 +43,7 @@ impl fmt::Display for EngineError {
43
 
44
  impl error_stack::Context for EngineError {}
45
 
46
- /// A trait to define common behaviour for all search engines.
47
  #[async_trait::async_trait]
48
  pub trait SearchEngine {
49
  async fn fetch_html_from_upstream(
@@ -56,7 +56,7 @@ pub trait SearchEngine {
56
  Ok(reqwest::Client::new()
57
  .get(url)
58
  .timeout(Duration::from_secs(request_timeout as u64)) // Add timeout to request to avoid DDOSing the server
59
- .headers(header_map) // add spoofed headers to emulate human behaviour
60
  .send()
61
  .await
62
  .into_report()
 
43
 
44
  impl error_stack::Context for EngineError {}
45
 
46
+ /// A trait to define common behavior for all search engines.
47
  #[async_trait::async_trait]
48
  pub trait SearchEngine {
49
  async fn fetch_html_from_upstream(
 
56
  Ok(reqwest::Client::new()
57
  .get(url)
58
  .timeout(Duration::from_secs(request_timeout as u64)) // Add timeout to request to avoid DDOSing the server
59
+ .headers(header_map) // add spoofed headers to emulate human behavior
60
  .send()
61
  .await
62
  .into_report()
src/engines/searx.rs CHANGED
@@ -137,7 +137,7 @@ impl SearchEngine for Searx {
137
  vec!["searx".to_string()],
138
  )
139
  })
140
- .map(|search_result| (search_result.visiting_url.clone(), search_result))
141
  .collect())
142
  }
143
  }
 
137
  vec!["searx".to_string()],
138
  )
139
  })
140
+ .map(|search_result| (search_result.url.clone(), search_result))
141
  .collect())
142
  }
143
  }
src/results/aggregation_models.rs CHANGED
@@ -11,8 +11,6 @@ use crate::{config::parser_models::Style, engines::engine_models::EngineError};
11
  /// # Fields
12
  ///
13
  /// * `title` - The title of the search result.
14
- /// * `visiting_url` - The url which is accessed when clicked on it (href url in html in simple
15
- /// words).
16
  /// * `url` - The url to be displayed below the search result title in html.
17
  /// * `description` - The description of the search result.
18
  /// * `engine` - The names of the upstream engines from which this results were provided.
@@ -20,7 +18,6 @@ use crate::{config::parser_models::Style, engines::engine_models::EngineError};
20
  #[serde(rename_all = "camelCase")]
21
  pub struct SearchResult {
22
  pub title: String,
23
- pub visiting_url: String,
24
  pub url: String,
25
  pub description: String,
26
  pub engine: Vec<String>,
@@ -37,21 +34,23 @@ impl SearchResult {
37
  /// * `url` - The url to be displayed below the search result title in html.
38
  /// * `description` - The description of the search result.
39
  /// * `engine` - The names of the upstream engines from which this results were provided.
40
- pub fn new(
41
- title: String,
42
- visiting_url: String,
43
- url: String,
44
- description: String,
45
- engine: Vec<String>,
46
- ) -> Self {
47
  SearchResult {
48
  title,
49
- visiting_url,
50
  url,
51
  description,
52
  engine,
53
  }
54
  }
 
 
 
 
 
 
 
 
 
55
  }
56
 
57
  /// A named struct to store the raw scraped search results scraped search results from the
@@ -61,14 +60,14 @@ impl SearchResult {
61
  /// # Fields
62
  ///
63
  /// * `title` - The title of the search result.
64
- /// * `visiting_url` - The url which is accessed when clicked on it
65
  /// (href url in html in simple words).
66
  /// * `description` - The description of the search result.
67
  /// * `engine` - The names of the upstream engines from which this results were provided.
68
  #[derive(Clone)]
69
  pub struct RawSearchResult {
70
  pub title: String,
71
- pub visiting_url: String,
72
  pub description: String,
73
  pub engine: Vec<String>,
74
  }
@@ -79,19 +78,14 @@ impl RawSearchResult {
79
  /// # Arguments
80
  ///
81
  /// * `title` - The title of the search result.
82
- /// * `visiting_url` - The url which is accessed when clicked on it
83
  /// (href url in html in simple words).
84
  /// * `description` - The description of the search result.
85
  /// * `engine` - The names of the upstream engines from which this results were provided.
86
- pub fn new(
87
- title: String,
88
- visiting_url: String,
89
- description: String,
90
- engine: Vec<String>,
91
- ) -> Self {
92
  RawSearchResult {
93
  title,
94
- visiting_url,
95
  description,
96
  engine,
97
  }
 
11
  /// # Fields
12
  ///
13
  /// * `title` - The title of the search result.
 
 
14
  /// * `url` - The url to be displayed below the search result title in html.
15
  /// * `description` - The description of the search result.
16
  /// * `engine` - The names of the upstream engines from which this results were provided.
 
18
  #[serde(rename_all = "camelCase")]
19
  pub struct SearchResult {
20
  pub title: String,
 
21
  pub url: String,
22
  pub description: String,
23
  pub engine: Vec<String>,
 
34
  /// * `url` - The url to be displayed below the search result title in html.
35
  /// * `description` - The description of the search result.
36
  /// * `engine` - The names of the upstream engines from which this results were provided.
37
+ pub fn new(title: String, url: String, description: String, engine: Vec<String>) -> Self {
 
 
 
 
 
 
38
  SearchResult {
39
  title,
 
40
  url,
41
  description,
42
  engine,
43
  }
44
  }
45
+
46
+ pub fn from_raw(raw: RawSearchResult) -> Self {
47
+ SearchResult {
48
+ title: raw.title,
49
+ url: raw.url,
50
+ description: raw.description,
51
+ engine: raw.engine,
52
+ }
53
+ }
54
  }
55
 
56
  /// A named struct to store the raw scraped search results scraped search results from the
 
60
  /// # Fields
61
  ///
62
  /// * `title` - The title of the search result.
63
+ /// * `url` - The url which is accessed when clicked on it
64
  /// (href url in html in simple words).
65
  /// * `description` - The description of the search result.
66
  /// * `engine` - The names of the upstream engines from which this results were provided.
67
  #[derive(Clone)]
68
  pub struct RawSearchResult {
69
  pub title: String,
70
+ pub url: String,
71
  pub description: String,
72
  pub engine: Vec<String>,
73
  }
 
78
  /// # Arguments
79
  ///
80
  /// * `title` - The title of the search result.
81
+ /// * `url` - The url which is accessed when clicked on it
82
  /// (href url in html in simple words).
83
  /// * `description` - The description of the search result.
84
  /// * `engine` - The names of the upstream engines from which this results were provided.
85
+ pub fn new(title: String, url: String, description: String, engine: Vec<String>) -> Self {
 
 
 
 
 
86
  RawSearchResult {
87
  title,
88
+ url,
89
  description,
90
  engine,
91
  }
src/results/aggregator.rs CHANGED
@@ -64,11 +64,10 @@ pub async fn aggregate(
64
  page: u32,
65
  random_delay: bool,
66
  debug: bool,
67
- upstream_search_engines: Vec<String>,
68
  request_timeout: u8,
69
  ) -> Result<SearchResults, Box<dyn std::error::Error>> {
70
  let user_agent: String = random_user_agent();
71
- let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
72
 
73
  // Add a random delay before making the request.
74
  if random_delay || !debug {
@@ -77,20 +76,14 @@ pub async fn aggregate(
77
  tokio::time::sleep(Duration::from_secs(delay_secs)).await;
78
  }
79
 
80
- // fetch results from upstream search engines simultaneously/concurrently.
81
- let search_engines: Vec<Box<dyn SearchEngine + Send + Sync>> = upstream_search_engines
82
  .iter()
83
  .map(|engine| match engine.to_lowercase().as_str() {
84
  "duckduckgo" => Box::new(duckduckgo::DuckDuckGo) as Box<dyn SearchEngine + Send + Sync>,
85
  "searx" => Box::new(searx::Searx) as Box<dyn SearchEngine + Send + Sync>,
86
  &_ => panic!("Config Error: Incorrect config file option provided"),
87
  })
88
- .collect();
89
-
90
- let task_capacity: usize = search_engines.len();
91
-
92
- let tasks: FutureVec = search_engines
93
- .into_iter()
94
  .map(|search_engine| {
95
  let query: String = query.clone();
96
  let user_agent: String = user_agent.clone();
@@ -102,101 +95,67 @@ pub async fn aggregate(
102
  })
103
  .collect();
104
 
105
- let mut outputs = Vec::with_capacity(task_capacity);
 
106
 
107
  for task in tasks {
108
  if let Ok(result) = task.await {
109
- outputs.push(result)
110
  }
111
  }
112
 
 
 
113
  let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new();
114
 
115
- // The code block `outputs.iter()` determines whether it is the first time the code is being run.
116
- // It does this by checking the initial flag. If it is the first time, the code selects the first
117
- // engine from which results are fetched and adds or extends them into the `result_map`. If the
118
- // initially selected engine fails, the code automatically selects another engine to map or extend
119
- // into the `result_map`. On the other hand, if an engine selected for the first time successfully
120
- // fetches results and maps them into the `result_map`, the initial flag is set to false. Subsequently,
121
- // the code iterates through the remaining engines one by one. It compares the fetched results from each
122
- // engine with the results already present in the `result_map` to identify any duplicates. If duplicate
123
- // results are found, the code groups them together with the name of the engine from which they were
124
- // fetched, and automatically removes the duplicate results from the newly fetched data.
125
- //
126
- // Additionally, the code handles errors returned by the engines. It keeps track of which engines
127
- // encountered errors and stores this information in a vector of structures called `EngineErrorInfo`.
128
- // Each structure in this vector contains the name of the engine and the type of error it returned.
129
- // These structures will later be added to the final `SearchResults` structure. The `SearchResults`
130
- // structure is used to display an error box in the UI containing the relevant information from
131
- // the `EngineErrorInfo` structure.
132
- //
133
- // In summary, this code block manages the selection of engines, handling of duplicate results, and tracking
134
- // of errors in order to populate the `result_map` and provide informative feedback to the user through the
135
- // `SearchResults` structure.
136
- let mut initial: bool = true;
137
- let mut counter: usize = 0;
138
- outputs.iter().for_each(|results| {
139
- if initial {
140
- match results {
141
- Ok(result) => {
142
- result_map.extend(result.clone());
143
- counter += 1;
144
- initial = false
145
  }
146
- Err(error_type) => {
147
- log::error!("Engine Error: {:?}", error_type);
148
- engine_errors_info.push(EngineErrorInfo::new(
149
- error_type.downcast_ref::<EngineError>().unwrap(),
150
- upstream_search_engines[counter].clone(),
151
- ));
152
- counter += 1
153
  }
154
  }
155
- } else {
156
- match results {
157
- Ok(result) => {
158
- result.clone().into_iter().for_each(|(key, value)| {
159
- result_map
160
- .entry(key)
161
- .and_modify(|result| {
162
- result.add_engines(value.clone().engine());
163
- })
164
- .or_insert_with(|| -> RawSearchResult {
165
- RawSearchResult::new(
166
- value.title.clone(),
167
- value.visiting_url.clone(),
168
- value.description.clone(),
169
- value.engine.clone(),
170
- )
171
- });
172
- });
173
- counter += 1
174
- }
175
- Err(error_type) => {
176
- log::error!("Engine Error: {:?}", error_type);
177
- engine_errors_info.push(EngineErrorInfo::new(
178
- error_type.downcast_ref::<EngineError>().unwrap(),
179
- upstream_search_engines[counter].clone(),
180
- ));
181
- counter += 1
182
- }
183
  }
184
  }
185
- });
 
 
 
 
 
186
 
187
  Ok(SearchResults::new(
188
- result_map
189
- .into_iter()
190
- .map(|(key, value)| {
191
- SearchResult::new(
192
- value.title,
193
- value.visiting_url,
194
- key,
195
- value.description,
196
- value.engine,
197
- )
198
- })
199
- .collect(),
200
  query.to_string(),
201
  engine_errors_info,
202
  ))
 
64
  page: u32,
65
  random_delay: bool,
66
  debug: bool,
67
+ mut upstream_search_engines: Vec<String>,
68
  request_timeout: u8,
69
  ) -> Result<SearchResults, Box<dyn std::error::Error>> {
70
  let user_agent: String = random_user_agent();
 
71
 
72
  // Add a random delay before making the request.
73
  if random_delay || !debug {
 
76
  tokio::time::sleep(Duration::from_secs(delay_secs)).await;
77
  }
78
 
79
+ // create tasks for upstream result fetching
80
+ let tasks: FutureVec = upstream_search_engines
81
  .iter()
82
  .map(|engine| match engine.to_lowercase().as_str() {
83
  "duckduckgo" => Box::new(duckduckgo::DuckDuckGo) as Box<dyn SearchEngine + Send + Sync>,
84
  "searx" => Box::new(searx::Searx) as Box<dyn SearchEngine + Send + Sync>,
85
  &_ => panic!("Config Error: Incorrect config file option provided"),
86
  })
 
 
 
 
 
 
87
  .map(|search_engine| {
88
  let query: String = query.clone();
89
  let user_agent: String = user_agent.clone();
 
95
  })
96
  .collect();
97
 
98
+ // get upstream responses
99
+ let mut responses = Vec::with_capacity(tasks.len());
100
 
101
  for task in tasks {
102
  if let Ok(result) = task.await {
103
+ responses.push(result)
104
  }
105
  }
106
 
107
+ // aggregate search results, removing duplicates and handling errors the upstream engines returned
108
+ let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
109
  let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new();
110
 
111
+ let mut handle_error = |error: Report<EngineError>, engine_name: String| {
112
+ log::error!("Engine Error: {:?}", error);
113
+ engine_errors_info.push(EngineErrorInfo::new(
114
+ error.downcast_ref::<EngineError>().unwrap(),
115
+ engine_name,
116
+ ));
117
+ };
118
+
119
+ for _ in 0..responses.len() {
120
+ let response = responses.pop().unwrap();
121
+ let engine_name = upstream_search_engines.pop().unwrap();
122
+
123
+ if result_map.is_empty() {
124
+ match response {
125
+ Ok(results) => {
126
+ result_map = results.clone();
 
 
 
 
 
 
 
 
 
 
 
 
 
 
127
  }
128
+ Err(error) => {
129
+ handle_error(error, engine_name.clone());
 
 
 
 
 
130
  }
131
  }
132
+ continue;
133
+ }
134
+
135
+ match response {
136
+ Ok(result) => {
137
+ result.into_iter().for_each(|(key, value)| {
138
+ result_map
139
+ .entry(key)
140
+ .and_modify(|result| {
141
+ result.add_engines(engine_name.clone());
142
+ })
143
+ .or_insert_with(|| -> RawSearchResult { value });
144
+ });
145
+ }
146
+ Err(error) => {
147
+ handle_error(error, engine_name.clone());
 
 
 
 
 
 
 
 
 
 
 
 
148
  }
149
  }
150
+ }
151
+
152
+ let mut results = Vec::with_capacity(result_map.len());
153
+ for (_, result) in result_map {
154
+ results.push(SearchResult::from_raw(result))
155
+ }
156
 
157
  Ok(SearchResults::new(
158
+ results,
 
 
 
 
 
 
 
 
 
 
 
159
  query.to_string(),
160
  engine_errors_info,
161
  ))
tests/index.rs CHANGED
@@ -8,7 +8,7 @@ fn spawn_app() -> String {
8
  // Binding to port 0 will trigger the OS to assign a port for us.
9
  let listener = TcpListener::bind("127.0.0.1:0").expect("Failed to bind random port");
10
  let port = listener.local_addr().unwrap().port();
11
- let config = Config::parse(true).unwrap();
12
  let server = run(listener, config).expect("Failed to bind address");
13
 
14
  tokio::spawn(server);
@@ -36,7 +36,7 @@ async fn test_index() {
36
  assert_eq!(res.status(), 200);
37
 
38
  let handlebars = handlebars();
39
- let config = Config::parse(false).unwrap();
40
  let template = handlebars.render("index", &config.style).unwrap();
41
  assert_eq!(res.text().await.unwrap(), template);
42
  }
 
8
  // Binding to port 0 will trigger the OS to assign a port for us.
9
  let listener = TcpListener::bind("127.0.0.1:0").expect("Failed to bind random port");
10
  let port = listener.local_addr().unwrap().port();
11
+ let config = Config::parse(false).unwrap();
12
  let server = run(listener, config).expect("Failed to bind address");
13
 
14
  tokio::spawn(server);
 
36
  assert_eq!(res.status(), 200);
37
 
38
  let handlebars = handlebars();
39
+ let config = Config::parse(true).unwrap();
40
  let template = handlebars.render("index", &config.style).unwrap();
41
  assert_eq!(res.text().await.unwrap(), template);
42
  }