Spaces:

alamin655
/

surfx

Running

Milim commited on Aug 18, 2023

Commit

5aca5c0

•

1 Parent(s): 15dfda6

Improve aggregation

Adds the EngineHandler struct
Removes vulnerability where an attacker could send requests cookies with fake engine names and crash the server.
Merged RawSearchResult and SearchResult, as they were functionally identical.

Files changed (7) hide show

src/config/parser.rs +2 -1
src/engines/duckduckgo.rs +3 -3
src/engines/engine_models.rs +35 -3
src/engines/searx.rs +3 -3
src/results/aggregation_models.rs +5 -52
src/results/aggregator.rs +27 -36
src/server/routes.rs +9 -1

src/config/parser.rs CHANGED Viewed

@@ -34,7 +34,7 @@ pub struct Config {
     pub aggregator: AggregatorConfig,
     pub logging: bool,
     pub debug: bool,
-    pub upstream_search_engines: Vec<String>,
     pub request_timeout: u8,
     pub threads: u8,
 }
@@ -107,6 +107,7 @@ impl Config {
                     .get::<_, HashMap<String, bool>>("upstream_search_engines")?
                     .into_iter()
                     .filter_map(|(key, value)| value.then_some(key))
                     .collect(),
                 request_timeout: globals.get::<_, u8>("request_timeout")?,
                 threads,

     pub aggregator: AggregatorConfig,
     pub logging: bool,
     pub debug: bool,
+    pub upstream_search_engines: Vec<crate::engines::engine_models::EngineHandler>,
     pub request_timeout: u8,
     pub threads: u8,
 }
                     .get::<_, HashMap<String, bool>>("upstream_search_engines")?
                     .into_iter()
                     .filter_map(|(key, value)| value.then_some(key))
+                    .filter_map(|engine| crate::engines::engine_models::EngineHandler::new(&engine))
                     .collect(),
                 request_timeout: globals.get::<_, u8>("request_timeout")?,
                 threads,

src/engines/duckduckgo.rs CHANGED Viewed

@@ -7,7 +7,7 @@ use std::collections::HashMap;
 use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
 use scraper::{Html, Selector};
-use crate::results::aggregation_models::RawSearchResult;
 use super::engine_models::{EngineError, SearchEngine};
@@ -43,7 +43,7 @@ impl SearchEngine for DuckDuckGo {
         page: u32,
         user_agent: String,
         request_timeout: u8,
-    ) -> Result<HashMap<String, RawSearchResult>, EngineError> {
         // Page number can be missing or empty string and so appropriate handling is required
         // so that upstream server recieves valid page number.
         let url: String = match page {
@@ -120,7 +120,7 @@ impl SearchEngine for DuckDuckGo {
         Ok(document
             .select(&results)
             .map(|result| {
-                RawSearchResult::new(
                     result
                         .select(&result_title)
                         .next()

 use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
 use scraper::{Html, Selector};
+use crate::results::aggregation_models::SearchResult;
 use super::engine_models::{EngineError, SearchEngine};
         page: u32,
         user_agent: String,
         request_timeout: u8,
+    ) -> Result<HashMap<String, SearchResult>, EngineError> {
         // Page number can be missing or empty string and so appropriate handling is required
         // so that upstream server recieves valid page number.
         let url: String = match page {
         Ok(document
             .select(&results)
             .map(|result| {
+                SearchResult::new(
                     result
                         .select(&result_title)
                         .next()

src/engines/engine_models.rs CHANGED Viewed

@@ -1,7 +1,7 @@
 //! This module provides the error enum to handle different errors associated while requesting data from
 //! the upstream search engines with the search query provided by the user.
-use crate::results::aggregation_models::RawSearchResult;
 use error_stack::{IntoReport, Result, ResultExt};
 use std::{collections::HashMap, fmt, time::Duration};
@@ -45,7 +45,7 @@ impl error_stack::Context for EngineError {}
 /// A trait to define common behavior for all search engines.
 #[async_trait::async_trait]
-pub trait SearchEngine {
     async fn fetch_html_from_upstream(
         &self,
         url: String,
@@ -73,5 +73,37 @@ pub trait SearchEngine {
         page: u32,
         user_agent: String,
         request_timeout: u8,
-    ) -> Result<HashMap<String, RawSearchResult>, EngineError>;
 }

 //! This module provides the error enum to handle different errors associated while requesting data from
 //! the upstream search engines with the search query provided by the user.
+use crate::results::aggregation_models::SearchResult;
 use error_stack::{IntoReport, Result, ResultExt};
 use std::{collections::HashMap, fmt, time::Duration};
 /// A trait to define common behavior for all search engines.
 #[async_trait::async_trait]
+pub trait SearchEngine: Sync + Send {
     async fn fetch_html_from_upstream(
         &self,
         url: String,
         page: u32,
         user_agent: String,
         request_timeout: u8,
+    ) -> Result<HashMap<String, SearchResult>, EngineError>;
+}
+pub struct EngineHandler {
+    engine: Box<dyn SearchEngine>,
+    name: &'static str,
+}
+impl Clone for EngineHandler {
+    fn clone(&self) -> Self {
+        Self::new(self.name).unwrap()
+    }
+}
+impl EngineHandler {
+    /// parses an engine name into an engine handler, returns none if the engine is unknown
+    pub fn new(engine_name: &str) -> Option<Self> {
+        let engine: (&'static str, Box<dyn SearchEngine>) =
+            match engine_name.to_lowercase().as_str() {
+                "duckduckgo" => ("duckduckgo", Box::new(super::duckduckgo::DuckDuckGo)),
+                "searx" => ("searx", Box::new(super::searx::Searx)),
+                _ => return None,
+            };
+        Some(Self {
+            engine: engine.1,
+            name: engine.0,
+        })
+    }
+    pub fn into_name_engine(self) -> (&'static str, Box<dyn SearchEngine>) {
+        (self.name, self.engine)
+    }
 }

src/engines/searx.rs CHANGED Viewed

@@ -6,7 +6,7 @@ use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
 use scraper::{Html, Selector};
 use std::collections::HashMap;
-use crate::results::aggregation_models::RawSearchResult;
 use super::engine_models::{EngineError, SearchEngine};
 use error_stack::{IntoReport, Report, Result, ResultExt};
@@ -42,7 +42,7 @@ impl SearchEngine for Searx {
         page: u32,
         user_agent: String,
         request_timeout: u8,
-    ) -> Result<HashMap<String, RawSearchResult>, EngineError> {
         // Page number can be missing or empty string and so appropriate handling is required
         // so that upstream server recieves valid page number.
         let url: String = match page {
@@ -111,7 +111,7 @@ impl SearchEngine for Searx {
         Ok(document
             .select(&results)
             .map(|result| {
-                RawSearchResult::new(
                     result
                         .select(&result_title)
                         .next()

 use scraper::{Html, Selector};
 use std::collections::HashMap;
+use crate::results::aggregation_models::SearchResult;
 use super::engine_models::{EngineError, SearchEngine};
 use error_stack::{IntoReport, Report, Result, ResultExt};
         page: u32,
         user_agent: String,
         request_timeout: u8,
+    ) -> Result<HashMap<String, SearchResult>, EngineError> {
         // Page number can be missing or empty string and so appropriate handling is required
         // so that upstream server recieves valid page number.
         let url: String = match page {
         Ok(document
             .select(&results)
             .map(|result| {
+                SearchResult::new(
                     result
                         .select(&result_title)
                         .next()

src/results/aggregation_models.rs CHANGED Viewed

@@ -5,54 +5,6 @@ use serde::{Deserialize, Serialize};
 use crate::{config::parser_models::Style, engines::engine_models::EngineError};
-/// A named struct to store, serialize and deserializes the individual search result from all the
-/// scraped and aggregated search results from the upstream search engines.
-///
-/// # Fields
-///
-/// * `title` - The title of the search result.
-/// * `url` - The url to be displayed below the search result title in html.
-/// * `description` - The description of the search result.
-/// * `engine` - The names of the upstream engines from which this results were provided.
-#[derive(Serialize, Deserialize)]
-#[serde(rename_all = "camelCase")]
-pub struct SearchResult {
-    pub title: String,
-    pub url: String,
-    pub description: String,
-    pub engine: Vec<String>,
-}
-impl SearchResult {
-    /// Constructs a new `SearchResult` with the given arguments needed for the struct.
-    ///
-    /// # Arguments
-    ///
-    /// * `title` - The title of the search result.
-    /// * `visiting_url` - The url which is accessed when clicked on it
-    /// (href url in html in simple words).
-    /// * `url` - The url to be displayed below the search result title in html.
-    /// * `description` - The description of the search result.
-    /// * `engine` - The names of the upstream engines from which this results were provided.
-    pub fn new(title: String, url: String, description: String, engine: Vec<String>) -> Self {
-        SearchResult {
-            title,
-            url,
-            description,
-            engine,
-        }
-    }
-    pub fn from_raw(raw: RawSearchResult) -> Self {
-        SearchResult {
-            title: raw.title,
-            url: raw.url,
-            description: raw.description,
-            engine: raw.engine,
-        }
-    }
-}
 /// A named struct to store the raw scraped search results scraped search results from the
 /// upstream search engines before aggregating it.It derives the Clone trait which is needed
 /// to write idiomatic rust using `Iterators`.
@@ -64,15 +16,16 @@ impl SearchResult {
 /// (href url in html in simple words).
 /// * `description` - The description of the search result.
 /// * `engine` - The names of the upstream engines from which this results were provided.
-#[derive(Clone)]
-pub struct RawSearchResult {
     pub title: String,
     pub url: String,
     pub description: String,
     pub engine: Vec<String>,
 }
-impl RawSearchResult {
     /// Constructs a new `RawSearchResult` with the given arguments needed for the struct.
     ///
     /// # Arguments
@@ -83,7 +36,7 @@ impl RawSearchResult {
     /// * `description` - The description of the search result.
     /// * `engine` - The names of the upstream engines from which this results were provided.
     pub fn new(title: String, url: String, description: String, engine: Vec<String>) -> Self {
-        RawSearchResult {
             title,
             url,
             description,

 use crate::{config::parser_models::Style, engines::engine_models::EngineError};
 /// A named struct to store the raw scraped search results scraped search results from the
 /// upstream search engines before aggregating it.It derives the Clone trait which is needed
 /// to write idiomatic rust using `Iterators`.
 /// (href url in html in simple words).
 /// * `description` - The description of the search result.
 /// * `engine` - The names of the upstream engines from which this results were provided.
+#[derive(Clone, Serialize, Deserialize)]
+#[serde(rename_all = "camelCase")]
+pub struct SearchResult {
     pub title: String,
     pub url: String,
     pub description: String,
     pub engine: Vec<String>,
 }
+impl SearchResult {
     /// Constructs a new `RawSearchResult` with the given arguments needed for the struct.
     ///
     /// # Arguments
     /// * `description` - The description of the search result.
     /// * `engine` - The names of the upstream engines from which this results were provided.
     pub fn new(title: String, url: String, description: String, engine: Vec<String>) -> Self {
+        SearchResult {
             title,
             url,
             description,

src/results/aggregator.rs CHANGED Viewed

@@ -8,18 +8,14 @@ use rand::Rng;
 use tokio::task::JoinHandle;
 use super::{
-    aggregation_models::{EngineErrorInfo, RawSearchResult, SearchResult, SearchResults},
     user_agent::random_user_agent,
 };
-use crate::engines::{
-    duckduckgo,
-    engine_models::{EngineError, SearchEngine},
-    searx,
-};
 /// Aliases for long type annotations
-type FutureVec = Vec<JoinHandle<Result<HashMap<String, RawSearchResult>, Report<EngineError>>>>;
 /// The function aggregates the scraped results from the user-selected upstream search engines.
 /// These engines can be chosen either from the user interface (UI) or from the configuration file.
@@ -64,7 +60,7 @@ pub async fn aggregate(
     page: u32,
     random_delay: bool,
     debug: bool,
-    mut upstream_search_engines: Vec<String>,
     request_timeout: u8,
 ) -> Result<SearchResults, Box<dyn std::error::Error>> {
     let user_agent: String = random_user_agent();
@@ -76,24 +72,22 @@ pub async fn aggregate(
         tokio::time::sleep(Duration::from_secs(delay_secs)).await;
     }
     // create tasks for upstream result fetching
-    let tasks: FutureVec = upstream_search_engines
-        .iter()
-        .map(|engine| match engine.to_lowercase().as_str() {
-            "duckduckgo" => Box::new(duckduckgo::DuckDuckGo) as Box<dyn SearchEngine + Send + Sync>,
-            "searx" => Box::new(searx::Searx) as Box<dyn SearchEngine + Send + Sync>,
-            &_ => panic!("Config Error: Incorrect config file option provided"),
-        })
-        .map(|search_engine| {
-            let query: String = query.clone();
-            let user_agent: String = user_agent.clone();
-            tokio::spawn(async move {
-                search_engine
-                    .results(query, page, user_agent.clone(), request_timeout)
-                    .await
-            })
-        })
-        .collect();
     // get upstream responses
     let mut responses = Vec::with_capacity(tasks.len());
@@ -105,20 +99,20 @@ pub async fn aggregate(
     }
     // aggregate search results, removing duplicates and handling errors the upstream engines returned
-    let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
     let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new();
     let mut handle_error = |error: Report<EngineError>, engine_name: String| {
         log::error!("Engine Error: {:?}", error);
         engine_errors_info.push(EngineErrorInfo::new(
             error.downcast_ref::<EngineError>().unwrap(),
-            engine_name,
         ));
     };
     for _ in 0..responses.len() {
         let response = responses.pop().unwrap();
-        let engine_name = upstream_search_engines.pop().unwrap();
         if result_map.is_empty() {
             match response {
@@ -126,7 +120,7 @@ pub async fn aggregate(
                     result_map = results.clone();
                 }
                 Err(error) => {
-                    handle_error(error, engine_name.clone());
                 }
             }
             continue;
@@ -138,21 +132,18 @@ pub async fn aggregate(
                     result_map
                         .entry(key)
                         .and_modify(|result| {
-                            result.add_engines(engine_name.clone());
                         })
-                        .or_insert_with(|| -> RawSearchResult { value });
                 });
             }
             Err(error) => {
-                handle_error(error, engine_name.clone());
             }
         }
     }
-    let mut results = Vec::with_capacity(result_map.len());
-    for (_, result) in result_map {
-        results.push(SearchResult::from_raw(result))
-    }
     Ok(SearchResults::new(
         results,

 use tokio::task::JoinHandle;
 use super::{
+    aggregation_models::{EngineErrorInfo, SearchResult, SearchResults},
     user_agent::random_user_agent,
 };
+use crate::engines::engine_models::{EngineError, EngineHandler};
 /// Aliases for long type annotations
+type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>;
 /// The function aggregates the scraped results from the user-selected upstream search engines.
 /// These engines can be chosen either from the user interface (UI) or from the configuration file.
     page: u32,
     random_delay: bool,
     debug: bool,
+    upstream_search_engines: Vec<EngineHandler>,
     request_timeout: u8,
 ) -> Result<SearchResults, Box<dyn std::error::Error>> {
     let user_agent: String = random_user_agent();
         tokio::time::sleep(Duration::from_secs(delay_secs)).await;
     }
+    let mut names: Vec<&str> = vec![];
     // create tasks for upstream result fetching
+    let mut tasks: FutureVec = FutureVec::new();
+    for engine_handler in upstream_search_engines {
+        let (name, search_engine) = engine_handler.into_name_engine();
+        names.push(name);
+        let query: String = query.clone();
+        let user_agent: String = user_agent.clone();
+        tasks.push(tokio::spawn(async move {
+            search_engine
+                .results(query, page, user_agent.clone(), request_timeout)
+                .await
+        }));
+    }
     // get upstream responses
     let mut responses = Vec::with_capacity(tasks.len());
     }
     // aggregate search results, removing duplicates and handling errors the upstream engines returned
+    let mut result_map: HashMap<String, SearchResult> = HashMap::new();
     let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new();
     let mut handle_error = |error: Report<EngineError>, engine_name: String| {
         log::error!("Engine Error: {:?}", error);
         engine_errors_info.push(EngineErrorInfo::new(
             error.downcast_ref::<EngineError>().unwrap(),
+            engine_name.to_string(),
         ));
     };
     for _ in 0..responses.len() {
         let response = responses.pop().unwrap();
+        let engine = names.pop().unwrap().to_string();
         if result_map.is_empty() {
             match response {
                     result_map = results.clone();
                 }
                 Err(error) => {
+                    handle_error(error, engine);
                 }
             }
             continue;
                     result_map
                         .entry(key)
                         .and_modify(|result| {
+                            result.add_engines(engine.clone());
                         })
+                        .or_insert_with(|| -> SearchResult { value });
                 });
             }
             Err(error) => {
+                handle_error(error, engine);
             }
         }
     }
+    let results = result_map.into_values().collect();
     Ok(SearchResults::new(
         results,

src/server/routes.rs CHANGED Viewed

@@ -7,6 +7,7 @@ use std::fs::read_to_string;
 use crate::{
     cache::cacher::RedisCache,
     config::parser::Config,
     handler::public_paths::public_path,
     results::{aggregation_models::SearchResults, aggregator::aggregate},
 };
@@ -175,12 +176,19 @@ async fn results(
             {
                 Some(cookie_value) => {
                     let cookie_value: Cookie = serde_json::from_str(cookie_value.name_value().1)?;
                     aggregate(
                         query,
                         page,
                         config.aggregator.random_delay,
                         config.debug,
-                        cookie_value.engines,
                         config.request_timeout,
                     )
                     .await?

 use crate::{
     cache::cacher::RedisCache,
     config::parser::Config,
+    engines::engine_models::EngineHandler,
     handler::public_paths::public_path,
     results::{aggregation_models::SearchResults, aggregator::aggregate},
 };
             {
                 Some(cookie_value) => {
                     let cookie_value: Cookie = serde_json::from_str(cookie_value.name_value().1)?;
+                    let engines = cookie_value
+                        .engines
+                        .iter()
+                        .filter_map(|name| EngineHandler::new(name))
+                        .collect();
                     aggregate(
                         query,
                         page,
                         config.aggregator.random_delay,
                         config.debug,
+                        engines,
                         config.request_timeout,
                     )
                     .await?