Spaces:

alamin655
/

surfx

Running

App Files Files Community

neon_arch commited on Jul 18, 2023

Commit

221f38c

unverified ·

2 Parent(s): e4625c3 a28d559

Merge pull request #146 from neon-mmd/improve-async-multithreading

Browse files

Files changed (10) hide show

Cargo.lock +39 -27
Cargo.toml +3 -2
src/config/parser.rs +16 -2
src/engines/duckduckgo.rs +125 -129
src/engines/engine_models.rs +35 -3
src/engines/searx.rs +112 -113
src/results/aggregation_models.rs +36 -3
src/results/aggregator.rs +129 -40
src/server/routes.rs +53 -11
websurfx/config.lua +4 -1

Cargo.lock CHANGED Viewed

@@ -292,6 +292,17 @@ version = "0.10.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "619743e34b5ba4e9703bba34deac3427c72507c7159f5fd030aea8cac0cfe341"
 [[package]]
 name = "autocfg"
 version = "0.1.8"
@@ -506,18 +517,18 @@ dependencies = [
 [[package]]
 name = "clap"
-version = "4.3.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "1640e5cc7fb47dbb8338fd471b105e7ed6c3cb2aeb00c2e067127ffd3764a05d"
 dependencies = [
  "clap_builder",
 ]
 [[package]]
 name = "clap_builder"
-version = "4.3.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "98c59138d527eeaf9b53f35a77fcc1fad9d883116070c63d5de1c7dc7b00c72b"
 dependencies = [
  "anstyle",
  "clap_lex",
@@ -784,7 +795,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
 dependencies = [
  "quote 1.0.29",
- "syn 2.0.25",
 ]
 [[package]]
@@ -1457,7 +1468,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
 dependencies = [
  "hermit-abi",
- "rustix 0.38.3",
  "windows-sys",
 ]
@@ -1834,7 +1845,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
 dependencies = [
  "proc-macro2 1.0.64",
  "quote 1.0.29",
- "syn 2.0.25",
 ]
 [[package]]
@@ -1952,7 +1963,7 @@ dependencies = [
  "pest_meta",
  "proc-macro2 1.0.64",
  "quote 1.0.29",
- "syn 2.0.25",
 ]
 [[package]]
@@ -2054,7 +2065,7 @@ dependencies = [
  "phf_shared 0.11.2",
  "proc-macro2 1.0.64",
  "quote 1.0.29",
- "syn 2.0.25",
 ]
 [[package]]
@@ -2398,9 +2409,9 @@ dependencies = [
 [[package]]
 name = "regex-automata"
-version = "0.3.2"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "83d3daa6976cffb758ec878f108ba0e062a45b2d6ca3a2cca965338855476caf"
 dependencies = [
  "aho-corasick",
  "memchr",
@@ -2409,9 +2420,9 @@ dependencies = [
 [[package]]
 name = "regex-syntax"
-version = "0.7.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "2ab07dc67230e4a4718e70fd5c20055a4334b121f1f9db8fe63ef39ce9b8c846"
 [[package]]
 name = "reqwest"
@@ -2548,9 +2559,9 @@ dependencies = [
 [[package]]
 name = "rustix"
-version = "0.38.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "ac5ffa1efe7548069688cd7028f32591853cd7b5b756d41bcffd2353e4fc75b4"
 dependencies = [
  "bitflags 2.3.3",
  "errno",
@@ -2708,14 +2719,14 @@ checksum = "389894603bd18c46fa56231694f8d827779c0951a667087194cf9de94ed24682"
 dependencies = [
  "proc-macro2 1.0.64",
  "quote 1.0.29",
- "syn 2.0.25",
 ]
 [[package]]
 name = "serde_json"
-version = "1.0.100"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "0f1e14e89be7aa4c4b78bdbdc9eb5bf8517829a600ae8eaa39a6e1d960b5185c"
 dependencies = [
  "itoa 1.0.8",
  "ryu",
@@ -2937,9 +2948,9 @@ dependencies = [
 [[package]]
 name = "syn"
-version = "2.0.25"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "15e3fc8c0c74267e2df136e5e5fb656a464158aa57624053375eb9c8c6e25ae2"
 dependencies = [
  "proc-macro2 1.0.64",
  "quote 1.0.29",
@@ -3009,7 +3020,7 @@ checksum = "463fe12d7993d3b327787537ce8dd4dfa058de32fc2b195ef3cde03dc4771e8f"
 dependencies = [
  "proc-macro2 1.0.64",
  "quote 1.0.29",
- "syn 2.0.25",
 ]
 [[package]]
@@ -3164,7 +3175,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
 dependencies = [
  "proc-macro2 1.0.64",
  "quote 1.0.29",
- "syn 2.0.25",
 ]
 [[package]]
@@ -3343,9 +3354,9 @@ checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460"
 [[package]]
 name = "unicode-ident"
-version = "1.0.10"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "22049a19f4a68748a168c0fc439f9516686aa045927ff767eca0a85101fb6e73"
 [[package]]
 name = "unicode-normalization"
@@ -3486,7 +3497,7 @@ dependencies = [
  "once_cell",
  "proc-macro2 1.0.64",
  "quote 1.0.29",
- "syn 2.0.25",
  "wasm-bindgen-shared",
 ]
@@ -3520,7 +3531,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
 dependencies = [
  "proc-macro2 1.0.64",
  "quote 1.0.29",
- "syn 2.0.25",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
@@ -3543,10 +3554,11 @@ dependencies = [
 [[package]]
 name = "websurfx"
-version = "0.13.17"
 dependencies = [
  "actix-files",
  "actix-web",
  "criterion",
  "env_logger",
  "error-stack",

 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "619743e34b5ba4e9703bba34deac3427c72507c7159f5fd030aea8cac0cfe341"
+[[package]]
+name = "async-trait"
+version = "0.1.71"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "a564d521dd56509c4c47480d00b80ee55f7e385ae48db5744c67ad50c92d2ebf"
+dependencies = [
+ "proc-macro2 1.0.64",
+ "quote 1.0.29",
+ "syn 2.0.26",
+]
 [[package]]
 name = "autocfg"
 version = "0.1.8"
 [[package]]
 name = "clap"
+version = "4.3.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "3eab9e8ceb9afdade1ab3f0fd8dbce5b1b2f468ad653baf10e771781b2b67b73"
 dependencies = [
  "clap_builder",
 ]
 [[package]]
 name = "clap_builder"
+version = "4.3.12"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "9f2763db829349bf00cfc06251268865ed4363b93a943174f638daf3ecdba2cd"
 dependencies = [
  "anstyle",
  "clap_lex",
 checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
 dependencies = [
  "quote 1.0.29",
+ "syn 2.0.26",
 ]
 [[package]]
 checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
 dependencies = [
  "hermit-abi",
+ "rustix 0.38.4",
  "windows-sys",
 ]
 dependencies = [
  "proc-macro2 1.0.64",
  "quote 1.0.29",
+ "syn 2.0.26",
 ]
 [[package]]
  "pest_meta",
  "proc-macro2 1.0.64",
  "quote 1.0.29",
+ "syn 2.0.26",
 ]
 [[package]]
  "phf_shared 0.11.2",
  "proc-macro2 1.0.64",
  "quote 1.0.29",
+ "syn 2.0.26",
 ]
 [[package]]
 [[package]]
 name = "regex-automata"
+version = "0.3.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "39354c10dd07468c2e73926b23bb9c2caca74c5501e38a35da70406f1d923310"
 dependencies = [
  "aho-corasick",
  "memchr",
 [[package]]
 name = "regex-syntax"
+version = "0.7.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2"
 [[package]]
 name = "reqwest"
 [[package]]
 name = "rustix"
+version = "0.38.4"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "0a962918ea88d644592894bc6dc55acc6c0956488adcebbfb6e273506b7fd6e5"
 dependencies = [
  "bitflags 2.3.3",
  "errno",
 dependencies = [
  "proc-macro2 1.0.64",
  "quote 1.0.29",
+ "syn 2.0.26",
 ]
 [[package]]
 name = "serde_json"
+version = "1.0.102"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "b5062a995d481b2308b6064e9af76011f2921c35f97b0468811ed9f6cd91dfed"
 dependencies = [
  "itoa 1.0.8",
  "ryu",
 [[package]]
 name = "syn"
+version = "2.0.26"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "45c3457aacde3c65315de5031ec191ce46604304d2446e803d71ade03308d970"
 dependencies = [
  "proc-macro2 1.0.64",
  "quote 1.0.29",
 dependencies = [
  "proc-macro2 1.0.64",
  "quote 1.0.29",
+ "syn 2.0.26",
 ]
 [[package]]
 dependencies = [
  "proc-macro2 1.0.64",
  "quote 1.0.29",
+ "syn 2.0.26",
 ]
 [[package]]
 [[package]]
 name = "unicode-ident"
+version = "1.0.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c"
 [[package]]
 name = "unicode-normalization"
  "once_cell",
  "proc-macro2 1.0.64",
  "quote 1.0.29",
+ "syn 2.0.26",
  "wasm-bindgen-shared",
 ]
 dependencies = [
  "proc-macro2 1.0.64",
  "quote 1.0.29",
+ "syn 2.0.26",
  "wasm-bindgen-backend",
  "wasm-bindgen-shared",
 ]
 [[package]]
 name = "websurfx"
+version = "0.14.0"
 dependencies = [
  "actix-files",
  "actix-web",
+ "async-trait",
  "criterion",
  "env_logger",
  "error-stack",

Cargo.toml CHANGED Viewed

@@ -1,6 +1,6 @@
 [package]
 name = "websurfx"
-version = "0.13.17"
 edition = "2021"
 description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind."
 repository = "https://github.com/neon-mmd/websurfx"
@@ -12,7 +12,7 @@ tokio = {version="*",features=["full"]}
 serde = {version="*",features=["derive"]}
 handlebars = { version = "4.3.6", features = ["dir_source"] }
 scraper = {version="*"}
-actix-web = {version="4.3.1"}
 actix-files = {version="0.6.2"}
 serde_json = {version="*"}
 fake-useragent = {version="*"}
@@ -24,6 +24,7 @@ md5 = {version="*"}
 rand={version="*"}
 once_cell = {version="*"}
 error-stack = {version="0.3.1"}
 [dev-dependencies]
 rusty-hook = "^0.11.2"

 [package]
 name = "websurfx"
+version = "0.14.0"
 edition = "2021"
 description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind."
 repository = "https://github.com/neon-mmd/websurfx"
 serde = {version="*",features=["derive"]}
 handlebars = { version = "4.3.6", features = ["dir_source"] }
 scraper = {version="*"}
+actix-web = {version="4.3.1", features = ["cookies"]}
 actix-files = {version="0.6.2"}
 serde_json = {version="*"}
 fake-useragent = {version="*"}
 rand={version="*"}
 once_cell = {version="*"}
 error-stack = {version="0.3.1"}
+async-trait = {version="*"}
 [dev-dependencies]
 rusty-hook = "^0.11.2"

src/config/parser.rs CHANGED Viewed

@@ -3,7 +3,7 @@
 use super::parser_models::Style;
 use rlua::Lua;
-use std::{format, fs, path::Path};
 // ------- Constants --------
 static COMMON_DIRECTORY_NAME: &str = "websurfx";
@@ -18,6 +18,10 @@ static CONFIG_FILE_NAME: &str = "config.lua";
 /// * `style` - It stores the theming options for the website.
 /// * `redis_url` - It stores the redis connection url address on which the redis
 /// client should connect.
 #[derive(Clone)]
 pub struct Config {
     pub port: u16,
@@ -27,12 +31,17 @@ pub struct Config {
     pub aggregator: AggregatorConfig,
     pub logging: bool,
     pub debug: bool,
 }
 /// Configuration options for the aggregator.
 #[derive(Clone)]
 pub struct AggregatorConfig {
-    /// Whether to introduce a random delay before sending the request to the search engine.
     pub random_delay: bool,
 }
@@ -66,6 +75,11 @@ impl Config {
                 },
                 logging: globals.get::<_, bool>("logging")?,
                 debug: globals.get::<_, bool>("debug")?,
             })
         })
     }

 use super::parser_models::Style;
 use rlua::Lua;
+use std::{collections::HashMap, format, fs, path::Path};
 // ------- Constants --------
 static COMMON_DIRECTORY_NAME: &str = "websurfx";
 /// * `style` - It stores the theming options for the website.
 /// * `redis_url` - It stores the redis connection url address on which the redis
 /// client should connect.
+/// * `aggregator` -  It stores the option to whether enable or disable production use.
+/// * `logging` - It stores the option to whether enable or disable logs.
+/// * `debug` - It stores the option to whether enable or disable debug mode.
+/// * `upstream_search_engines` - It stores all the engine names that were enabled by the user.
 #[derive(Clone)]
 pub struct Config {
     pub port: u16,
     pub aggregator: AggregatorConfig,
     pub logging: bool,
     pub debug: bool,
+    pub upstream_search_engines: Vec<String>,
 }
 /// Configuration options for the aggregator.
+///
+/// # Fields
+///
+/// * `random_delay` - It stores the option to whether enable or disable random delays between
+/// requests.
 #[derive(Clone)]
 pub struct AggregatorConfig {
     pub random_delay: bool,
 }
                 },
                 logging: globals.get::<_, bool>("logging")?,
                 debug: globals.get::<_, bool>("debug")?,
+                upstream_search_engines: globals
+                    .get::<_, HashMap<String, bool>>("upstream_search_engines")?
+                    .into_iter()
+                    .filter_map(|(key, value)| value.then_some(key))
+                    .collect(),
             })
         })
     }

src/engines/duckduckgo.rs CHANGED Viewed

@@ -2,154 +2,150 @@
 //! by querying the upstream duckduckgo search engine with user provided query and with a page
 //! number if provided.
-use std::{collections::HashMap, time::Duration};
 use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
 use scraper::{Html, Selector};
 use crate::results::aggregation_models::RawSearchResult;
-use super::engine_models::EngineError;
 use error_stack::{IntoReport, Report, Result, ResultExt};
-/// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
-/// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
-/// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
-/// values are RawSearchResult struct and then returns it within a Result enum.
-///
-/// # Arguments
-///
-/// * `query` - Takes the user provided query to query to the upstream search engine with.
-/// * `page` - Takes an u32 as an argument.
-/// * `user_agent` - Takes a random user agent string as an argument.
-///
-/// # Errors
-///
-/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
-/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
-/// provide results for the requested search query and also returns error if the scraping selector
-/// or HeaderMap fails to initialize.
-pub async fn results(
-    query: &str,
-    page: u32,
-    user_agent: &str,
-) -> Result<HashMap<String, RawSearchResult>, EngineError> {
-    // Page number can be missing or empty string and so appropriate handling is required
-    // so that upstream server receives valid page number.
-    let url: String = match page {
-        1 => {
-            format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")
-        }
-        _ => {
-            format!(
-                "https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js",
-                query,
-                (page / 2 + (page % 2)) * 30,
-                (page / 2 + (page % 2)) * 30 + 1
-            )
-        }
-    };
-    // initializing HeaderMap and adding appropriate headers.
-    let mut header_map = HeaderMap::new();
-    header_map.insert(
-        USER_AGENT,
-        user_agent
-            .parse()
-            .into_report()
-            .change_context(EngineError::UnexpectedError)?,
-    );
-    header_map.insert(
-        REFERER,
-        "https://google.com/"
-            .parse()
-            .into_report()
-            .change_context(EngineError::UnexpectedError)?,
-    );
-    header_map.insert(
-        CONTENT_TYPE,
-        "application/x-www-form-urlencoded"
-            .parse()
-            .into_report()
-            .change_context(EngineError::UnexpectedError)?,
-    );
-    header_map.insert(
-        COOKIE,
-        "kl=wt-wt"
-            .parse()
-            .into_report()
-            .change_context(EngineError::UnexpectedError)?,
-    );
-    // fetch the html from upstream duckduckgo engine
-    let results: String = reqwest::Client::new()
-        .get(url)
-        .timeout(Duration::from_secs(5))
-        .headers(header_map) // add spoofed headers to emulate human behavior
-        .send()
-        .await
-        .into_report()
-        .change_context(EngineError::RequestError)?
-        .text()
-        .await
-        .into_report()
-        .change_context(EngineError::RequestError)?;
-    let document: Html = Html::parse_document(&results);
-    let no_result: Selector = Selector::parse(".no-results")
-        .map_err(|_| Report::new(EngineError::UnexpectedError))
-        .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?;
-    if document.select(&no_result).next().is_some() {
-        return Err(Report::new(EngineError::EmptyResultSet));
-    }
-    let results: Selector = Selector::parse(".result")
-        .map_err(|_| Report::new(EngineError::UnexpectedError))
-        .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
-    let result_title: Selector = Selector::parse(".result__a")
-        .map_err(|_| Report::new(EngineError::UnexpectedError))
-        .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?;
-    let result_url: Selector = Selector::parse(".result__url")
-        .map_err(|_| Report::new(EngineError::UnexpectedError))
-        .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?;
-    let result_desc: Selector = Selector::parse(".result__snippet")
-        .map_err(|_| Report::new(EngineError::UnexpectedError))
-        .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?;
-    // scrape all the results from the html
-    Ok(document
-        .select(&results)
-        .map(|result| {
-            RawSearchResult::new(
-                result
-                    .select(&result_title)
-                    .next()
-                    .unwrap()
-                    .inner_html()
-                    .trim()
-                    .to_string(),
-                format!(
-                    "https://{}",
                     result
-                        .select(&result_url)
                         .next()
                         .unwrap()
                         .inner_html()
                         .trim()
-                ),
-                result
-                    .select(&result_desc)
-                    .next()
-                    .unwrap()
-                    .inner_html()
-                    .trim()
-                    .to_string(),
-                vec!["duckduckgo".to_string()],
-            )
-        })
-        .map(|search_result| (search_result.visiting_url.clone(), search_result))
-        .collect())
 }

 //! by querying the upstream duckduckgo search engine with user provided query and with a page
 //! number if provided.
+use std::collections::HashMap;
 use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
 use scraper::{Html, Selector};
 use crate::results::aggregation_models::RawSearchResult;
+use super::engine_models::{EngineError, SearchEngine};
 use error_stack::{IntoReport, Report, Result, ResultExt};
+/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
+/// reduce code duplication as well as allows to create vector of different search engines easily.
+pub struct DuckDuckGo;
+#[async_trait::async_trait]
+impl SearchEngine for DuckDuckGo {
+    /// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
+    /// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
+    /// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
+    /// values are RawSearchResult struct and then returns it within a Result enum.
+    ///
+    /// # Arguments
+    ///
+    /// * `query` - Takes the user provided query to query to the upstream search engine with.
+    /// * `page` - Takes an u32 as an argument.
+    /// * `user_agent` - Takes a random user agent string as an argument.
+    ///
+    /// # Errors
+    ///
+    /// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
+    /// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
+    /// provide results for the requested search query and also returns error if the scraping selector
+    /// or HeaderMap fails to initialize.
+    async fn results(
+        &self,
+        query: String,
+        page: u32,
+        user_agent: String,
+    ) -> Result<HashMap<String, RawSearchResult>, EngineError> {
+        // Page number can be missing or empty string and so appropriate handling is required
+        // so that upstream server recieves valid page number.
+        let url: String = match page {
+            1 => {
+                format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")
+            }
+            _ => {
+                format!(
+                    "https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js",
+                    query,
+                    (page / 2 + (page % 2)) * 30,
+                    (page / 2 + (page % 2)) * 30 + 1
+                )
+            }
+        };
+        // initializing HeaderMap and adding appropriate headers.
+        let mut header_map = HeaderMap::new();
+        header_map.insert(
+            USER_AGENT,
+            user_agent
+                .parse()
+                .into_report()
+                .change_context(EngineError::UnexpectedError)?,
+        );
+        header_map.insert(
+            REFERER,
+            "https://google.com/"
+                .parse()
+                .into_report()
+                .change_context(EngineError::UnexpectedError)?,
+        );
+        header_map.insert(
+            CONTENT_TYPE,
+            "application/x-www-form-urlencoded"
+                .parse()
+                .into_report()
+                .change_context(EngineError::UnexpectedError)?,
+        );
+        header_map.insert(
+            COOKIE,
+            "kl=wt-wt"
+                .parse()
+                .into_report()
+                .change_context(EngineError::UnexpectedError)?,
+        );
+        let document: Html = Html::parse_document(
+            &DuckDuckGo::fetch_html_from_upstream(self, url, header_map).await?,
+        );
+        let no_result: Selector = Selector::parse(".no-results")
+            .map_err(|_| Report::new(EngineError::UnexpectedError))
+            .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?;
+        if document.select(&no_result).next().is_some() {
+            return Err(Report::new(EngineError::EmptyResultSet));
+        }
+        let results: Selector = Selector::parse(".result")
+            .map_err(|_| Report::new(EngineError::UnexpectedError))
+            .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
+        let result_title: Selector = Selector::parse(".result__a")
+            .map_err(|_| Report::new(EngineError::UnexpectedError))
+            .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?;
+        let result_url: Selector = Selector::parse(".result__url")
+            .map_err(|_| Report::new(EngineError::UnexpectedError))
+            .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?;
+        let result_desc: Selector = Selector::parse(".result__snippet")
+            .map_err(|_| Report::new(EngineError::UnexpectedError))
+            .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?;
+        // scrape all the results from the html
+        Ok(document
+            .select(&results)
+            .map(|result| {
+                RawSearchResult::new(
                     result
+                        .select(&result_title)
                         .next()
                         .unwrap()
                         .inner_html()
                         .trim()
+                        .to_string(),
+                    format!(
+                        "https://{}",
+                        result
+                            .select(&result_url)
+                            .next()
+                            .unwrap()
+                            .inner_html()
+                            .trim()
+                    ),
+                    result
+                        .select(&result_desc)
+                        .next()
+                        .unwrap()
+                        .inner_html()
+                        .trim()
+                        .to_string(),
+                    vec!["duckduckgo".to_string()],
+                )
+            })
+            .map(|search_result| (search_result.visiting_url.clone(), search_result))
+            .collect())
+    }
 }

src/engines/engine_models.rs CHANGED Viewed

@@ -1,8 +1,9 @@
 //! This module provides the error enum to handle different errors associated while requesting data from
 //! the upstream search engines with the search query provided by the user.
-use error_stack::Context;
-use std::fmt;
 /// A custom error type used for handle engine associated errors.
 ///
@@ -40,4 +41,35 @@ impl fmt::Display for EngineError {
     }
 }
-impl Context for EngineError {}

 //! This module provides the error enum to handle different errors associated while requesting data from
 //! the upstream search engines with the search query provided by the user.
+use crate::results::aggregation_models::RawSearchResult;
+use error_stack::{IntoReport, Result, ResultExt};
+use std::{collections::HashMap, fmt, time::Duration};
 /// A custom error type used for handle engine associated errors.
 ///
     }
 }
+impl error_stack::Context for EngineError {}
+/// A trait to define common behaviour for all search engines.
+#[async_trait::async_trait]
+pub trait SearchEngine {
+    async fn fetch_html_from_upstream(
+        &self,
+        url: String,
+        header_map: reqwest::header::HeaderMap,
+    ) -> Result<String, EngineError> {
+        // fetch the html from upstream search engine
+        Ok(reqwest::Client::new()
+            .get(url)
+            .timeout(Duration::from_secs(30)) // Add timeout to request to avoid DDOSing the server
+            .headers(header_map) // add spoofed headers to emulate human behaviour
+            .send()
+            .await
+            .into_report()
+            .change_context(EngineError::RequestError)?
+            .text()
+            .await
+            .into_report()
+            .change_context(EngineError::RequestError)?)
+    }
+    async fn results(
+        &self,
+        query: String,
+        page: u32,
+        user_agent: String,
+    ) -> Result<HashMap<String, RawSearchResult>, EngineError>;
+}

src/engines/searx.rs CHANGED Viewed

@@ -8,131 +8,130 @@ use std::collections::HashMap;
 use crate::results::aggregation_models::RawSearchResult;
-use super::engine_models::EngineError;
 use error_stack::{IntoReport, Report, Result, ResultExt};
-/// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
-/// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
-/// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
-/// values are RawSearchResult struct and then returns it within a Result enum.
-///
-/// # Arguments
-///
-/// * `query` - Takes the user provided query to query to the upstream search engine with.
-/// * `page` - Takes an u32 as an argument.
-/// * `user_agent` - Takes a random user agent string as an argument.
-///
-/// # Errors
-///
-/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
-/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
-/// provide results for the requested search query and also returns error if the scraping selector
-/// or HeaderMap fails to initialize.
-pub async fn results(
-    query: &str,
-    page: u32,
-    user_agent: &str,
-) -> Result<HashMap<String, RawSearchResult>, EngineError> {
-    // Page number can be missing or empty string and so appropriate handling is required
-    // so that upstream server recieves valid page number.
-    let url: String = format!("https://searx.work/search?q={query}&pageno={page}");
-    // initializing headers and adding appropriate headers.
-    let mut header_map = HeaderMap::new();
-    header_map.insert(
-        USER_AGENT,
-        user_agent
-            .parse()
-            .into_report()
-            .change_context(EngineError::UnexpectedError)?,
-    );
-    header_map.insert(
-        REFERER,
-        "https://google.com/"
-            .parse()
-            .into_report()
-            .change_context(EngineError::UnexpectedError)?,
-    );
-    header_map.insert(
-        CONTENT_TYPE,
-        "application/x-www-form-urlencoded"
-            .parse()
-            .into_report()
-            .change_context(EngineError::UnexpectedError)?,
-    );
-    header_map.insert(COOKIE, "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".parse().into_report().change_context(EngineError::UnexpectedError)?);
-    // fetch the html from upstream searx instance engine
-    let results: String = reqwest::Client::new()
-        .get(url)
-        .headers(header_map) // add spoofed headers to emulate human behaviours.
-        .send()
-        .await
-        .into_report()
-        .change_context(EngineError::RequestError)?
-        .text()
-        .await
-        .into_report()
-        .change_context(EngineError::RequestError)?;
-    let document: Html = Html::parse_document(&results);
-    let no_result: Selector = Selector::parse("#urls>.dialog-error>p")
-        .map_err(|_| Report::new(EngineError::UnexpectedError))
-        .attach_printable_lazy(|| format!("invalid CSS selector: {}", "#urls>.dialog-error>p"))?;
-    if let Some(no_result_msg) = document.select(&no_result).nth(1) {
-        if no_result_msg.inner_html()
             == "we didn't find any results. Please use another query or search in more categories"
         {
             return Err(Report::new(EngineError::EmptyResultSet));
         }
-    }
-    let results: Selector = Selector::parse(".result")
-        .map_err(|_| Report::new(EngineError::UnexpectedError))
-        .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
-    let result_title: Selector = Selector::parse("h3>a")
-        .map_err(|_| Report::new(EngineError::UnexpectedError))
-        .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
-    let result_url: Selector = Selector::parse("h3>a")
-        .map_err(|_| Report::new(EngineError::UnexpectedError))
-        .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
-    let result_desc: Selector = Selector::parse(".content")
-        .map_err(|_| Report::new(EngineError::UnexpectedError))
-        .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?;
-    // scrape all the results from the html
-    Ok(document
-        .select(&results)
-        .map(|result| {
-            RawSearchResult::new(
-                result
-                    .select(&result_title)
-                    .next()
-                    .unwrap()
-                    .inner_html()
-                    .trim()
-                    .to_string(),
-                result
-                    .select(&result_url)
-                    .next()
-                    .unwrap()
-                    .value()
-                    .attr("href")
-                    .unwrap()
-                    .to_string(),
-                result
-                    .select(&result_desc)
-                    .next()
-                    .unwrap()
-                    .inner_html()
-                    .trim()
-                    .to_string(),
-                vec!["searx".to_string()],
-            )
-        })
-        .map(|search_result| (search_result.visiting_url.clone(), search_result))
-        .collect())
 }

 use crate::results::aggregation_models::RawSearchResult;
+use super::engine_models::{EngineError, SearchEngine};
 use error_stack::{IntoReport, Report, Result, ResultExt};
+/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
+/// reduce code duplication as well as allows to create vector of different search engines easily.
+pub struct Searx;
+#[async_trait::async_trait]
+impl SearchEngine for Searx {
+    /// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
+    /// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
+    /// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
+    /// values are RawSearchResult struct and then returns it within a Result enum.
+    ///
+    /// # Arguments
+    ///
+    /// * `query` - Takes the user provided query to query to the upstream search engine with.
+    /// * `page` - Takes an u32 as an argument.
+    /// * `user_agent` - Takes a random user agent string as an argument.
+    ///
+    /// # Errors
+    ///
+    /// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
+    /// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
+    /// provide results for the requested search query and also returns error if the scraping selector
+    /// or HeaderMap fails to initialize.
+    async fn results(
+        &self,
+        query: String,
+        page: u32,
+        user_agent: String,
+    ) -> Result<HashMap<String, RawSearchResult>, EngineError> {
+        // Page number can be missing or empty string and so appropriate handling is required
+        // so that upstream server recieves valid page number.
+        let url: String = format!("https://searx.work/search?q={query}&pageno={page}");
+        // initializing headers and adding appropriate headers.
+        let mut header_map = HeaderMap::new();
+        header_map.insert(
+            USER_AGENT,
+            user_agent
+                .parse()
+                .into_report()
+                .change_context(EngineError::UnexpectedError)?,
+        );
+        header_map.insert(
+            REFERER,
+            "https://google.com/"
+                .parse()
+                .into_report()
+                .change_context(EngineError::UnexpectedError)?,
+        );
+        header_map.insert(
+            CONTENT_TYPE,
+            "application/x-www-form-urlencoded"
+                .parse()
+                .into_report()
+                .change_context(EngineError::UnexpectedError)?,
+        );
+        header_map.insert(COOKIE, "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".parse().into_report().change_context(EngineError::UnexpectedError)?);
+        let document: Html =
+            Html::parse_document(&Searx::fetch_html_from_upstream(self, url, header_map).await?);
+        let no_result: Selector = Selector::parse("#urls>.dialog-error>p")
+            .map_err(|_| Report::new(EngineError::UnexpectedError))
+            .attach_printable_lazy(|| {
+                format!("invalid CSS selector: {}", "#urls>.dialog-error>p")
+            })?;
+        if let Some(no_result_msg) = document.select(&no_result).nth(1) {
+            if no_result_msg.inner_html()
             == "we didn't find any results. Please use another query or search in more categories"
         {
             return Err(Report::new(EngineError::EmptyResultSet));
         }
+        }
+        let results: Selector = Selector::parse(".result")
+            .map_err(|_| Report::new(EngineError::UnexpectedError))
+            .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
+        let result_title: Selector = Selector::parse("h3>a")
+            .map_err(|_| Report::new(EngineError::UnexpectedError))
+            .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
+        let result_url: Selector = Selector::parse("h3>a")
+            .map_err(|_| Report::new(EngineError::UnexpectedError))
+            .attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
+        let result_desc: Selector = Selector::parse(".content")
+            .map_err(|_| Report::new(EngineError::UnexpectedError))
+            .attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?;
+        // scrape all the results from the html
+        Ok(document
+            .select(&results)
+            .map(|result| {
+                RawSearchResult::new(
+                    result
+                        .select(&result_title)
+                        .next()
+                        .unwrap()
+                        .inner_html()
+                        .trim()
+                        .to_string(),
+                    result
+                        .select(&result_url)
+                        .next()
+                        .unwrap()
+                        .value()
+                        .attr("href")
+                        .unwrap()
+                        .to_string(),
+                    result
+                        .select(&result_desc)
+                        .next()
+                        .unwrap()
+                        .inner_html()
+                        .trim()
+                        .to_string(),
+                    vec!["searx".to_string()],
+                )
+            })
+            .map(|search_result| (search_result.visiting_url.clone(), search_result))
+            .collect())
+    }
 }

src/results/aggregation_models.rs CHANGED Viewed

@@ -3,7 +3,7 @@
 use serde::{Deserialize, Serialize};
-use crate::config::parser_models::Style;
 /// A named struct to store, serialize and deserializes the individual search result from all the
 /// scraped and aggregated search results from the upstream search engines.
@@ -16,7 +16,7 @@ use crate::config::parser_models::Style;
 /// * `url` - The url to be displayed below the search result title in html.
 /// * `description` - The description of the search result.
 /// * `engine` - The names of the upstream engines from which this results were provided.
-#[derive(Debug, Serialize, Deserialize)]
 #[serde(rename_all = "camelCase")]
 pub struct SearchResult {
     pub title: String,
@@ -116,6 +116,25 @@ impl RawSearchResult {
     }
 }
 /// A named struct to store, serialize, deserialize the all the search results scraped and
 /// aggregated from the upstream search engines.
 ///
@@ -124,12 +143,18 @@ impl RawSearchResult {
 /// * `results` - Stores the individual serializable `SearchResult` struct into a vector of
 /// `SearchResult` structs.
 /// * `page_query` - Stores the current pages search query `q` provided in the search url.
 #[derive(Serialize, Deserialize)]
 #[serde(rename_all = "camelCase")]
 pub struct SearchResults {
     pub results: Vec<SearchResult>,
     pub page_query: String,
     pub style: Style,
 }
 impl SearchResults {
@@ -141,14 +166,22 @@ impl SearchResults {
     /// and stores it into a vector of `SearchResult` structs.
     /// * `page_query` - Takes an argument of current page`s search query `q` provided in
     /// the search url.
-    pub fn new(results: Vec<SearchResult>, page_query: String) -> Self {
         SearchResults {
             results,
             page_query,
             style: Style::new("".to_string(), "".to_string()),
         }
     }
     pub fn add_style(&mut self, style: Style) {
         self.style = style;
     }

 use serde::{Deserialize, Serialize};
+use crate::{config::parser_models::Style, engines::engine_models::EngineError};
 /// A named struct to store, serialize and deserializes the individual search result from all the
 /// scraped and aggregated search results from the upstream search engines.
 /// * `url` - The url to be displayed below the search result title in html.
 /// * `description` - The description of the search result.
 /// * `engine` - The names of the upstream engines from which this results were provided.
+#[derive(Serialize, Deserialize)]
 #[serde(rename_all = "camelCase")]
 pub struct SearchResult {
     pub title: String,
     }
 }
+#[derive(Serialize, Deserialize)]
+pub struct EngineErrorInfo {
+    pub error: String,
+    pub engine: String,
+}
+impl EngineErrorInfo {
+    pub fn new(error: &EngineError, engine: String) -> Self {
+        Self {
+            error: match error {
+                EngineError::RequestError => String::from("RequestError"),
+                EngineError::EmptyResultSet => String::from("EmptyResultSet"),
+                EngineError::UnexpectedError => String::from("UnexpectedError"),
+            },
+            engine,
+        }
+    }
+}
 /// A named struct to store, serialize, deserialize the all the search results scraped and
 /// aggregated from the upstream search engines.
 ///
 /// * `results` - Stores the individual serializable `SearchResult` struct into a vector of
 /// `SearchResult` structs.
 /// * `page_query` - Stores the current pages search query `q` provided in the search url.
+/// * `style` - Stores the theming options for the website.
+/// * `engine_errors_info` - Stores the information on which engines failed with their engine name
+/// and the type of error that caused it.
+/// * `empty_result_set` - Stores a boolean which indicates that no engines gave a result for the
+/// given search query.
 #[derive(Serialize, Deserialize)]
 #[serde(rename_all = "camelCase")]
 pub struct SearchResults {
     pub results: Vec<SearchResult>,
     pub page_query: String,
     pub style: Style,
+    pub engine_errors_info: Vec<EngineErrorInfo>,
 }
 impl SearchResults {
     /// and stores it into a vector of `SearchResult` structs.
     /// * `page_query` - Takes an argument of current page`s search query `q` provided in
     /// the search url.
+    /// * `empty_result_set` - Takes a boolean which indicates that no engines gave a result for the
+    /// given search query.
+    pub fn new(
+        results: Vec<SearchResult>,
+        page_query: String,
+        engine_errors_info: Vec<EngineErrorInfo>,
+    ) -> Self {
         SearchResults {
             results,
             page_query,
             style: Style::new("".to_string(), "".to_string()),
+            engine_errors_info,
         }
     }
+    /// A setter function to add website style to the return search results.
     pub fn add_style(&mut self, style: Style) {
         self.style = style;
     }

src/results/aggregator.rs CHANGED Viewed

@@ -3,22 +3,41 @@
 use std::{collections::HashMap, time::Duration};
 use rand::Rng;
-use tokio::join;
 use super::{
-    aggregation_models::{RawSearchResult, SearchResult, SearchResults},
     user_agent::random_user_agent,
 };
-use crate::engines::{duckduckgo, searx};
-/// A function that aggregates all the scraped results from the above upstream engines and
-/// then removes duplicate results and if two results are found to be from two or more engines
-/// then puts their names together to show the results are fetched from these upstream engines
-/// and then removes all data from the HashMap and puts into a struct of all results aggregated
-/// into a vector and also adds the query used into the struct this is necessary because
-/// otherwise the search bar in search remains empty if searched from the query url
 ///
 /// # Example:
 ///
@@ -30,6 +49,9 @@ use crate::engines::{duckduckgo, searx};
 /// * `query` - Accepts a string to query with the above upstream search engines.
 /// * `page` - Accepts an u32 page number.
 /// * `random_delay` - Accepts a boolean value to add a random delay before making the request.
 ///
 /// # Error
 ///
@@ -37,10 +59,11 @@ use crate::engines::{duckduckgo, searx};
 /// function in either `searx` or `duckduckgo` or both otherwise returns a `SearchResults struct`
 /// containing appropriate values.
 pub async fn aggregate(
-    query: &str,
     page: u32,
     random_delay: bool,
     debug: bool,
 ) -> Result<SearchResults, Box<dyn std::error::Error>> {
     let user_agent: String = random_user_agent();
     let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
@@ -53,41 +76,106 @@ pub async fn aggregate(
     }
     // fetch results from upstream search engines simultaneously/concurrently.
-    let (ddg_map_results, searx_map_results) = join!(
-        duckduckgo::results(query, page, &user_agent),
-        searx::results(query, page, &user_agent)
-    );
-    let ddg_map_results = ddg_map_results.unwrap_or_else(|e| {
-        if debug {
-            log::error!("Error fetching results from DuckDuckGo: {:?}", e);
-        }
-        HashMap::new()
-    });
-    let searx_map_results = searx_map_results.unwrap_or_else(|e| {
-        if debug {
-            log::error!("Error fetching results from Searx: {:?}", e);
         }
-        HashMap::new()
-    });
-    result_map.extend(ddg_map_results);
-    searx_map_results.into_iter().for_each(|(key, value)| {
-        result_map
-            .entry(key)
-            .and_modify(|result| {
-                result.add_engines(value.clone().engine());
-            })
-            .or_insert_with(|| -> RawSearchResult {
-                RawSearchResult::new(
-                    value.title.clone(),
-                    value.visiting_url.clone(),
-                    value.description.clone(),
-                    value.engine.clone(),
-                )
-            });
     });
     Ok(SearchResults::new(
@@ -104,5 +192,6 @@ pub async fn aggregate(
             })
             .collect(),
         query.to_string(),
     ))
 }

 use std::{collections::HashMap, time::Duration};
+use error_stack::Report;
 use rand::Rng;
+use tokio::task::JoinHandle;
 use super::{
+    aggregation_models::{EngineErrorInfo, RawSearchResult, SearchResult, SearchResults},
     user_agent::random_user_agent,
 };
+use crate::engines::{
+    duckduckgo,
+    engine_models::{EngineError, SearchEngine},
+    searx,
+};
+/// Aliases for long type annotations
+type FutureVec = Vec<JoinHandle<Result<HashMap<String, RawSearchResult>, Report<EngineError>>>>;
+/// The function aggregates the scraped results from the user-selected upstream search engines.
+/// These engines can be chosen either from the user interface (UI) or from the configuration file.
+/// The code handles this process by matching the selected search engines and adding them to a vector.
+/// This vector is then used to create an asynchronous task vector using `tokio::spawn`, which returns
+/// a future. This future is awaited in another loop. Once the results are collected, they are filtered
+/// to remove any errors and ensure only proper results are included. If an error is encountered, it is
+/// sent to the UI along with the name of the engine and the type of error. This information is finally
+/// placed in the returned `SearchResults` struct.
+///
+/// Additionally, the function eliminates duplicate results. If two results are identified as coming from
+/// multiple engines, their names are combined to indicate that the results were fetched from these upstream
+/// engines. After this, all the data in the `HashMap` is removed and placed into a struct that contains all
+/// the aggregated results in a vector. Furthermore, the query used is also added to the struct. This step is
+/// necessary to ensure that the search bar in the search remains populated even when searched from the query URL.
+///
+/// Overall, this function serves to aggregate scraped results from user-selected search engines, handling errors,
+/// removing duplicates, and organizing the data for display in the UI.
 ///
 /// # Example:
 ///
 /// * `query` - Accepts a string to query with the above upstream search engines.
 /// * `page` - Accepts an u32 page number.
 /// * `random_delay` - Accepts a boolean value to add a random delay before making the request.
+/// * `debug` - Accepts a boolean value to enable or disable debug mode option.
+/// * `upstream_search_engines` - Accepts a vector of search engine names which was selected by the
+/// user through the UI or the config file.
 ///
 /// # Error
 ///
 /// function in either `searx` or `duckduckgo` or both otherwise returns a `SearchResults struct`
 /// containing appropriate values.
 pub async fn aggregate(
+    query: String,
     page: u32,
     random_delay: bool,
     debug: bool,
+    upstream_search_engines: Vec<String>,
 ) -> Result<SearchResults, Box<dyn std::error::Error>> {
     let user_agent: String = random_user_agent();
     let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
     }
     // fetch results from upstream search engines simultaneously/concurrently.
+    let search_engines: Vec<Box<dyn SearchEngine + Send + Sync>> = upstream_search_engines
+        .iter()
+        .map(|engine| match engine.to_lowercase().as_str() {
+            "duckduckgo" => Box::new(duckduckgo::DuckDuckGo) as Box<dyn SearchEngine + Send + Sync>,
+            "searx" => Box::new(searx::Searx) as Box<dyn SearchEngine + Send + Sync>,
+            &_ => panic!("Config Error: Incorrect config file option provided"),
+        })
+        .collect();
+    let task_capacity: usize = search_engines.len();
+    let tasks: FutureVec = search_engines
+        .into_iter()
+        .map(|search_engine| {
+            let query: String = query.clone();
+            let user_agent: String = user_agent.clone();
+            tokio::spawn(
+                async move { search_engine.results(query, page, user_agent.clone()).await },
+            )
+        })
+        .collect();
+    let mut outputs = Vec::with_capacity(task_capacity);
+    for task in tasks {
+        if let Ok(result) = task.await {
+            outputs.push(result)
         }
+    }
+    let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new();
+    // The code block `outputs.iter()` determines whether it is the first time the code is being run.
+    // It does this by checking the initial flag. If it is the first time, the code selects the first
+    // engine from which results are fetched and adds or extends them into the `result_map`. If the
+    // initially selected engine fails, the code automatically selects another engine to map or extend
+    // into the `result_map`. On the other hand, if an engine selected for the first time successfully
+    // fetches results and maps them into the `result_map`, the initial flag is set to false. Subsequently,
+    // the code iterates through the remaining engines one by one. It compares the fetched results from each
+    // engine with the results already present in the `result_map` to identify any duplicates. If duplicate
+    // results are found, the code groups them together with the name of the engine from which they were
+    // fetched, and automatically removes the duplicate results from the newly fetched data.
+    //
+    // Additionally, the code handles errors returned by the engines. It keeps track of which engines
+    // encountered errors and stores this information in a vector of structures called `EngineErrorInfo`.
+    // Each structure in this vector contains the name of the engine and the type of error it returned.
+    // These structures will later be added to the final `SearchResults` structure. The `SearchResults`
+    // structure is used to display an error box in the UI containing the relevant information from
+    // the `EngineErrorInfo` structure.
+    //
+    // In summary, this code block manages the selection of engines, handling of duplicate results, and tracking
+    // of errors in order to populate the `result_map` and provide informative feedback to the user through the
+    // `SearchResults` structure.
+    let mut initial: bool = true;
+    let mut counter: usize = 0;
+    outputs.iter().for_each(|results| {
+        if initial {
+            match results {
+                Ok(result) => {
+                    result_map.extend(result.clone());
+                    counter += 1;
+                    initial = false
+                }
+                Err(error_type) => {
+                    engine_errors_info.push(EngineErrorInfo::new(
+                        error_type.downcast_ref::<EngineError>().unwrap(),
+                        upstream_search_engines[counter].clone(),
+                    ));
+                    counter += 1
+                }
+            }
+        } else {
+            match results {
+                Ok(result) => {
+                    result.clone().into_iter().for_each(|(key, value)| {
+                        result_map
+                            .entry(key)
+                            .and_modify(|result| {
+                                result.add_engines(value.clone().engine());
+                            })
+                            .or_insert_with(|| -> RawSearchResult {
+                                RawSearchResult::new(
+                                    value.title.clone(),
+                                    value.visiting_url.clone(),
+                                    value.description.clone(),
+                                    value.engine.clone(),
+                                )
+                            });
+                    });
+                    counter += 1
+                }
+                Err(error_type) => {
+                    engine_errors_info.push(EngineErrorInfo::new(
+                        error_type.downcast_ref::<EngineError>().unwrap(),
+                        upstream_search_engines[counter].clone(),
+                    ));
+                    counter += 1
+                }
+            }
+        }
     });
     Ok(SearchResults::new(
             })
             .collect(),
         query.to_string(),
+        engine_errors_info,
     ))
 }

src/server/routes.rs CHANGED Viewed

@@ -22,7 +22,7 @@ use serde::Deserialize;
 /// of the search url.
 /// * `page` - It stores the search parameter `page` (or pageno in simple words)
 /// of the search url.
-#[derive(Debug, Deserialize)]
 struct SearchParams {
     q: Option<String>,
     page: Option<u32>,
@@ -51,6 +51,21 @@ pub async fn not_found(
         .body(page_content))
 }
 /// Handles the route of search page of the `websurfx` meta search engine website and it takes
 /// two search url parameters `q` and `page` where `page` parameter is optional.
 ///
@@ -72,7 +87,6 @@ pub async fn search(
     config: web::Data<Config>,
 ) -> Result<HttpResponse, Box<dyn std::error::Error>> {
     let params = web::Query::<SearchParams>::from_query(req.query_string())?;
     match &params.q {
         Some(query) => {
             if query.trim().is_empty() {
@@ -89,7 +103,7 @@ pub async fn search(
                 "http://{}:{}/search?q={}&page={}",
                 config.binding_ip, config.port, query, page
             );
-            let results_json = results(url, &config, query, page).await?;
             let page_content: String = hbs.render("search", &results_json)?;
             Ok(HttpResponse::Ok().body(page_content))
         }
@@ -104,23 +118,51 @@ pub async fn search(
 async fn results(
     url: String,
     config: &Config,
-    query: &str,
     page: u32,
 ) -> Result<SearchResults, Box<dyn std::error::Error>> {
     //Initialize redis cache connection struct
     let mut redis_cache = RedisCache::new(config.redis_url.clone())?;
     // fetch the cached results json.
     let cached_results_json = redis_cache.cached_json(&url);
-    // check if fetched results was indeed fetched or it was an error and if so
     // handle the data accordingly.
     match cached_results_json {
-        Ok(results_json) => Ok(serde_json::from_str::<SearchResults>(&results_json).unwrap()),
         Err(_) => {
-            let mut results_json: crate::results::aggregation_models::SearchResults =
-                aggregate(query, page, config.aggregator.random_delay, config.debug).await?;
-            results_json.add_style(config.style.clone());
-            redis_cache.cache_results(serde_json::to_string(&results_json)?, &url)?;
-            Ok(results_json)
         }
     }
 }

 /// of the search url.
 /// * `page` - It stores the search parameter `page` (or pageno in simple words)
 /// of the search url.
+#[derive(Deserialize)]
 struct SearchParams {
     q: Option<String>,
     page: Option<u32>,
         .body(page_content))
 }
+/// A named struct which is used to deserialize the cookies fetched from the client side.
+///
+/// # Fields
+///
+/// * `theme` - It stores the theme name used in the website.
+/// * `colorscheme` - It stores the colorscheme name used for the website theme.
+/// * `engines` - It stores the user selected upstream search engines selected from the UI.
+#[allow(dead_code)]
+#[derive(Deserialize)]
+struct Cookie {
+    theme: String,
+    colorscheme: String,
+    engines: Vec<String>,
+}
 /// Handles the route of search page of the `websurfx` meta search engine website and it takes
 /// two search url parameters `q` and `page` where `page` parameter is optional.
 ///
     config: web::Data<Config>,
 ) -> Result<HttpResponse, Box<dyn std::error::Error>> {
     let params = web::Query::<SearchParams>::from_query(req.query_string())?;
     match &params.q {
         Some(query) => {
             if query.trim().is_empty() {
                 "http://{}:{}/search?q={}&page={}",
                 config.binding_ip, config.port, query, page
             );
+            let results_json = results(url, &config, query.to_string(), page, req).await?;
             let page_content: String = hbs.render("search", &results_json)?;
             Ok(HttpResponse::Ok().body(page_content))
         }
 async fn results(
     url: String,
     config: &Config,
+    query: String,
     page: u32,
+    req: HttpRequest,
 ) -> Result<SearchResults, Box<dyn std::error::Error>> {
     //Initialize redis cache connection struct
     let mut redis_cache = RedisCache::new(config.redis_url.clone())?;
     // fetch the cached results json.
     let cached_results_json = redis_cache.cached_json(&url);
+    // check if fetched cache results was indeed fetched or it was an error and if so
     // handle the data accordingly.
     match cached_results_json {
+        Ok(results) => Ok(serde_json::from_str::<SearchResults>(&results).unwrap()),
         Err(_) => {
+            // check if the cookie value is empty or not if it is empty then use the
+            // default selected upstream search engines from the config file otherwise
+            // parse the non-empty cookie and grab the user selected engines from the
+            // UI and use that.
+            let mut results: crate::results::aggregation_models::SearchResults = match req
+                .cookie("appCookie")
+            {
+                Some(cookie_value) => {
+                    let cookie_value: Cookie = serde_json::from_str(cookie_value.name_value().1)?;
+                    aggregate(
+                        query,
+                        page,
+                        config.aggregator.random_delay,
+                        config.debug,
+                        cookie_value.engines,
+                    )
+                    .await?
+                }
+                None => {
+                    aggregate(
+                        query,
+                        page,
+                        config.aggregator.random_delay,
+                        config.debug,
+                        config.upstream_search_engines.clone(),
+                    )
+                    .await?
+                }
+            };
+            results.add_style(config.style.clone());
+            redis_cache.cache_results(serde_json::to_string(&results)?, &url)?;
+            Ok(results)
         }
     }
 }

websurfx/config.lua CHANGED Viewed

@@ -5,7 +5,7 @@ debug = false -- an option to enable or disable debug mode.
 -- ### Server ###
 port = "8080" -- port on which server should be launched
 binding_ip = "127.0.0.1" --ip address on the which server should be launched.
-production_use = false -- whether to use production mode or not (in other words this option should be used if it is to be used to host it on the server to provide a service to a large number of users)
 -- if production_use is set to true
 -- There will be a random delay before sending the request to the search engines, this is to prevent DDoSing the upstream search engines from a large number of simultaneous requests.
@@ -26,3 +26,6 @@ theme = "simple" -- the theme name which should be used for the website
 -- ### Caching ###
 redis_url = "redis://127.0.0.1:8082" -- redis connection url address on which the client should connect on.

 -- ### Server ###
 port = "8080" -- port on which server should be launched
 binding_ip = "127.0.0.1" --ip address on the which server should be launched.
+production_use = false -- whether to use production mode or not (in other words this option should be used if it is to be used to host it on the server to provide a service to a large number of users (more than one))
 -- if production_use is set to true
 -- There will be a random delay before sending the request to the search engines, this is to prevent DDoSing the upstream search engines from a large number of simultaneous requests.
 -- ### Caching ###
 redis_url = "redis://127.0.0.1:8082" -- redis connection url address on which the client should connect on.
+-- ### Search Engines ###
+upstream_search_engines = { DuckDuckGo = true, Searx = false } -- select the upstream search engines from which the results should be fetched.