Merge pull request #146 from neon-mmd/improve-async-multithreading
Browse files- Cargo.lock +39 -27
- Cargo.toml +3 -2
- src/config/parser.rs +16 -2
- src/engines/duckduckgo.rs +125 -129
- src/engines/engine_models.rs +35 -3
- src/engines/searx.rs +112 -113
- src/results/aggregation_models.rs +36 -3
- src/results/aggregator.rs +129 -40
- src/server/routes.rs +53 -11
- websurfx/config.lua +4 -1
Cargo.lock
CHANGED
@@ -292,6 +292,17 @@ version = "0.10.3"
|
|
292 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
293 |
checksum = "619743e34b5ba4e9703bba34deac3427c72507c7159f5fd030aea8cac0cfe341"
|
294 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
295 |
[[package]]
|
296 |
name = "autocfg"
|
297 |
version = "0.1.8"
|
@@ -506,18 +517,18 @@ dependencies = [
|
|
506 |
|
507 |
[[package]]
|
508 |
name = "clap"
|
509 |
-
version = "4.3.
|
510 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
511 |
-
checksum = "
|
512 |
dependencies = [
|
513 |
"clap_builder",
|
514 |
]
|
515 |
|
516 |
[[package]]
|
517 |
name = "clap_builder"
|
518 |
-
version = "4.3.
|
519 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
520 |
-
checksum = "
|
521 |
dependencies = [
|
522 |
"anstyle",
|
523 |
"clap_lex",
|
@@ -784,7 +795,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
784 |
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
|
785 |
dependencies = [
|
786 |
"quote 1.0.29",
|
787 |
-
"syn 2.0.
|
788 |
]
|
789 |
|
790 |
[[package]]
|
@@ -1457,7 +1468,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
|
|
1457 |
checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
|
1458 |
dependencies = [
|
1459 |
"hermit-abi",
|
1460 |
-
"rustix 0.38.
|
1461 |
"windows-sys",
|
1462 |
]
|
1463 |
|
@@ -1834,7 +1845,7 @@ checksum = "a948666b637a0f465e8564c73e89d4dde00d72d4d473cc972f390fc3dcee7d9c"
|
|
1834 |
dependencies = [
|
1835 |
"proc-macro2 1.0.64",
|
1836 |
"quote 1.0.29",
|
1837 |
-
"syn 2.0.
|
1838 |
]
|
1839 |
|
1840 |
[[package]]
|
@@ -1952,7 +1963,7 @@ dependencies = [
|
|
1952 |
"pest_meta",
|
1953 |
"proc-macro2 1.0.64",
|
1954 |
"quote 1.0.29",
|
1955 |
-
"syn 2.0.
|
1956 |
]
|
1957 |
|
1958 |
[[package]]
|
@@ -2054,7 +2065,7 @@ dependencies = [
|
|
2054 |
"phf_shared 0.11.2",
|
2055 |
"proc-macro2 1.0.64",
|
2056 |
"quote 1.0.29",
|
2057 |
-
"syn 2.0.
|
2058 |
]
|
2059 |
|
2060 |
[[package]]
|
@@ -2398,9 +2409,9 @@ dependencies = [
|
|
2398 |
|
2399 |
[[package]]
|
2400 |
name = "regex-automata"
|
2401 |
-
version = "0.3.
|
2402 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
2403 |
-
checksum = "
|
2404 |
dependencies = [
|
2405 |
"aho-corasick",
|
2406 |
"memchr",
|
@@ -2409,9 +2420,9 @@ dependencies = [
|
|
2409 |
|
2410 |
[[package]]
|
2411 |
name = "regex-syntax"
|
2412 |
-
version = "0.7.
|
2413 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
2414 |
-
checksum = "
|
2415 |
|
2416 |
[[package]]
|
2417 |
name = "reqwest"
|
@@ -2548,9 +2559,9 @@ dependencies = [
|
|
2548 |
|
2549 |
[[package]]
|
2550 |
name = "rustix"
|
2551 |
-
version = "0.38.
|
2552 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
2553 |
-
checksum = "
|
2554 |
dependencies = [
|
2555 |
"bitflags 2.3.3",
|
2556 |
"errno",
|
@@ -2708,14 +2719,14 @@ checksum = "389894603bd18c46fa56231694f8d827779c0951a667087194cf9de94ed24682"
|
|
2708 |
dependencies = [
|
2709 |
"proc-macro2 1.0.64",
|
2710 |
"quote 1.0.29",
|
2711 |
-
"syn 2.0.
|
2712 |
]
|
2713 |
|
2714 |
[[package]]
|
2715 |
name = "serde_json"
|
2716 |
-
version = "1.0.
|
2717 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
2718 |
-
checksum = "
|
2719 |
dependencies = [
|
2720 |
"itoa 1.0.8",
|
2721 |
"ryu",
|
@@ -2937,9 +2948,9 @@ dependencies = [
|
|
2937 |
|
2938 |
[[package]]
|
2939 |
name = "syn"
|
2940 |
-
version = "2.0.
|
2941 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
2942 |
-
checksum = "
|
2943 |
dependencies = [
|
2944 |
"proc-macro2 1.0.64",
|
2945 |
"quote 1.0.29",
|
@@ -3009,7 +3020,7 @@ checksum = "463fe12d7993d3b327787537ce8dd4dfa058de32fc2b195ef3cde03dc4771e8f"
|
|
3009 |
dependencies = [
|
3010 |
"proc-macro2 1.0.64",
|
3011 |
"quote 1.0.29",
|
3012 |
-
"syn 2.0.
|
3013 |
]
|
3014 |
|
3015 |
[[package]]
|
@@ -3164,7 +3175,7 @@ checksum = "630bdcf245f78637c13ec01ffae6187cca34625e8c63150d424b59e55af2675e"
|
|
3164 |
dependencies = [
|
3165 |
"proc-macro2 1.0.64",
|
3166 |
"quote 1.0.29",
|
3167 |
-
"syn 2.0.
|
3168 |
]
|
3169 |
|
3170 |
[[package]]
|
@@ -3343,9 +3354,9 @@ checksum = "92888ba5573ff080736b3648696b70cafad7d250551175acbaa4e0385b3e1460"
|
|
3343 |
|
3344 |
[[package]]
|
3345 |
name = "unicode-ident"
|
3346 |
-
version = "1.0.
|
3347 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
3348 |
-
checksum = "
|
3349 |
|
3350 |
[[package]]
|
3351 |
name = "unicode-normalization"
|
@@ -3486,7 +3497,7 @@ dependencies = [
|
|
3486 |
"once_cell",
|
3487 |
"proc-macro2 1.0.64",
|
3488 |
"quote 1.0.29",
|
3489 |
-
"syn 2.0.
|
3490 |
"wasm-bindgen-shared",
|
3491 |
]
|
3492 |
|
@@ -3520,7 +3531,7 @@ checksum = "54681b18a46765f095758388f2d0cf16eb8d4169b639ab575a8f5693af210c7b"
|
|
3520 |
dependencies = [
|
3521 |
"proc-macro2 1.0.64",
|
3522 |
"quote 1.0.29",
|
3523 |
-
"syn 2.0.
|
3524 |
"wasm-bindgen-backend",
|
3525 |
"wasm-bindgen-shared",
|
3526 |
]
|
@@ -3543,10 +3554,11 @@ dependencies = [
|
|
3543 |
|
3544 |
[[package]]
|
3545 |
name = "websurfx"
|
3546 |
-
version = "0.
|
3547 |
dependencies = [
|
3548 |
"actix-files",
|
3549 |
"actix-web",
|
|
|
3550 |
"criterion",
|
3551 |
"env_logger",
|
3552 |
"error-stack",
|
|
|
292 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
293 |
checksum = "619743e34b5ba4e9703bba34deac3427c72507c7159f5fd030aea8cac0cfe341"
|
294 |
|
295 |
+
[[package]]
|
296 |
+
name = "async-trait"
|
297 |
+
version = "0.1.71"
|
298 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
299 |
+
checksum = "a564d521dd56509c4c47480d00b80ee55f7e385ae48db5744c67ad50c92d2ebf"
|
300 |
+
dependencies = [
|
301 |
+
"proc-macro2 1.0.64",
|
302 |
+
"quote 1.0.29",
|
303 |
+
"syn 2.0.26",
|
304 |
+
]
|
305 |
+
|
306 |
[[package]]
|
307 |
name = "autocfg"
|
308 |
version = "0.1.8"
|
|
|
517 |
|
518 |
[[package]]
|
519 |
name = "clap"
|
520 |
+
version = "4.3.12"
|
521 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
522 |
+
checksum = "3eab9e8ceb9afdade1ab3f0fd8dbce5b1b2f468ad653baf10e771781b2b67b73"
|
523 |
dependencies = [
|
524 |
"clap_builder",
|
525 |
]
|
526 |
|
527 |
[[package]]
|
528 |
name = "clap_builder"
|
529 |
+
version = "4.3.12"
|
530 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
531 |
+
checksum = "9f2763db829349bf00cfc06251268865ed4363b93a943174f638daf3ecdba2cd"
|
532 |
dependencies = [
|
533 |
"anstyle",
|
534 |
"clap_lex",
|
|
|
795 |
checksum = "13b588ba4ac1a99f7f2964d24b3d896ddc6bf847ee3855dbd4366f058cfcd331"
|
796 |
dependencies = [
|
797 |
"quote 1.0.29",
|
798 |
+
"syn 2.0.26",
|
799 |
]
|
800 |
|
801 |
[[package]]
|
|
|
1468 |
checksum = "cb0889898416213fab133e1d33a0e5858a48177452750691bde3666d0fdbaf8b"
|
1469 |
dependencies = [
|
1470 |
"hermit-abi",
|
1471 |
+
"rustix 0.38.4",
|
1472 |
"windows-sys",
|
1473 |
]
|
1474 |
|
|
|
1845 |
dependencies = [
|
1846 |
"proc-macro2 1.0.64",
|
1847 |
"quote 1.0.29",
|
1848 |
+
"syn 2.0.26",
|
1849 |
]
|
1850 |
|
1851 |
[[package]]
|
|
|
1963 |
"pest_meta",
|
1964 |
"proc-macro2 1.0.64",
|
1965 |
"quote 1.0.29",
|
1966 |
+
"syn 2.0.26",
|
1967 |
]
|
1968 |
|
1969 |
[[package]]
|
|
|
2065 |
"phf_shared 0.11.2",
|
2066 |
"proc-macro2 1.0.64",
|
2067 |
"quote 1.0.29",
|
2068 |
+
"syn 2.0.26",
|
2069 |
]
|
2070 |
|
2071 |
[[package]]
|
|
|
2409 |
|
2410 |
[[package]]
|
2411 |
name = "regex-automata"
|
2412 |
+
version = "0.3.3"
|
2413 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
2414 |
+
checksum = "39354c10dd07468c2e73926b23bb9c2caca74c5501e38a35da70406f1d923310"
|
2415 |
dependencies = [
|
2416 |
"aho-corasick",
|
2417 |
"memchr",
|
|
|
2420 |
|
2421 |
[[package]]
|
2422 |
name = "regex-syntax"
|
2423 |
+
version = "0.7.4"
|
2424 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
2425 |
+
checksum = "e5ea92a5b6195c6ef2a0295ea818b312502c6fc94dde986c5553242e18fd4ce2"
|
2426 |
|
2427 |
[[package]]
|
2428 |
name = "reqwest"
|
|
|
2559 |
|
2560 |
[[package]]
|
2561 |
name = "rustix"
|
2562 |
+
version = "0.38.4"
|
2563 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
2564 |
+
checksum = "0a962918ea88d644592894bc6dc55acc6c0956488adcebbfb6e273506b7fd6e5"
|
2565 |
dependencies = [
|
2566 |
"bitflags 2.3.3",
|
2567 |
"errno",
|
|
|
2719 |
dependencies = [
|
2720 |
"proc-macro2 1.0.64",
|
2721 |
"quote 1.0.29",
|
2722 |
+
"syn 2.0.26",
|
2723 |
]
|
2724 |
|
2725 |
[[package]]
|
2726 |
name = "serde_json"
|
2727 |
+
version = "1.0.102"
|
2728 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
2729 |
+
checksum = "b5062a995d481b2308b6064e9af76011f2921c35f97b0468811ed9f6cd91dfed"
|
2730 |
dependencies = [
|
2731 |
"itoa 1.0.8",
|
2732 |
"ryu",
|
|
|
2948 |
|
2949 |
[[package]]
|
2950 |
name = "syn"
|
2951 |
+
version = "2.0.26"
|
2952 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
2953 |
+
checksum = "45c3457aacde3c65315de5031ec191ce46604304d2446e803d71ade03308d970"
|
2954 |
dependencies = [
|
2955 |
"proc-macro2 1.0.64",
|
2956 |
"quote 1.0.29",
|
|
|
3020 |
dependencies = [
|
3021 |
"proc-macro2 1.0.64",
|
3022 |
"quote 1.0.29",
|
3023 |
+
"syn 2.0.26",
|
3024 |
]
|
3025 |
|
3026 |
[[package]]
|
|
|
3175 |
dependencies = [
|
3176 |
"proc-macro2 1.0.64",
|
3177 |
"quote 1.0.29",
|
3178 |
+
"syn 2.0.26",
|
3179 |
]
|
3180 |
|
3181 |
[[package]]
|
|
|
3354 |
|
3355 |
[[package]]
|
3356 |
name = "unicode-ident"
|
3357 |
+
version = "1.0.11"
|
3358 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
3359 |
+
checksum = "301abaae475aa91687eb82514b328ab47a211a533026cb25fc3e519b86adfc3c"
|
3360 |
|
3361 |
[[package]]
|
3362 |
name = "unicode-normalization"
|
|
|
3497 |
"once_cell",
|
3498 |
"proc-macro2 1.0.64",
|
3499 |
"quote 1.0.29",
|
3500 |
+
"syn 2.0.26",
|
3501 |
"wasm-bindgen-shared",
|
3502 |
]
|
3503 |
|
|
|
3531 |
dependencies = [
|
3532 |
"proc-macro2 1.0.64",
|
3533 |
"quote 1.0.29",
|
3534 |
+
"syn 2.0.26",
|
3535 |
"wasm-bindgen-backend",
|
3536 |
"wasm-bindgen-shared",
|
3537 |
]
|
|
|
3554 |
|
3555 |
[[package]]
|
3556 |
name = "websurfx"
|
3557 |
+
version = "0.14.0"
|
3558 |
dependencies = [
|
3559 |
"actix-files",
|
3560 |
"actix-web",
|
3561 |
+
"async-trait",
|
3562 |
"criterion",
|
3563 |
"env_logger",
|
3564 |
"error-stack",
|
Cargo.toml
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
[package]
|
2 |
name = "websurfx"
|
3 |
-
version = "0.
|
4 |
edition = "2021"
|
5 |
description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind."
|
6 |
repository = "https://github.com/neon-mmd/websurfx"
|
@@ -12,7 +12,7 @@ tokio = {version="*",features=["full"]}
|
|
12 |
serde = {version="*",features=["derive"]}
|
13 |
handlebars = { version = "4.3.6", features = ["dir_source"] }
|
14 |
scraper = {version="*"}
|
15 |
-
actix-web = {version="4.3.1"}
|
16 |
actix-files = {version="0.6.2"}
|
17 |
serde_json = {version="*"}
|
18 |
fake-useragent = {version="*"}
|
@@ -24,6 +24,7 @@ md5 = {version="*"}
|
|
24 |
rand={version="*"}
|
25 |
once_cell = {version="*"}
|
26 |
error-stack = {version="0.3.1"}
|
|
|
27 |
|
28 |
[dev-dependencies]
|
29 |
rusty-hook = "^0.11.2"
|
|
|
1 |
[package]
|
2 |
name = "websurfx"
|
3 |
+
version = "0.14.0"
|
4 |
edition = "2021"
|
5 |
description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind."
|
6 |
repository = "https://github.com/neon-mmd/websurfx"
|
|
|
12 |
serde = {version="*",features=["derive"]}
|
13 |
handlebars = { version = "4.3.6", features = ["dir_source"] }
|
14 |
scraper = {version="*"}
|
15 |
+
actix-web = {version="4.3.1", features = ["cookies"]}
|
16 |
actix-files = {version="0.6.2"}
|
17 |
serde_json = {version="*"}
|
18 |
fake-useragent = {version="*"}
|
|
|
24 |
rand={version="*"}
|
25 |
once_cell = {version="*"}
|
26 |
error-stack = {version="0.3.1"}
|
27 |
+
async-trait = {version="*"}
|
28 |
|
29 |
[dev-dependencies]
|
30 |
rusty-hook = "^0.11.2"
|
src/config/parser.rs
CHANGED
@@ -3,7 +3,7 @@
|
|
3 |
|
4 |
use super::parser_models::Style;
|
5 |
use rlua::Lua;
|
6 |
-
use std::{format, fs, path::Path};
|
7 |
|
8 |
// ------- Constants --------
|
9 |
static COMMON_DIRECTORY_NAME: &str = "websurfx";
|
@@ -18,6 +18,10 @@ static CONFIG_FILE_NAME: &str = "config.lua";
|
|
18 |
/// * `style` - It stores the theming options for the website.
|
19 |
/// * `redis_url` - It stores the redis connection url address on which the redis
|
20 |
/// client should connect.
|
|
|
|
|
|
|
|
|
21 |
#[derive(Clone)]
|
22 |
pub struct Config {
|
23 |
pub port: u16,
|
@@ -27,12 +31,17 @@ pub struct Config {
|
|
27 |
pub aggregator: AggregatorConfig,
|
28 |
pub logging: bool,
|
29 |
pub debug: bool,
|
|
|
30 |
}
|
31 |
|
32 |
/// Configuration options for the aggregator.
|
|
|
|
|
|
|
|
|
|
|
33 |
#[derive(Clone)]
|
34 |
pub struct AggregatorConfig {
|
35 |
-
/// Whether to introduce a random delay before sending the request to the search engine.
|
36 |
pub random_delay: bool,
|
37 |
}
|
38 |
|
@@ -66,6 +75,11 @@ impl Config {
|
|
66 |
},
|
67 |
logging: globals.get::<_, bool>("logging")?,
|
68 |
debug: globals.get::<_, bool>("debug")?,
|
|
|
|
|
|
|
|
|
|
|
69 |
})
|
70 |
})
|
71 |
}
|
|
|
3 |
|
4 |
use super::parser_models::Style;
|
5 |
use rlua::Lua;
|
6 |
+
use std::{collections::HashMap, format, fs, path::Path};
|
7 |
|
8 |
// ------- Constants --------
|
9 |
static COMMON_DIRECTORY_NAME: &str = "websurfx";
|
|
|
18 |
/// * `style` - It stores the theming options for the website.
|
19 |
/// * `redis_url` - It stores the redis connection url address on which the redis
|
20 |
/// client should connect.
|
21 |
+
/// * `aggregator` - It stores the option to whether enable or disable production use.
|
22 |
+
/// * `logging` - It stores the option to whether enable or disable logs.
|
23 |
+
/// * `debug` - It stores the option to whether enable or disable debug mode.
|
24 |
+
/// * `upstream_search_engines` - It stores all the engine names that were enabled by the user.
|
25 |
#[derive(Clone)]
|
26 |
pub struct Config {
|
27 |
pub port: u16,
|
|
|
31 |
pub aggregator: AggregatorConfig,
|
32 |
pub logging: bool,
|
33 |
pub debug: bool,
|
34 |
+
pub upstream_search_engines: Vec<String>,
|
35 |
}
|
36 |
|
37 |
/// Configuration options for the aggregator.
|
38 |
+
///
|
39 |
+
/// # Fields
|
40 |
+
///
|
41 |
+
/// * `random_delay` - It stores the option to whether enable or disable random delays between
|
42 |
+
/// requests.
|
43 |
#[derive(Clone)]
|
44 |
pub struct AggregatorConfig {
|
|
|
45 |
pub random_delay: bool,
|
46 |
}
|
47 |
|
|
|
75 |
},
|
76 |
logging: globals.get::<_, bool>("logging")?,
|
77 |
debug: globals.get::<_, bool>("debug")?,
|
78 |
+
upstream_search_engines: globals
|
79 |
+
.get::<_, HashMap<String, bool>>("upstream_search_engines")?
|
80 |
+
.into_iter()
|
81 |
+
.filter_map(|(key, value)| value.then_some(key))
|
82 |
+
.collect(),
|
83 |
})
|
84 |
})
|
85 |
}
|
src/engines/duckduckgo.rs
CHANGED
@@ -2,154 +2,150 @@
|
|
2 |
//! by querying the upstream duckduckgo search engine with user provided query and with a page
|
3 |
//! number if provided.
|
4 |
|
5 |
-
use std::
|
6 |
|
7 |
use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
|
8 |
use scraper::{Html, Selector};
|
9 |
|
10 |
use crate::results::aggregation_models::RawSearchResult;
|
11 |
|
12 |
-
use super::engine_models::EngineError;
|
13 |
|
14 |
use error_stack::{IntoReport, Report, Result, ResultExt};
|
15 |
|
16 |
-
///
|
17 |
-
///
|
18 |
-
|
19 |
-
/// values are RawSearchResult struct and then returns it within a Result enum.
|
20 |
-
///
|
21 |
-
/// # Arguments
|
22 |
-
///
|
23 |
-
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
24 |
-
/// * `page` - Takes an u32 as an argument.
|
25 |
-
/// * `user_agent` - Takes a random user agent string as an argument.
|
26 |
-
///
|
27 |
-
/// # Errors
|
28 |
-
///
|
29 |
-
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
|
30 |
-
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
|
31 |
-
/// provide results for the requested search query and also returns error if the scraping selector
|
32 |
-
/// or HeaderMap fails to initialize.
|
33 |
-
pub async fn results(
|
34 |
-
query: &str,
|
35 |
-
page: u32,
|
36 |
-
user_agent: &str,
|
37 |
-
) -> Result<HashMap<String, RawSearchResult>, EngineError> {
|
38 |
-
// Page number can be missing or empty string and so appropriate handling is required
|
39 |
-
// so that upstream server receives valid page number.
|
40 |
-
let url: String = match page {
|
41 |
-
1 => {
|
42 |
-
format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")
|
43 |
-
}
|
44 |
-
_ => {
|
45 |
-
format!(
|
46 |
-
"https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js",
|
47 |
-
query,
|
48 |
-
(page / 2 + (page % 2)) * 30,
|
49 |
-
(page / 2 + (page % 2)) * 30 + 1
|
50 |
-
)
|
51 |
-
}
|
52 |
-
};
|
53 |
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
74 |
-
|
75 |
-
|
76 |
-
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
84 |
|
85 |
-
|
86 |
-
|
87 |
-
.
|
88 |
-
|
89 |
-
|
90 |
-
|
91 |
-
|
92 |
-
|
93 |
-
|
94 |
-
.
|
95 |
-
|
96 |
-
|
97 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
98 |
|
99 |
-
|
|
|
|
|
100 |
|
101 |
-
|
102 |
-
|
103 |
-
|
104 |
|
105 |
-
|
106 |
-
|
107 |
-
|
108 |
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
result
|
128 |
-
.select(&result_title)
|
129 |
-
.next()
|
130 |
-
.unwrap()
|
131 |
-
.inner_html()
|
132 |
-
.trim()
|
133 |
-
.to_string(),
|
134 |
-
format!(
|
135 |
-
"https://{}",
|
136 |
result
|
137 |
-
.select(&
|
138 |
.next()
|
139 |
.unwrap()
|
140 |
.inner_html()
|
141 |
.trim()
|
142 |
-
|
143 |
-
|
144 |
-
|
145 |
-
|
146 |
-
|
147 |
-
|
148 |
-
|
149 |
-
|
150 |
-
|
151 |
-
|
152 |
-
|
153 |
-
|
154 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
155 |
}
|
|
|
2 |
//! by querying the upstream duckduckgo search engine with user provided query and with a page
|
3 |
//! number if provided.
|
4 |
|
5 |
+
use std::collections::HashMap;
|
6 |
|
7 |
use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
|
8 |
use scraper::{Html, Selector};
|
9 |
|
10 |
use crate::results::aggregation_models::RawSearchResult;
|
11 |
|
12 |
+
use super::engine_models::{EngineError, SearchEngine};
|
13 |
|
14 |
use error_stack::{IntoReport, Report, Result, ResultExt};
|
15 |
|
16 |
+
/// A new DuckDuckGo engine type defined in-order to implement the `SearchEngine` trait which allows to
|
17 |
+
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
18 |
+
pub struct DuckDuckGo;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
|
20 |
+
#[async_trait::async_trait]
|
21 |
+
impl SearchEngine for DuckDuckGo {
|
22 |
+
/// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
|
23 |
+
/// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
|
24 |
+
/// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
|
25 |
+
/// values are RawSearchResult struct and then returns it within a Result enum.
|
26 |
+
///
|
27 |
+
/// # Arguments
|
28 |
+
///
|
29 |
+
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
30 |
+
/// * `page` - Takes an u32 as an argument.
|
31 |
+
/// * `user_agent` - Takes a random user agent string as an argument.
|
32 |
+
///
|
33 |
+
/// # Errors
|
34 |
+
///
|
35 |
+
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
|
36 |
+
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
|
37 |
+
/// provide results for the requested search query and also returns error if the scraping selector
|
38 |
+
/// or HeaderMap fails to initialize.
|
39 |
+
async fn results(
|
40 |
+
&self,
|
41 |
+
query: String,
|
42 |
+
page: u32,
|
43 |
+
user_agent: String,
|
44 |
+
) -> Result<HashMap<String, RawSearchResult>, EngineError> {
|
45 |
+
// Page number can be missing or empty string and so appropriate handling is required
|
46 |
+
// so that upstream server recieves valid page number.
|
47 |
+
let url: String = match page {
|
48 |
+
1 => {
|
49 |
+
format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")
|
50 |
+
}
|
51 |
+
_ => {
|
52 |
+
format!(
|
53 |
+
"https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js",
|
54 |
+
query,
|
55 |
+
(page / 2 + (page % 2)) * 30,
|
56 |
+
(page / 2 + (page % 2)) * 30 + 1
|
57 |
+
)
|
58 |
+
}
|
59 |
+
};
|
60 |
|
61 |
+
// initializing HeaderMap and adding appropriate headers.
|
62 |
+
let mut header_map = HeaderMap::new();
|
63 |
+
header_map.insert(
|
64 |
+
USER_AGENT,
|
65 |
+
user_agent
|
66 |
+
.parse()
|
67 |
+
.into_report()
|
68 |
+
.change_context(EngineError::UnexpectedError)?,
|
69 |
+
);
|
70 |
+
header_map.insert(
|
71 |
+
REFERER,
|
72 |
+
"https://google.com/"
|
73 |
+
.parse()
|
74 |
+
.into_report()
|
75 |
+
.change_context(EngineError::UnexpectedError)?,
|
76 |
+
);
|
77 |
+
header_map.insert(
|
78 |
+
CONTENT_TYPE,
|
79 |
+
"application/x-www-form-urlencoded"
|
80 |
+
.parse()
|
81 |
+
.into_report()
|
82 |
+
.change_context(EngineError::UnexpectedError)?,
|
83 |
+
);
|
84 |
+
header_map.insert(
|
85 |
+
COOKIE,
|
86 |
+
"kl=wt-wt"
|
87 |
+
.parse()
|
88 |
+
.into_report()
|
89 |
+
.change_context(EngineError::UnexpectedError)?,
|
90 |
+
);
|
91 |
|
92 |
+
let document: Html = Html::parse_document(
|
93 |
+
&DuckDuckGo::fetch_html_from_upstream(self, url, header_map).await?,
|
94 |
+
);
|
95 |
|
96 |
+
let no_result: Selector = Selector::parse(".no-results")
|
97 |
+
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
98 |
+
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?;
|
99 |
|
100 |
+
if document.select(&no_result).next().is_some() {
|
101 |
+
return Err(Report::new(EngineError::EmptyResultSet));
|
102 |
+
}
|
103 |
|
104 |
+
let results: Selector = Selector::parse(".result")
|
105 |
+
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
106 |
+
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
|
107 |
+
let result_title: Selector = Selector::parse(".result__a")
|
108 |
+
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
109 |
+
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?;
|
110 |
+
let result_url: Selector = Selector::parse(".result__url")
|
111 |
+
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
112 |
+
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?;
|
113 |
+
let result_desc: Selector = Selector::parse(".result__snippet")
|
114 |
+
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
115 |
+
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?;
|
116 |
|
117 |
+
// scrape all the results from the html
|
118 |
+
Ok(document
|
119 |
+
.select(&results)
|
120 |
+
.map(|result| {
|
121 |
+
RawSearchResult::new(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
122 |
result
|
123 |
+
.select(&result_title)
|
124 |
.next()
|
125 |
.unwrap()
|
126 |
.inner_html()
|
127 |
.trim()
|
128 |
+
.to_string(),
|
129 |
+
format!(
|
130 |
+
"https://{}",
|
131 |
+
result
|
132 |
+
.select(&result_url)
|
133 |
+
.next()
|
134 |
+
.unwrap()
|
135 |
+
.inner_html()
|
136 |
+
.trim()
|
137 |
+
),
|
138 |
+
result
|
139 |
+
.select(&result_desc)
|
140 |
+
.next()
|
141 |
+
.unwrap()
|
142 |
+
.inner_html()
|
143 |
+
.trim()
|
144 |
+
.to_string(),
|
145 |
+
vec!["duckduckgo".to_string()],
|
146 |
+
)
|
147 |
+
})
|
148 |
+
.map(|search_result| (search_result.visiting_url.clone(), search_result))
|
149 |
+
.collect())
|
150 |
+
}
|
151 |
}
|
src/engines/engine_models.rs
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
//! This module provides the error enum to handle different errors associated while requesting data from
|
2 |
//! the upstream search engines with the search query provided by the user.
|
3 |
|
4 |
-
use
|
5 |
-
use
|
|
|
6 |
|
7 |
/// A custom error type used for handle engine associated errors.
|
8 |
///
|
@@ -40,4 +41,35 @@ impl fmt::Display for EngineError {
|
|
40 |
}
|
41 |
}
|
42 |
|
43 |
-
impl Context for EngineError {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
//! This module provides the error enum to handle different errors associated while requesting data from
|
2 |
//! the upstream search engines with the search query provided by the user.
|
3 |
|
4 |
+
use crate::results::aggregation_models::RawSearchResult;
|
5 |
+
use error_stack::{IntoReport, Result, ResultExt};
|
6 |
+
use std::{collections::HashMap, fmt, time::Duration};
|
7 |
|
8 |
/// A custom error type used for handle engine associated errors.
|
9 |
///
|
|
|
41 |
}
|
42 |
}
|
43 |
|
44 |
+
impl error_stack::Context for EngineError {}
|
45 |
+
|
46 |
+
/// A trait to define common behaviour for all search engines.
|
47 |
+
#[async_trait::async_trait]
|
48 |
+
pub trait SearchEngine {
|
49 |
+
async fn fetch_html_from_upstream(
|
50 |
+
&self,
|
51 |
+
url: String,
|
52 |
+
header_map: reqwest::header::HeaderMap,
|
53 |
+
) -> Result<String, EngineError> {
|
54 |
+
// fetch the html from upstream search engine
|
55 |
+
Ok(reqwest::Client::new()
|
56 |
+
.get(url)
|
57 |
+
.timeout(Duration::from_secs(30)) // Add timeout to request to avoid DDOSing the server
|
58 |
+
.headers(header_map) // add spoofed headers to emulate human behaviour
|
59 |
+
.send()
|
60 |
+
.await
|
61 |
+
.into_report()
|
62 |
+
.change_context(EngineError::RequestError)?
|
63 |
+
.text()
|
64 |
+
.await
|
65 |
+
.into_report()
|
66 |
+
.change_context(EngineError::RequestError)?)
|
67 |
+
}
|
68 |
+
|
69 |
+
async fn results(
|
70 |
+
&self,
|
71 |
+
query: String,
|
72 |
+
page: u32,
|
73 |
+
user_agent: String,
|
74 |
+
) -> Result<HashMap<String, RawSearchResult>, EngineError>;
|
75 |
+
}
|
src/engines/searx.rs
CHANGED
@@ -8,131 +8,130 @@ use std::collections::HashMap;
|
|
8 |
|
9 |
use crate::results::aggregation_models::RawSearchResult;
|
10 |
|
11 |
-
use super::engine_models::EngineError;
|
12 |
use error_stack::{IntoReport, Report, Result, ResultExt};
|
13 |
|
14 |
-
///
|
15 |
-
///
|
16 |
-
|
17 |
-
/// values are RawSearchResult struct and then returns it within a Result enum.
|
18 |
-
///
|
19 |
-
/// # Arguments
|
20 |
-
///
|
21 |
-
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
22 |
-
/// * `page` - Takes an u32 as an argument.
|
23 |
-
/// * `user_agent` - Takes a random user agent string as an argument.
|
24 |
-
///
|
25 |
-
/// # Errors
|
26 |
-
///
|
27 |
-
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
|
28 |
-
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
|
29 |
-
/// provide results for the requested search query and also returns error if the scraping selector
|
30 |
-
/// or HeaderMap fails to initialize.
|
31 |
-
pub async fn results(
|
32 |
-
query: &str,
|
33 |
-
page: u32,
|
34 |
-
user_agent: &str,
|
35 |
-
) -> Result<HashMap<String, RawSearchResult>, EngineError> {
|
36 |
-
// Page number can be missing or empty string and so appropriate handling is required
|
37 |
-
// so that upstream server recieves valid page number.
|
38 |
-
let url: String = format!("https://searx.work/search?q={query}&pageno={page}");
|
39 |
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
.parse()
|
60 |
-
.into_report()
|
61 |
-
.change_context(EngineError::UnexpectedError)?,
|
62 |
-
);
|
63 |
-
header_map.insert(COOKIE, "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".parse().into_report().change_context(EngineError::UnexpectedError)?);
|
64 |
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
.
|
73 |
-
.
|
74 |
-
.await
|
75 |
-
.into_report()
|
76 |
-
.change_context(EngineError::RequestError)?;
|
77 |
|
78 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
79 |
|
80 |
-
|
81 |
-
|
82 |
-
.attach_printable_lazy(|| format!("invalid CSS selector: {}", "#urls>.dialog-error>p"))?;
|
83 |
|
84 |
-
|
85 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
86 |
== "we didn't find any results. Please use another query or search in more categories"
|
87 |
{
|
88 |
return Err(Report::new(EngineError::EmptyResultSet));
|
89 |
}
|
90 |
-
|
91 |
|
92 |
-
|
93 |
-
|
94 |
-
|
95 |
-
|
96 |
-
|
97 |
-
|
98 |
-
|
99 |
-
|
100 |
-
|
101 |
|
102 |
-
|
103 |
-
|
104 |
-
|
105 |
|
106 |
-
|
107 |
-
|
108 |
-
|
109 |
-
|
110 |
-
|
111 |
-
|
112 |
-
|
113 |
-
|
114 |
-
|
115 |
-
|
116 |
-
|
117 |
-
|
118 |
-
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
124 |
-
|
125 |
-
|
126 |
-
|
127 |
-
|
128 |
-
|
129 |
-
|
130 |
-
|
131 |
-
|
132 |
-
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
|
|
138 |
}
|
|
|
8 |
|
9 |
use crate::results::aggregation_models::RawSearchResult;
|
10 |
|
11 |
+
use super::engine_models::{EngineError, SearchEngine};
|
12 |
use error_stack::{IntoReport, Report, Result, ResultExt};
|
13 |
|
14 |
+
/// A new Searx engine type defined in-order to implement the `SearchEngine` trait which allows to
|
15 |
+
/// reduce code duplication as well as allows to create vector of different search engines easily.
|
16 |
+
pub struct Searx;
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
|
18 |
+
#[async_trait::async_trait]
|
19 |
+
impl SearchEngine for Searx {
|
20 |
+
/// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
|
21 |
+
/// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
|
22 |
+
/// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
|
23 |
+
/// values are RawSearchResult struct and then returns it within a Result enum.
|
24 |
+
///
|
25 |
+
/// # Arguments
|
26 |
+
///
|
27 |
+
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
28 |
+
/// * `page` - Takes an u32 as an argument.
|
29 |
+
/// * `user_agent` - Takes a random user agent string as an argument.
|
30 |
+
///
|
31 |
+
/// # Errors
|
32 |
+
///
|
33 |
+
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
|
34 |
+
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
|
35 |
+
/// provide results for the requested search query and also returns error if the scraping selector
|
36 |
+
/// or HeaderMap fails to initialize.
|
|
|
|
|
|
|
|
|
|
|
37 |
|
38 |
+
async fn results(
|
39 |
+
&self,
|
40 |
+
query: String,
|
41 |
+
page: u32,
|
42 |
+
user_agent: String,
|
43 |
+
) -> Result<HashMap<String, RawSearchResult>, EngineError> {
|
44 |
+
// Page number can be missing or empty string and so appropriate handling is required
|
45 |
+
// so that upstream server recieves valid page number.
|
46 |
+
let url: String = format!("https://searx.work/search?q={query}&pageno={page}");
|
|
|
|
|
|
|
47 |
|
48 |
+
// initializing headers and adding appropriate headers.
|
49 |
+
let mut header_map = HeaderMap::new();
|
50 |
+
header_map.insert(
|
51 |
+
USER_AGENT,
|
52 |
+
user_agent
|
53 |
+
.parse()
|
54 |
+
.into_report()
|
55 |
+
.change_context(EngineError::UnexpectedError)?,
|
56 |
+
);
|
57 |
+
header_map.insert(
|
58 |
+
REFERER,
|
59 |
+
"https://google.com/"
|
60 |
+
.parse()
|
61 |
+
.into_report()
|
62 |
+
.change_context(EngineError::UnexpectedError)?,
|
63 |
+
);
|
64 |
+
header_map.insert(
|
65 |
+
CONTENT_TYPE,
|
66 |
+
"application/x-www-form-urlencoded"
|
67 |
+
.parse()
|
68 |
+
.into_report()
|
69 |
+
.change_context(EngineError::UnexpectedError)?,
|
70 |
+
);
|
71 |
+
header_map.insert(COOKIE, "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".parse().into_report().change_context(EngineError::UnexpectedError)?);
|
72 |
|
73 |
+
let document: Html =
|
74 |
+
Html::parse_document(&Searx::fetch_html_from_upstream(self, url, header_map).await?);
|
|
|
75 |
|
76 |
+
let no_result: Selector = Selector::parse("#urls>.dialog-error>p")
|
77 |
+
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
78 |
+
.attach_printable_lazy(|| {
|
79 |
+
format!("invalid CSS selector: {}", "#urls>.dialog-error>p")
|
80 |
+
})?;
|
81 |
+
|
82 |
+
if let Some(no_result_msg) = document.select(&no_result).nth(1) {
|
83 |
+
if no_result_msg.inner_html()
|
84 |
== "we didn't find any results. Please use another query or search in more categories"
|
85 |
{
|
86 |
return Err(Report::new(EngineError::EmptyResultSet));
|
87 |
}
|
88 |
+
}
|
89 |
|
90 |
+
let results: Selector = Selector::parse(".result")
|
91 |
+
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
92 |
+
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
|
93 |
+
let result_title: Selector = Selector::parse("h3>a")
|
94 |
+
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
95 |
+
.attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
|
96 |
+
let result_url: Selector = Selector::parse("h3>a")
|
97 |
+
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
98 |
+
.attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
|
99 |
|
100 |
+
let result_desc: Selector = Selector::parse(".content")
|
101 |
+
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
102 |
+
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?;
|
103 |
|
104 |
+
// scrape all the results from the html
|
105 |
+
Ok(document
|
106 |
+
.select(&results)
|
107 |
+
.map(|result| {
|
108 |
+
RawSearchResult::new(
|
109 |
+
result
|
110 |
+
.select(&result_title)
|
111 |
+
.next()
|
112 |
+
.unwrap()
|
113 |
+
.inner_html()
|
114 |
+
.trim()
|
115 |
+
.to_string(),
|
116 |
+
result
|
117 |
+
.select(&result_url)
|
118 |
+
.next()
|
119 |
+
.unwrap()
|
120 |
+
.value()
|
121 |
+
.attr("href")
|
122 |
+
.unwrap()
|
123 |
+
.to_string(),
|
124 |
+
result
|
125 |
+
.select(&result_desc)
|
126 |
+
.next()
|
127 |
+
.unwrap()
|
128 |
+
.inner_html()
|
129 |
+
.trim()
|
130 |
+
.to_string(),
|
131 |
+
vec!["searx".to_string()],
|
132 |
+
)
|
133 |
+
})
|
134 |
+
.map(|search_result| (search_result.visiting_url.clone(), search_result))
|
135 |
+
.collect())
|
136 |
+
}
|
137 |
}
|
src/results/aggregation_models.rs
CHANGED
@@ -3,7 +3,7 @@
|
|
3 |
|
4 |
use serde::{Deserialize, Serialize};
|
5 |
|
6 |
-
use crate::config::parser_models::Style;
|
7 |
|
8 |
/// A named struct to store, serialize and deserializes the individual search result from all the
|
9 |
/// scraped and aggregated search results from the upstream search engines.
|
@@ -16,7 +16,7 @@ use crate::config::parser_models::Style;
|
|
16 |
/// * `url` - The url to be displayed below the search result title in html.
|
17 |
/// * `description` - The description of the search result.
|
18 |
/// * `engine` - The names of the upstream engines from which this results were provided.
|
19 |
-
#[derive(
|
20 |
#[serde(rename_all = "camelCase")]
|
21 |
pub struct SearchResult {
|
22 |
pub title: String,
|
@@ -116,6 +116,25 @@ impl RawSearchResult {
|
|
116 |
}
|
117 |
}
|
118 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
119 |
/// A named struct to store, serialize, deserialize the all the search results scraped and
|
120 |
/// aggregated from the upstream search engines.
|
121 |
///
|
@@ -124,12 +143,18 @@ impl RawSearchResult {
|
|
124 |
/// * `results` - Stores the individual serializable `SearchResult` struct into a vector of
|
125 |
/// `SearchResult` structs.
|
126 |
/// * `page_query` - Stores the current pages search query `q` provided in the search url.
|
|
|
|
|
|
|
|
|
|
|
127 |
#[derive(Serialize, Deserialize)]
|
128 |
#[serde(rename_all = "camelCase")]
|
129 |
pub struct SearchResults {
|
130 |
pub results: Vec<SearchResult>,
|
131 |
pub page_query: String,
|
132 |
pub style: Style,
|
|
|
133 |
}
|
134 |
|
135 |
impl SearchResults {
|
@@ -141,14 +166,22 @@ impl SearchResults {
|
|
141 |
/// and stores it into a vector of `SearchResult` structs.
|
142 |
/// * `page_query` - Takes an argument of current page`s search query `q` provided in
|
143 |
/// the search url.
|
144 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
145 |
SearchResults {
|
146 |
results,
|
147 |
page_query,
|
148 |
style: Style::new("".to_string(), "".to_string()),
|
|
|
149 |
}
|
150 |
}
|
151 |
|
|
|
152 |
pub fn add_style(&mut self, style: Style) {
|
153 |
self.style = style;
|
154 |
}
|
|
|
3 |
|
4 |
use serde::{Deserialize, Serialize};
|
5 |
|
6 |
+
use crate::{config::parser_models::Style, engines::engine_models::EngineError};
|
7 |
|
8 |
/// A named struct to store, serialize and deserializes the individual search result from all the
|
9 |
/// scraped and aggregated search results from the upstream search engines.
|
|
|
16 |
/// * `url` - The url to be displayed below the search result title in html.
|
17 |
/// * `description` - The description of the search result.
|
18 |
/// * `engine` - The names of the upstream engines from which this results were provided.
|
19 |
+
#[derive(Serialize, Deserialize)]
|
20 |
#[serde(rename_all = "camelCase")]
|
21 |
pub struct SearchResult {
|
22 |
pub title: String,
|
|
|
116 |
}
|
117 |
}
|
118 |
|
119 |
+
#[derive(Serialize, Deserialize)]
|
120 |
+
pub struct EngineErrorInfo {
|
121 |
+
pub error: String,
|
122 |
+
pub engine: String,
|
123 |
+
}
|
124 |
+
|
125 |
+
impl EngineErrorInfo {
|
126 |
+
pub fn new(error: &EngineError, engine: String) -> Self {
|
127 |
+
Self {
|
128 |
+
error: match error {
|
129 |
+
EngineError::RequestError => String::from("RequestError"),
|
130 |
+
EngineError::EmptyResultSet => String::from("EmptyResultSet"),
|
131 |
+
EngineError::UnexpectedError => String::from("UnexpectedError"),
|
132 |
+
},
|
133 |
+
engine,
|
134 |
+
}
|
135 |
+
}
|
136 |
+
}
|
137 |
+
|
138 |
/// A named struct to store, serialize, deserialize the all the search results scraped and
|
139 |
/// aggregated from the upstream search engines.
|
140 |
///
|
|
|
143 |
/// * `results` - Stores the individual serializable `SearchResult` struct into a vector of
|
144 |
/// `SearchResult` structs.
|
145 |
/// * `page_query` - Stores the current pages search query `q` provided in the search url.
|
146 |
+
/// * `style` - Stores the theming options for the website.
|
147 |
+
/// * `engine_errors_info` - Stores the information on which engines failed with their engine name
|
148 |
+
/// and the type of error that caused it.
|
149 |
+
/// * `empty_result_set` - Stores a boolean which indicates that no engines gave a result for the
|
150 |
+
/// given search query.
|
151 |
#[derive(Serialize, Deserialize)]
|
152 |
#[serde(rename_all = "camelCase")]
|
153 |
pub struct SearchResults {
|
154 |
pub results: Vec<SearchResult>,
|
155 |
pub page_query: String,
|
156 |
pub style: Style,
|
157 |
+
pub engine_errors_info: Vec<EngineErrorInfo>,
|
158 |
}
|
159 |
|
160 |
impl SearchResults {
|
|
|
166 |
/// and stores it into a vector of `SearchResult` structs.
|
167 |
/// * `page_query` - Takes an argument of current page`s search query `q` provided in
|
168 |
/// the search url.
|
169 |
+
/// * `empty_result_set` - Takes a boolean which indicates that no engines gave a result for the
|
170 |
+
/// given search query.
|
171 |
+
pub fn new(
|
172 |
+
results: Vec<SearchResult>,
|
173 |
+
page_query: String,
|
174 |
+
engine_errors_info: Vec<EngineErrorInfo>,
|
175 |
+
) -> Self {
|
176 |
SearchResults {
|
177 |
results,
|
178 |
page_query,
|
179 |
style: Style::new("".to_string(), "".to_string()),
|
180 |
+
engine_errors_info,
|
181 |
}
|
182 |
}
|
183 |
|
184 |
+
/// A setter function to add website style to the return search results.
|
185 |
pub fn add_style(&mut self, style: Style) {
|
186 |
self.style = style;
|
187 |
}
|
src/results/aggregator.rs
CHANGED
@@ -3,22 +3,41 @@
|
|
3 |
|
4 |
use std::{collections::HashMap, time::Duration};
|
5 |
|
|
|
6 |
use rand::Rng;
|
7 |
-
use tokio::
|
8 |
|
9 |
use super::{
|
10 |
-
aggregation_models::{RawSearchResult, SearchResult, SearchResults},
|
11 |
user_agent::random_user_agent,
|
12 |
};
|
13 |
|
14 |
-
use crate::engines::{
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
-
///
|
17 |
-
///
|
18 |
-
///
|
19 |
-
///
|
20 |
-
///
|
21 |
-
///
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
22 |
///
|
23 |
/// # Example:
|
24 |
///
|
@@ -30,6 +49,9 @@ use crate::engines::{duckduckgo, searx};
|
|
30 |
/// * `query` - Accepts a string to query with the above upstream search engines.
|
31 |
/// * `page` - Accepts an u32 page number.
|
32 |
/// * `random_delay` - Accepts a boolean value to add a random delay before making the request.
|
|
|
|
|
|
|
33 |
///
|
34 |
/// # Error
|
35 |
///
|
@@ -37,10 +59,11 @@ use crate::engines::{duckduckgo, searx};
|
|
37 |
/// function in either `searx` or `duckduckgo` or both otherwise returns a `SearchResults struct`
|
38 |
/// containing appropriate values.
|
39 |
pub async fn aggregate(
|
40 |
-
query:
|
41 |
page: u32,
|
42 |
random_delay: bool,
|
43 |
debug: bool,
|
|
|
44 |
) -> Result<SearchResults, Box<dyn std::error::Error>> {
|
45 |
let user_agent: String = random_user_agent();
|
46 |
let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
|
@@ -53,41 +76,106 @@ pub async fn aggregate(
|
|
53 |
}
|
54 |
|
55 |
// fetch results from upstream search engines simultaneously/concurrently.
|
56 |
-
let
|
57 |
-
|
58 |
-
|
59 |
-
|
|
|
|
|
|
|
|
|
60 |
|
61 |
-
let
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
67 |
|
68 |
-
let
|
69 |
-
|
70 |
-
|
|
|
|
|
71 |
}
|
72 |
-
|
73 |
-
});
|
74 |
|
75 |
-
|
76 |
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
82 |
-
|
83 |
-
|
84 |
-
|
85 |
-
|
86 |
-
|
87 |
-
|
88 |
-
|
89 |
-
|
90 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
});
|
92 |
|
93 |
Ok(SearchResults::new(
|
@@ -104,5 +192,6 @@ pub async fn aggregate(
|
|
104 |
})
|
105 |
.collect(),
|
106 |
query.to_string(),
|
|
|
107 |
))
|
108 |
}
|
|
|
3 |
|
4 |
use std::{collections::HashMap, time::Duration};
|
5 |
|
6 |
+
use error_stack::Report;
|
7 |
use rand::Rng;
|
8 |
+
use tokio::task::JoinHandle;
|
9 |
|
10 |
use super::{
|
11 |
+
aggregation_models::{EngineErrorInfo, RawSearchResult, SearchResult, SearchResults},
|
12 |
user_agent::random_user_agent,
|
13 |
};
|
14 |
|
15 |
+
use crate::engines::{
|
16 |
+
duckduckgo,
|
17 |
+
engine_models::{EngineError, SearchEngine},
|
18 |
+
searx,
|
19 |
+
};
|
20 |
+
|
21 |
+
/// Aliases for long type annotations
|
22 |
+
type FutureVec = Vec<JoinHandle<Result<HashMap<String, RawSearchResult>, Report<EngineError>>>>;
|
23 |
|
24 |
+
/// The function aggregates the scraped results from the user-selected upstream search engines.
|
25 |
+
/// These engines can be chosen either from the user interface (UI) or from the configuration file.
|
26 |
+
/// The code handles this process by matching the selected search engines and adding them to a vector.
|
27 |
+
/// This vector is then used to create an asynchronous task vector using `tokio::spawn`, which returns
|
28 |
+
/// a future. This future is awaited in another loop. Once the results are collected, they are filtered
|
29 |
+
/// to remove any errors and ensure only proper results are included. If an error is encountered, it is
|
30 |
+
/// sent to the UI along with the name of the engine and the type of error. This information is finally
|
31 |
+
/// placed in the returned `SearchResults` struct.
|
32 |
+
///
|
33 |
+
/// Additionally, the function eliminates duplicate results. If two results are identified as coming from
|
34 |
+
/// multiple engines, their names are combined to indicate that the results were fetched from these upstream
|
35 |
+
/// engines. After this, all the data in the `HashMap` is removed and placed into a struct that contains all
|
36 |
+
/// the aggregated results in a vector. Furthermore, the query used is also added to the struct. This step is
|
37 |
+
/// necessary to ensure that the search bar in the search remains populated even when searched from the query URL.
|
38 |
+
///
|
39 |
+
/// Overall, this function serves to aggregate scraped results from user-selected search engines, handling errors,
|
40 |
+
/// removing duplicates, and organizing the data for display in the UI.
|
41 |
///
|
42 |
/// # Example:
|
43 |
///
|
|
|
49 |
/// * `query` - Accepts a string to query with the above upstream search engines.
|
50 |
/// * `page` - Accepts an u32 page number.
|
51 |
/// * `random_delay` - Accepts a boolean value to add a random delay before making the request.
|
52 |
+
/// * `debug` - Accepts a boolean value to enable or disable debug mode option.
|
53 |
+
/// * `upstream_search_engines` - Accepts a vector of search engine names which was selected by the
|
54 |
+
/// user through the UI or the config file.
|
55 |
///
|
56 |
/// # Error
|
57 |
///
|
|
|
59 |
/// function in either `searx` or `duckduckgo` or both otherwise returns a `SearchResults struct`
|
60 |
/// containing appropriate values.
|
61 |
pub async fn aggregate(
|
62 |
+
query: String,
|
63 |
page: u32,
|
64 |
random_delay: bool,
|
65 |
debug: bool,
|
66 |
+
upstream_search_engines: Vec<String>,
|
67 |
) -> Result<SearchResults, Box<dyn std::error::Error>> {
|
68 |
let user_agent: String = random_user_agent();
|
69 |
let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
|
|
|
76 |
}
|
77 |
|
78 |
// fetch results from upstream search engines simultaneously/concurrently.
|
79 |
+
let search_engines: Vec<Box<dyn SearchEngine + Send + Sync>> = upstream_search_engines
|
80 |
+
.iter()
|
81 |
+
.map(|engine| match engine.to_lowercase().as_str() {
|
82 |
+
"duckduckgo" => Box::new(duckduckgo::DuckDuckGo) as Box<dyn SearchEngine + Send + Sync>,
|
83 |
+
"searx" => Box::new(searx::Searx) as Box<dyn SearchEngine + Send + Sync>,
|
84 |
+
&_ => panic!("Config Error: Incorrect config file option provided"),
|
85 |
+
})
|
86 |
+
.collect();
|
87 |
|
88 |
+
let task_capacity: usize = search_engines.len();
|
89 |
+
|
90 |
+
let tasks: FutureVec = search_engines
|
91 |
+
.into_iter()
|
92 |
+
.map(|search_engine| {
|
93 |
+
let query: String = query.clone();
|
94 |
+
let user_agent: String = user_agent.clone();
|
95 |
+
tokio::spawn(
|
96 |
+
async move { search_engine.results(query, page, user_agent.clone()).await },
|
97 |
+
)
|
98 |
+
})
|
99 |
+
.collect();
|
100 |
|
101 |
+
let mut outputs = Vec::with_capacity(task_capacity);
|
102 |
+
|
103 |
+
for task in tasks {
|
104 |
+
if let Ok(result) = task.await {
|
105 |
+
outputs.push(result)
|
106 |
}
|
107 |
+
}
|
|
|
108 |
|
109 |
+
let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new();
|
110 |
|
111 |
+
// The code block `outputs.iter()` determines whether it is the first time the code is being run.
|
112 |
+
// It does this by checking the initial flag. If it is the first time, the code selects the first
|
113 |
+
// engine from which results are fetched and adds or extends them into the `result_map`. If the
|
114 |
+
// initially selected engine fails, the code automatically selects another engine to map or extend
|
115 |
+
// into the `result_map`. On the other hand, if an engine selected for the first time successfully
|
116 |
+
// fetches results and maps them into the `result_map`, the initial flag is set to false. Subsequently,
|
117 |
+
// the code iterates through the remaining engines one by one. It compares the fetched results from each
|
118 |
+
// engine with the results already present in the `result_map` to identify any duplicates. If duplicate
|
119 |
+
// results are found, the code groups them together with the name of the engine from which they were
|
120 |
+
// fetched, and automatically removes the duplicate results from the newly fetched data.
|
121 |
+
//
|
122 |
+
// Additionally, the code handles errors returned by the engines. It keeps track of which engines
|
123 |
+
// encountered errors and stores this information in a vector of structures called `EngineErrorInfo`.
|
124 |
+
// Each structure in this vector contains the name of the engine and the type of error it returned.
|
125 |
+
// These structures will later be added to the final `SearchResults` structure. The `SearchResults`
|
126 |
+
// structure is used to display an error box in the UI containing the relevant information from
|
127 |
+
// the `EngineErrorInfo` structure.
|
128 |
+
//
|
129 |
+
// In summary, this code block manages the selection of engines, handling of duplicate results, and tracking
|
130 |
+
// of errors in order to populate the `result_map` and provide informative feedback to the user through the
|
131 |
+
// `SearchResults` structure.
|
132 |
+
let mut initial: bool = true;
|
133 |
+
let mut counter: usize = 0;
|
134 |
+
outputs.iter().for_each(|results| {
|
135 |
+
if initial {
|
136 |
+
match results {
|
137 |
+
Ok(result) => {
|
138 |
+
result_map.extend(result.clone());
|
139 |
+
counter += 1;
|
140 |
+
initial = false
|
141 |
+
}
|
142 |
+
Err(error_type) => {
|
143 |
+
engine_errors_info.push(EngineErrorInfo::new(
|
144 |
+
error_type.downcast_ref::<EngineError>().unwrap(),
|
145 |
+
upstream_search_engines[counter].clone(),
|
146 |
+
));
|
147 |
+
counter += 1
|
148 |
+
}
|
149 |
+
}
|
150 |
+
} else {
|
151 |
+
match results {
|
152 |
+
Ok(result) => {
|
153 |
+
result.clone().into_iter().for_each(|(key, value)| {
|
154 |
+
result_map
|
155 |
+
.entry(key)
|
156 |
+
.and_modify(|result| {
|
157 |
+
result.add_engines(value.clone().engine());
|
158 |
+
})
|
159 |
+
.or_insert_with(|| -> RawSearchResult {
|
160 |
+
RawSearchResult::new(
|
161 |
+
value.title.clone(),
|
162 |
+
value.visiting_url.clone(),
|
163 |
+
value.description.clone(),
|
164 |
+
value.engine.clone(),
|
165 |
+
)
|
166 |
+
});
|
167 |
+
});
|
168 |
+
counter += 1
|
169 |
+
}
|
170 |
+
Err(error_type) => {
|
171 |
+
engine_errors_info.push(EngineErrorInfo::new(
|
172 |
+
error_type.downcast_ref::<EngineError>().unwrap(),
|
173 |
+
upstream_search_engines[counter].clone(),
|
174 |
+
));
|
175 |
+
counter += 1
|
176 |
+
}
|
177 |
+
}
|
178 |
+
}
|
179 |
});
|
180 |
|
181 |
Ok(SearchResults::new(
|
|
|
192 |
})
|
193 |
.collect(),
|
194 |
query.to_string(),
|
195 |
+
engine_errors_info,
|
196 |
))
|
197 |
}
|
src/server/routes.rs
CHANGED
@@ -22,7 +22,7 @@ use serde::Deserialize;
|
|
22 |
/// of the search url.
|
23 |
/// * `page` - It stores the search parameter `page` (or pageno in simple words)
|
24 |
/// of the search url.
|
25 |
-
#[derive(
|
26 |
struct SearchParams {
|
27 |
q: Option<String>,
|
28 |
page: Option<u32>,
|
@@ -51,6 +51,21 @@ pub async fn not_found(
|
|
51 |
.body(page_content))
|
52 |
}
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
/// Handles the route of search page of the `websurfx` meta search engine website and it takes
|
55 |
/// two search url parameters `q` and `page` where `page` parameter is optional.
|
56 |
///
|
@@ -72,7 +87,6 @@ pub async fn search(
|
|
72 |
config: web::Data<Config>,
|
73 |
) -> Result<HttpResponse, Box<dyn std::error::Error>> {
|
74 |
let params = web::Query::<SearchParams>::from_query(req.query_string())?;
|
75 |
-
|
76 |
match ¶ms.q {
|
77 |
Some(query) => {
|
78 |
if query.trim().is_empty() {
|
@@ -89,7 +103,7 @@ pub async fn search(
|
|
89 |
"http://{}:{}/search?q={}&page={}",
|
90 |
config.binding_ip, config.port, query, page
|
91 |
);
|
92 |
-
let results_json = results(url, &config, query, page).await?;
|
93 |
let page_content: String = hbs.render("search", &results_json)?;
|
94 |
Ok(HttpResponse::Ok().body(page_content))
|
95 |
}
|
@@ -104,23 +118,51 @@ pub async fn search(
|
|
104 |
async fn results(
|
105 |
url: String,
|
106 |
config: &Config,
|
107 |
-
query:
|
108 |
page: u32,
|
|
|
109 |
) -> Result<SearchResults, Box<dyn std::error::Error>> {
|
110 |
//Initialize redis cache connection struct
|
111 |
let mut redis_cache = RedisCache::new(config.redis_url.clone())?;
|
112 |
// fetch the cached results json.
|
113 |
let cached_results_json = redis_cache.cached_json(&url);
|
114 |
-
// check if fetched results was indeed fetched or it was an error and if so
|
115 |
// handle the data accordingly.
|
116 |
match cached_results_json {
|
117 |
-
Ok(
|
118 |
Err(_) => {
|
119 |
-
|
120 |
-
|
121 |
-
|
122 |
-
|
123 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
124 |
}
|
125 |
}
|
126 |
}
|
|
|
22 |
/// of the search url.
|
23 |
/// * `page` - It stores the search parameter `page` (or pageno in simple words)
|
24 |
/// of the search url.
|
25 |
+
#[derive(Deserialize)]
|
26 |
struct SearchParams {
|
27 |
q: Option<String>,
|
28 |
page: Option<u32>,
|
|
|
51 |
.body(page_content))
|
52 |
}
|
53 |
|
54 |
+
/// A named struct which is used to deserialize the cookies fetched from the client side.
|
55 |
+
///
|
56 |
+
/// # Fields
|
57 |
+
///
|
58 |
+
/// * `theme` - It stores the theme name used in the website.
|
59 |
+
/// * `colorscheme` - It stores the colorscheme name used for the website theme.
|
60 |
+
/// * `engines` - It stores the user selected upstream search engines selected from the UI.
|
61 |
+
#[allow(dead_code)]
|
62 |
+
#[derive(Deserialize)]
|
63 |
+
struct Cookie {
|
64 |
+
theme: String,
|
65 |
+
colorscheme: String,
|
66 |
+
engines: Vec<String>,
|
67 |
+
}
|
68 |
+
|
69 |
/// Handles the route of search page of the `websurfx` meta search engine website and it takes
|
70 |
/// two search url parameters `q` and `page` where `page` parameter is optional.
|
71 |
///
|
|
|
87 |
config: web::Data<Config>,
|
88 |
) -> Result<HttpResponse, Box<dyn std::error::Error>> {
|
89 |
let params = web::Query::<SearchParams>::from_query(req.query_string())?;
|
|
|
90 |
match ¶ms.q {
|
91 |
Some(query) => {
|
92 |
if query.trim().is_empty() {
|
|
|
103 |
"http://{}:{}/search?q={}&page={}",
|
104 |
config.binding_ip, config.port, query, page
|
105 |
);
|
106 |
+
let results_json = results(url, &config, query.to_string(), page, req).await?;
|
107 |
let page_content: String = hbs.render("search", &results_json)?;
|
108 |
Ok(HttpResponse::Ok().body(page_content))
|
109 |
}
|
|
|
118 |
async fn results(
|
119 |
url: String,
|
120 |
config: &Config,
|
121 |
+
query: String,
|
122 |
page: u32,
|
123 |
+
req: HttpRequest,
|
124 |
) -> Result<SearchResults, Box<dyn std::error::Error>> {
|
125 |
//Initialize redis cache connection struct
|
126 |
let mut redis_cache = RedisCache::new(config.redis_url.clone())?;
|
127 |
// fetch the cached results json.
|
128 |
let cached_results_json = redis_cache.cached_json(&url);
|
129 |
+
// check if fetched cache results was indeed fetched or it was an error and if so
|
130 |
// handle the data accordingly.
|
131 |
match cached_results_json {
|
132 |
+
Ok(results) => Ok(serde_json::from_str::<SearchResults>(&results).unwrap()),
|
133 |
Err(_) => {
|
134 |
+
// check if the cookie value is empty or not if it is empty then use the
|
135 |
+
// default selected upstream search engines from the config file otherwise
|
136 |
+
// parse the non-empty cookie and grab the user selected engines from the
|
137 |
+
// UI and use that.
|
138 |
+
let mut results: crate::results::aggregation_models::SearchResults = match req
|
139 |
+
.cookie("appCookie")
|
140 |
+
{
|
141 |
+
Some(cookie_value) => {
|
142 |
+
let cookie_value: Cookie = serde_json::from_str(cookie_value.name_value().1)?;
|
143 |
+
aggregate(
|
144 |
+
query,
|
145 |
+
page,
|
146 |
+
config.aggregator.random_delay,
|
147 |
+
config.debug,
|
148 |
+
cookie_value.engines,
|
149 |
+
)
|
150 |
+
.await?
|
151 |
+
}
|
152 |
+
None => {
|
153 |
+
aggregate(
|
154 |
+
query,
|
155 |
+
page,
|
156 |
+
config.aggregator.random_delay,
|
157 |
+
config.debug,
|
158 |
+
config.upstream_search_engines.clone(),
|
159 |
+
)
|
160 |
+
.await?
|
161 |
+
}
|
162 |
+
};
|
163 |
+
results.add_style(config.style.clone());
|
164 |
+
redis_cache.cache_results(serde_json::to_string(&results)?, &url)?;
|
165 |
+
Ok(results)
|
166 |
}
|
167 |
}
|
168 |
}
|
websurfx/config.lua
CHANGED
@@ -5,7 +5,7 @@ debug = false -- an option to enable or disable debug mode.
|
|
5 |
-- ### Server ###
|
6 |
port = "8080" -- port on which server should be launched
|
7 |
binding_ip = "127.0.0.1" --ip address on the which server should be launched.
|
8 |
-
production_use = false -- whether to use production mode or not (in other words this option should be used if it is to be used to host it on the server to provide a service to a large number of users)
|
9 |
-- if production_use is set to true
|
10 |
-- There will be a random delay before sending the request to the search engines, this is to prevent DDoSing the upstream search engines from a large number of simultaneous requests.
|
11 |
|
@@ -26,3 +26,6 @@ theme = "simple" -- the theme name which should be used for the website
|
|
26 |
|
27 |
-- ### Caching ###
|
28 |
redis_url = "redis://127.0.0.1:8082" -- redis connection url address on which the client should connect on.
|
|
|
|
|
|
|
|
5 |
-- ### Server ###
|
6 |
port = "8080" -- port on which server should be launched
|
7 |
binding_ip = "127.0.0.1" --ip address on the which server should be launched.
|
8 |
+
production_use = false -- whether to use production mode or not (in other words this option should be used if it is to be used to host it on the server to provide a service to a large number of users (more than one))
|
9 |
-- if production_use is set to true
|
10 |
-- There will be a random delay before sending the request to the search engines, this is to prevent DDoSing the upstream search engines from a large number of simultaneous requests.
|
11 |
|
|
|
26 |
|
27 |
-- ### Caching ###
|
28 |
redis_url = "redis://127.0.0.1:8082" -- redis connection url address on which the client should connect on.
|
29 |
+
|
30 |
+
-- ### Search Engines ###
|
31 |
+
upstream_search_engines = { DuckDuckGo = true, Searx = false } -- select the upstream search engines from which the results should be fetched.
|