neon_arch
commited on
Commit
•
c170de8
1
Parent(s):
f8c3c8d
add code to evade ip blocking, improve pagination code and fix documentation
Browse files- Cargo.lock +39 -0
- Cargo.toml +5 -2
- src/cache/cacher.rs +78 -0
- src/cache/mod.rs +1 -0
- src/config_parser/parser.rs +5 -0
- src/config_parser/parser_models.rs +11 -8
- src/engines/duckduckgo.rs +27 -17
- src/engines/searx.rs +18 -16
- src/lib.rs +1 -0
- src/search_results_handler/aggregation_models.rs +7 -7
- src/search_results_handler/aggregator.rs +2 -2
- src/server/routes.rs +65 -9
- tests/index.rs +2 -0
- websurfx/config.lua +3 -0
Cargo.lock
CHANGED
@@ -447,6 +447,16 @@ dependencies = [
|
|
447 |
"bitflags",
|
448 |
]
|
449 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
450 |
[[package]]
|
451 |
name = "convert_case"
|
452 |
version = "0.4.0"
|
@@ -1427,6 +1437,12 @@ version = "2.0.0"
|
|
1427 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1428 |
checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
|
1429 |
|
|
|
|
|
|
|
|
|
|
|
|
|
1430 |
[[package]]
|
1431 |
name = "memchr"
|
1432 |
version = "2.5.0"
|
@@ -2157,6 +2173,20 @@ dependencies = [
|
|
2157 |
"rand_core 0.3.1",
|
2158 |
]
|
2159 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2160 |
[[package]]
|
2161 |
name = "redox_syscall"
|
2162 |
version = "0.1.57"
|
@@ -2526,6 +2556,12 @@ dependencies = [
|
|
2526 |
"digest",
|
2527 |
]
|
2528 |
|
|
|
|
|
|
|
|
|
|
|
|
|
2529 |
[[package]]
|
2530 |
name = "sha2"
|
2531 |
version = "0.10.6"
|
@@ -3291,6 +3327,9 @@ dependencies = [
|
|
3291 |
"fake-useragent",
|
3292 |
"handlebars",
|
3293 |
"log",
|
|
|
|
|
|
|
3294 |
"reqwest 0.11.17",
|
3295 |
"rlua",
|
3296 |
"scraper",
|
|
|
447 |
"bitflags",
|
448 |
]
|
449 |
|
450 |
+
[[package]]
|
451 |
+
name = "combine"
|
452 |
+
version = "4.6.6"
|
453 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
454 |
+
checksum = "35ed6e9d84f0b51a7f52daf1c7d71dd136fd7a3f41a8462b8cdb8c78d920fad4"
|
455 |
+
dependencies = [
|
456 |
+
"bytes 1.4.0",
|
457 |
+
"memchr",
|
458 |
+
]
|
459 |
+
|
460 |
[[package]]
|
461 |
name = "convert_case"
|
462 |
version = "0.4.0"
|
|
|
1437 |
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1438 |
checksum = "60302e4db3a61da70c0cb7991976248362f30319e88850c487b9b95bbf059e00"
|
1439 |
|
1440 |
+
[[package]]
|
1441 |
+
name = "md5"
|
1442 |
+
version = "0.7.0"
|
1443 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
1444 |
+
checksum = "490cc448043f947bae3cbee9c203358d62dbee0db12107a74be5c30ccfd09771"
|
1445 |
+
|
1446 |
[[package]]
|
1447 |
name = "memchr"
|
1448 |
version = "2.5.0"
|
|
|
2173 |
"rand_core 0.3.1",
|
2174 |
]
|
2175 |
|
2176 |
+
[[package]]
|
2177 |
+
name = "redis"
|
2178 |
+
version = "0.23.0"
|
2179 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
2180 |
+
checksum = "3ea8c51b5dc1d8e5fd3350ec8167f464ec0995e79f2e90a075b63371500d557f"
|
2181 |
+
dependencies = [
|
2182 |
+
"combine",
|
2183 |
+
"itoa 1.0.6",
|
2184 |
+
"percent-encoding 2.2.0",
|
2185 |
+
"ryu",
|
2186 |
+
"sha1_smol",
|
2187 |
+
"url 2.3.1",
|
2188 |
+
]
|
2189 |
+
|
2190 |
[[package]]
|
2191 |
name = "redox_syscall"
|
2192 |
version = "0.1.57"
|
|
|
2556 |
"digest",
|
2557 |
]
|
2558 |
|
2559 |
+
[[package]]
|
2560 |
+
name = "sha1_smol"
|
2561 |
+
version = "1.0.0"
|
2562 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
2563 |
+
checksum = "ae1a47186c03a32177042e55dbc5fd5aee900b8e0069a8d70fba96a9375cd012"
|
2564 |
+
|
2565 |
[[package]]
|
2566 |
name = "sha2"
|
2567 |
version = "0.10.6"
|
|
|
3327 |
"fake-useragent",
|
3328 |
"handlebars",
|
3329 |
"log",
|
3330 |
+
"md5",
|
3331 |
+
"rand 0.6.5",
|
3332 |
+
"redis",
|
3333 |
"reqwest 0.11.17",
|
3334 |
"rlua",
|
3335 |
"scraper",
|
Cargo.toml
CHANGED
@@ -15,6 +15,9 @@ actix-web = {version="4.3.1"}
|
|
15 |
actix-files = {version="0.6.2"}
|
16 |
serde_json = {version="*"}
|
17 |
fake-useragent = {version="*"}
|
18 |
-
env_logger = "0.10.0"
|
19 |
-
log = "0.4.17"
|
20 |
rlua = {version="*"}
|
|
|
|
|
|
|
|
15 |
actix-files = {version="0.6.2"}
|
16 |
serde_json = {version="*"}
|
17 |
fake-useragent = {version="*"}
|
18 |
+
env_logger = {version="0.10.0"}
|
19 |
+
log = {version="0.4.17"}
|
20 |
rlua = {version="*"}
|
21 |
+
redis = {version="*"}
|
22 |
+
md5 = {version="*"}
|
23 |
+
rand={version="*"}
|
src/cache/cacher.rs
ADDED
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//! This module provides the functionality to cache the aggregated results fetched and aggregated
|
2 |
+
//! from the upstream search engines in a json format.
|
3 |
+
|
4 |
+
use md5::compute;
|
5 |
+
use redis::{Client, Commands, Connection};
|
6 |
+
|
7 |
+
/// A named struct which stores the redis Connection url address to which the client will
|
8 |
+
/// connect to.
|
9 |
+
///
|
10 |
+
/// # Fields
|
11 |
+
///
|
12 |
+
/// * `redis_connection_url` - It stores the redis Connection url address.
|
13 |
+
#[derive(Clone)]
|
14 |
+
pub struct RedisCache {
|
15 |
+
redis_connection_url: String,
|
16 |
+
}
|
17 |
+
|
18 |
+
impl RedisCache {
|
19 |
+
/// Constructs a new `SearchResult` with the given arguments needed for the struct.
|
20 |
+
///
|
21 |
+
/// # Arguments
|
22 |
+
///
|
23 |
+
/// * `redis_connection_url` - It stores the redis Connection url address.
|
24 |
+
pub fn new(redis_connection_url: String) -> Self {
|
25 |
+
RedisCache {
|
26 |
+
redis_connection_url,
|
27 |
+
}
|
28 |
+
}
|
29 |
+
|
30 |
+
/// A helper function which computes the hash of the url and formats and returns it as string.
|
31 |
+
///
|
32 |
+
/// # Arguments
|
33 |
+
///
|
34 |
+
/// * `url` - It takes an url as string.
|
35 |
+
fn compute_url_hash(self, url: &str) -> String {
|
36 |
+
format!("{:?}", compute(url))
|
37 |
+
}
|
38 |
+
|
39 |
+
/// A function which fetches the cached json results as json string from the redis server.
|
40 |
+
///
|
41 |
+
/// # Arguments
|
42 |
+
///
|
43 |
+
/// * `url` - It takes an url as a string.
|
44 |
+
pub fn cached_results_json(self, url: String) -> Result<String, Box<dyn std::error::Error>> {
|
45 |
+
let hashed_url_string = self.clone().compute_url_hash(&url);
|
46 |
+
let mut redis_connection: Connection =
|
47 |
+
Client::open(self.redis_connection_url)?.get_connection()?;
|
48 |
+
Ok(redis_connection.get(hashed_url_string)?)
|
49 |
+
}
|
50 |
+
|
51 |
+
/// A function which caches the results by using the hashed `url` as the key and
|
52 |
+
/// `json results` as the value and stores it in redis server with ttl(time to live)
|
53 |
+
/// set to 60 seconds.
|
54 |
+
///
|
55 |
+
/// # Arguments
|
56 |
+
///
|
57 |
+
/// * `json_results` - It takes the json results string as an argument.
|
58 |
+
/// * `url` - It takes the url as a String.
|
59 |
+
pub fn cache_results(
|
60 |
+
self,
|
61 |
+
json_results: String,
|
62 |
+
url: String,
|
63 |
+
) -> Result<(), Box<dyn std::error::Error>> {
|
64 |
+
let hashed_url_string = self.clone().compute_url_hash(&url);
|
65 |
+
let mut redis_connection: Connection =
|
66 |
+
Client::open(self.redis_connection_url)?.get_connection()?;
|
67 |
+
|
68 |
+
// put results_json into cache
|
69 |
+
redis_connection.set(hashed_url_string.clone(), json_results)?;
|
70 |
+
|
71 |
+
// Set the TTL for the key to 60 seconds
|
72 |
+
redis_connection
|
73 |
+
.expire::<String, u32>(hashed_url_string.clone(), 60)
|
74 |
+
.unwrap();
|
75 |
+
|
76 |
+
Ok(())
|
77 |
+
}
|
78 |
+
}
|
src/cache/mod.rs
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
pub mod cacher;
|
src/config_parser/parser.rs
CHANGED
@@ -11,11 +11,15 @@ use std::fs;
|
|
11 |
//
|
12 |
/// * `port` - It stores the parsed port number option on which the server should launch.
|
13 |
/// * `binding_ip_addr` - It stores the parsed ip address option on which the server should launch
|
|
|
|
|
|
|
14 |
#[derive(Clone)]
|
15 |
pub struct Config {
|
16 |
pub port: u16,
|
17 |
pub binding_ip_addr: String,
|
18 |
pub style: Style,
|
|
|
19 |
}
|
20 |
|
21 |
impl Config {
|
@@ -44,6 +48,7 @@ impl Config {
|
|
44 |
globals.get::<_, String>("theme")?,
|
45 |
globals.get::<_, String>("colorscheme")?,
|
46 |
),
|
|
|
47 |
})
|
48 |
})
|
49 |
}
|
|
|
11 |
//
|
12 |
/// * `port` - It stores the parsed port number option on which the server should launch.
|
13 |
/// * `binding_ip_addr` - It stores the parsed ip address option on which the server should launch
|
14 |
+
/// * `style` - It stores the theming options for the website.
|
15 |
+
/// * `redis_connection_url` - It stores the redis connection url address on which the redis
|
16 |
+
/// client should connect.
|
17 |
#[derive(Clone)]
|
18 |
pub struct Config {
|
19 |
pub port: u16,
|
20 |
pub binding_ip_addr: String,
|
21 |
pub style: Style,
|
22 |
+
pub redis_connection_url: String,
|
23 |
}
|
24 |
|
25 |
impl Config {
|
|
|
48 |
globals.get::<_, String>("theme")?,
|
49 |
globals.get::<_, String>("colorscheme")?,
|
50 |
),
|
51 |
+
redis_connection_url: globals.get::<_, String>("redis_connection_url")?,
|
52 |
})
|
53 |
})
|
54 |
}
|
src/config_parser/parser_models.rs
CHANGED
@@ -1,21 +1,24 @@
|
|
1 |
//! This module provides public models for handling, storing and serializing parsed config file
|
2 |
//! options from config.lua by grouping them togather.
|
3 |
|
4 |
-
use serde::Serialize;
|
5 |
|
6 |
-
/// A named struct which stores, serializes and groups the parsed config file options
|
7 |
-
/// colorscheme names into the Style struct which derives the `Clone
|
8 |
-
/// where the `Clone` trait is derived for allowing the struct to be
|
9 |
-
/// server as a shared data between all routes except `/robots.txt` and
|
10 |
-
/// has been derived for allowing the object to be serialized so that it
|
11 |
-
/// handlebars template files
|
|
|
|
|
|
|
12 |
///
|
13 |
/// # Fields
|
14 |
//
|
15 |
/// * `theme` - It stores the parsed theme option used to set a theme for the website.
|
16 |
/// * `colorscheme` - It stores the parsed colorscheme option used to set a colorscheme for the
|
17 |
/// theme being used.
|
18 |
-
#[derive(Serialize, Clone)]
|
19 |
pub struct Style {
|
20 |
pub theme: String,
|
21 |
pub colorscheme: String,
|
|
|
1 |
//! This module provides public models for handling, storing and serializing parsed config file
|
2 |
//! options from config.lua by grouping them togather.
|
3 |
|
4 |
+
use serde::{Deserialize, Serialize};
|
5 |
|
6 |
+
/// A named struct which stores,deserializes, serializes and groups the parsed config file options
|
7 |
+
/// of theme and colorscheme names into the Style struct which derives the `Clone`, `Serialize`
|
8 |
+
/// and Deserialize traits where the `Clone` trait is derived for allowing the struct to be
|
9 |
+
/// cloned and passed to the server as a shared data between all routes except `/robots.txt` and
|
10 |
+
/// the `Serialize` trait has been derived for allowing the object to be serialized so that it
|
11 |
+
/// can be passed to handlebars template files and the `Deserialize` trait has been derived in
|
12 |
+
/// order to allow the deserializing the json back to struct in aggregate function in
|
13 |
+
/// aggregator.rs and create a new struct out of it and then serialize it back to json and pass
|
14 |
+
/// it to the template files.
|
15 |
///
|
16 |
/// # Fields
|
17 |
//
|
18 |
/// * `theme` - It stores the parsed theme option used to set a theme for the website.
|
19 |
/// * `colorscheme` - It stores the parsed colorscheme option used to set a colorscheme for the
|
20 |
/// theme being used.
|
21 |
+
#[derive(Serialize, Deserialize, Clone)]
|
22 |
pub struct Style {
|
23 |
pub theme: String,
|
24 |
pub colorscheme: String,
|
src/engines/duckduckgo.rs
CHANGED
@@ -2,9 +2,10 @@
|
|
2 |
//! by querying the upstream duckduckgo search engine with user provided query and with a page
|
3 |
//! number if provided.
|
4 |
|
5 |
-
use std::collections::HashMap;
|
6 |
|
7 |
-
use
|
|
|
8 |
use scraper::{Html, Selector};
|
9 |
|
10 |
use crate::search_results_handler::aggregation_models::RawSearchResult;
|
@@ -17,7 +18,7 @@ use crate::search_results_handler::aggregation_models::RawSearchResult;
|
|
17 |
/// # Arguments
|
18 |
///
|
19 |
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
20 |
-
/// * `page` - Takes an
|
21 |
/// * `user_agent` - Takes a random user agent string as an argument.
|
22 |
///
|
23 |
/// # Errors
|
@@ -27,32 +28,41 @@ use crate::search_results_handler::aggregation_models::RawSearchResult;
|
|
27 |
/// selector fails to initialize"
|
28 |
pub async fn results(
|
29 |
query: &str,
|
30 |
-
page:
|
31 |
user_agent: &str,
|
32 |
) -> Result<HashMap<String, RawSearchResult>, Box<dyn std::error::Error>> {
|
33 |
// Page number can be missing or empty string and so appropriate handling is required
|
34 |
// so that upstream server recieves valid page number.
|
35 |
let url: String = match page {
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
}
|
47 |
}
|
48 |
-
None => format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js"),
|
49 |
};
|
50 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
// fetch the html from upstream duckduckgo engine
|
52 |
// TODO: Write better error handling code to handle no results case.
|
53 |
let results: String = reqwest::Client::new()
|
54 |
.get(url)
|
55 |
-
.
|
56 |
.send()
|
57 |
.await?
|
58 |
.text()
|
|
|
2 |
//! by querying the upstream duckduckgo search engine with user provided query and with a page
|
3 |
//! number if provided.
|
4 |
|
5 |
+
use std::{collections::HashMap, time::Duration};
|
6 |
|
7 |
+
use rand::Rng;
|
8 |
+
use reqwest::header::{HeaderMap, CONTENT_TYPE, REFERER, USER_AGENT};
|
9 |
use scraper::{Html, Selector};
|
10 |
|
11 |
use crate::search_results_handler::aggregation_models::RawSearchResult;
|
|
|
18 |
/// # Arguments
|
19 |
///
|
20 |
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
21 |
+
/// * `page` - Takes an u32 as an argument.
|
22 |
/// * `user_agent` - Takes a random user agent string as an argument.
|
23 |
///
|
24 |
/// # Errors
|
|
|
28 |
/// selector fails to initialize"
|
29 |
pub async fn results(
|
30 |
query: &str,
|
31 |
+
page: u32,
|
32 |
user_agent: &str,
|
33 |
) -> Result<HashMap<String, RawSearchResult>, Box<dyn std::error::Error>> {
|
34 |
// Page number can be missing or empty string and so appropriate handling is required
|
35 |
// so that upstream server recieves valid page number.
|
36 |
let url: String = match page {
|
37 |
+
1 => {
|
38 |
+
format!("https://html.duckduckgo.com/html/?q={query}&s=&dc=&v=1&o=json&api=/d.js")
|
39 |
+
}
|
40 |
+
_ => {
|
41 |
+
format!(
|
42 |
+
"https://duckduckgo.com/html/?q={}&s={}&dc={}&v=1&o=json&api=/d.js",
|
43 |
+
query,
|
44 |
+
(page / 2 + (page % 2)) * 30,
|
45 |
+
(page / 2 + (page % 2)) * 30 + 1
|
46 |
+
)
|
|
|
47 |
}
|
|
|
48 |
};
|
49 |
|
50 |
+
// Add a random delay before making the request.
|
51 |
+
let mut rng = rand::thread_rng();
|
52 |
+
let delay_secs = rng.gen_range(1, 10);
|
53 |
+
std::thread::sleep(Duration::from_secs(delay_secs));
|
54 |
+
|
55 |
+
// initializing HeaderMap and adding appropriate headers.
|
56 |
+
let mut header_map = HeaderMap::new();
|
57 |
+
header_map.insert(USER_AGENT, user_agent.parse()?);
|
58 |
+
header_map.insert(REFERER, "https://google.com/".parse()?);
|
59 |
+
header_map.insert(CONTENT_TYPE, "text/html; charset=UTF-8".parse()?);
|
60 |
+
|
61 |
// fetch the html from upstream duckduckgo engine
|
62 |
// TODO: Write better error handling code to handle no results case.
|
63 |
let results: String = reqwest::Client::new()
|
64 |
.get(url)
|
65 |
+
.headers(header_map) // add spoofed headers to emulate human behaviour
|
66 |
.send()
|
67 |
.await?
|
68 |
.text()
|
src/engines/searx.rs
CHANGED
@@ -2,10 +2,10 @@
|
|
2 |
//! by querying the upstream searx search engine instance with user provided query and with a page
|
3 |
//! number if provided.
|
4 |
|
5 |
-
use
|
6 |
-
|
7 |
-
use reqwest::header::USER_AGENT;
|
8 |
use scraper::{Html, Selector};
|
|
|
9 |
|
10 |
use crate::search_results_handler::aggregation_models::RawSearchResult;
|
11 |
|
@@ -17,7 +17,7 @@ use crate::search_results_handler::aggregation_models::RawSearchResult;
|
|
17 |
/// # Arguments
|
18 |
///
|
19 |
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
20 |
-
/// * `page` - Takes an
|
21 |
/// * `user_agent` - Takes a random user agent string as an argument.
|
22 |
///
|
23 |
/// # Errors
|
@@ -27,27 +27,29 @@ use crate::search_results_handler::aggregation_models::RawSearchResult;
|
|
27 |
/// selector fails to initialize"
|
28 |
pub async fn results(
|
29 |
query: &str,
|
30 |
-
page:
|
31 |
user_agent: &str,
|
32 |
) -> Result<HashMap<String, RawSearchResult>, Box<dyn std::error::Error>> {
|
33 |
// Page number can be missing or empty string and so appropriate handling is required
|
34 |
// so that upstream server recieves valid page number.
|
35 |
-
let url: String =
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
|
|
|
|
45 |
|
46 |
// fetch the html from upstream searx instance engine
|
47 |
// TODO: Write better error handling code to handle no results case.
|
48 |
let results: String = reqwest::Client::new()
|
49 |
.get(url)
|
50 |
-
.
|
51 |
.send()
|
52 |
.await?
|
53 |
.text()
|
|
|
2 |
//! by querying the upstream searx search engine instance with user provided query and with a page
|
3 |
//! number if provided.
|
4 |
|
5 |
+
use rand::Rng;
|
6 |
+
use reqwest::header::{HeaderMap, CONTENT_TYPE, REFERER, USER_AGENT};
|
|
|
7 |
use scraper::{Html, Selector};
|
8 |
+
use std::{collections::HashMap, time::Duration};
|
9 |
|
10 |
use crate::search_results_handler::aggregation_models::RawSearchResult;
|
11 |
|
|
|
17 |
/// # Arguments
|
18 |
///
|
19 |
/// * `query` - Takes the user provided query to query to the upstream search engine with.
|
20 |
+
/// * `page` - Takes an u32 as an argument.
|
21 |
/// * `user_agent` - Takes a random user agent string as an argument.
|
22 |
///
|
23 |
/// # Errors
|
|
|
27 |
/// selector fails to initialize"
|
28 |
pub async fn results(
|
29 |
query: &str,
|
30 |
+
page: u32,
|
31 |
user_agent: &str,
|
32 |
) -> Result<HashMap<String, RawSearchResult>, Box<dyn std::error::Error>> {
|
33 |
// Page number can be missing or empty string and so appropriate handling is required
|
34 |
// so that upstream server recieves valid page number.
|
35 |
+
let url: String = format!("https://searx.work/search?q={query}&pageno={page}");
|
36 |
+
|
37 |
+
// Add random delay before making the request.
|
38 |
+
let mut rng = rand::thread_rng();
|
39 |
+
let delay_secs = rng.gen_range(1, 10);
|
40 |
+
std::thread::sleep(Duration::from_secs(delay_secs));
|
41 |
+
|
42 |
+
// initializing headers and adding appropriate headers.
|
43 |
+
let mut header_map = HeaderMap::new();
|
44 |
+
header_map.insert(USER_AGENT, user_agent.parse()?);
|
45 |
+
header_map.insert(REFERER, "https://google.com/".parse()?);
|
46 |
+
header_map.insert(CONTENT_TYPE, "application/x-www-form-urlencoded".parse()?);
|
47 |
|
48 |
// fetch the html from upstream searx instance engine
|
49 |
// TODO: Write better error handling code to handle no results case.
|
50 |
let results: String = reqwest::Client::new()
|
51 |
.get(url)
|
52 |
+
.headers(header_map) // add spoofed headers to emulate human behaviours.
|
53 |
.send()
|
54 |
.await?
|
55 |
.text()
|
src/lib.rs
CHANGED
@@ -1,6 +1,7 @@
|
|
1 |
//! This main library module provides the functionality to provide and handle the Tcp server
|
2 |
//! and register all the routes for the `websurfx` meta search engine website.
|
3 |
|
|
|
4 |
pub mod config_parser;
|
5 |
pub mod engines;
|
6 |
pub mod search_results_handler;
|
|
|
1 |
//! This main library module provides the functionality to provide and handle the Tcp server
|
2 |
//! and register all the routes for the `websurfx` meta search engine website.
|
3 |
|
4 |
+
pub mod cache;
|
5 |
pub mod config_parser;
|
6 |
pub mod engines;
|
7 |
pub mod search_results_handler;
|
src/search_results_handler/aggregation_models.rs
CHANGED
@@ -1,12 +1,12 @@
|
|
1 |
//! This module provides public models for handling, storing and serializing of search results
|
2 |
//! data scraped from the upstream search engines.
|
3 |
|
4 |
-
use serde::Serialize;
|
5 |
|
6 |
use crate::config_parser::parser_models::Style;
|
7 |
|
8 |
-
/// A named struct to store and
|
9 |
-
/// and aggregated search results from the upstream search engines.
|
10 |
///
|
11 |
/// # Fields
|
12 |
///
|
@@ -16,7 +16,7 @@ use crate::config_parser::parser_models::Style;
|
|
16 |
/// * `url` - The url to be displayed below the search result title in html.
|
17 |
/// * `description` - The description of the search result.
|
18 |
/// * `engine` - The names of the upstream engines from which this results were provided.
|
19 |
-
#[derive(Debug, Serialize)]
|
20 |
#[serde(rename_all = "camelCase")]
|
21 |
pub struct SearchResult {
|
22 |
pub title: String,
|
@@ -116,15 +116,15 @@ impl RawSearchResult {
|
|
116 |
}
|
117 |
}
|
118 |
|
119 |
-
/// A named struct to store
|
120 |
-
/// from the upstream search engines.
|
121 |
///
|
122 |
/// # Fields
|
123 |
///
|
124 |
/// * `results` - Stores the individual serializable `SearchResult` struct into a vector of
|
125 |
/// `SearchResult` structs.
|
126 |
/// * `page_query` - Stores the current pages search query `q` provided in the search url.
|
127 |
-
#[derive(Serialize)]
|
128 |
#[serde(rename_all = "camelCase")]
|
129 |
pub struct SearchResults {
|
130 |
pub results: Vec<SearchResult>,
|
|
|
1 |
//! This module provides public models for handling, storing and serializing of search results
|
2 |
//! data scraped from the upstream search engines.
|
3 |
|
4 |
+
use serde::{Deserialize, Serialize};
|
5 |
|
6 |
use crate::config_parser::parser_models::Style;
|
7 |
|
8 |
+
/// A named struct to store, serialize and deserializes the individual search result from all the
|
9 |
+
/// scraped and aggregated search results from the upstream search engines.
|
10 |
///
|
11 |
/// # Fields
|
12 |
///
|
|
|
16 |
/// * `url` - The url to be displayed below the search result title in html.
|
17 |
/// * `description` - The description of the search result.
|
18 |
/// * `engine` - The names of the upstream engines from which this results were provided.
|
19 |
+
#[derive(Debug, Serialize, Deserialize)]
|
20 |
#[serde(rename_all = "camelCase")]
|
21 |
pub struct SearchResult {
|
22 |
pub title: String,
|
|
|
116 |
}
|
117 |
}
|
118 |
|
119 |
+
/// A named struct to store, serialize, deserialize the all the search results scraped and
|
120 |
+
/// aggregated from the upstream search engines.
|
121 |
///
|
122 |
/// # Fields
|
123 |
///
|
124 |
/// * `results` - Stores the individual serializable `SearchResult` struct into a vector of
|
125 |
/// `SearchResult` structs.
|
126 |
/// * `page_query` - Stores the current pages search query `q` provided in the search url.
|
127 |
+
#[derive(Serialize, Deserialize)]
|
128 |
#[serde(rename_all = "camelCase")]
|
129 |
pub struct SearchResults {
|
130 |
pub results: Vec<SearchResult>,
|
src/search_results_handler/aggregator.rs
CHANGED
@@ -25,7 +25,7 @@ use crate::engines::{duckduckgo, searx};
|
|
25 |
/// # Arguments
|
26 |
///
|
27 |
/// * `query` - Accepts a string to query with the above upstream search engines.
|
28 |
-
/// * `page` - Accepts an
|
29 |
///
|
30 |
/// # Error
|
31 |
///
|
@@ -34,7 +34,7 @@ use crate::engines::{duckduckgo, searx};
|
|
34 |
/// containing appropriate values.
|
35 |
pub async fn aggregate(
|
36 |
query: &str,
|
37 |
-
page:
|
38 |
) -> Result<SearchResults, Box<dyn std::error::Error>> {
|
39 |
let user_agent: String = random_user_agent();
|
40 |
let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
|
|
|
25 |
/// # Arguments
|
26 |
///
|
27 |
/// * `query` - Accepts a string to query with the above upstream search engines.
|
28 |
+
/// * `page` - Accepts an u32 page number.
|
29 |
///
|
30 |
/// # Error
|
31 |
///
|
|
|
34 |
/// containing appropriate values.
|
35 |
pub async fn aggregate(
|
36 |
query: &str,
|
37 |
+
page: u32,
|
38 |
) -> Result<SearchResults, Box<dyn std::error::Error>> {
|
39 |
let user_agent: String = random_user_agent();
|
40 |
let mut result_map: HashMap<String, RawSearchResult> = HashMap::new();
|
src/server/routes.rs
CHANGED
@@ -4,7 +4,11 @@
|
|
4 |
|
5 |
use std::fs::read_to_string;
|
6 |
|
7 |
-
use crate::{
|
|
|
|
|
|
|
|
|
8 |
use actix_web::{get, web, HttpRequest, HttpResponse};
|
9 |
use handlebars::Handlebars;
|
10 |
use serde::Deserialize;
|
@@ -67,6 +71,9 @@ pub async fn search(
|
|
67 |
config: web::Data<Config>,
|
68 |
) -> Result<HttpResponse, Box<dyn std::error::Error>> {
|
69 |
let params = web::Query::<SearchParams>::from_query(req.query_string())?;
|
|
|
|
|
|
|
70 |
match ¶ms.q {
|
71 |
Some(query) => {
|
72 |
if query.trim().is_empty() {
|
@@ -74,11 +81,63 @@ pub async fn search(
|
|
74 |
.insert_header(("location", "/"))
|
75 |
.finish())
|
76 |
} else {
|
77 |
-
|
78 |
-
|
79 |
-
|
80 |
-
|
81 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
}
|
83 |
}
|
84 |
None => Ok(HttpResponse::Found()
|
@@ -115,6 +174,3 @@ pub async fn settings(
|
|
115 |
let page_content: String = hbs.render("settings", &config.style)?;
|
116 |
Ok(HttpResponse::Ok().body(page_content))
|
117 |
}
|
118 |
-
|
119 |
-
// TODO: Write tests for tesing parameters for search function that if provided with something
|
120 |
-
// other than u32 like alphabets and special characters than it should panic
|
|
|
4 |
|
5 |
use std::fs::read_to_string;
|
6 |
|
7 |
+
use crate::{
|
8 |
+
cache::cacher::RedisCache,
|
9 |
+
config_parser::parser::Config,
|
10 |
+
search_results_handler::{aggregation_models::SearchResults, aggregator::aggregate},
|
11 |
+
};
|
12 |
use actix_web::{get, web, HttpRequest, HttpResponse};
|
13 |
use handlebars::Handlebars;
|
14 |
use serde::Deserialize;
|
|
|
71 |
config: web::Data<Config>,
|
72 |
) -> Result<HttpResponse, Box<dyn std::error::Error>> {
|
73 |
let params = web::Query::<SearchParams>::from_query(req.query_string())?;
|
74 |
+
|
75 |
+
//Initialize redis cache connection struct
|
76 |
+
let redis_cache = RedisCache::new(config.redis_connection_url.clone());
|
77 |
match ¶ms.q {
|
78 |
Some(query) => {
|
79 |
if query.trim().is_empty() {
|
|
|
81 |
.insert_header(("location", "/"))
|
82 |
.finish())
|
83 |
} else {
|
84 |
+
// Initialize the page url as an empty string
|
85 |
+
let mut page_url = String::new();
|
86 |
+
|
87 |
+
// Find whether the page is valid page number if not then return
|
88 |
+
// the first page number and also construct the page_url accordingly
|
89 |
+
let page = match params.page {
|
90 |
+
Some(page_number) => {
|
91 |
+
if page_number <= 1 {
|
92 |
+
page_url = format!(
|
93 |
+
"http://{}:{}/search?q={}&page={}",
|
94 |
+
config.binding_ip_addr, config.port, query, 1
|
95 |
+
);
|
96 |
+
1
|
97 |
+
} else {
|
98 |
+
page_url = format!(
|
99 |
+
"http://{}:{}/search?q={}&page={}",
|
100 |
+
config.binding_ip_addr, config.port, query, page_number
|
101 |
+
);
|
102 |
+
|
103 |
+
page_number
|
104 |
+
}
|
105 |
+
}
|
106 |
+
None => {
|
107 |
+
page_url = format!(
|
108 |
+
"http://{}:{}{}&page={}",
|
109 |
+
config.binding_ip_addr,
|
110 |
+
config.port,
|
111 |
+
req.uri(),
|
112 |
+
1
|
113 |
+
);
|
114 |
+
|
115 |
+
1
|
116 |
+
}
|
117 |
+
};
|
118 |
+
|
119 |
+
// fetch the cached results json.
|
120 |
+
let cached_results_json = redis_cache.clone().cached_results_json(page_url.clone());
|
121 |
+
// check if fetched results was indeed fetched or it was an error and if so
|
122 |
+
// handle the data accordingly.
|
123 |
+
match cached_results_json {
|
124 |
+
Ok(results_json) => {
|
125 |
+
let new_results_json: SearchResults = serde_json::from_str(&results_json)?;
|
126 |
+
let page_content: String = hbs.render("search", &new_results_json)?;
|
127 |
+
Ok(HttpResponse::Ok().body(page_content))
|
128 |
+
}
|
129 |
+
Err(_) => {
|
130 |
+
let mut results_json: crate::search_results_handler::aggregation_models::SearchResults =
|
131 |
+
aggregate(query, page).await?;
|
132 |
+
results_json.add_style(config.style.clone());
|
133 |
+
redis_cache.clone().cache_results(
|
134 |
+
serde_json::to_string(&results_json)?,
|
135 |
+
page_url.clone(),
|
136 |
+
)?;
|
137 |
+
let page_content: String = hbs.render("search", &results_json)?;
|
138 |
+
Ok(HttpResponse::Ok().body(page_content))
|
139 |
+
}
|
140 |
+
}
|
141 |
}
|
142 |
}
|
143 |
None => Ok(HttpResponse::Found()
|
|
|
174 |
let page_content: String = hbs.render("settings", &config.style)?;
|
175 |
Ok(HttpResponse::Ok().body(page_content))
|
176 |
}
|
|
|
|
|
|
tests/index.rs
CHANGED
@@ -41,3 +41,5 @@ async fn test_index() {
|
|
41 |
assert_eq!(res.text().await.unwrap(), template);
|
42 |
}
|
43 |
|
|
|
|
|
|
41 |
assert_eq!(res.text().await.unwrap(), template);
|
42 |
}
|
43 |
|
44 |
+
// TODO: Write tests for tesing parameters for search function that if provided with something
|
45 |
+
// other than u32 like alphabets and special characters than it should panic
|
websurfx/config.lua
CHANGED
@@ -16,3 +16,6 @@ binding_ip_addr = "127.0.0.1" --ip address on the which server should be launche
|
|
16 |
-- }}
|
17 |
colorscheme = "catppuccin-mocha" -- the colorscheme name which should be used for the website theme
|
18 |
theme = "simple" -- the theme name which should be used for the website
|
|
|
|
|
|
|
|
16 |
-- }}
|
17 |
colorscheme = "catppuccin-mocha" -- the colorscheme name which should be used for the website theme
|
18 |
theme = "simple" -- the theme name which should be used for the website
|
19 |
+
|
20 |
+
-- Caching
|
21 |
+
redis_connection_url = "redis://127.0.0.1:8082" -- redis connection url address on which the client should connect on.
|