Merge branch 'rolling' into improve-and-fix-settings-page
Browse files- Cargo.lock +17 -0
- Cargo.toml +29 -2
- README.md +14 -14
- public/static/index.js +22 -7
- public/static/pagination.js +27 -14
- src/config_parser/parser.rs +1 -1
- src/engines/duckduckgo.rs +66 -16
- src/engines/engine_models.rs +43 -0
- src/engines/mod.rs +1 -0
- src/engines/searx.rs +62 -15
- src/search_results_handler/aggregator.rs +13 -2
Cargo.lock
CHANGED
@@ -268,6 +268,12 @@ dependencies = [
|
|
268 |
"alloc-no-stdlib",
|
269 |
]
|
270 |
|
|
|
|
|
|
|
|
|
|
|
|
|
271 |
[[package]]
|
272 |
name = "askama_escape"
|
273 |
version = "0.10.3"
|
@@ -739,6 +745,16 @@ dependencies = [
|
|
739 |
"libc",
|
740 |
]
|
741 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
742 |
[[package]]
|
743 |
name = "failure"
|
744 |
version = "0.1.8"
|
@@ -3370,6 +3386,7 @@ dependencies = [
|
|
3370 |
"actix-files",
|
3371 |
"actix-web",
|
3372 |
"env_logger",
|
|
|
3373 |
"fake-useragent",
|
3374 |
"handlebars",
|
3375 |
"log",
|
|
|
268 |
"alloc-no-stdlib",
|
269 |
]
|
270 |
|
271 |
+
[[package]]
|
272 |
+
name = "anyhow"
|
273 |
+
version = "1.0.71"
|
274 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
275 |
+
checksum = "9c7d0618f0e0b7e8ff11427422b64564d5fb0be1940354bfe2e0529b18a9d9b8"
|
276 |
+
|
277 |
[[package]]
|
278 |
name = "askama_escape"
|
279 |
version = "0.10.3"
|
|
|
745 |
"libc",
|
746 |
]
|
747 |
|
748 |
+
[[package]]
|
749 |
+
name = "error-stack"
|
750 |
+
version = "0.3.1"
|
751 |
+
source = "registry+https://github.com/rust-lang/crates.io-index"
|
752 |
+
checksum = "5f00447f331c7f726db5b8532ebc9163519eed03c6d7c8b73c90b3ff5646ac85"
|
753 |
+
dependencies = [
|
754 |
+
"anyhow",
|
755 |
+
"rustc_version 0.4.0",
|
756 |
+
]
|
757 |
+
|
758 |
[[package]]
|
759 |
name = "failure"
|
760 |
version = "0.1.8"
|
|
|
3386 |
"actix-files",
|
3387 |
"actix-web",
|
3388 |
"env_logger",
|
3389 |
+
"error-stack",
|
3390 |
"fake-useragent",
|
3391 |
"handlebars",
|
3392 |
"log",
|
Cargo.toml
CHANGED
@@ -2,8 +2,9 @@
|
|
2 |
name = "websurfx"
|
3 |
version = "0.13.0"
|
4 |
edition = "2021"
|
5 |
-
|
6 |
-
|
|
|
7 |
|
8 |
[dependencies]
|
9 |
reqwest = {version="*",features=["json"]}
|
@@ -22,6 +23,32 @@ redis = {version="*"}
|
|
22 |
md5 = {version="*"}
|
23 |
rand={version="*"}
|
24 |
once_cell = {version="*"}
|
|
|
25 |
|
26 |
[dev-dependencies]
|
27 |
rusty-hook = "^0.11.2"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
name = "websurfx"
|
3 |
version = "0.13.0"
|
4 |
edition = "2021"
|
5 |
+
description = "An open-source alternative to Searx that provides clean, ad-free, and organic results with incredible speed while keeping privacy and security in mind."
|
6 |
+
repository = "https://github.com/neon-mmd/websurfx"
|
7 |
+
license = "AGPL-3.0"
|
8 |
|
9 |
[dependencies]
|
10 |
reqwest = {version="*",features=["json"]}
|
|
|
23 |
md5 = {version="*"}
|
24 |
rand={version="*"}
|
25 |
once_cell = {version="*"}
|
26 |
+
error-stack = {version="0.3.1"}
|
27 |
|
28 |
[dev-dependencies]
|
29 |
rusty-hook = "^0.11.2"
|
30 |
+
|
31 |
+
[profile.dev]
|
32 |
+
opt-level = 0
|
33 |
+
debug = true
|
34 |
+
split-debuginfo = '...'
|
35 |
+
debug-assertions = true
|
36 |
+
overflow-checks = true
|
37 |
+
lto = false
|
38 |
+
panic = 'unwind'
|
39 |
+
incremental = true
|
40 |
+
codegen-units = 256
|
41 |
+
rpath = false
|
42 |
+
|
43 |
+
[profile.release]
|
44 |
+
opt-level = 3
|
45 |
+
debug = false
|
46 |
+
split-debuginfo = '...'
|
47 |
+
debug-assertions = false
|
48 |
+
overflow-checks = false
|
49 |
+
lto = 'thin'
|
50 |
+
panic = 'unwind'
|
51 |
+
incremental = false
|
52 |
+
codegen-units = 16
|
53 |
+
rpath = false
|
54 |
+
strip = "debuginfo"
|
README.md
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
<h1 align="center">
|
2 |
<img src="./images/websurfx_logo.png" alt="websurfx logo" align="center" />
|
3 |
</h1>
|
4 |
<p align="center">
|
@@ -39,7 +39,7 @@
|
|
39 |
>meta search engine</a
|
40 |
>
|
41 |
(pronounced as websurface or web-surface /wɛbˈsɜːrfəs/.) written in Rust. It
|
42 |
-
provides a quick and secure search experience while
|
43 |
privacy.</i
|
44 |
>
|
45 |
</p>
|
@@ -72,7 +72,7 @@
|
|
72 |
|
73 |
# Preview 🔭
|
74 |
|
75 |
-
##
|
76 |
|
77 |
<img align="center" src="./images/main_page.png" />
|
78 |
|
@@ -88,7 +88,7 @@
|
|
88 |
|
89 |
# Features 🚀
|
90 |
|
91 |
-
- 🎨
|
92 |
- 🔐 Fast, private, and secure
|
93 |
- 🆓 100% free and open source
|
94 |
- 💨 Ad-free and clean results
|
@@ -116,7 +116,7 @@ redis-server --port 8082 &
|
|
116 |
Once you have started the server, open your preferred web browser and navigate to <http://127.0.0.1:8080> to start using Websurfx.
|
117 |
|
118 |
> **Warning**
|
119 |
-
>
|
120 |
|
121 |
**[⬆️ Back to Top](#--)**
|
122 |
|
@@ -132,14 +132,14 @@ Websurfx is configured through the config.lua file, located at `websurfx/config.
|
|
132 |
|
133 |
> For full theming and customization instructions, see: [**Theming**](./docs/theming.md)
|
134 |
|
135 |
-
Websurfx comes with several themes and color schemes
|
136 |
|
137 |
**[⬆️ Back to Top](#--)**
|
138 |
|
139 |
# Multi-Language Support 🌍
|
140 |
|
141 |
> **Note**
|
142 |
-
> Currently, we do not support other languages
|
143 |
|
144 |
**[⬆️ Back to Top](#--)**
|
145 |
|
@@ -153,15 +153,15 @@ At present, we only support x86_64 architecture systems, but we would love to ha
|
|
153 |
|
154 |
## Why Websurfx?
|
155 |
|
156 |
-
The primary purpose of the Websurfx project is to create a fast, secure, and privacy-focused meta-search engine.
|
157 |
|
158 |
## Why AGPLv3?
|
159 |
|
160 |
-
Websurfx is distributed under the **AGPLv3** license to keep the source code open and transparent. This helps
|
161 |
|
162 |
## Why Rust?
|
163 |
|
164 |
-
|
165 |
|
166 |
**[⬆️ Back to Top](#--)**
|
167 |
|
@@ -175,14 +175,14 @@ We are looking for more willing contributors to help grow this project. For more
|
|
175 |
|
176 |
> For full details and other ways you can help out, see: [**Contributing**]()
|
177 |
|
178 |
-
If you use Websurfx and would like to contribute to its development,
|
179 |
|
180 |
Several areas that we need a bit of help with at the moment are:
|
181 |
- **Better and more color schemes**: Help fix color schemes and add other famous color schemes.
|
182 |
- **Improve evasion code for bot detection** - Help improve code related to evading IP blocking and emulating human behaviors located in everyone's engine file.
|
183 |
- **Logo** - Help create a logo for the project and website.
|
184 |
- **Docker Support** - Help write a Docker Compose file for the project.
|
185 |
-
- Submit a PR to add a new feature, fix a bug, update the docs, add a theme, widget, or
|
186 |
- Star Websurfx on GitHub.
|
187 |
|
188 |
**[⬆️ Back to Top](#--)**
|
@@ -196,13 +196,13 @@ Several areas that we need a bit of help with at the moment are:
|
|
196 |
|
197 |
# Roadmap 🛣️
|
198 |
|
199 |
-
> Coming soon
|
200 |
|
201 |
**[⬆️ Back to Top](#--)**
|
202 |
|
203 |
# Contributing 🙋
|
204 |
|
205 |
-
Contributions are welcome from anyone. It doesn
|
206 |
|
207 |
## Not a developer but still want to contribute?
|
208 |
|
|
|
1 |
+
<h1 align="center">
|
2 |
<img src="./images/websurfx_logo.png" alt="websurfx logo" align="center" />
|
3 |
</h1>
|
4 |
<p align="center">
|
|
|
39 |
>meta search engine</a
|
40 |
>
|
41 |
(pronounced as websurface or web-surface /wɛbˈsɜːrfəs/.) written in Rust. It
|
42 |
+
provides a quick and secure search experience while completely respecting user
|
43 |
privacy.</i
|
44 |
>
|
45 |
</p>
|
|
|
72 |
|
73 |
# Preview 🔭
|
74 |
|
75 |
+
## Home Page
|
76 |
|
77 |
<img align="center" src="./images/main_page.png" />
|
78 |
|
|
|
88 |
|
89 |
# Features 🚀
|
90 |
|
91 |
+
- 🎨 Make Websurfx uniquely yours with nine color schemes provided by default. It also supports creation of custom themes and color schemes in a quick and easy way, so unleash your creativity!
|
92 |
- 🔐 Fast, private, and secure
|
93 |
- 🆓 100% free and open source
|
94 |
- 💨 Ad-free and clean results
|
|
|
116 |
Once you have started the server, open your preferred web browser and navigate to <http://127.0.0.1:8080> to start using Websurfx.
|
117 |
|
118 |
> **Warning**
|
119 |
+
> This project is still in the testing phase and is **not** ready for production use.
|
120 |
|
121 |
**[⬆️ Back to Top](#--)**
|
122 |
|
|
|
132 |
|
133 |
> For full theming and customization instructions, see: [**Theming**](./docs/theming.md)
|
134 |
|
135 |
+
Websurfx comes loaded with several themes and color schemes, which you can apply and edit through the config file. It also supports custom themes and color schemes using CSS, allowing you to make it truly yours.
|
136 |
|
137 |
**[⬆️ Back to Top](#--)**
|
138 |
|
139 |
# Multi-Language Support 🌍
|
140 |
|
141 |
> **Note**
|
142 |
+
> Currently, we do not support other languages but we will start accepting contributions regarding language support in the future. We believe language should never be a barrier to entry.
|
143 |
|
144 |
**[⬆️ Back to Top](#--)**
|
145 |
|
|
|
153 |
|
154 |
## Why Websurfx?
|
155 |
|
156 |
+
The primary purpose of the Websurfx project is to create a fast, secure, and privacy-focused meta-search engine. There are numerous meta-search engines available, but not all guarantee the security of their search engine, which is critical for maintaining privacy. Memory flaws, for example, can expose private or sensitive information, which is understandably bad. There is also the added problem of spam, ads, and inorganic results which most engines don't have a fool-proof answer to. Until now. With Websurfx I finally put a full stop to this problem. Websurfx is based on Rust, which ensures memory safety and removes such issues. Many meta-search engines also lack important features like advanced picture search, required by graphic designers, content providers, and others. Websurfx improves the user experience by providing these and other features, such as proper NSFW blocking and Micro-apps or Quick Results (providing a calculator, currency exchanges, etc in the search results).
|
157 |
|
158 |
## Why AGPLv3?
|
159 |
|
160 |
+
Websurfx is distributed under the **AGPLv3** license to keep the source code open and transparent. This helps keep malware, telemetry, and other dangers out of the project. **AGPLv3** is a strong copyleft license that ensures the software's source code, including any modifications or improvements made to the code, remains open and available to everyone.
|
161 |
|
162 |
## Why Rust?
|
163 |
|
164 |
+
Websurfx is based on Rust due to its memory safety features, which prevents vulnerabilities and makes the codebase more secure. Rust is also faster than C++, contributing to Websurfx's speed and responsiveness. Finally, the Rust ownership and borrowing system enables secure concurrency and thread safety in the program.
|
165 |
|
166 |
**[⬆️ Back to Top](#--)**
|
167 |
|
|
|
175 |
|
176 |
> For full details and other ways you can help out, see: [**Contributing**]()
|
177 |
|
178 |
+
If you use Websurfx and would like to contribute to its development, we're glad to have you on board! Contributions of any size or type are always welcome, and we will always acknowledge your efforts.
|
179 |
|
180 |
Several areas that we need a bit of help with at the moment are:
|
181 |
- **Better and more color schemes**: Help fix color schemes and add other famous color schemes.
|
182 |
- **Improve evasion code for bot detection** - Help improve code related to evading IP blocking and emulating human behaviors located in everyone's engine file.
|
183 |
- **Logo** - Help create a logo for the project and website.
|
184 |
- **Docker Support** - Help write a Docker Compose file for the project.
|
185 |
+
- Submit a PR to add a new feature, fix a bug, update the docs, add a theme, widget, or anything else.
|
186 |
- Star Websurfx on GitHub.
|
187 |
|
188 |
**[⬆️ Back to Top](#--)**
|
|
|
196 |
|
197 |
# Roadmap 🛣️
|
198 |
|
199 |
+
> Coming soon! 🙂.
|
200 |
|
201 |
**[⬆️ Back to Top](#--)**
|
202 |
|
203 |
# Contributing 🙋
|
204 |
|
205 |
+
Contributions are welcome from anyone. It doesn't matter who you are; you can still contribute to the project in your own way.
|
206 |
|
207 |
## Not a developer but still want to contribute?
|
208 |
|
public/static/index.js
CHANGED
@@ -1,10 +1,25 @@
|
|
1 |
-
|
2 |
-
|
3 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
4 |
}
|
5 |
|
6 |
-
|
7 |
-
|
8 |
-
|
|
|
|
|
|
|
|
|
9 |
}
|
10 |
-
})
|
|
|
1 |
+
/**
|
2 |
+
* Selects the input element for the search box
|
3 |
+
* @type {HTMLInputElement}
|
4 |
+
*/
|
5 |
+
const searchBox = document.querySelector('input');
|
6 |
+
|
7 |
+
/**
|
8 |
+
* Redirects the user to the search results page with the query parameter
|
9 |
+
*/
|
10 |
+
function searchWeb() {
|
11 |
+
const query = searchBox.value.trim();
|
12 |
+
if (query) {
|
13 |
+
window.location.href = `search?q=${encodeURIComponent(query)}`;
|
14 |
+
}
|
15 |
}
|
16 |
|
17 |
+
/**
|
18 |
+
* Listens for the 'Enter' key press event on the search box and calls the searchWeb function
|
19 |
+
* @param {KeyboardEvent} e - The keyboard event object
|
20 |
+
*/
|
21 |
+
searchBox.addEventListener('keyup', (e) => {
|
22 |
+
if (e.key === 'Enter') {
|
23 |
+
searchWeb();
|
24 |
}
|
25 |
+
});
|
public/static/pagination.js
CHANGED
@@ -1,26 +1,39 @@
|
|
|
|
|
|
|
|
|
|
1 |
function navigate_forward() {
|
2 |
-
const url = new URL(window.location)
|
3 |
-
const searchParams = url.searchParams
|
4 |
|
5 |
-
let q = searchParams.get('q')
|
6 |
-
let page = searchParams.get('page')
|
7 |
|
8 |
-
if (page
|
9 |
-
page =
|
10 |
-
window.location = `${url.origin}${url.pathname}?q=${q}&page=${page}`
|
11 |
} else {
|
12 |
-
|
13 |
}
|
|
|
|
|
14 |
}
|
15 |
|
|
|
|
|
|
|
|
|
16 |
function navigate_backward() {
|
17 |
-
const url = new URL(window.location)
|
18 |
-
const searchParams = url.searchParams
|
19 |
|
20 |
-
let q = searchParams.get('q')
|
21 |
-
let page = searchParams.get('page')
|
22 |
|
23 |
-
if (page
|
24 |
-
|
|
|
|
|
25 |
}
|
|
|
|
|
26 |
}
|
|
|
1 |
+
/**
|
2 |
+
* Navigates to the next page by incrementing the current page number in the URL query parameters.
|
3 |
+
* @returns {void}
|
4 |
+
*/
|
5 |
function navigate_forward() {
|
6 |
+
const url = new URL(window.location);
|
7 |
+
const searchParams = url.searchParams;
|
8 |
|
9 |
+
let q = searchParams.get('q');
|
10 |
+
let page = parseInt(searchParams.get('page'));
|
11 |
|
12 |
+
if (isNaN(page)) {
|
13 |
+
page = 1;
|
|
|
14 |
} else {
|
15 |
+
page++;
|
16 |
}
|
17 |
+
|
18 |
+
window.location.href = `${url.origin}${url.pathname}?q=${encodeURIComponent(q)}&page=${page}`;
|
19 |
}
|
20 |
|
21 |
+
/**
|
22 |
+
* Navigates to the previous page by decrementing the current page number in the URL query parameters.
|
23 |
+
* @returns {void}
|
24 |
+
*/
|
25 |
function navigate_backward() {
|
26 |
+
const url = new URL(window.location);
|
27 |
+
const searchParams = url.searchParams;
|
28 |
|
29 |
+
let q = searchParams.get('q');
|
30 |
+
let page = parseInt(searchParams.get('page'));
|
31 |
|
32 |
+
if (isNaN(page)) {
|
33 |
+
page = 1;
|
34 |
+
} else if (page > 1) {
|
35 |
+
page--;
|
36 |
}
|
37 |
+
|
38 |
+
window.location.href = `${url.origin}${url.pathname}?q=${encodeURIComponent(q)}&page=${page}`;
|
39 |
}
|
src/config_parser/parser.rs
CHANGED
@@ -118,7 +118,7 @@ impl Config {
|
|
118 |
{
|
119 |
Ok("./websurfx/config.lua".to_string())
|
120 |
} else {
|
121 |
-
Err(
|
122 |
}
|
123 |
}
|
124 |
}
|
|
|
118 |
{
|
119 |
Ok("./websurfx/config.lua".to_string())
|
120 |
} else {
|
121 |
+
Err("Config file not found!!".to_string().into())
|
122 |
}
|
123 |
}
|
124 |
}
|
src/engines/duckduckgo.rs
CHANGED
@@ -2,13 +2,17 @@
|
|
2 |
//! by querying the upstream duckduckgo search engine with user provided query and with a page
|
3 |
//! number if provided.
|
4 |
|
5 |
-
use std::collections::HashMap;
|
6 |
|
7 |
use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
|
8 |
use scraper::{Html, Selector};
|
9 |
|
10 |
use crate::search_results_handler::aggregation_models::RawSearchResult;
|
11 |
|
|
|
|
|
|
|
|
|
12 |
/// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
|
13 |
/// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
|
14 |
/// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
|
@@ -22,14 +26,15 @@ use crate::search_results_handler::aggregation_models::RawSearchResult;
|
|
22 |
///
|
23 |
/// # Errors
|
24 |
///
|
25 |
-
/// Returns
|
26 |
-
/// reach the above `upstream search engine` page
|
27 |
-
///
|
|
|
28 |
pub async fn results(
|
29 |
query: &str,
|
30 |
page: u32,
|
31 |
user_agent: &str,
|
32 |
-
) -> Result<HashMap<String, RawSearchResult>,
|
33 |
// Page number can be missing or empty string and so appropriate handling is required
|
34 |
// so that upstream server recieves valid page number.
|
35 |
let url: String = match page {
|
@@ -48,26 +53,71 @@ pub async fn results(
|
|
48 |
|
49 |
// initializing HeaderMap and adding appropriate headers.
|
50 |
let mut header_map = HeaderMap::new();
|
51 |
-
header_map.insert(
|
52 |
-
|
53 |
-
|
54 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
55 |
|
56 |
// fetch the html from upstream duckduckgo engine
|
57 |
-
// TODO: Write better error handling code to handle no results case.
|
58 |
let results: String = reqwest::Client::new()
|
59 |
.get(url)
|
|
|
60 |
.headers(header_map) // add spoofed headers to emulate human behaviour
|
61 |
.send()
|
62 |
-
.await
|
|
|
|
|
63 |
.text()
|
64 |
-
.await
|
|
|
|
|
65 |
|
66 |
let document: Html = Html::parse_document(&results);
|
67 |
-
|
68 |
-
let
|
69 |
-
|
70 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
71 |
|
72 |
// scrape all the results from the html
|
73 |
Ok(document
|
|
|
2 |
//! by querying the upstream duckduckgo search engine with user provided query and with a page
|
3 |
//! number if provided.
|
4 |
|
5 |
+
use std::{collections::HashMap, time::Duration};
|
6 |
|
7 |
use reqwest::header::{HeaderMap, CONTENT_TYPE, COOKIE, REFERER, USER_AGENT};
|
8 |
use scraper::{Html, Selector};
|
9 |
|
10 |
use crate::search_results_handler::aggregation_models::RawSearchResult;
|
11 |
|
12 |
+
use super::engine_models::EngineError;
|
13 |
+
|
14 |
+
use error_stack::{IntoReport, Report, Result, ResultExt};
|
15 |
+
|
16 |
/// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
|
17 |
/// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
|
18 |
/// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
|
|
|
26 |
///
|
27 |
/// # Errors
|
28 |
///
|
29 |
+
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
|
30 |
+
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
|
31 |
+
/// provide results for the requested search query and also returns error if the scraping selector
|
32 |
+
/// or HeaderMap fails to initialize.
|
33 |
pub async fn results(
|
34 |
query: &str,
|
35 |
page: u32,
|
36 |
user_agent: &str,
|
37 |
+
) -> Result<HashMap<String, RawSearchResult>, EngineError> {
|
38 |
// Page number can be missing or empty string and so appropriate handling is required
|
39 |
// so that upstream server recieves valid page number.
|
40 |
let url: String = match page {
|
|
|
53 |
|
54 |
// initializing HeaderMap and adding appropriate headers.
|
55 |
let mut header_map = HeaderMap::new();
|
56 |
+
header_map.insert(
|
57 |
+
USER_AGENT,
|
58 |
+
user_agent
|
59 |
+
.parse()
|
60 |
+
.into_report()
|
61 |
+
.change_context(EngineError::UnexpectedError)?,
|
62 |
+
);
|
63 |
+
header_map.insert(
|
64 |
+
REFERER,
|
65 |
+
"https://google.com/"
|
66 |
+
.parse()
|
67 |
+
.into_report()
|
68 |
+
.change_context(EngineError::UnexpectedError)?,
|
69 |
+
);
|
70 |
+
header_map.insert(
|
71 |
+
CONTENT_TYPE,
|
72 |
+
"application/x-www-form-urlencoded"
|
73 |
+
.parse()
|
74 |
+
.into_report()
|
75 |
+
.change_context(EngineError::UnexpectedError)?,
|
76 |
+
);
|
77 |
+
header_map.insert(
|
78 |
+
COOKIE,
|
79 |
+
"kl=wt-wt"
|
80 |
+
.parse()
|
81 |
+
.into_report()
|
82 |
+
.change_context(EngineError::UnexpectedError)?,
|
83 |
+
);
|
84 |
|
85 |
// fetch the html from upstream duckduckgo engine
|
|
|
86 |
let results: String = reqwest::Client::new()
|
87 |
.get(url)
|
88 |
+
.timeout(Duration::from_secs(5))
|
89 |
.headers(header_map) // add spoofed headers to emulate human behaviour
|
90 |
.send()
|
91 |
+
.await
|
92 |
+
.into_report()
|
93 |
+
.change_context(EngineError::RequestError)?
|
94 |
.text()
|
95 |
+
.await
|
96 |
+
.into_report()
|
97 |
+
.change_context(EngineError::RequestError)?;
|
98 |
|
99 |
let document: Html = Html::parse_document(&results);
|
100 |
+
|
101 |
+
let no_result: Selector = Selector::parse(".no-results")
|
102 |
+
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
103 |
+
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".no-results"))?;
|
104 |
+
|
105 |
+
if document.select(&no_result).next().is_some() {
|
106 |
+
return Err(Report::new(EngineError::EmptyResultSet));
|
107 |
+
}
|
108 |
+
|
109 |
+
let results: Selector = Selector::parse(".result")
|
110 |
+
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
111 |
+
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
|
112 |
+
let result_title: Selector = Selector::parse(".result__a")
|
113 |
+
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
114 |
+
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__a"))?;
|
115 |
+
let result_url: Selector = Selector::parse(".result__url")
|
116 |
+
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
117 |
+
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__url"))?;
|
118 |
+
let result_desc: Selector = Selector::parse(".result__snippet")
|
119 |
+
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
120 |
+
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result__snippet"))?;
|
121 |
|
122 |
// scrape all the results from the html
|
123 |
Ok(document
|
src/engines/engine_models.rs
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
//! This module provides the error enum to handle different errors associated while requesting data from
|
2 |
+
//! the upstream search engines with the search query provided by the user.
|
3 |
+
|
4 |
+
use error_stack::Context;
|
5 |
+
use std::fmt;
|
6 |
+
|
7 |
+
/// A custom error type used for handle engine associated errors.
|
8 |
+
///
|
9 |
+
/// This enum provides variants three different categories of errors:
|
10 |
+
/// * `RequestError` - This variant handles all request related errors like forbidden, not found,
|
11 |
+
/// etc.
|
12 |
+
/// * `EmptyResultSet` - This variant handles the not results found error provide by the upstream
|
13 |
+
/// search engines.
|
14 |
+
/// * `UnexpectedError` - This variant handles all the errors which are unexpected or occur rarely
|
15 |
+
/// and are errors mostly related to failure in initialization of HeaderMap, Selector errors and
|
16 |
+
/// all other errors occuring within the code handling the `upstream search engines`.
|
17 |
+
#[derive(Debug)]
|
18 |
+
pub enum EngineError {
|
19 |
+
EmptyResultSet,
|
20 |
+
RequestError,
|
21 |
+
UnexpectedError,
|
22 |
+
}
|
23 |
+
|
24 |
+
impl fmt::Display for EngineError {
|
25 |
+
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
|
26 |
+
match self {
|
27 |
+
EngineError::EmptyResultSet => {
|
28 |
+
write!(f, "The upstream search engine returned an empty result set")
|
29 |
+
}
|
30 |
+
EngineError::RequestError => {
|
31 |
+
write!(
|
32 |
+
f,
|
33 |
+
"Error occurred while requesting data from upstream search engine"
|
34 |
+
)
|
35 |
+
}
|
36 |
+
EngineError::UnexpectedError => {
|
37 |
+
write!(f, "An unexpected error occurred while processing the data")
|
38 |
+
}
|
39 |
+
}
|
40 |
+
}
|
41 |
+
}
|
42 |
+
|
43 |
+
impl Context for EngineError {}
|
src/engines/mod.rs
CHANGED
@@ -1,2 +1,3 @@
|
|
1 |
pub mod duckduckgo;
|
|
|
2 |
pub mod searx;
|
|
|
1 |
pub mod duckduckgo;
|
2 |
+
pub mod engine_models;
|
3 |
pub mod searx;
|
src/engines/searx.rs
CHANGED
@@ -8,6 +8,9 @@ use std::collections::HashMap;
|
|
8 |
|
9 |
use crate::search_results_handler::aggregation_models::RawSearchResult;
|
10 |
|
|
|
|
|
|
|
11 |
/// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
|
12 |
/// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
|
13 |
/// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
|
@@ -21,40 +24,84 @@ use crate::search_results_handler::aggregation_models::RawSearchResult;
|
|
21 |
///
|
22 |
/// # Errors
|
23 |
///
|
24 |
-
/// Returns
|
25 |
-
/// reach the above `upstream search engine` page
|
26 |
-
///
|
|
|
27 |
pub async fn results(
|
28 |
query: &str,
|
29 |
page: u32,
|
30 |
user_agent: &str,
|
31 |
-
) -> Result<HashMap<String, RawSearchResult>,
|
32 |
// Page number can be missing or empty string and so appropriate handling is required
|
33 |
// so that upstream server recieves valid page number.
|
34 |
let url: String = format!("https://searx.work/search?q={query}&pageno={page}");
|
35 |
|
36 |
// initializing headers and adding appropriate headers.
|
37 |
let mut header_map = HeaderMap::new();
|
38 |
-
header_map.insert(
|
39 |
-
|
40 |
-
|
41 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
42 |
|
43 |
// fetch the html from upstream searx instance engine
|
44 |
-
// TODO: Write better error handling code to handle no results case.
|
45 |
let results: String = reqwest::Client::new()
|
46 |
.get(url)
|
47 |
.headers(header_map) // add spoofed headers to emulate human behaviours.
|
48 |
.send()
|
49 |
-
.await
|
|
|
|
|
50 |
.text()
|
51 |
-
.await
|
|
|
|
|
52 |
|
53 |
let document: Html = Html::parse_document(&results);
|
54 |
-
|
55 |
-
let
|
56 |
-
|
57 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
58 |
|
59 |
// scrape all the results from the html
|
60 |
Ok(document
|
|
|
8 |
|
9 |
use crate::search_results_handler::aggregation_models::RawSearchResult;
|
10 |
|
11 |
+
use super::engine_models::EngineError;
|
12 |
+
use error_stack::{IntoReport, Report, Result, ResultExt};
|
13 |
+
|
14 |
/// This function scrapes results from the upstream engine duckduckgo and puts all the scraped
|
15 |
/// results like title, visiting_url (href in html),engine (from which engine it was fetched from)
|
16 |
/// and description in a RawSearchResult and then adds that to HashMap whose keys are url and
|
|
|
24 |
///
|
25 |
/// # Errors
|
26 |
///
|
27 |
+
/// Returns an `EngineErrorKind` if the user is not connected to the internet or if their is failure to
|
28 |
+
/// reach the above `upstream search engine` page or if the `upstream search engine` is unable to
|
29 |
+
/// provide results for the requested search query and also returns error if the scraping selector
|
30 |
+
/// or HeaderMap fails to initialize.
|
31 |
pub async fn results(
|
32 |
query: &str,
|
33 |
page: u32,
|
34 |
user_agent: &str,
|
35 |
+
) -> Result<HashMap<String, RawSearchResult>, EngineError> {
|
36 |
// Page number can be missing or empty string and so appropriate handling is required
|
37 |
// so that upstream server recieves valid page number.
|
38 |
let url: String = format!("https://searx.work/search?q={query}&pageno={page}");
|
39 |
|
40 |
// initializing headers and adding appropriate headers.
|
41 |
let mut header_map = HeaderMap::new();
|
42 |
+
header_map.insert(
|
43 |
+
USER_AGENT,
|
44 |
+
user_agent
|
45 |
+
.parse()
|
46 |
+
.into_report()
|
47 |
+
.change_context(EngineError::UnexpectedError)?,
|
48 |
+
);
|
49 |
+
header_map.insert(
|
50 |
+
REFERER,
|
51 |
+
"https://google.com/"
|
52 |
+
.parse()
|
53 |
+
.into_report()
|
54 |
+
.change_context(EngineError::UnexpectedError)?,
|
55 |
+
);
|
56 |
+
header_map.insert(
|
57 |
+
CONTENT_TYPE,
|
58 |
+
"application/x-www-form-urlencoded"
|
59 |
+
.parse()
|
60 |
+
.into_report()
|
61 |
+
.change_context(EngineError::UnexpectedError)?,
|
62 |
+
);
|
63 |
+
header_map.insert(COOKIE, "categories=general; language=auto; locale=en; autocomplete=duckduckgo; image_proxy=1; method=POST; safesearch=2; theme=simple; results_on_new_tab=1; doi_resolver=oadoi.org; simple_style=auto; center_alignment=1; query_in_title=1; infinite_scroll=0; disabled_engines=; enabled_engines=\"archive is__general\\054yep__general\\054curlie__general\\054currency__general\\054ddg definitions__general\\054wikidata__general\\054duckduckgo__general\\054tineye__general\\054lingva__general\\054startpage__general\\054yahoo__general\\054wiby__general\\054marginalia__general\\054alexandria__general\\054wikibooks__general\\054wikiquote__general\\054wikisource__general\\054wikiversity__general\\054wikivoyage__general\\054dictzone__general\\054seznam__general\\054mojeek__general\\054naver__general\\054wikimini__general\\054brave__general\\054petalsearch__general\\054goo__general\"; disabled_plugins=; enabled_plugins=\"searx.plugins.hostname_replace\\054searx.plugins.oa_doi_rewrite\\054searx.plugins.vim_hotkeys\"; tokens=; maintab=on; enginetab=on".parse().into_report().change_context(EngineError::UnexpectedError)?);
|
64 |
|
65 |
// fetch the html from upstream searx instance engine
|
|
|
66 |
let results: String = reqwest::Client::new()
|
67 |
.get(url)
|
68 |
.headers(header_map) // add spoofed headers to emulate human behaviours.
|
69 |
.send()
|
70 |
+
.await
|
71 |
+
.into_report()
|
72 |
+
.change_context(EngineError::RequestError)?
|
73 |
.text()
|
74 |
+
.await
|
75 |
+
.into_report()
|
76 |
+
.change_context(EngineError::RequestError)?;
|
77 |
|
78 |
let document: Html = Html::parse_document(&results);
|
79 |
+
|
80 |
+
let no_result: Selector = Selector::parse("#urls>.dialog-error>p")
|
81 |
+
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
82 |
+
.attach_printable_lazy(|| format!("invalid CSS selector: {}", "#urls>.dialog-error>p"))?;
|
83 |
+
|
84 |
+
if let Some(no_result_msg) = document.select(&no_result).nth(1) {
|
85 |
+
if no_result_msg.inner_html()
|
86 |
+
== "we didn't find any results. Please use another query or search in more categories"
|
87 |
+
{
|
88 |
+
return Err(Report::new(EngineError::EmptyResultSet));
|
89 |
+
}
|
90 |
+
}
|
91 |
+
|
92 |
+
let results: Selector = Selector::parse(".result")
|
93 |
+
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
94 |
+
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".result"))?;
|
95 |
+
let result_title: Selector = Selector::parse("h3>a")
|
96 |
+
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
97 |
+
.attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
|
98 |
+
let result_url: Selector = Selector::parse("h3>a")
|
99 |
+
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
100 |
+
.attach_printable_lazy(|| format!("invalid CSS selector: {}", "h3>a"))?;
|
101 |
+
|
102 |
+
let result_desc: Selector = Selector::parse(".content")
|
103 |
+
.map_err(|_| Report::new(EngineError::UnexpectedError))
|
104 |
+
.attach_printable_lazy(|| format!("invalid CSS selector: {}", ".content"))?;
|
105 |
|
106 |
// scrape all the results from the html
|
107 |
Ok(document
|
src/search_results_handler/aggregator.rs
CHANGED
@@ -58,8 +58,19 @@ pub async fn aggregate(
|
|
58 |
searx::results(query, page, &user_agent)
|
59 |
);
|
60 |
|
61 |
-
let ddg_map_results
|
62 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
63 |
|
64 |
result_map.extend(ddg_map_results);
|
65 |
|
|
|
58 |
searx::results(query, page, &user_agent)
|
59 |
);
|
60 |
|
61 |
+
let ddg_map_results = ddg_map_results.unwrap_or_else(|e| {
|
62 |
+
if debug {
|
63 |
+
log::error!("Error fetching results from DuckDuckGo: {:?}", e);
|
64 |
+
}
|
65 |
+
HashMap::new()
|
66 |
+
});
|
67 |
+
|
68 |
+
let searx_map_results = searx_map_results.unwrap_or_else(|e| {
|
69 |
+
if debug {
|
70 |
+
log::error!("Error fetching results from Searx: {:?}", e);
|
71 |
+
}
|
72 |
+
HashMap::new()
|
73 |
+
});
|
74 |
|
75 |
result_map.extend(ddg_map_results);
|
76 |
|