Jann Marc Villablanca mergify[bot] commited on
Commit
9f23a1c
1 Parent(s): 9f5213c

✨ feat(engine): `mojeek` for the search engine (#464)

Browse files

* 🧑‍💻 fix: add closing curly brace in the nix build step

* ✨ feat: add code to provide search results from `mojeek` engine

---------

Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>

flake.nix CHANGED
@@ -60,4 +60,4 @@
60
  # calls the build function
61
  packages.websurfx = packages.default;
62
  });
63
-
 
60
  # calls the build function
61
  packages.websurfx = packages.default;
62
  });
63
+ }
src/engines/mod.rs CHANGED
@@ -6,6 +6,7 @@
6
  pub mod brave;
7
  pub mod duckduckgo;
8
  pub mod librex;
 
9
  pub mod search_result_parser;
10
  pub mod searx;
11
  pub mod startpage;
 
6
  pub mod brave;
7
  pub mod duckduckgo;
8
  pub mod librex;
9
+ pub mod mojeek;
10
  pub mod search_result_parser;
11
  pub mod searx;
12
  pub mod startpage;
src/engines/mojeek.rs ADDED
@@ -0,0 +1,151 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //! The `mojeek` module handles the scraping of results from the mojeek search engine
2
+ //! by querying the upstream mojeek search engine with user provided query and with a page
3
+ //! number if provided.
4
+
5
+ use std::collections::HashMap;
6
+
7
+ use reqwest::header::HeaderMap;
8
+ use reqwest::Client;
9
+ use scraper::Html;
10
+
11
+ use crate::models::aggregation_models::SearchResult;
12
+
13
+ use crate::models::engine_models::{EngineError, SearchEngine};
14
+
15
+ use error_stack::{Report, Result, ResultExt};
16
+
17
+ use super::search_result_parser::SearchResultParser;
18
+
19
+ /// A new Mojeek engine type defined in-order to implement the `SearchEngine` trait which allows to
20
+ /// reduce code duplication as well as allows to create vector of different search engines easily.
21
+ pub struct Mojeek {
22
+ /// The parser, used to interpret the search result.
23
+ parser: SearchResultParser,
24
+ }
25
+
26
+ impl Mojeek {
27
+ /// Creates the Mojeek parser.
28
+ pub fn new() -> Result<Self, EngineError> {
29
+ Ok(Self {
30
+ parser: SearchResultParser::new(
31
+ ".result-col",
32
+ ".results-standard li",
33
+ "a span.url",
34
+ "h2 a.title",
35
+ "p.s",
36
+ )?,
37
+ })
38
+ }
39
+ }
40
+
41
+ #[async_trait::async_trait]
42
+ impl SearchEngine for Mojeek {
43
+ async fn results(
44
+ &self,
45
+ query: &str,
46
+ page: u32,
47
+ user_agent: &str,
48
+ client: &Client,
49
+ safe_search: u8,
50
+ ) -> Result<HashMap<String, SearchResult>, EngineError> {
51
+ // Mojeek uses `start results from this number` convention
52
+ // So, for 10 results per page, page 0 starts at 1, page 1
53
+ // starts at 11, and so on.
54
+ let results_per_page = 10;
55
+ let start_result = results_per_page * page + 1;
56
+
57
+ let results_per_page = results_per_page.to_string();
58
+ let start_result = start_result.to_string();
59
+
60
+ let search_engines = vec![
61
+ "Bing",
62
+ "Brave",
63
+ "DuckDuckGo",
64
+ "Ecosia",
65
+ "Google",
66
+ "Lilo",
67
+ "Metager",
68
+ "Qwant",
69
+ "Startpage",
70
+ "Swisscows",
71
+ "Yandex",
72
+ "Yep",
73
+ "You",
74
+ ];
75
+ let qss = search_engines.join("%2C");
76
+ let safe = if safe_search == 0 { "0" } else { "1" };
77
+
78
+ // Mojeek detects automated requests, these are preferences that are
79
+ // able to circumvent the countermeasure. Some of these are
80
+ // not documented in their Search API
81
+ let query_params: Vec<(&str, &str)> = vec![
82
+ ("t", results_per_page.as_str()),
83
+ ("theme", "dark"),
84
+ ("arc", "none"),
85
+ ("date", "1"),
86
+ ("cdate", "1"),
87
+ ("tlen", "100"),
88
+ ("ref", "1"),
89
+ ("hp", "minimal"),
90
+ ("lb", "en"),
91
+ ("qss", &qss),
92
+ ("safe", safe),
93
+ ];
94
+
95
+ let mut query_params_string = String::new();
96
+ for (k, v) in &query_params {
97
+ query_params_string.push_str(&format!("&{k}={v}"));
98
+ }
99
+
100
+ let url: String = match page {
101
+ 0 => {
102
+ format!("https://www.mojeek.com/search?q={query}{query_params_string}")
103
+ }
104
+ _ => {
105
+ format!(
106
+ "https://www.mojeek.com/search?q={query}&s={start_result}{query_params_string}"
107
+ )
108
+ }
109
+ };
110
+
111
+ let mut cookie_string = String::new();
112
+ for (k, v) in &query_params {
113
+ cookie_string.push_str(&format!("{k}={v}; "));
114
+ }
115
+
116
+ let header_map = HeaderMap::try_from(&HashMap::from([
117
+ ("USER_AGENT".to_string(), user_agent.to_string()),
118
+ ("REFERER".to_string(), "https://google.com/".to_string()),
119
+ (
120
+ "CONTENT_TYPE".to_string(),
121
+ "application/x-www-form-urlencoded".to_string(),
122
+ ),
123
+ ("COOKIE".to_string(), cookie_string),
124
+ ]))
125
+ .change_context(EngineError::UnexpectedError)?;
126
+
127
+ let document: Html = Html::parse_document(
128
+ &Mojeek::fetch_html_from_upstream(self, &url, header_map, client).await?,
129
+ );
130
+
131
+ if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(0) {
132
+ if no_result_msg
133
+ .inner_html()
134
+ .contains("No pages found matching:")
135
+ {
136
+ return Err(Report::new(EngineError::EmptyResultSet));
137
+ }
138
+ }
139
+
140
+ // scrape all the results from the html
141
+ self.parser
142
+ .parse_for_results(&document, |title, url, desc| {
143
+ Some(SearchResult::new(
144
+ title.inner_html().trim(),
145
+ url.inner_html().trim(),
146
+ desc.inner_html().trim(),
147
+ &["mojeek"],
148
+ ))
149
+ })
150
+ }
151
+ }
src/models/engine_models.rs CHANGED
@@ -162,6 +162,10 @@ impl EngineHandler {
162
  let engine = crate::engines::librex::LibreX::new()?;
163
  ("librex", Box::new(engine))
164
  }
 
 
 
 
165
  _ => {
166
  return Err(Report::from(EngineError::NoSuchEngineFound(
167
  engine_name.to_string(),
 
162
  let engine = crate::engines::librex::LibreX::new()?;
163
  ("librex", Box::new(engine))
164
  }
165
+ "mojeek" => {
166
+ let engine = crate::engines::mojeek::Mojeek::new()?;
167
+ ("mojeek", Box::new(engine))
168
+ }
169
  _ => {
170
  return Err(Report::from(EngineError::NoSuchEngineFound(
171
  engine_name.to_string(),
websurfx/config.lua CHANGED
@@ -64,4 +64,5 @@ upstream_search_engines = {
64
  Brave = false,
65
  Startpage = false,
66
  LibreX = false,
 
67
  } -- select the upstream search engines from which the results should be fetched.
 
64
  Brave = false,
65
  Startpage = false,
66
  LibreX = false,
67
+ Mojeek = false,
68
  } -- select the upstream search engines from which the results should be fetched.