Jann Marc Villablanca neon-mmd commited on
Commit
50aa52c
1 Parent(s): 5020f36

✨ `Bing` for the search engine (#473) (#473)

Browse files

fix: add closing curly brace

- accidentally removed from previous PR. i hope i did not ruin some
build

* feat: implement mojeek engine

* Merge branch 'rolling' into FEAT_316_bing_search_engine

* ✨ feat: implement bing engine

* Fix: include cookie string to header_map

* Fix: include <strong> tags from upstream search

* Merge branch 'rolling' into FEAT_316_bing_search_engine

Co-authored-by: neon-mmd <132049916+neon-mmd@users.noreply.github.com>

src/engines/bing.rs ADDED
@@ -0,0 +1,124 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ //! The `bing` module handles the scraping of results from the bing search engine
2
+ //! by querying the upstream bing search engine with user provided query and with a page
3
+ //! number if provided.
4
+
5
+ use std::collections::HashMap;
6
+
7
+ use regex::Regex;
8
+ use reqwest::header::HeaderMap;
9
+ use reqwest::Client;
10
+ use scraper::Html;
11
+
12
+ use crate::models::aggregation_models::SearchResult;
13
+
14
+ use crate::models::engine_models::{EngineError, SearchEngine};
15
+
16
+ use error_stack::{Report, Result, ResultExt};
17
+
18
+ use super::search_result_parser::SearchResultParser;
19
+
20
+ /// A new Bing engine type defined in-order to implement the `SearchEngine` trait which allows to
21
+ /// reduce code duplication as well as allows to create vector of different search engines easily.
22
+ pub struct Bing {
23
+ /// The parser, used to interpret the search result.
24
+ parser: SearchResultParser,
25
+ }
26
+
27
+ impl Bing {
28
+ /// Creates the Bing parser.
29
+ pub fn new() -> Result<Self, EngineError> {
30
+ Ok(Self {
31
+ parser: SearchResultParser::new(
32
+ ".b_results",
33
+ ".b_algo",
34
+ "h2 a",
35
+ ".tpcn a.tilk",
36
+ ".b_caption p",
37
+ )?,
38
+ })
39
+ }
40
+ }
41
+
42
+ #[async_trait::async_trait]
43
+ impl SearchEngine for Bing {
44
+ async fn results(
45
+ &self,
46
+ query: &str,
47
+ page: u32,
48
+ user_agent: &str,
49
+ client: &Client,
50
+ _safe_search: u8,
51
+ ) -> Result<HashMap<String, SearchResult>, EngineError> {
52
+ // Bing uses `start results from this number` convention
53
+ // So, for 10 results per page, page 0 starts at 1, page 1
54
+ // starts at 11, and so on.
55
+ let results_per_page = 10;
56
+ let start_result = results_per_page * page + 1;
57
+
58
+ let url: String = match page {
59
+ 0 => {
60
+ format!("https://www.bing.com/search?q={query}")
61
+ }
62
+ _ => {
63
+ format!("https://www.bing.com/search?q={query}&first={start_result}")
64
+ }
65
+ };
66
+
67
+ let query_params: Vec<(&str, &str)> = vec![
68
+ ("_EDGE_V", "1"),
69
+ ("SRCHD=AF", "NOFORM"),
70
+ ("_Rwho=u", "d"),
71
+ ("bngps=s", "0"),
72
+ ("_UR=QS=0&TQS", "0"),
73
+ ("_UR=QS=0&TQS", "0"),
74
+ ];
75
+
76
+ let mut cookie_string = String::new();
77
+ for (k, v) in &query_params {
78
+ cookie_string.push_str(&format!("{k}={v}; "));
79
+ }
80
+
81
+ let header_map = HeaderMap::try_from(&HashMap::from([
82
+ ("USER_AGENT".to_string(), user_agent.to_string()),
83
+ ("REFERER".to_string(), "https://google.com/".to_string()),
84
+ (
85
+ "CONTENT_TYPE".to_string(),
86
+ "application/x-www-form-urlencoded".to_string(),
87
+ ),
88
+ ("COOKIE".to_string(), cookie_string),
89
+ ]))
90
+ .change_context(EngineError::UnexpectedError)?;
91
+
92
+ let document: Html = Html::parse_document(
93
+ &Bing::fetch_html_from_upstream(self, &url, header_map, client).await?,
94
+ );
95
+
96
+ // Bing is very aggressive in finding matches
97
+ // even with the most absurd of queries. ".b_algo" is the
98
+ // class for the list item of results
99
+ if let Some(no_result_msg) = self.parser.parse_for_no_results(&document).nth(0) {
100
+ if no_result_msg
101
+ .value()
102
+ .attr("class")
103
+ .map(|classes| classes.contains("b_algo"))
104
+ .unwrap_or(false)
105
+ {
106
+ return Err(Report::new(EngineError::EmptyResultSet));
107
+ }
108
+ }
109
+
110
+ let re_span = Regex::new(r#"<span.*?>.*?(?:</span>&nbsp;·|</span>)"#).unwrap();
111
+ let re_strong = Regex::new(r#"(<strong>|</strong>)"#).unwrap();
112
+
113
+ // scrape all the results from the html
114
+ self.parser
115
+ .parse_for_results(&document, |title, url, desc| {
116
+ Some(SearchResult::new(
117
+ &re_strong.replace_all(title.inner_html().trim(), ""),
118
+ url.value().attr("href").unwrap(),
119
+ &re_span.replace_all(desc.inner_html().trim(), ""),
120
+ &["bing"],
121
+ ))
122
+ })
123
+ }
124
+ }
src/engines/mod.rs CHANGED
@@ -3,6 +3,7 @@
3
  //! provide a standard functions to be implemented for all the upstream search engine handling
4
  //! code. Moreover, it also provides a custom error for the upstream search engine handling code.
5
 
 
6
  pub mod brave;
7
  pub mod duckduckgo;
8
  pub mod librex;
 
3
  //! provide a standard functions to be implemented for all the upstream search engine handling
4
  //! code. Moreover, it also provides a custom error for the upstream search engine handling code.
5
 
6
+ pub mod bing;
7
  pub mod brave;
8
  pub mod duckduckgo;
9
  pub mod librex;
src/models/engine_models.rs CHANGED
@@ -166,6 +166,10 @@ impl EngineHandler {
166
  let engine = crate::engines::mojeek::Mojeek::new()?;
167
  ("mojeek", Box::new(engine))
168
  }
 
 
 
 
169
  _ => {
170
  return Err(Report::from(EngineError::NoSuchEngineFound(
171
  engine_name.to_string(),
 
166
  let engine = crate::engines::mojeek::Mojeek::new()?;
167
  ("mojeek", Box::new(engine))
168
  }
169
+ "bing" => {
170
+ let engine = crate::engines::bing::Bing::new()?;
171
+ ("bing", Box::new(engine))
172
+ }
173
  _ => {
174
  return Err(Report::from(EngineError::NoSuchEngineFound(
175
  engine_name.to_string(),
websurfx/config.lua CHANGED
@@ -65,4 +65,5 @@ upstream_search_engines = {
65
  Startpage = false,
66
  LibreX = false,
67
  Mojeek = false,
 
68
  } -- select the upstream search engines from which the results should be fetched.
 
65
  Startpage = false,
66
  LibreX = false,
67
  Mojeek = false,
68
+ Bing = false,
69
  } -- select the upstream search engines from which the results should be fetched.