|
|
|
|
|
|
|
use std::{ |
|
collections::HashMap, |
|
io::{BufReader, Read}, |
|
time::Duration, |
|
}; |
|
|
|
use super::{ |
|
aggregation_models::{EngineErrorInfo, SearchResult, SearchResults}, |
|
user_agent::random_user_agent, |
|
}; |
|
use error_stack::Report; |
|
use rand::Rng; |
|
use regex::Regex; |
|
use std::{fs::File, io::BufRead}; |
|
use tokio::task::JoinHandle; |
|
|
|
use crate::{ |
|
engines::engine_models::{EngineError, EngineHandler}, |
|
handler::paths::{file_path, FileType}, |
|
}; |
|
|
|
|
|
type FutureVec = Vec<JoinHandle<Result<HashMap<String, SearchResult>, Report<EngineError>>>>; |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pub async fn aggregate( |
|
query: String, |
|
page: u32, |
|
random_delay: bool, |
|
debug: bool, |
|
upstream_search_engines: Vec<EngineHandler>, |
|
request_timeout: u8, |
|
) -> Result<SearchResults, Box<dyn std::error::Error>> { |
|
let user_agent: String = random_user_agent(); |
|
|
|
|
|
if random_delay || !debug { |
|
let mut rng = rand::thread_rng(); |
|
let delay_secs = rng.gen_range(1..10); |
|
tokio::time::sleep(Duration::from_secs(delay_secs)).await; |
|
} |
|
|
|
let mut names: Vec<&str> = vec![]; |
|
|
|
|
|
let mut tasks: FutureVec = FutureVec::new(); |
|
|
|
for engine_handler in upstream_search_engines { |
|
let (name, search_engine) = engine_handler.into_name_engine(); |
|
names.push(name); |
|
let query: String = query.clone(); |
|
let user_agent: String = user_agent.clone(); |
|
tasks.push(tokio::spawn(async move { |
|
search_engine |
|
.results(query, page, user_agent.clone(), request_timeout) |
|
.await |
|
})); |
|
} |
|
|
|
|
|
let mut responses = Vec::with_capacity(tasks.len()); |
|
|
|
for task in tasks { |
|
if let Ok(result) = task.await { |
|
responses.push(result) |
|
} |
|
} |
|
|
|
|
|
let mut result_map: HashMap<String, SearchResult> = HashMap::new(); |
|
let mut engine_errors_info: Vec<EngineErrorInfo> = Vec::new(); |
|
|
|
let mut handle_error = |error: Report<EngineError>, engine_name: String| { |
|
log::error!("Engine Error: {:?}", error); |
|
engine_errors_info.push(EngineErrorInfo::new( |
|
error.downcast_ref::<EngineError>().unwrap(), |
|
engine_name, |
|
)); |
|
}; |
|
|
|
for _ in 0..responses.len() { |
|
let response = responses.pop().unwrap(); |
|
let engine = names.pop().unwrap().to_string(); |
|
|
|
if result_map.is_empty() { |
|
match response { |
|
Ok(results) => { |
|
result_map = results.clone(); |
|
} |
|
Err(error) => { |
|
handle_error(error, engine); |
|
} |
|
} |
|
continue; |
|
} |
|
|
|
match response { |
|
Ok(result) => { |
|
result.into_iter().for_each(|(key, value)| { |
|
result_map |
|
.entry(key) |
|
.and_modify(|result| { |
|
result.add_engines(engine.clone()); |
|
}) |
|
.or_insert_with(|| -> SearchResult { value }); |
|
}); |
|
} |
|
Err(error) => { |
|
handle_error(error, engine); |
|
} |
|
} |
|
} |
|
|
|
let mut blacklist_map: HashMap<String, SearchResult> = HashMap::new(); |
|
filter_with_lists( |
|
&mut result_map, |
|
&mut blacklist_map, |
|
&file_path(FileType::BlockList)?, |
|
)?; |
|
|
|
filter_with_lists( |
|
&mut blacklist_map, |
|
&mut result_map, |
|
&file_path(FileType::AllowList)?, |
|
)?; |
|
|
|
drop(blacklist_map); |
|
|
|
let results: Vec<SearchResult> = result_map.into_values().collect(); |
|
|
|
Ok(SearchResults::new( |
|
results, |
|
query.to_string(), |
|
engine_errors_info, |
|
)) |
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
pub fn filter_with_lists( |
|
map_to_be_filtered: &mut HashMap<String, SearchResult>, |
|
resultant_map: &mut HashMap<String, SearchResult>, |
|
file_path: &str, |
|
) -> Result<(), Box<dyn std::error::Error>> { |
|
let mut reader = BufReader::new(File::open(file_path)?); |
|
|
|
for line in reader.by_ref().lines() { |
|
let re = Regex::new(&line?)?; |
|
|
|
|
|
for (url, search_result) in map_to_be_filtered.clone().into_iter() { |
|
if re.is_match(&url.to_lowercase()) |
|
|| re.is_match(&search_result.title.to_lowercase()) |
|
|| re.is_match(&search_result.description.to_lowercase()) |
|
{ |
|
|
|
resultant_map.insert(url.clone(), map_to_be_filtered.remove(&url).unwrap()); |
|
} |
|
} |
|
} |
|
|
|
Ok(()) |
|
} |
|
|
|
#[cfg(test)] |
|
mod tests { |
|
use super::*; |
|
use std::collections::HashMap; |
|
use std::io::Write; |
|
use tempfile::NamedTempFile; |
|
|
|
#[test] |
|
fn test_filter_with_lists() -> Result<(), Box<dyn std::error::Error>> { |
|
|
|
let mut map_to_be_filtered = HashMap::new(); |
|
map_to_be_filtered.insert( |
|
"https://www.example.com".to_string(), |
|
SearchResult { |
|
title: "Example Domain".to_string(), |
|
url: "https://www.example.com".to_string(), |
|
description: "This domain is for use in illustrative examples in documents." |
|
.to_string(), |
|
engine: vec!["Google".to_string(), "Bing".to_string()], |
|
}, |
|
); |
|
map_to_be_filtered.insert( |
|
"https://www.rust-lang.org/".to_string(), |
|
SearchResult { |
|
title: "Rust Programming Language".to_string(), |
|
url: "https://www.rust-lang.org/".to_string(), |
|
description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_string(), |
|
engine: vec!["Google".to_string(), "DuckDuckGo".to_string()], |
|
}, |
|
); |
|
|
|
|
|
let mut file = NamedTempFile::new()?; |
|
writeln!(file, "example")?; |
|
writeln!(file, "rust")?; |
|
file.flush()?; |
|
|
|
let mut resultant_map = HashMap::new(); |
|
filter_with_lists( |
|
&mut map_to_be_filtered, |
|
&mut resultant_map, |
|
file.path().to_str().unwrap(), |
|
)?; |
|
|
|
assert_eq!(resultant_map.len(), 2); |
|
assert!(resultant_map.contains_key("https://www.example.com")); |
|
assert!(resultant_map.contains_key("https://www.rust-lang.org/")); |
|
assert_eq!(map_to_be_filtered.len(), 0); |
|
|
|
Ok(()) |
|
} |
|
|
|
#[test] |
|
fn test_filter_with_lists_wildcard() -> Result<(), Box<dyn std::error::Error>> { |
|
let mut map_to_be_filtered = HashMap::new(); |
|
map_to_be_filtered.insert( |
|
"https://www.example.com".to_string(), |
|
SearchResult { |
|
title: "Example Domain".to_string(), |
|
url: "https://www.example.com".to_string(), |
|
description: "This domain is for use in illustrative examples in documents." |
|
.to_string(), |
|
engine: vec!["Google".to_string(), "Bing".to_string()], |
|
}, |
|
); |
|
map_to_be_filtered.insert( |
|
"https://www.rust-lang.org/".to_string(), |
|
SearchResult { |
|
title: "Rust Programming Language".to_string(), |
|
url: "https://www.rust-lang.org/".to_string(), |
|
description: "A systems programming language that runs blazingly fast, prevents segfaults, and guarantees thread safety.".to_string(), |
|
engine: vec!["Google".to_string(), "DuckDuckGo".to_string()], |
|
}, |
|
); |
|
|
|
|
|
let mut file = NamedTempFile::new()?; |
|
writeln!(file, "ex.*le")?; |
|
file.flush()?; |
|
|
|
let mut resultant_map = HashMap::new(); |
|
|
|
filter_with_lists( |
|
&mut map_to_be_filtered, |
|
&mut resultant_map, |
|
file.path().to_str().unwrap(), |
|
)?; |
|
|
|
assert_eq!(resultant_map.len(), 1); |
|
assert!(resultant_map.contains_key("https://www.example.com")); |
|
assert_eq!(map_to_be_filtered.len(), 1); |
|
assert!(map_to_be_filtered.contains_key("https://www.rust-lang.org/")); |
|
|
|
Ok(()) |
|
} |
|
|
|
#[test] |
|
fn test_filter_with_lists_file_not_found() { |
|
let mut map_to_be_filtered = HashMap::new(); |
|
|
|
let mut resultant_map = HashMap::new(); |
|
|
|
|
|
let result = filter_with_lists( |
|
&mut map_to_be_filtered, |
|
&mut resultant_map, |
|
"non-existent-file.txt", |
|
); |
|
|
|
assert!(result.is_err()); |
|
} |
|
|
|
#[test] |
|
fn test_filter_with_lists_invalid_regex() { |
|
let mut map_to_be_filtered = HashMap::new(); |
|
map_to_be_filtered.insert( |
|
"https://www.example.com".to_string(), |
|
SearchResult { |
|
title: "Example Domain".to_string(), |
|
url: "https://www.example.com".to_string(), |
|
description: "This domain is for use in illustrative examples in documents." |
|
.to_string(), |
|
engine: vec!["Google".to_string(), "Bing".to_string()], |
|
}, |
|
); |
|
|
|
let mut resultant_map = HashMap::new(); |
|
|
|
|
|
let mut file = NamedTempFile::new().unwrap(); |
|
writeln!(file, "example(").unwrap(); |
|
file.flush().unwrap(); |
|
|
|
let result = filter_with_lists( |
|
&mut map_to_be_filtered, |
|
&mut resultant_map, |
|
file.path().to_str().unwrap(), |
|
); |
|
|
|
assert!(result.is_err()); |
|
} |
|
} |
|
|