Hansimov commited on
Commit
ef3de03
1 Parent(s): d6015f4

:gem: [Feature] New SearchResultsExtractor: title, site, link, abstract

Browse files
documents/__init__.py ADDED
File without changes
documents/search_results_extractor.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from bs4 import BeautifulSoup
2
+ from pathlib import Path
3
+
4
+
5
+ class SearchResultsExtractor:
6
+ def __init__(self) -> None:
7
+ pass
8
+
9
+ def load_html(self, html_path):
10
+ with open(html_path, "r", encoding="utf-8") as f:
11
+ html = f.read()
12
+ self.soup = BeautifulSoup(html, "html.parser")
13
+
14
+ def extract_search_results(self):
15
+ search_result_elements = self.soup.find_all("div", class_="g")
16
+
17
+ for result in search_result_elements:
18
+ site = result.find("cite").find_previous("span").text
19
+ link = result.find("a")["href"]
20
+ title = result.find("h3").text
21
+
22
+ abstract_element = result.find("div", {"data-sncf": "1"})
23
+ if abstract_element is None:
24
+ abstract_element = result.find("div", class_="ITZIwc")
25
+ abstract = abstract_element.text.strip()
26
+
27
+ print(
28
+ f"{title}\n" f" - {site}\n" f" - {link}\n" f" - {abstract}\n" f"\n"
29
+ )
30
+
31
+ def extract_related_questions(self):
32
+ related_questions = self.soup.find_all("div", class_="related-question-pair")
33
+ for question in related_questions:
34
+ print(question)
35
+ # print(question.find("a")["href"])
36
+ # print(question.find("a").text)
37
+
38
+ def extract(self, html_path):
39
+ self.load_html(html_path)
40
+ self.extract_search_results()
41
+
42
+
43
+ if __name__ == "__main__":
44
+ html_path_root = Path(__file__).parents[1] / "files"
45
+ # html_filename = "python教程"
46
+ html_filename = "python_tutorials"
47
+ html_path = html_path_root / f"{html_filename}.html"
48
+ extractor = SearchResultsExtractor()
49
+ extractor.extract(html_path)