Kajise Org commited on
Commit
fa1be5f
1 Parent(s): 8365f82

Create SearchResult.py

Browse files
Files changed (1) hide show
  1. SearchResult.py +42 -0
SearchResult.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import requests
3
+ from bs4 import BeautifulSoup
4
+
5
+ class SearchResult:
6
+ def __init__(self, results: list[str], user_agent: str, did_you_mean: str = "", tailored_query: str = ""):
7
+ self.results = results
8
+ self.user_agent = user_agent
9
+ self.suggestion_query = did_you_mean
10
+ self.tailored_query = tailored_query
11
+
12
+ def parse_results(self):
13
+ results = self.results[1:]
14
+ headers = {
15
+ "User-Agent": self.user_agent
16
+ }
17
+
18
+ stripped_pages: list[{
19
+ "page_title": str,
20
+ "text_content": str
21
+ }] = [] # type: ignore
22
+
23
+ for link_entry in results:
24
+ twitter_pattern = re.compile(r".*twitter.*", re.IGNORECASE)
25
+
26
+ if not re.search(twitter_pattern, link_entry):
27
+ text_content = ""
28
+ response = requests.get(link_entry, headers=headers)
29
+ soup = BeautifulSoup(response.text, "html.parser")
30
+
31
+ title = soup.title.string or "No title provided" # type: ignore
32
+ relevant_tags = ["p", "li", "h1", "h2", "h3", "h4", "h5", "h6"]
33
+
34
+ for tag in relevant_tags:
35
+ elements = soup.find_all(tag, class_=lambda c: c != 'ads' and c != 'header' and c != 'footer')
36
+ for element in elements:
37
+ if element.text.strip().lower():
38
+ text_content += element.text.strip() + '\n'
39
+
40
+ stripped_pages.append({ "page_title": title, "text_content": text_content })
41
+
42
+ return stripped_pages