Spaces:
Runtime error
Runtime error
Kajise Org
commited on
Commit
•
fa1be5f
1
Parent(s):
8365f82
Create SearchResult.py
Browse files- SearchResult.py +42 -0
SearchResult.py
ADDED
@@ -0,0 +1,42 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import requests
|
3 |
+
from bs4 import BeautifulSoup
|
4 |
+
|
5 |
+
class SearchResult:
|
6 |
+
def __init__(self, results: list[str], user_agent: str, did_you_mean: str = "", tailored_query: str = ""):
|
7 |
+
self.results = results
|
8 |
+
self.user_agent = user_agent
|
9 |
+
self.suggestion_query = did_you_mean
|
10 |
+
self.tailored_query = tailored_query
|
11 |
+
|
12 |
+
def parse_results(self):
|
13 |
+
results = self.results[1:]
|
14 |
+
headers = {
|
15 |
+
"User-Agent": self.user_agent
|
16 |
+
}
|
17 |
+
|
18 |
+
stripped_pages: list[{
|
19 |
+
"page_title": str,
|
20 |
+
"text_content": str
|
21 |
+
}] = [] # type: ignore
|
22 |
+
|
23 |
+
for link_entry in results:
|
24 |
+
twitter_pattern = re.compile(r".*twitter.*", re.IGNORECASE)
|
25 |
+
|
26 |
+
if not re.search(twitter_pattern, link_entry):
|
27 |
+
text_content = ""
|
28 |
+
response = requests.get(link_entry, headers=headers)
|
29 |
+
soup = BeautifulSoup(response.text, "html.parser")
|
30 |
+
|
31 |
+
title = soup.title.string or "No title provided" # type: ignore
|
32 |
+
relevant_tags = ["p", "li", "h1", "h2", "h3", "h4", "h5", "h6"]
|
33 |
+
|
34 |
+
for tag in relevant_tags:
|
35 |
+
elements = soup.find_all(tag, class_=lambda c: c != 'ads' and c != 'header' and c != 'footer')
|
36 |
+
for element in elements:
|
37 |
+
if element.text.strip().lower():
|
38 |
+
text_content += element.text.strip() + '\n'
|
39 |
+
|
40 |
+
stripped_pages.append({ "page_title": title, "text_content": text_content })
|
41 |
+
|
42 |
+
return stripped_pages
|