Hansimov commited on
Commit
e773696
1 Parent(s): 7d44e75

:gem: [Feature] New WebpageContentExtractor: extract webpage content as clean markdown

Browse files
documents/webpage_content_extractor.py ADDED
@@ -0,0 +1,115 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ from pathlib import Path
3
+ from pprint import pprint
4
+ from bs4 import BeautifulSoup, Comment, NavigableString, Tag
5
+ from tiktoken import get_encoding as tiktoken_get_encoding
6
+ from utils.logger import logger
7
+ from markdownify import markdownify
8
+
9
+ # from trafilatura import extract as extract_text_from_html
10
+ # from inscriptis import get_text as extract_text_from_html
11
+ # from html_text import extract_text as extract_text_from_html
12
+ # from readabilipy import simple_json_from_html_string as extract_text_from_html
13
+
14
+
15
+ class WebpageContentExtractor:
16
+ def __init__(self):
17
+ self.tokenizer = tiktoken_get_encoding("cl100k_base")
18
+
19
+ def count_tokens(self, text):
20
+ tokens = self.tokenizer.encode(text)
21
+ token_count = len(tokens)
22
+ return token_count
23
+
24
+ def filter_html_str(self, html_str):
25
+ soup = BeautifulSoup(html_str, "html.parser")
26
+
27
+ ignore_tags = ["script", "style", "button"]
28
+
29
+ ignore_classes = [
30
+ "sidebar",
31
+ "footer",
32
+ "related",
33
+ "comment",
34
+ "topbar",
35
+ "menu",
36
+ "offcanvas",
37
+ "navbar",
38
+ ]
39
+ ignore_classes_pattern = f'{"|".join(ignore_classes)}'
40
+ removed_element_counts = 0
41
+ for element in soup.find_all():
42
+ class_str = ""
43
+ id_str = ""
44
+ try:
45
+ class_attr = element.get("class", [])
46
+ if class_attr:
47
+ class_str = " ".join(list(class_attr))
48
+ if id_str:
49
+ class_str = f"{class_str} {id_str}"
50
+ except:
51
+ pass
52
+
53
+ try:
54
+ id_str = element.get("id", "")
55
+ except:
56
+ pass
57
+
58
+ if (
59
+ (not element.text.strip())
60
+ or (element.name in ignore_tags)
61
+ or (re.search(ignore_classes_pattern, class_str, flags=re.IGNORECASE))
62
+ or (re.search(ignore_classes_pattern, id_str, flags=re.IGNORECASE))
63
+ ):
64
+ # try:
65
+ # logger.note(f"Removing:\n{element}")
66
+ # except:
67
+ # logger.note(f"Removing unknown element")
68
+ element.decompose()
69
+ removed_element_counts += 1
70
+
71
+ logger.note(
72
+ f"Elements Removed/Remained: {removed_element_counts}/{len(soup.find_all())}"
73
+ )
74
+
75
+ html_str = str(soup)
76
+ return html_str
77
+
78
+ def extract(self, html_path):
79
+ logger.note(f"Extracing content from:{html_path}")
80
+ with open(html_path, "r", encoding="utf-8") as f:
81
+ html_str = f.read()
82
+
83
+ html_str = self.filter_html_str(html_str)
84
+
85
+ # self.main_content = extract_text_from_html(html_str)
86
+
87
+ # # when using `readabilipy`
88
+ # self.main_content = extract_text_from_html(html_str)["plain_content"]
89
+ # self.main_content = "\n".join(
90
+ # item["text"] for item in extract_text_from_html(html_str)["plain_text"]
91
+ # )
92
+ # self.main_content = markdownify(extract_text_from_html(html_str)["content"])
93
+
94
+ # self.main_content = markdownify(extract_text_from_html(html_str))
95
+
96
+ self.main_content = markdownify(html_str, strip="a")
97
+ self.main_content = re.sub(r"\n{3,}", "\n\n", self.main_content)
98
+ logger.line(self.main_content)
99
+ # pprint(self.main_content)
100
+ token_count = self.count_tokens(self.main_content)
101
+ logger.note(f"Token Count: {token_count}")
102
+ return self.main_content
103
+
104
+
105
+ if __name__ == "__main__":
106
+ html_path = (
107
+ Path(__file__).parents[1]
108
+ / "files"
109
+ / "urls"
110
+ # / "stackoverflow.com_questions_295135_turn-a-string-into-a-valid-filename.html"
111
+ / "www.liaoxuefeng.com_wiki_1016959663602400_1017495723838528.html"
112
+ # / "docs.python.org_zh-cn_3_tutorial_interpreter.html"
113
+ )
114
+ extractor = WebpageContentExtractor()
115
+ main_content = extractor.extract(html_path)