Spaces:
Paused
Paused
File size: 2,875 Bytes
99d3f35 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 |
from dataclasses import dataclass
import re
from typing import Iterator, List
from langchain.docstore.document import Document
from langchain.document_loaders.base import BaseLoader
from bs4 import BeautifulSoup, Tag, ResultSet
import requests
RE_HEADERS = re.compile(r"h[23]")
@dataclass
class Content:
name: str
title: str
text: str
body: list[Tag]
def _get_anchor_name(header: Tag) -> str:
for tag in header.previous_elements:
if tag.name == "a":
return tag.attrs.get("name", "")
return ""
def _reversed_remove_last_anchor(body: list[Tag]) -> Iterator[Tag]:
has_anchor = False
for tag in reversed(body):
if not has_anchor:
if tag.name == "a":
has_anchor = True
continue
else:
yield tag
def _remove_last_anchor(body: list[Tag]) -> Iterator[Tag]:
return reversed(list(_reversed_remove_last_anchor(body)))
def _get_bodys_text(body: list[Tag]) -> str:
text = ""
for tag in body:
text += tag.get_text()
return text
def _get_child_content(header: Tag) -> Content:
title = header.get_text()
name = _get_anchor_name(header)
body = [header]
for i, child in enumerate(header.next_elements):
if i == 0:
continue
if child.name == "h2" or child.name == "h3":
break
body.append(child)
removed_next_anchor_body = list(_remove_last_anchor(body))
text = _get_bodys_text(removed_next_anchor_body)
return Content(name,
title,
text,
removed_next_anchor_body
)
def get_contents(headers: ResultSet[Tag]) -> Iterator[Content]:
for header in headers:
yield _get_child_content(header)
class NVDAUserGuideLoader(BaseLoader):
"""
"""
def __init__(self, url: str, category: str) -> None:
self.url = url
self.category = category
def fetch(self) -> BeautifulSoup:
res = requests.get(self.url)
soup = BeautifulSoup(res.content, 'lxml')
return soup
def lazy_load(self) -> Iterator[Document]:
soup = self.fetch()
# body = soup.body
headers = soup.find_all(RE_HEADERS)
for content in get_contents(headers):
name = content.name
title = content.title
text = content.text
metadata = {"category": self.category, "source": name, "url": f"{self.url}#{name}", "title": title}
yield Document(page_content=text, metadata=metadata)
def load(self) -> List[Document]:
return list(self.lazy_load())
if __name__ == "__main__":
url = "https://www.nvaccess.org/files/nvda/documentation/userGuide.html"
loader = NVDAUserGuideLoader(url, "en-nvda-user-guide")
data = loader.load()
print(data)
# breakpoint()
|