Spaces:
Paused
Paused
from dataclasses import dataclass | |
import re | |
from typing import Iterator, List | |
from langchain.docstore.document import Document | |
from langchain.document_loaders.base import BaseLoader | |
from bs4 import BeautifulSoup, Tag, ResultSet | |
import requests | |
RE_HEADERS = re.compile(r"h[23]") | |
class Content: | |
name: str | |
title: str | |
text: str | |
body: list[Tag] | |
def _get_anchor_name(header: Tag) -> str: | |
for tag in header.previous_elements: | |
if tag.name == "a": | |
return tag.attrs.get("name", "") | |
return "" | |
def _reversed_remove_last_anchor(body: list[Tag]) -> Iterator[Tag]: | |
has_anchor = False | |
for tag in reversed(body): | |
if not has_anchor: | |
if tag.name == "a": | |
has_anchor = True | |
continue | |
else: | |
yield tag | |
def _remove_last_anchor(body: list[Tag]) -> Iterator[Tag]: | |
return reversed(list(_reversed_remove_last_anchor(body))) | |
def _get_bodys_text(body: list[Tag]) -> str: | |
text = "" | |
for tag in body: | |
text += tag.get_text() | |
return text | |
def _get_child_content(header: Tag) -> Content: | |
title = header.get_text() | |
name = _get_anchor_name(header) | |
body = [header] | |
for i, child in enumerate(header.next_elements): | |
if i == 0: | |
continue | |
if child.name == "h2" or child.name == "h3": | |
break | |
body.append(child) | |
removed_next_anchor_body = list(_remove_last_anchor(body)) | |
text = _get_bodys_text(removed_next_anchor_body) | |
return Content(name, | |
title, | |
text, | |
removed_next_anchor_body | |
) | |
def get_contents(headers: ResultSet[Tag]) -> Iterator[Content]: | |
for header in headers: | |
yield _get_child_content(header) | |
class NVDAUserGuideLoader(BaseLoader): | |
""" | |
""" | |
def __init__(self, url: str, category: str) -> None: | |
self.url = url | |
self.category = category | |
def fetch(self) -> BeautifulSoup: | |
res = requests.get(self.url) | |
soup = BeautifulSoup(res.content, 'lxml') | |
return soup | |
def lazy_load(self) -> Iterator[Document]: | |
soup = self.fetch() | |
# body = soup.body | |
headers = soup.find_all(RE_HEADERS) | |
for content in get_contents(headers): | |
name = content.name | |
title = content.title | |
text = content.text | |
metadata = {"category": self.category, "source": name, "url": f"{self.url}#{name}", "title": title} | |
yield Document(page_content=text, metadata=metadata) | |
def load(self) -> List[Document]: | |
return list(self.lazy_load()) | |
if __name__ == "__main__": | |
url = "https://www.nvaccess.org/files/nvda/documentation/userGuide.html" | |
loader = NVDAUserGuideLoader(url, "en-nvda-user-guide") | |
data = loader.load() | |
print(data) | |
# breakpoint() | |