Spaces:
Paused
Paused
from langchain.document_loaders import ReadTheDocsLoader | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain.embeddings import OpenAIEmbeddings | |
from langchain.vectorstores import Qdrant | |
# from qdrant_client import QdrantClient | |
from nvda_ug_loader import NVDAUserGuideLoader | |
from config import DB_CONFIG | |
CHUNK_SIZE = 500 | |
def _remove_prefix_path(p: str): | |
prefix = "data/rtdocs/nvdajp-book.readthedocs.io/" | |
return p.removeprefix(prefix) | |
def get_documents(path: str): | |
loader = ReadTheDocsLoader(path, encoding="utf-8") | |
docs = loader.load() | |
base_url = "https://nvdajp-book.readthedocs.io/" | |
category = "ja-book" | |
for doc in docs: | |
org_metadata = doc.metadata | |
source = _remove_prefix_path(org_metadata["source"]) | |
add_meta = {"category": category, "source": source, "url": f"{base_url}{source}"} | |
doc.metadata = org_metadata | add_meta | |
yield doc | |
def get_text_chunk(docs): | |
text_splitter = RecursiveCharacterTextSplitter(chunk_size=CHUNK_SIZE, chunk_overlap=0) | |
texts = text_splitter.split_documents(docs) | |
return texts | |
def store(texts): | |
embeddings = OpenAIEmbeddings() | |
db_url, db_api_key, db_collection_name = DB_CONFIG | |
# client = QdrantClient(url=db_url, api_key=db_api_key, prefer_grpc=True) | |
_ = Qdrant.from_documents( | |
texts, | |
embeddings, | |
url=db_url, | |
api_key=db_api_key, | |
collection_name=db_collection_name | |
) | |
def rtd_main(path: str): | |
docs = get_documents(path) | |
texts = get_text_chunk(docs) | |
store(texts) | |
def nul_main(url: str): | |
if "www.nvda.jp" in url: | |
category = "ja-nvda-user-guide" | |
else: | |
category = "en-nvda-user-guide" | |
loader = NVDAUserGuideLoader(url, category) | |
docs = loader.load() | |
texts = get_text_chunk(docs) | |
store(texts) | |
if __name__ == "__main__": | |
""" | |
$ python store.py rtd "data/rtdocs/nvdajp-book.readthedocs.io/ja/latest" | |
$ python store.py nul "https://www.nvaccess.org/files/nvda/documentation/userGuide.html" | |
$ python store.py nul "https://www.nvda.jp/nvda2023.1jp/ja/userGuide.html" | |
""" | |
import sys | |
args = sys.argv | |
if len(args) != 3: | |
print("No args, you need two args for type, html_path") | |
else: | |
type_ = args[1] | |
path = args[2] | |
if type_ == "rtd": | |
rtd_main(path) | |
elif type_ == "nul": | |
nul_main(path) | |
else: | |
print("No type for store") | |