medivocate / src /vector_store /document_loader.py
alexneakameni's picture
Medivocate : An AI-powered platform exploring African history, culture, and traditional medicine, fostering understanding and appreciation of the continent's rich heritage.
15aea1e verified
import json
import os
from concurrent.futures import ThreadPoolExecutor
from glob import glob
from typing import List
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import DirectoryLoader, TextLoader
from langchain_core.documents import Document
from tqdm import tqdm
def sanitize_metadata(metadata: dict) -> dict:
sanitized = {}
for key, value in metadata.items():
if isinstance(value, list):
sanitized[key] = ", ".join(value)
elif isinstance(value, (str, int, float, bool)):
sanitized[key] = value
else:
raise ValueError(
f"Unsupported metadata type for key '{key}': {type(value)}"
)
return sanitized
class DocumentLoader:
"""
Handles loading and splitting documents from directories.
"""
def __init__(self, docs_dir: str):
self.docs_dir = docs_dir
def load_text_documents(self) -> List[Document]:
"""Loads and splits text documents."""
loader = DirectoryLoader(self.docs_dir, glob="**/*.txt", loader_cls=TextLoader)
documents = loader.load()
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
return splitter.split_documents(documents)
def load_json_documents(self) -> List[Document]:
"""Loads and processes JSON documents."""
files = glob(os.path.join(self.docs_dir, "*.json"))
def load_json_file(file_path):
with open(file_path, "r") as f:
data = json.load(f)["kwargs"]
return Document.model_validate(
{**data, "metadata": sanitize_metadata(data["metadata"])}
)
with ThreadPoolExecutor() as executor:
documents = list(
tqdm(
executor.map(load_json_file, files),
total=len(files),
desc="Loading JSON documents",
)
)
return documents
def load_documents(self) -> List[Document]:
"""Determines and loads documents based on file type."""
if glob(os.path.join(self.docs_dir, "*.json")):
return self.load_json_documents()
return self.load_text_documents()