Spaces:
Sleeping
Sleeping

Medivocate : An AI-powered platform exploring African history, culture, and traditional medicine, fostering understanding and appreciation of the continent's rich heritage.
15aea1e
verified
import json | |
import os | |
from concurrent.futures import ThreadPoolExecutor | |
from glob import glob | |
from typing import List | |
from langchain.text_splitter import RecursiveCharacterTextSplitter | |
from langchain_community.document_loaders import DirectoryLoader, TextLoader | |
from langchain_core.documents import Document | |
from tqdm import tqdm | |
def sanitize_metadata(metadata: dict) -> dict: | |
sanitized = {} | |
for key, value in metadata.items(): | |
if isinstance(value, list): | |
sanitized[key] = ", ".join(value) | |
elif isinstance(value, (str, int, float, bool)): | |
sanitized[key] = value | |
else: | |
raise ValueError( | |
f"Unsupported metadata type for key '{key}': {type(value)}" | |
) | |
return sanitized | |
class DocumentLoader: | |
""" | |
Handles loading and splitting documents from directories. | |
""" | |
def __init__(self, docs_dir: str): | |
self.docs_dir = docs_dir | |
def load_text_documents(self) -> List[Document]: | |
"""Loads and splits text documents.""" | |
loader = DirectoryLoader(self.docs_dir, glob="**/*.txt", loader_cls=TextLoader) | |
documents = loader.load() | |
splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200) | |
return splitter.split_documents(documents) | |
def load_json_documents(self) -> List[Document]: | |
"""Loads and processes JSON documents.""" | |
files = glob(os.path.join(self.docs_dir, "*.json")) | |
def load_json_file(file_path): | |
with open(file_path, "r") as f: | |
data = json.load(f)["kwargs"] | |
return Document.model_validate( | |
{**data, "metadata": sanitize_metadata(data["metadata"])} | |
) | |
with ThreadPoolExecutor() as executor: | |
documents = list( | |
tqdm( | |
executor.map(load_json_file, files), | |
total=len(files), | |
desc="Loading JSON documents", | |
) | |
) | |
return documents | |
def load_documents(self) -> List[Document]: | |
"""Determines and loads documents based on file type.""" | |
if glob(os.path.join(self.docs_dir, "*.json")): | |
return self.load_json_documents() | |
return self.load_text_documents() | |