from enum import Enum from langchain_community.document_loaders import PyPDFLoader,TextLoader from langchain.text_splitter import RecursiveCharacterTextSplitter,NLTKTextSplitter,SpacyTextSplitter separators=[ "\n\n", "\n", " ", ".", ",", "\u200b", # Zero-width space "\uff0c", # Fullwidth comma "\u3001", # Ideographic comma "\uff0e", # Fullwidth full stop "\u3002", # Ideographic full stop "", ] class ChunkingStrategy(Enum): RECURSIVE_CHARACTER_CHAR_SPLITTER = "recursive_character_char_splitter" NLTK_TEXT_SPLITTER = "nltk_text_splitter" SPACY_TEXT_SPLITTER = "spacy_text_splitter" class TextLoaderAndSplitterWrapper: def __init__(self, strategy: ChunkingStrategy, file_path:str): # Defaults self.splitter = None self.documents = [] # Determine with splitter strategy to use from parameter if strategy == ChunkingStrategy.RECURSIVE_CHARACTER_CHAR_SPLITTER: self.splitter = RecursiveCharacterTextSplitter(separators=separators) elif strategy == ChunkingStrategy.NLTK_TEXT_SPLITTER: self.splitter = NLTKTextSplitter() elif strategy == ChunkingStrategy.SPACY_TEXT_SPLITTER: self.splitter = SpacyTextSplitter() else: raise ValueError(f"Unknown strategy: {strategy}") # Load the document and chunk it self.file_path = file_path def load_documents(self): if self.file_path.endswith(".pdf"): # Use PDF loader pdf_loader = PyPDFLoader(self.file_path) self.documents = pdf_loader.load_and_split(text_splitter=self.splitter) # Defaults to RecursiveCharacterTextSplitter. return self.documents elif self.file_path.endswith(".txt"): # Use Text loader text_loader = TextLoader(self.file_path) self.documents = text_loader.load_and_split(text_splitter=self.splitter) return self.documents else: raise ValueError(f"Unknown file type: {self.file_path}") def split(self, text: str): return self.splitter.split(text) def join(self, chunks: list): return self.splitter.join(chunks) def __str__(self): return f"TextLoaderAndSplitterWrapper(splitter={self.splitter})" def __repr__(self): return str(self)