File size: 2,432 Bytes
4a0c158
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
from enum import Enum
from langchain_community.document_loaders import PyPDFLoader,TextLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter,NLTKTextSplitter,SpacyTextSplitter
separators=[
        "\n\n",
        "\n",
        " ",
        ".",
        ",",
        "\u200b",  # Zero-width space
        "\uff0c",  # Fullwidth comma
        "\u3001",  # Ideographic comma
        "\uff0e",  # Fullwidth full stop
        "\u3002",  # Ideographic full stop
        "",
    ]

class ChunkingStrategy(Enum):
    RECURSIVE_CHARACTER_CHAR_SPLITTER = "recursive_character_char_splitter"
    NLTK_TEXT_SPLITTER = "nltk_text_splitter"
    SPACY_TEXT_SPLITTER = "spacy_text_splitter"

class TextLoaderAndSplitterWrapper:
    def __init__(self, strategy: ChunkingStrategy, file_path:str):
        # Defaults
        self.splitter = None
        self.documents = []

        # Determine with splitter strategy to use from parameter
        if strategy == ChunkingStrategy.RECURSIVE_CHARACTER_CHAR_SPLITTER:
            self.splitter = RecursiveCharacterTextSplitter(separators=separators)
        elif strategy == ChunkingStrategy.NLTK_TEXT_SPLITTER:
            self.splitter = NLTKTextSplitter()
        elif strategy == ChunkingStrategy.SPACY_TEXT_SPLITTER:
            self.splitter = SpacyTextSplitter()
        else:
            raise ValueError(f"Unknown strategy: {strategy}")

        # Load the document and chunk it
        self.file_path = file_path
        

    def load_documents(self):
        if self.file_path.endswith(".pdf"):
            # Use PDF loader
            pdf_loader = PyPDFLoader(self.file_path)
            self.documents =  pdf_loader.load_and_split(text_splitter=self.splitter) #  Defaults to RecursiveCharacterTextSplitter.
            return self.documents
        elif self.file_path.endswith(".txt"):
            # Use Text loader
            text_loader = TextLoader(self.file_path)
            self.documents = text_loader.load_and_split(text_splitter=self.splitter)
            return self.documents
        else:
            raise ValueError(f"Unknown file type: {self.file_path}")


    def split(self, text: str):
        return self.splitter.split(text)

    def join(self, chunks: list):
        return self.splitter.join(chunks)

    def __str__(self):
        return f"TextLoaderAndSplitterWrapper(splitter={self.splitter})"

    def __repr__(self):
        return str(self)