File size: 10,534 Bytes
4a8d0fb
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
from haystack.nodes.base import BaseComponent
from haystack.schema import Document
from haystack.nodes import PDFToTextOCRConverter, PDFToTextConverter
from haystack.nodes import TextConverter, DocxToTextConverter, PreProcessor
from typing import Callable, Dict, List, Optional, Text, Tuple, Union
from typing_extensions import Literal
import pandas as pd
import logging
import re
import string
from haystack.pipelines import Pipeline

def useOCR(file_path: str)-> Text:
    """
    Converts image pdfs into text, Using the Farm-haystack[OCR]
    
    Params
    ----------
    file_path: file_path of uploade file, returned by add_upload function in 
    uploadAndExample.py
    
    Returns the text file as string.
    """

    
    converter = PDFToTextOCRConverter(remove_numeric_tables=True, 
                                      valid_languages=["eng"])
    docs = converter.convert(file_path=file_path, meta=None)
    return docs[0].content




class FileConverter(BaseComponent):
    """
    Wrapper class to convert uploaded document into text by calling appropriate 
    Converter class, will use internally haystack PDFToTextOCR in case of image 
    pdf. Cannot use the FileClassifier from haystack as its doesnt has any 
    label/output class for image.
    1. https://haystack.deepset.ai/pipeline_nodes/custom-nodes
    2. https://docs.haystack.deepset.ai/docs/file_converters
    3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/file_converter
    4. https://docs.haystack.deepset.ai/reference/file-converters-api
    """

    outgoing_edges = 1

    def run(self, file_name: str , file_path: str, encoding: Optional[str]=None,
            id_hash_keys: Optional[List[str]] = None,
            ) -> Tuple[dict,str]:
        """ this is required method to invoke the component in 
            the pipeline implementation. 
            
        Params
        ----------
        file_name: name of file
        file_path: file_path of uploade file, returned by add_upload function in 
                    uploadAndExample.py
        
        See the links provided in Class docstring/description to see other params
        
        Return
        ---------
        output: dictionary, with key as identifier and value could be anything 
                we need to return. In this case its the List of Hasyatck Document
        
        output_1: As there is only one outgoing edge, we pass 'output_1' string
        """
        try:
            if file_name.endswith('.pdf'):
                converter = PDFToTextConverter(remove_numeric_tables=True)
            if file_name.endswith('.txt'):
                converter = TextConverter(remove_numeric_tables=True)
            if file_name.endswith('.docx'):
                converter = DocxToTextConverter() 
        except Exception as e:
            logging.error(e)
            return 



        documents = []


# encoding is empty, probably should be utf-8
        document = converter.convert(
                      file_path=file_path, meta=None, 
                      encoding=encoding, id_hash_keys=id_hash_keys
                      )[0]

        text = document.content

        # in case of scanned/images only PDF the content might contain only
        # the page separator (\f or \x0c). We check if is so and use
        # use the OCR to get the text.
        filtered = re.sub(r'\x0c', '', text)

        if filtered == "":
            logging.info("Using OCR")
            text = useOCR(file_path)

        documents.append(Document(content=text, 
                              meta={"name": file_name}, 
                              id_hash_keys=id_hash_keys))
        
        logging.info('file conversion succesful')
        output = {'documents': documents}
        return output, 'output_1'

    def run_batch():
        """
        we dont have requirement to process the multiple files in one go
        therefore nothing here, however to use the custom node we need to have
        this method for the class.
        """
        
        return


def basic(s:str, remove_punc:bool = False):

    """
    Performs basic cleaning of text.
    Params
    ----------
    s: string to be processed
    removePunc: to remove all Punctuation including ',' and '.' or not
    
    Returns: processed string: see comments in the source code for more info
    """
    
    # Remove URLs
    s = re.sub(r'^https?:\/\/.*[\r\n]*', ' ', s, flags=re.MULTILINE)
    s = re.sub(r"http\S+", " ", s)

    # Remove new line characters
    s = re.sub('\n', ' ', s) 

    # Remove punctuations
    if remove_punc == True:
      translator = str.maketrans(' ', ' ', string.punctuation) 
      s = s.translate(translator)
    # Remove distracting single quotes and dotted pattern
    s = re.sub("\'", " ", s)
    s = s.replace("..","") 
    
    return s.strip()

def paraLengthCheck(paraList, max_len = 100):
    """
    There are cases where preprocessor cannot respect word limit, when using 
    respect sentence boundary flag due to missing sentence boundaries.
    Therefore we run one more round of split here for those paragraphs
    
    Params
    ---------------
    paraList : list of paragraphs/text
    max_len : max length to be respected by sentences which bypassed 
              preprocessor strategy
              
    """
    new_para_list = []
    for passage in paraList:
        # check if para exceeds words limit
        if len(passage.content.split()) > max_len:
          # we might need few iterations example if para = 512 tokens
          # we need to iterate 5 times to reduce para to size limit of '100'
            iterations = int(len(passage.content.split())/max_len)
            for i in range(iterations):
                temp  = " ".join(passage.content.split()[max_len*i:max_len*(i+1)])
                new_para_list.append((temp,passage.meta['page']))
            temp  = " ".join(passage.content.split()[max_len*(i+1):])
            new_para_list.append((temp,passage.meta['page']))
        else:
            # paragraphs which dont need any splitting
            new_para_list.append((passage.content, passage.meta['page']))
    
    logging.info("New paragraphs length {}".format(len(new_para_list)))
    return new_para_list

class UdfPreProcessor(BaseComponent):
    """
    class to preprocess the document returned by FileConverter. It will check
    for splitting strategy and splits the document by word or sentences and then
    synthetically create the paragraphs. 
    1. https://docs.haystack.deepset.ai/docs/preprocessor
    2. https://docs.haystack.deepset.ai/reference/preprocessor-api
    3. https://github.com/deepset-ai/haystack/tree/main/haystack/nodes/preprocessor
    """
    outgoing_edges = 1

    def run(self, documents:List[Document], remove_punc:bool=False, 
            split_by: Literal["sentence", "word"] = 'sentence',
            split_length:int = 2, split_respect_sentence_boundary:bool = False,
            split_overlap:int = 0):

        """ this is required method to invoke the component in 
        the pipeline implementation. 
            
        Params
        ----------
        documents: documents from the output dictionary returned by Fileconverter
        remove_punc: to remove all Punctuation including ',' and '.' or not
        split_by: document splitting strategy either as word or sentence
        split_length: when synthetically creating the paragrpahs from document,
                      it defines the length of paragraph.
        split_respect_sentence_boundary: Used when using 'word' strategy for 
        splititng of text.
        split_overlap: Number of words or sentences that overlap when creating
        the paragraphs. This is done as one sentence or 'some words' make sense
        when  read in together with others. Therefore the overlap is used.
        
        Return
        ---------
        output: dictionary, with key as identifier and value could be anything 
                we need to return. In this case the output will contain 4 objects
                the paragraphs text list as List, Haystack document, Dataframe and 
                one raw text file.
        
        output_1: As there is only one outgoing edge, we pass 'output_1' string
      
        """
        
        if split_by == 'sentence':
            split_respect_sentence_boundary = False
    
        else:
            split_respect_sentence_boundary = split_respect_sentence_boundary
      
        preprocessor = PreProcessor(
            clean_empty_lines=True,
            clean_whitespace=True,
            clean_header_footer=True,
            split_by=split_by,
            split_length=split_length,
            split_respect_sentence_boundary= split_respect_sentence_boundary,
            split_overlap=split_overlap,

            # will add page number only in case of PDF not for text/docx file.
            add_page_number=True
            )
        
        for i in documents:
            # # basic cleaning before passing it to preprocessor.
            # i = basic(i)
            docs_processed = preprocessor.process([i])
            for item in docs_processed:
                item.content = basic(item.content, remove_punc= remove_punc)

        df = pd.DataFrame(docs_processed)
        all_text = " ".join(df.content.to_list())
        para_list = df.content.to_list()
        logging.info('document split into {} paragraphs'.format(len(para_list)))
        output = {'documents': docs_processed,
                  'dataframe': df,
                  'text': all_text,
                  'paraList': para_list
                 }
        return output, "output_1"
    def run_batch():
        """
            we dont have requirement to process the multiple files in one go
            therefore nothing here, however to use the custom node we need to have
            this method for the class.
        """
        return

def processingpipeline():
    """
    Returns the preprocessing pipeline. Will use FileConverter and UdfPreProcesor 
    from utils.preprocessing
    """

    preprocessing_pipeline = Pipeline()
    file_converter = FileConverter()
    custom_preprocessor = UdfPreProcessor()

    preprocessing_pipeline.add_node(component=file_converter, 
                                    name="FileConverter", inputs=["File"])
    preprocessing_pipeline.add_node(component = custom_preprocessor, 
                            name ='UdfPreProcessor', inputs=["FileConverter"])

    return preprocessing_pipeline