File size: 1,533 Bytes
32a1b64
 
b5c1366
32a1b64
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b5c1366
32a1b64
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
from utils.preprocessing import processingpipeline

def get_paragraphs(file_path_input): 

    # Declare params
    SPLIT_BY = 'word'

    # usually models have max-length of 384/512
    SPLIT_LENGTH = 100

    # too much overlap can lead to repeatitive text
    # but as a rule fo thumb we keep (20% of Split Length)
    SPLIT_OVERLAP = 10
    
    # the text is cleaned for removing htmls and other annoying texts
    # but if you need to remove all punctuations like ,.; etc.
    # good to use for non-Transformers based models.
    REMOVE_PUNC = False
    
    # This param is used only for split_by ='word'
    RESPECT_SENTENCE_BOUNDARY = True

    # initialize the preprocessing pipeline and pass params for Preprocessor either
    # on go or as per delcared variables above.
    prep_pipeline  = processingpipeline()

    output_pre = prep_pipeline.run(file_paths = file_path_input,
                          params= {"FileConverter": {"file_path": file_path, \
                                        "file_name": file_name},
                                   "UdfPreProcessor": {"remove_punc": REMOVE_PUNC, \
                                        "split_by": SPLIT_BY, \
                                        "split_length":SPLIT_LENGTH,\
                                        "split_overlap": SPLIT_OVERLAP, \
                                        "split_respect_sentence_boundary":RESPECT_SENTENCE_BOUNDARY}})

    output_pre.keys()

    par_list = output_pre['paraList']
    #print(par_list)

    return par_list