from utils.preprocessing import processingpipeline def get_paragraphs(file_path_input): # Declare params SPLIT_BY = 'word' # usually models have max-length of 384/512 SPLIT_LENGTH = 100 # too much overlap can lead to repeatitive text # but as a rule fo thumb we keep (20% of Split Length) SPLIT_OVERLAP = 10 # the text is cleaned for removing htmls and other annoying texts # but if you need to remove all punctuations like ,.; etc. # good to use for non-Transformers based models. REMOVE_PUNC = False # This param is used only for split_by ='word' RESPECT_SENTENCE_BOUNDARY = True # initialize the preprocessing pipeline and pass params for Preprocessor either # on go or as per delcared variables above. prep_pipeline = processingpipeline() output_pre = prep_pipeline.run(file_paths = file_path_input, params= {"FileConverter": {"file_path": file_path, \ "file_name": file_name}, "UdfPreProcessor": {"remove_punc": REMOVE_PUNC, \ "split_by": SPLIT_BY, \ "split_length":SPLIT_LENGTH,\ "split_overlap": SPLIT_OVERLAP, \ "split_respect_sentence_boundary":RESPECT_SENTENCE_BOUNDARY}}) output_pre.keys() par_list = output_pre['paraList'] #print(par_list) return par_list