leavoigt commited on
Commit
32a1b64
1 Parent(s): 5b4a98a

Create file_processing.py

Browse files
Files changed (1) hide show
  1. file_processing.py +42 -0
file_processing.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import utils
2
+ from utils.preprocessing import processingpipeline
3
+
4
+ def get_paragraphs(file_input):
5
+
6
+ # Declare params
7
+ SPLIT_BY = 'word'
8
+
9
+ # usually models have max-length of 384/512
10
+ SPLIT_LENGTH = 100
11
+
12
+ # too much overlap can lead to repeatitive text
13
+ # but as a rule fo thumb we keep (20% of Split Length)
14
+ SPLIT_OVERLAP = 10
15
+
16
+ # the text is cleaned for removing htmls and other annoying texts
17
+ # but if you need to remove all punctuations like ,.; etc.
18
+ # good to use for non-Transformers based models.
19
+ REMOVE_PUNC = False
20
+
21
+ # This param is used only for split_by ='word'
22
+ RESPECT_SENTENCE_BOUNDARY = True
23
+
24
+ # initialize the preprocessing pipeline and pass params for Preprocessor either
25
+ # on go or as per delcared variables above.
26
+ prep_pipeline = processingpipeline()
27
+
28
+ output_pre = prep_pipeline.run(file_paths = file_path,
29
+ params= {"FileConverter": {"file_path": file_path, \
30
+ "file_name": file_name},
31
+ "UdfPreProcessor": {"remove_punc": REMOVE_PUNC, \
32
+ "split_by": SPLIT_BY, \
33
+ "split_length":SPLIT_LENGTH,\
34
+ "split_overlap": SPLIT_OVERLAP, \
35
+ "split_respect_sentence_boundary":RESPECT_SENTENCE_BOUNDARY}})
36
+
37
+ output_pre.keys()
38
+
39
+ par_list = output_pre['paraList']
40
+ #print(par_list)
41
+
42
+ return par_list