vulnerability_2_1

Sleeping

vulnerability_2_1 / file_processing.py

Update file_processing.py

b5c1366 over 1 year ago

1.53 kB

	from utils.preprocessing import processingpipeline

	def get_paragraphs(file_path_input):

	# Declare params
	SPLIT_BY = 'word'

	# usually models have max-length of 384/512
	SPLIT_LENGTH = 100

	# too much overlap can lead to repeatitive text
	# but as a rule fo thumb we keep (20% of Split Length)
	SPLIT_OVERLAP = 10

	# the text is cleaned for removing htmls and other annoying texts
	# but if you need to remove all punctuations like ,.; etc.
	# good to use for non-Transformers based models.
	REMOVE_PUNC = False

	# This param is used only for split_by ='word'
	RESPECT_SENTENCE_BOUNDARY = True

	# initialize the preprocessing pipeline and pass params for Preprocessor either
	# on go or as per delcared variables above.
	prep_pipeline = processingpipeline()

	output_pre = prep_pipeline.run(file_paths = file_path_input,
	params= {"FileConverter": {"file_path": file_path, \
	"file_name": file_name},
	"UdfPreProcessor": {"remove_punc": REMOVE_PUNC, \
	"split_by": SPLIT_BY, \
	"split_length":SPLIT_LENGTH,\
	"split_overlap": SPLIT_OVERLAP, \
	"split_respect_sentence_boundary":RESPECT_SENTENCE_BOUNDARY}})

	output_pre.keys()

	par_list = output_pre['paraList']
	#print(par_list)

	return par_list