Spaces:

fazni
/

Resume-filter-plus-QA-documents

Sleeping

added google palm model

5ad9f7c 12 months ago

1.93 kB

	import re
	from PyPDF2 import PdfReader

	def preprocess_text(text):
	# Remove newlines and tabs
	text = re.sub(r'\n\|\t', '', text)

	# Remove letter combinations between spaces
	text = re.sub(r'\s[A-Z]\s', ' ', text)

	# Remove emails
	text = re.sub(r'\S+@\S+', '', text)

	# Remove dates in the format DD-MM-YYYY or DD/MM/YYYY
	text = re.sub(r'\d{2}[-/]\d{2}[-/]\d{4}', '', text)

	# Remove phone numbers
	text = re.sub(r'\+\d{2}\s?\d{2,3}\s?\d{3,4}\s?\d{4}', '', text)

	# Remove specific text format
	text = re.sub(r'Issued\s\w+\s\d{4}Credential ID \w+', '', text)

	# Remove extra spaces between words
	text = re.sub(r'\s+', ' ', text)

	# Add a space before a word containing a capital letter in the middle
	text = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)

	return text

	def get_pdf_text(pdfs,preprocess=True):
	if preprocess:
	all_text = []
	for pdf in pdfs:
	# Process each uploaded PDF file
	# Reading PDF
	pdf_reader = PdfReader(pdf)

	# Get the filename of the PDF
	filename = pdf.name

	text = ""
	# Reading Each Page
	for page in pdf_reader.pages:
	# Extracting Text in Every Page
	text += page.extract_text()
	# Preprocess the text
	text = preprocess_text(text)
	# Appending to array
	all_text.append({"filename": filename, "text": text})
	return all_text

	else:
	text = ""
	for pdf in pdfs:
	# Process each uploaded PDF file
	# Reading PDF
	pdf_reader = PdfReader(pdf)

	# Reading Each Page
	for page in pdf_reader.pages:
	# Extracting Text in Every Page
	text += page.extract_text()

	# text = preprocess_text(text)
	return text