resume-data-extraction

Build error

App Files Files Community

resume-data-extraction / ResumeReader.py

Keshav4

Duplicate from Sybghat/resume-parser

643a815 over 1 year ago

raw

history blame

4.01 kB

	import re
	import os
	import logging
	import pdfplumber
	import fitz

	class ResumeReader:

	def convert_docx_to_txt(self, docx_file,docx_parser):
	"""
	A utility function to convert a Microsoft docx files to raw text.

	This code is largely borrowed from existing solutions, and does not match the style of the rest of this repo.
	:param docx_file: docx file with gets uploaded by the user
	:type docx_file: InMemoryUploadedFile
	:return: The text contents of the docx file
	:rtype: str
	"""

	# doc = docx.Document(docx_file)
	# allText = []
	# for docpara in doc.paragraphs:
	# allText.append(docpara.text)
	# text = ' '.join(allText)
	text = ""
	try:
	clean_text = re.sub(r'\n+', '\n', text)
	clean_text = clean_text.replace("\r", "\n").replace("\t", " ") # Normalize text blob
	resume_lines = clean_text.splitlines() # Split text blob into individual lines
	resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if
	line.strip()] # Remove empty strings and whitespaces
	return resume_lines, text
	except Exception as e:
	logging.error('Error in docx file:: ' + str(e))
	return [], " "

	def convert_pdf_to_txt(self, pdf_file):
	"""
	A utility function to convert a machine-readable PDF to raw text.

	This code is largely borrowed from existing solutions, and does not match the style of the rest of this repo.
	:param input_pdf_path: Path to the .pdf file which should be converted
	:type input_pdf_path: str
	:return: The text contents of the pdf
	:rtype: str
	"""

	pdf = pdfplumber.open(pdf_file)
	raw_text= ""
	with fitz.open(pdf_file) as doc:
	for page in doc:
	raw_text += page.get_text()
	print(raw_text)
	# for page in pdf.pages:
	# raw_text += page.extract_text() + "\n"

	pdf.close()

	try:
	full_string = re.sub(r'\n+', '\n', raw_text)
	full_string = full_string.replace("\r", "\n")
	full_string = full_string.replace("\t", " ")

	# Remove awkward LaTeX bullet characters
	full_string = re.sub(r"\uf0b7", " ", full_string)
	full_string = re.sub(r"\(cid:\d{0,3}\)", " ", full_string)
	full_string = re.sub(r'• ', " ", full_string)

	# Split text blob into individual lines
	resume_lines = full_string.splitlines(True)

	# Remove empty strings and whitespaces
	resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()]

	return resume_lines, raw_text
	except Exception as e:
	logging.error('Error in docx file:: ' + str(e))
	return [], " "

	def read_file(self, file,docx_parser = "tika"):
	"""
	file : Give path of resume file
	docx_parser : Enter docx2txt or tika, by default is tika
	"""
	print("Reading the Resume...")
	# file = "/content/Asst Manager Trust Administration.docx"
	file = os.path.join(file)
	if file.endswith('docx') or file.endswith('doc'):
	# if file.endswith('doc') and docx_parser == "docx2txt":
	# docx_parser = "tika"
	# logging.error("doc format not supported by the docx2txt changing back to tika")
	resume_lines, raw_text = self.convert_docx_to_txt(file,docx_parser)
	elif file.endswith('pdf'):
	resume_lines, raw_text = self.convert_pdf_to_txt(file)
	elif file.endswith('txt'):
	with open(file, 'r', encoding='utf-8') as f:
	resume_lines = f.readlines()

	else:
	resume_lines = None


	return resume_lines