resume-data-extraction / ResumeReader.py
Keshav4's picture
Duplicate from Sybghat/resume-parser
643a815
raw
history blame
4.01 kB
import re
import os
import logging
import pdfplumber
import fitz
class ResumeReader:
def convert_docx_to_txt(self, docx_file,docx_parser):
"""
A utility function to convert a Microsoft docx files to raw text.
This code is largely borrowed from existing solutions, and does not match the style of the rest of this repo.
:param docx_file: docx file with gets uploaded by the user
:type docx_file: InMemoryUploadedFile
:return: The text contents of the docx file
:rtype: str
"""
# doc = docx.Document(docx_file)
# allText = []
# for docpara in doc.paragraphs:
# allText.append(docpara.text)
# text = ' '.join(allText)
text = ""
try:
clean_text = re.sub(r'\n+', '\n', text)
clean_text = clean_text.replace("\r", "\n").replace("\t", " ") # Normalize text blob
resume_lines = clean_text.splitlines() # Split text blob into individual lines
resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if
line.strip()] # Remove empty strings and whitespaces
return resume_lines, text
except Exception as e:
logging.error('Error in docx file:: ' + str(e))
return [], " "
def convert_pdf_to_txt(self, pdf_file):
"""
A utility function to convert a machine-readable PDF to raw text.
This code is largely borrowed from existing solutions, and does not match the style of the rest of this repo.
:param input_pdf_path: Path to the .pdf file which should be converted
:type input_pdf_path: str
:return: The text contents of the pdf
:rtype: str
"""
pdf = pdfplumber.open(pdf_file)
raw_text= ""
with fitz.open(pdf_file) as doc:
for page in doc:
raw_text += page.get_text()
print(raw_text)
# for page in pdf.pages:
# raw_text += page.extract_text() + "\n"
pdf.close()
try:
full_string = re.sub(r'\n+', '\n', raw_text)
full_string = full_string.replace("\r", "\n")
full_string = full_string.replace("\t", " ")
# Remove awkward LaTeX bullet characters
full_string = re.sub(r"\uf0b7", " ", full_string)
full_string = re.sub(r"\(cid:\d{0,3}\)", " ", full_string)
full_string = re.sub(r'• ', " ", full_string)
# Split text blob into individual lines
resume_lines = full_string.splitlines(True)
# Remove empty strings and whitespaces
resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()]
return resume_lines, raw_text
except Exception as e:
logging.error('Error in docx file:: ' + str(e))
return [], " "
def read_file(self, file,docx_parser = "tika"):
"""
file : Give path of resume file
docx_parser : Enter docx2txt or tika, by default is tika
"""
print("Reading the Resume...")
# file = "/content/Asst Manager Trust Administration.docx"
file = os.path.join(file)
if file.endswith('docx') or file.endswith('doc'):
# if file.endswith('doc') and docx_parser == "docx2txt":
# docx_parser = "tika"
# logging.error("doc format not supported by the docx2txt changing back to tika")
resume_lines, raw_text = self.convert_docx_to_txt(file,docx_parser)
elif file.endswith('pdf'):
resume_lines, raw_text = self.convert_pdf_to_txt(file)
elif file.endswith('txt'):
with open(file, 'r', encoding='utf-8') as f:
resume_lines = f.readlines()
else:
resume_lines = None
return resume_lines