Spaces:
Build error
Build error
import re | |
import os | |
import logging | |
import pdfplumber | |
import fitz | |
class ResumeReader: | |
def convert_docx_to_txt(self, docx_file,docx_parser): | |
""" | |
A utility function to convert a Microsoft docx files to raw text. | |
This code is largely borrowed from existing solutions, and does not match the style of the rest of this repo. | |
:param docx_file: docx file with gets uploaded by the user | |
:type docx_file: InMemoryUploadedFile | |
:return: The text contents of the docx file | |
:rtype: str | |
""" | |
# doc = docx.Document(docx_file) | |
# allText = [] | |
# for docpara in doc.paragraphs: | |
# allText.append(docpara.text) | |
# text = ' '.join(allText) | |
text = "" | |
try: | |
clean_text = re.sub(r'\n+', '\n', text) | |
clean_text = clean_text.replace("\r", "\n").replace("\t", " ") # Normalize text blob | |
resume_lines = clean_text.splitlines() # Split text blob into individual lines | |
resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if | |
line.strip()] # Remove empty strings and whitespaces | |
return resume_lines, text | |
except Exception as e: | |
logging.error('Error in docx file:: ' + str(e)) | |
return [], " " | |
def convert_pdf_to_txt(self, pdf_file): | |
""" | |
A utility function to convert a machine-readable PDF to raw text. | |
This code is largely borrowed from existing solutions, and does not match the style of the rest of this repo. | |
:param input_pdf_path: Path to the .pdf file which should be converted | |
:type input_pdf_path: str | |
:return: The text contents of the pdf | |
:rtype: str | |
""" | |
pdf = pdfplumber.open(pdf_file) | |
raw_text= "" | |
with fitz.open(pdf_file) as doc: | |
for page in doc: | |
raw_text += page.get_text() | |
print(raw_text) | |
# for page in pdf.pages: | |
# raw_text += page.extract_text() + "\n" | |
pdf.close() | |
try: | |
full_string = re.sub(r'\n+', '\n', raw_text) | |
full_string = full_string.replace("\r", "\n") | |
full_string = full_string.replace("\t", " ") | |
# Remove awkward LaTeX bullet characters | |
full_string = re.sub(r"\uf0b7", " ", full_string) | |
full_string = re.sub(r"\(cid:\d{0,3}\)", " ", full_string) | |
full_string = re.sub(r'• ', " ", full_string) | |
# Split text blob into individual lines | |
resume_lines = full_string.splitlines(True) | |
# Remove empty strings and whitespaces | |
resume_lines = [re.sub('\s+', ' ', line.strip()) for line in resume_lines if line.strip()] | |
return resume_lines, raw_text | |
except Exception as e: | |
logging.error('Error in docx file:: ' + str(e)) | |
return [], " " | |
def read_file(self, file,docx_parser = "tika"): | |
""" | |
file : Give path of resume file | |
docx_parser : Enter docx2txt or tika, by default is tika | |
""" | |
print("Reading the Resume...") | |
# file = "/content/Asst Manager Trust Administration.docx" | |
file = os.path.join(file) | |
if file.endswith('docx') or file.endswith('doc'): | |
# if file.endswith('doc') and docx_parser == "docx2txt": | |
# docx_parser = "tika" | |
# logging.error("doc format not supported by the docx2txt changing back to tika") | |
resume_lines, raw_text = self.convert_docx_to_txt(file,docx_parser) | |
elif file.endswith('pdf'): | |
resume_lines, raw_text = self.convert_pdf_to_txt(file) | |
elif file.endswith('txt'): | |
with open(file, 'r', encoding='utf-8') as f: | |
resume_lines = f.readlines() | |
else: | |
resume_lines = None | |
return resume_lines |