import PyPDF2 from docx import Document from pptx import Presentation from nlp import get_average_similarity_scores import numpy as np import plotly.graph_objects as go import os import tempfile import shutil # Langchain document loaders from langchain.document_loaders import PyPDFLoader #for pdf files from langchain.document_loaders import TextLoader #for text files from langchain.document_loaders import Docx2txtLoader #for docx files from langchain.document_loaders import UnstructuredPowerPointLoader #for pptx files from constants import StreamlitException from PyPDF2.errors import PdfReadError from zipfile import BadZipFile def load_file(st, uploaded_file): # uploaded_file is the output of st.sidebar.file_uploader file_type = uploaded_file.type try: os.mkdir('downloaded_files') except FileExistsError: pass download_path = os.path.join('downloaded_files', uploaded_file.name) with tempfile.NamedTemporaryFile(delete=False) as tmp_file: # Write the contents of the uploaded file to the temporary file tmp_file.write(uploaded_file.read()) tmp_file.flush() shutil.copy(tmp_file.name, download_path) try: if file_type == "application/pdf": resume_text_raw = extract_pdf_text(uploaded_file) lang_loader = PyPDFLoader(download_path) elif file_type == "application/vnd.openxmlformats-officedocument.wordprocessingml.document": resume_text_raw = extract_word_text(uploaded_file) lang_loader = Docx2txtLoader(download_path) elif file_type == "application/vnd.ms-powerpoint" or file_type == "application/vnd.openxmlformats-officedocument.presentationml.presentation": resume_text_raw = extract_ppt_text(uploaded_file) lang_loader = UnstructuredPowerPointLoader(download_path) else: return StreamlitException("**Error**: Invalid file format. Please upload a Word, PDF, or PowerPoint file.") except (PdfReadError, BadZipFile): return StreamlitException("**Error**: Invalid file content. Please upload a valid Word, PDF, or PowerPoint file.") return resume_text_raw, lang_loader # Function to extract text from a PDF file def extract_pdf_text(file): pdf_reader = PyPDF2.PdfReader(file) text = "" for page in pdf_reader.pages: lines = page.extract_text().split('\n') for line in lines: text += line.strip() + ".\n" return text # Function to extract text from a Word file def extract_word_text(file): doc = Document(file) text = '' p_iter = iter(doc.paragraphs) t_iter = iter(doc.tables) while True: try: paragraph = next(p_iter) text += paragraph.text + '.\n' except StopIteration: break try: table = next(t_iter) for row in table.rows: for cell in row.cells: text += cell.text + '\t' text += '\n' except StopIteration: pass return text # Function to extract text from a PowerPoint file def extract_ppt_text(file): prs = Presentation(file) text = "" for slide in prs.slides: for shape in slide.shapes: if shape.has_text_frame: text += shape.text_frame.text return text # Function to plot the average similarity score for each job description phrase def plot_similarity_scores(job_description_phrases, resume_phrases): avg_similarity_scores = get_average_similarity_scores(job_description_phrases, resume_phrases) sorted_scores = sorted(enumerate(avg_similarity_scores), key=lambda x: x[1], reverse=True)[:10] indices = [i[0] for i in sorted_scores] sorted_scores = [i[1] for i in sorted_scores] y_pos = list(range(len(indices))) fig = go.Figure() fig.add_trace(go.Bar( y=y_pos, x=sorted_scores, orientation='h' )) fig.update_layout( yaxis=dict( tickmode="array", tickvals=y_pos, ticktext=[s[:100].ljust(100) + '...' if len(s) > 100 else s.ljust(75) for s in np.array(job_description_phrases)[indices]], tickfont=dict(size=14), autorange="reversed", side="right", automargin=True ), xaxis=dict( tickmode="array", tickvals=np.round(np.arange(0, 1.2, 0.2), 2), ticktext=np.round(np.arange(0, 1.2, 0.2), 2), tickfont=dict(size=14), range=[0, 1.05] ), showlegend=False, margin=dict(t=0) ) fig.update_xaxes(title="Average Similarity Score", title_font=dict(size=14)) return fig