import streamlit as st from io import BytesIO import tempfile from PyPDF2 import PdfReader import spacy from pyresparser import ResumeParser from sklearn.feature_extraction.text import TfidfVectorizer # Load the spaCy model for natural language processing nlp = spacy.load('en_core_web_sm') # Function to extract text from PDF resumes def extract_text_from_pdf(file): text = "" pdf_reader = PdfReader(file) for page in pdf_reader.pages: text += page.extract_text() return text # Function to preprocess text using spaCy def preprocess_text(text): doc = nlp(text) tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha] return " ".join(tokens) # Function to preprocess and combine the relevant resume fields def preprocess_resume_data(resume_data): skills = " ".join(resume_data.get('skills', [])) if resume_data.get('skills') else "" experience = " ".join(resume_data.get('experience', [])) if resume_data.get('experience') else "" degree = " ".join(resume_data.get('degree', [])) if resume_data.get('degree') else "" combined_data = f"{skills} {experience} {degree}" return preprocess_text(combined_data) # Main function to create the Streamlit app def main(): st.title("Resume Ranker and Prescreening Software") st.write("Upload resumes (in PDF format) and enter job descriptions or keywords to filter and rank them.") # Upload resumes uploaded_files = st.file_uploader("Upload Resumes (PDF files)", accept_multiple_files=True) # Input field for job description or keywords job_description = st.text_area("Enter Job Description or Keywords") if st.button("Rank Resumes"): if not uploaded_files: st.warning("Please upload one or more resumes.") return if not job_description: st.warning("Please enter a job description or keywords.") return # Preprocess the job description job_description_processed = preprocess_text(job_description) # Vectorize the job description and resumes vectorizer = TfidfVectorizer() job_vec = vectorizer.fit_transform([job_description_processed]) # List to store responses along with file names and their matching percentages file_responses = [] # Loop through uploaded resumes for file in uploaded_files: # Read the uploaded PDF file into memory pdf_data = BytesIO(file.read()) # Save the PDF data to a temporary file with tempfile.NamedTemporaryFile(delete=False, suffix='.pdf') as temp_file: temp_file.write(pdf_data.getvalue()) temp_file_path = temp_file.name # Parse and preprocess resumes data = ResumeParser(temp_file_path).get_extracted_data() if data: combined_resume_data = preprocess_resume_data(data) resume_vec = vectorizer.transform([combined_resume_data]) similarity = (resume_vec * job_vec.T).A[0][0] * 100 # Append file name and similarity to the list file_responses.append((file.name, similarity)) # Delete the temporary file temp_file.close() # Sort file responses based on the similarity in descending order file_responses.sort(key=lambda x: x[1], reverse=True) # Display sorted file names and similarity percentages st.header("Ranked Resumes") for file_name, similarity in file_responses: st.write(f"Resume: {file_name}, Match Percentage: {similarity:.2f}%") if __name__ == "__main__": main()