import streamlit as st import nltk from nltk.corpus import stopwords from nltk.tokenize import word_tokenize from nltk.stem import PorterStemmer from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.metrics.pairwise import cosine_similarity from PyPDF2 import PdfReader import os from io import BytesIO import pickle import pdfminer from pdfminer.high_level import extract_text import re import PyPDF2 import textract import tempfile from docx import Document nltk.download('punkt') nltk.download('stopwords') def preprocess_text(text): words = word_tokenize(text.lower()) stop_words = set(stopwords.words('english')) words = [word for word in words if word not in stop_words] stemmer = PorterStemmer() words = [stemmer.stem(word) for word in words] return ' '.join(words) def extract_text_from_pdf(pdf_content): pdf_reader = PdfReader(BytesIO(pdf_content)) text = '' for page in pdf_reader.pages: text += page.extract_text() return text def extract_text_from_docx(docx_content): doc = Document(BytesIO(docx_content)) text = " ".join(paragraph.text for paragraph in doc.paragraphs) return text def extract_text_from_txt(txt_content): text = textract.process(input_filename=None, input_bytes=txt_content) return text def extract_text_from_resume(file_path): file_extension = file_path.split('.')[-1].lower() if file_extension == 'pdf': return extract_text_from_pdf(file_path) elif file_extension == 'docx': return extract_text_from_docx(file_path) elif file_extension == 'txt': return extract_text_from_txt(file_path) else: raise ValueError(f"Unsupported file format: {file_extension}") def clean_pdf_text(text): text = re.sub('http\S+\s*', ' ', text) text = re.sub('RT|cc', ' ', text) text = re.sub('#\S+', '', text) text = re.sub('@\S+', ' ', text) text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text) text = re.sub(r'[^\x00-\x7f]',r' ', text) text = re.sub('\s+', ' ', text) return text def extract_candidate_name(text): pattern = r'(?:Mr\.|Ms\.|Mrs\.)?\s?([A-Z][a-z]+)\s([A-Z][a-z]+)' match = re.search(pattern, text) if match: return match.group(0) return "Candidate Name Not Found" def calculate_similarity(job_description, cvs, cv_file_names): processed_job_desc = preprocess_text(job_description) processed_cvs = [preprocess_text(cv) for cv in cvs] all_text = [processed_job_desc] + processed_cvs vectorizer = TfidfVectorizer() tfidf_matrix = vectorizer.fit_transform(all_text) similarity_scores = cosine_similarity(tfidf_matrix)[0][1:] ranked_cvs = list(zip(cv_file_names, similarity_scores)) ranked_cvs.sort(key=lambda x: x[1], reverse=True) return ranked_cvs def rank_and_shortlist(job_description, cv_files, threshold=0.15): cv_texts = [] cv_file_names = [] for cv_file in cv_files: file_extension = os.path.splitext(cv_file.name)[1].lower() try: if file_extension == '.pdf': cv_text = extract_text_from_pdf(cv_file.read()) elif file_extension == '.docx': cv_text = extract_text_from_docx(cv_file.read()) elif file_extension == '.txt': cv_text = cv_file.read().decode('utf-8', errors='ignore') else: st.warning(f"Unsupported file format: {file_extension}. Skipping file: {cv_file.name}") continue cv_texts.append(clean_pdf_text(cv_text)) cv_file_names.append(cv_file.name) except Exception as e: st.warning(f"Error processing file '{cv_file.name}': {str(e)}") continue if not cv_texts: st.error("No valid resumes found. Please upload resumes in supported formats (PDF, DOCX, or TXT).") return [], [] similarity_scores = calculate_similarity(job_description, cv_texts, cv_file_names) ranked_cvs = [(cv_name, score) for (cv_name, score) in similarity_scores] shortlisted_cvs = [(cv_name, score) for (cv_name, score) in ranked_cvs if score >= threshold] return ranked_cvs, shortlisted_cvs def main(): st.title("Resume Ranking App") st.write("Upload the Job Description:") job_description = st.text_area("Job Description", height=200, key='job_description') st.write("Upload the Resumes :") cv_files = st.file_uploader("Choose files", accept_multiple_files=True, key='cv_files') if st.button("Submit"): if job_description and cv_files: # Rank and shortlist candidates ranked_cvs, shortlisted_cvs = rank_and_shortlist(job_description, cv_files) # Display ranking with larger text st.markdown("### Ranking of Resumes:") for rank, score in ranked_cvs: st.markdown(f"**File Name:** {rank}, **Similarity Score:** {score:.2f}") # Display shortlisted candidates with larger text st.markdown("### Shortlisted Candidates:") if not shortlisted_cvs: # Check if the shortlisted_cvs list is empty st.markdown("None") else: for rank, score in shortlisted_cvs: st.markdown(f"**File Name:** {rank}, **Similarity Score:** {score:.2f}") else: st.write("Please upload both the job description and resumes to proceed.") if __name__ == "__main__": main()