Spaces:
Sleeping
Sleeping
import streamlit as st | |
import nltk | |
from nltk.corpus import stopwords | |
from nltk.tokenize import word_tokenize | |
from nltk.stem import PorterStemmer | |
from sklearn.feature_extraction.text import TfidfVectorizer | |
from sklearn.metrics.pairwise import cosine_similarity | |
from PyPDF2 import PdfReader | |
import os | |
from io import BytesIO | |
import pickle | |
import pdfminer | |
from pdfminer.high_level import extract_text | |
import re | |
nltk.download('punkt') | |
nltk.download('stopwords') | |
def preprocess_text(text): | |
words = word_tokenize(text.lower()) | |
stop_words = set(stopwords.words('english')) | |
words = [word for word in words if word not in stop_words] | |
stemmer = PorterStemmer() | |
words = [stemmer.stem(word) for word in words] | |
return ' '.join(words) | |
def extract_text_from_pdf(pdf_content): | |
pdf_reader = PdfReader(BytesIO(pdf_content)) | |
text = '' | |
for page in pdf_reader.pages: | |
text += page.extract_text() | |
return text | |
def clean_pdf_text(text): | |
# Your existing cleanResume function remains unchanged | |
text = re.sub('http\S+\s*', ' ', text) | |
text = re.sub('RT|cc', ' ', text) | |
text = re.sub('#\S+', '', text) | |
text = re.sub('@\S+', ' ', text) | |
text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text) | |
text = re.sub(r'[^\x00-\x7f]',r' ', text) | |
text = re.sub('\s+', ' ', text) | |
return text | |
def extract_candidate_name(text): | |
# Use regular expressions to extract candidate names | |
# Modify the regex pattern according to your naming conventions | |
pattern = r'(?:Mr\.|Ms\.|Mrs\.)?\s?([A-Z][a-z]+)\s([A-Z][a-z]+)' | |
match = re.search(pattern, text) | |
if match: | |
return match.group(0) | |
return "Candidate Name Not Found" | |
def calculate_similarity(job_description, cvs, cv_file_names): | |
processed_job_desc = preprocess_text(job_description) | |
processed_cvs = [preprocess_text(cv) for cv in cvs] | |
all_text = [processed_job_desc] + processed_cvs | |
vectorizer = TfidfVectorizer() | |
tfidf_matrix = vectorizer.fit_transform(all_text) | |
similarity_scores = cosine_similarity(tfidf_matrix)[0][1:] | |
ranked_cvs = list(zip(cv_file_names, similarity_scores)) | |
ranked_cvs.sort(key=lambda x: x[1], reverse=True) | |
return ranked_cvs | |
def rank_and_shortlist(job_description, cv_files, threshold=0.15): | |
cv_texts = [extract_text_from_pdf(cv_file.read()) for cv_file in cv_files] | |
cv_file_names = [cv_file.name for cv_file in cv_files] | |
cvs = [clean_pdf_text(cv_text) for cv_text in cv_texts] | |
similarity_scores = calculate_similarity(job_description, cvs, cv_file_names) | |
ranked_cvs = [(cv_name, score) for (cv_name, score) in similarity_scores] | |
shortlisted_cvs = [(cv_name, score) for (cv_name, score) in ranked_cvs if score > threshold] | |
return ranked_cvs, shortlisted_cvs | |
def main(): | |
st.title("Resume Ranking App") | |
st.write("Upload the Job Description:") | |
job_description = st.text_area("Job Description", height=200, key='job_description') | |
st.write("Upload the Resumes (PDFs):") | |
cv_files = st.file_uploader("Choose PDF files", accept_multiple_files=True, type=["pdf"], key='cv_files') | |
if st.button("Submit"): | |
if job_description and cv_files: | |
# Rank and shortlist candidates | |
ranked_cvs, shortlisted_cvs = rank_and_shortlist(job_description, cv_files) | |
# Display ranking with larger text | |
st.markdown("### Ranking of Resumes:") | |
for rank, score in ranked_cvs: | |
st.markdown(f"**File Name:** {rank}, **Similarity Score:** {score:.2f}") | |
# Display shortlisted candidates with larger text | |
st.markdown("### Shortlisted Candidates:") | |
if not shortlisted_cvs: # Check if the shortlisted_cvs list is empty | |
st.markdown("None") | |
else: | |
for rank, score in shortlisted_cvs: | |
st.markdown(f"**File Name:** {rank}, **Similarity Score:** {score:.2f}") | |
else: | |
st.write("Please upload both the job description and resumes to proceed.") | |
if __name__ == "__main__": | |
main() |