File size: 4,103 Bytes
039d1b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1fe926a
039d1b5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1fe926a
 
 
 
 
 
 
039d1b5
 
1fe926a
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
import streamlit as st
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from PyPDF2 import PdfReader
import os
from io import BytesIO
import pickle
import pdfminer
from pdfminer.high_level import extract_text
import re

nltk.download('punkt')
nltk.download('stopwords')

def preprocess_text(text):
    words = word_tokenize(text.lower())

    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    stemmer = PorterStemmer()
    words = [stemmer.stem(word) for word in words]

    return ' '.join(words)

def extract_text_from_pdf(pdf_content):
    pdf_reader = PdfReader(BytesIO(pdf_content))
    text = ''
    for page in pdf_reader.pages:
        text += page.extract_text()
    return text

def clean_pdf_text(text):
    # Your existing cleanResume function remains unchanged
    text = re.sub('http\S+\s*', ' ', text)
    text = re.sub('RT|cc', ' ', text)
    text = re.sub('#\S+', '', text)
    text = re.sub('@\S+', '  ', text)
    text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text)
    text = re.sub(r'[^\x00-\x7f]',r' ', text)
    text = re.sub('\s+', ' ', text)
    return text

def extract_candidate_name(text):
    # Use regular expressions to extract candidate names
    # Modify the regex pattern according to your naming conventions
    pattern = r'(?:Mr\.|Ms\.|Mrs\.)?\s?([A-Z][a-z]+)\s([A-Z][a-z]+)'
    match = re.search(pattern, text)
    if match:
        return match.group(0)
    return "Candidate Name Not Found"

def calculate_similarity(job_description, cvs, cv_file_names):
    processed_job_desc = preprocess_text(job_description)

    processed_cvs = [preprocess_text(cv) for cv in cvs]

    all_text = [processed_job_desc] + processed_cvs

    vectorizer = TfidfVectorizer()
    tfidf_matrix = vectorizer.fit_transform(all_text)

    similarity_scores = cosine_similarity(tfidf_matrix)[0][1:]

    ranked_cvs = list(zip(cv_file_names, similarity_scores))
    ranked_cvs.sort(key=lambda x: x[1], reverse=True)

    return ranked_cvs

def rank_and_shortlist(job_description, cv_files, threshold=0.15):
    cv_texts = [extract_text_from_pdf(cv_file.read()) for cv_file in cv_files]
    cv_file_names = [cv_file.name for cv_file in cv_files]
    cvs = [clean_pdf_text(cv_text) for cv_text in cv_texts]
    similarity_scores = calculate_similarity(job_description, cvs, cv_file_names)

    ranked_cvs = [(cv_name, score) for (cv_name, score) in similarity_scores]
    shortlisted_cvs = [(cv_name, score) for (cv_name, score) in ranked_cvs if score > threshold]

    return ranked_cvs, shortlisted_cvs

def main():
    st.title("Resume Ranking App")

    st.write("Upload the Job Description:")
    job_description = st.text_area("Job Description", height=200, key='job_description')

    st.write("Upload the Resumes (PDFs):")
    cv_files = st.file_uploader("Choose PDF files", accept_multiple_files=True, type=["pdf"], key='cv_files')

    if st.button("Submit"):
        if job_description and cv_files:
            # Rank and shortlist candidates
            ranked_cvs, shortlisted_cvs = rank_and_shortlist(job_description, cv_files)

            # Display ranking with larger text
            st.markdown("### Ranking of Resumes:")
            for rank, score in ranked_cvs:
                st.markdown(f"**File Name:** {rank}, **Similarity Score:** {score:.2f}")

            # Display shortlisted candidates with larger text
            st.markdown("### Shortlisted Candidates:")
            if not shortlisted_cvs:  # Check if the shortlisted_cvs list is empty
                st.markdown("None")
            else:
                for rank, score in shortlisted_cvs:
                    st.markdown(f"**File Name:** {rank}, **Similarity Score:** {score:.2f}")
    else:
        st.write("Please upload both the job description and resumes to proceed.")

if __name__ == "__main__":
    main()