import altair as alt |
import joblib |
import nltk |
import numpy as np |
import pandas as pd |
import re |
import streamlit as st |
import time |
from gensim.corpora import Dictionary |
from gensim.models import KeyedVectors, TfidfModel |
from gensim.similarities import SoftCosineSimilarity, SparseTermSimilarityMatrix, WordEmbeddingSimilarityIndex |
from gensim.similarities.annoy import AnnoyIndexer |
from io import BytesIO |
from nltk import pos_tag, word_tokenize |
from nltk.corpus import stopwords, wordnet |
from nltk.stem import PorterStemmer, WordNetLemmatizer |
from pandas.api.types import is_categorical_dtype, is_numeric_dtype |
from PIL import Image |
from scipy.sparse import csr_matrix, hstack |
nltk.download('averaged_perceptron_tagger') |
nltk.download('punkt') |
nltk.download('stopwords') |
nltk.download('wordnet') |
stop_words = set(stopwords.words('english')) |
lemmatizer = WordNetLemmatizer() |
stemmer = PorterStemmer() |
def addZeroFeatures(matrix): |
maxFeatures = 18038 |
numDocs, numTerms = matrix.shape |
missingFeatures = maxFeatures - numTerms |
if missingFeatures > 0: |
zeroFeatures = csr_matrix((numDocs, missingFeatures), dtype=np.float64) |
matrix = hstack([matrix, zeroFeatures]) |
return matrix |
@st.cache_data(max_entries = 1, show_spinner = False) |
def classifyResumes(df): |
progressBar = st.progress(0) |
progressBar.progress(0, text = "Preprocessing data ...") |
startTime = time.time() |
df['cleanedResume'] = df.Resume.apply(lambda x: performStemming(x)) |
resumeText = df['cleanedResume'].values |
progressBar.progress(20, text = "Extracting features ...") |
vectorizer = loadTfidfVectorizer() |
wordFeatures = vectorizer.transform(resumeText) |
wordFeaturesWithZeros = addZeroFeatures(wordFeatures) |
progressBar.progress(40, text = "Reducing dimensionality ...") |
finalFeatures = dimensionalityReduction(wordFeaturesWithZeros) |
progressBar.progress(60, text = "Predicting categories ...") |
knn = loadKnnModel() |
predictedCategories = knn.predict(finalFeatures) |
progressBar.progress(80, text = "Finishing touches ...") |
le = loadLabelEncoder() |
df['Industry Category'] = le.inverse_transform(predictedCategories) |
df['Industry Category'] = pd.Categorical(df['Industry Category']) |
df.drop(columns = ['cleanedResume'], inplace = True) |
endTime = time.time() |
elapsedSeconds = endTime - startTime |
hours, remainder = divmod(int(elapsedSeconds), 3600) |
minutes, _ = divmod(remainder, 60) |
secondsWithDecimals = '{:.2f}'.format(elapsedSeconds % 60) |
elapsedTimeStr = f'{hours} h : {minutes} m : {secondsWithDecimals} s' |
progressBar.progress(100, text = f'Classification Complete!') |
time.sleep(1) |
progressBar.empty() |
st.info(f'Finished classifying {len(resumeText)} resumes - {elapsedTimeStr}') |
return df |
def clickClassify(): |
st.session_state.processClf = True |
def clickRank(): |
st.session_state.processRank = True |
def convertDfToXlsx(df): |
output = BytesIO() |
writer = pd.ExcelWriter(output, engine = 'xlsxwriter') |
df.to_excel(writer, index = False, sheet_name = 'Sheet1') |
workbook = writer.book |
worksheet = writer.sheets['Sheet1'] |
format1 = workbook.add_format({'num_format': '0.00'}) |
worksheet.set_column('A:A', None, format1) |
writer.close() |
processedData = output.getvalue() |
return processedData |
def createBarChart(df): |
valueCounts = df['Industry Category'].value_counts().reset_index() |
valueCounts.columns = ['Industry Category', 'Count'] |
newDataframe = pd.DataFrame(valueCounts) |
barChart = alt.Chart(newDataframe, |
).mark_bar( |
color = '#56B6C2', |
size = 13 |
).encode( |
x = alt.X('Count:Q', axis = alt.Axis(format = 'd'), title = 'Number of Resumes'), |
y = alt.Y('Industry Category:N', title = 'Category'), |
tooltip = ['Industry Category', 'Count'] |
).properties( |
title = 'Number of Resumes per Category', |
) |
return barChart |
def dimensionalityReduction(features): |
nca = joblib.load('nca_model.joblib') |
features = nca.transform(features.toarray()) |
return features |
def filterDataframeClf(df: pd.DataFrame) -> pd.DataFrame: |
modify = st.toggle("Add filters", key = 'filter-clf-1') |
if not modify: |
return df |
df = df.copy() |
modificationContainer = st.container() |
with modificationContainer: |
toFilterColumns = st.multiselect("Filter table on", df.columns, key = 'filter-clf-2') |
for column in toFilterColumns: |
left, right = st.columns((1, 20)) |
left.write("↳") |
widgetKey = f'filter-clf-{toFilterColumns.index(column)}-{column}' |
if is_categorical_dtype(df[column]): |
userCatInput = right.multiselect( |
f'Values for {column}', |
df[column].unique(), |
default = list(df[column].unique()), |
key = widgetKey |
) |
df = df[df[column].isin(userCatInput)] |
elif is_numeric_dtype(df[column]): |
_min = float(df[column].min()) |
_max = float(df[column].max()) |
step = (_max - _min) / 100 |
userNumInput = right.slider( |
f'Values for {column}', |
min_value = _min, |
max_value = _max, |
value = (_min, _max), |
step = step, |
key = widgetKey |
) |
df = df[df[column].between(*userNumInput)] |
else: |
userTextInput = right.text_input( |
f'Substring or regex in {column}', |
key = widgetKey |
) |
if userTextInput: |
userTextInput = userTextInput.lower() |
df = df[df[column].astype(str).str.lower().str.contains(userTextInput)] |
return df |
def filterDataframeRnk(df: pd.DataFrame) -> pd.DataFrame: |
modify = st.toggle("Add filters", key = 'filter-rnk-1') |
if not modify: |
return df |
df = df.copy() |
modificationContainer = st.container() |
with modificationContainer: |
toFilterColumns = st.multiselect("Filter table on", df.columns, key = 'filter-rnk-2') |
for column in toFilterColumns: |
left, right = st.columns((1, 20)) |
left.write("↳") |
widgetKey = f'filter-rnk-{toFilterColumns.index(column)}-{column}' |
if is_categorical_dtype(df[column]): |
userCatInput = right.multiselect( |
f'Values for {column}', |
df[column].unique(), |
default = list(df[column].unique()), |
key = widgetKey |
) |
df = df[df[column].isin(userCatInput)] |
elif is_numeric_dtype(df[column]): |
_min = float(df[column].min()) |
_max = float(df[column].max()) |
step = (_max - _min) / 100 |
userNumInput = right.slider( |
f'Values for {column}', |
min_value = _min, |
max_value = _max, |
value = (_min, _max), |
step = step, |
key = widgetKey |
) |
df = df[df[column].between(*userNumInput)] |
else: |
userTextInput = right.text_input( |
f'Substring or regex in {column}', |
key = widgetKey |
) |
if userTextInput: |
userTextInput = userTextInput.lower() |
df = df[df[column].astype(str).str.lower().str.contains(userTextInput)] |
return df |
def getWordnetPos(tag): |
if tag.startswith('J'): |
return wordnet.ADJ |
elif tag.startswith('V'): |
return wordnet.VERB |
elif tag.startswith('N'): |
return wordnet.NOUN |
elif tag.startswith('R'): |
return wordnet.ADV |
else: |
return wordnet.NOUN |
def loadKnnModel(): |
knnModelFileName = f'knn_model.joblib' |
return joblib.load(knnModelFileName) |
def loadLabelEncoder(): |
labelEncoderFileName = f'label_encoder.joblib' |
return joblib.load(labelEncoderFileName) |
def loadTfidfVectorizer(): |
tfidfVectorizerFileName = f'tfidf_vectorizer.joblib' |
return joblib.load(tfidfVectorizerFileName) |
def performLemmatization(text): |
text = re.sub('http\S+\s*', ' ', text) |
text = re.sub('RT|cc', ' ', text) |
text = re.sub('#\S+', '', text) |
text = re.sub('@\S+', ' ', text) |
text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text) |
text = re.sub(r'[^\x00-\x7f]',r' ', text) |
text = re.sub('\s+', ' ', text) |
words = word_tokenize(text) |
words = [ |
lemmatizer.lemmatize(word.lower(), pos = getWordnetPos(pos)) |
for word, pos in pos_tag(words) if word.lower() not in stop_words |
] |
return words |
def performStemming(text): |
text = re.sub('http\S+\s*', ' ', text) |
text = re.sub('RT|cc', ' ', text) |
text = re.sub('#\S+', '', text) |
text = re.sub('@\S+', ' ', text) |
text = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', text) |
text = re.sub(r'[^\x00-\x7f]',r' ', text) |
text = re.sub('\s+', ' ', text) |
words = word_tokenize(text) |
words = [stemmer.stem(word.lower()) for word in words if word.lower() not in stop_words] |
text = ' '.join(words) |
return text |
@st.cache_data |
def loadModel(): |
model_path = 'wiki-news-300d-1M-subword.vec' |
model = KeyedVectors.load_word2vec_format(model_path) |
return model |
model = loadModel() |
@st.cache_data(max_entries = 1, show_spinner = False) |
def rankResumes(text, df): |
progressBar = st.progress(0) |
progressBar.progress(0, text = "Preprocessing data ...") |
startTime = time.time() |
jobDescriptionText = performLemmatization(text) |
df['cleanedResume'] = df['Resume'].apply(lambda x: performLemmatization(x)) |
documents = [jobDescriptionText] + df['cleanedResume'].tolist() |
progressBar.progress(13, text = "Creating a dictionary ...") |
dictionary = Dictionary(documents) |
progressBar.progress(25, text = "Creating a TF-IDF model ...") |
tfidf = TfidfModel(dictionary = dictionary) |
progressBar.progress(38, text = "Creating a Similarity Index...") |
words = [word for word, count in dictionary.most_common()] |
wordVectors = model.vectors_for_all(words, allow_inference = False) |
indexer = AnnoyIndexer(wordVectors, num_trees = 300) |
similarityIndex = WordEmbeddingSimilarityIndex(wordVectors, kwargs = {'indexer': indexer}) |
progressBar.progress(50, text = "Creating a Similarity Matrix...") |
similarityMatrix = SparseTermSimilarityMatrix(similarityIndex, dictionary, tfidf) |
progressBar.progress(63, text = "Setting up job description as the query ...") |
query = tfidf[dictionary.doc2bow(jobDescriptionText)] |
progressBar.progress(75, text = "Calculating semantic similarities ...") |
index = SoftCosineSimilarity( |
tfidf[[dictionary.doc2bow(resume) for resume in df['cleanedResume']]], |
similarityMatrix |
) |
similarities = index[query] |
progressBar.progress(88, text = "Finishing touches ...") |
df['Similarity Score (-1 to 1)'] = similarities |
df['Rank'] = df['Similarity Score (-1 to 1)'].rank(ascending=False, method='dense').astype(int) |
df.sort_values(by='Rank', inplace=True) |
df.drop(columns = ['cleanedResume'], inplace = True) |
endTime = time.time() |
elapsedSeconds = endTime - startTime |
hours, remainder = divmod(int(elapsedSeconds), 3600) |
minutes, _ = divmod(remainder, 60) |
secondsWithDecimals = '{:.2f}'.format(elapsedSeconds % 60) |
elapsedTimeStr = f'{hours} h : {minutes} m : {secondsWithDecimals} s' |
progressBar.progress(100, text = f'Ranking Complete!') |
time.sleep(1) |
progressBar.empty() |
st.info(f'Finished ranking {len(df)} resumes - {elapsedTimeStr}') |
return df |
def writeGettingStarted(): |
st.write(""" |
## Hello, Welcome! |
In today's competitive job market, the process of manually screening resumes has become a daunting task for recruiters and hiring managers. |
The sheer volume of applications received for a single job posting can make it extremely time-consuming to identify the most suitable candidates efficiently. |
This often leads to missed opportunities and the potential loss of top-tier talent. |
The ***Resume Screening & Classification*** website application aims to help alleviate the challenges posed by manual resume screening. |
The main objectives are: |
- To classify the resumes into their most suitable job industry category |
- To compare the resumes to the job description and rank them by similarity |
""") |
st.divider() |
st.write(""" |
## Input Guide |
#### For the Job Description: |
Ensure the job description is saved in a text (.txt) file. |
Kindly outline the responsibilities, qualifications, and skills associated with the position. |
#### For the Resumes: |
Resumes must be compiled in an excel (.xlsx) file. |
The organization of columns is up to you but ensure that the "Resume" column is present. |
The values under this column should include all the relevant details for each resume. |
""") |
st.divider() |
st.write(""" |
## Demo Walkthrough |
#### Classify Tab: |
The web app will classify the resumes into their most suitable job industry category. |
Currently the Category Scope consists of the following: |
""") |
column1, column2 = st.columns(2) |
with column1: |
st.write(""" |
- Aviation |
- Business development |
- Culinary |
- Education |
- Engineering |
- Finance |
""") |
with column2: |
st.write(""" |
- Fitness |
- Healthcare |
- HR |
- Information Technology |
- Public relations |
""") |
with st.expander('Classification Steps'): |
st.write(""" |
##### Upload Resumes & Start Processing: |
- Navigate to the "Classify" tab. |
- Upload the Excel file (.xlsx) containing the resumes you want to classify. Ensure that your Excel file has the "Resume" column containing the resume texts. |
- Click the "Start Processing" button. |
- The app will analyze the resumes and categorize them into job industry categories. |
###### |
""") |
imgClf1 = Image.open('clf-1.png') |
st.image(imgClf1, use_column_width = True, output_format = "PNG") |
st.write(""" |
##### View Bar Chart: |
- A bar chart will appear, showing the number of resumes per category, helping you visualize the distribution. |
###### |
""") |
imgClf2 = Image.open('clf-2.png') |
st.image(imgClf2, use_column_width = True, output_format = "PNG") |
st.write(""" |
##### Add Filters: |
- You can apply filters to the dataframe to narrow down your results. |
###### |
""") |
imgClf3 = Image.open('clf-3.png') |
st.image(imgClf3, use_column_width = True, output_format = "PNG") |
st.write(""" |
##### Donwload Results: |
- Once you've applied filters or are satisfied with the results, you can download the current dataframe as an Excel file by clicking the "Save Current Output as XLSX" button. |
#### |
""") |
imgClf4 = Image.open('clf-4.png') |
st.image(imgClf4, use_column_width = True, output_format = "PNG") |
st.write(""" |
#### Rank Tab: |
The web app will rank the resumes based on their semantic similarity to the job description. |
The similarity score ranges from -1 to 1. |
A score of 1 is achieved when Document A and Document B are identical. |
##### **Kindly take note:** |
It's important to note that these scores are not absolute and may change when more resumes are added in the comparison. |
The ranking algorithm dynamically adjusts its results based on the entire set of uploaded resumes. |
We recommend considering the scores as a relative measure rather than an absolute determination. |
""") |
with st.expander('Ranking Steps'): |
st.write(""" |
##### Upload Files & Start Processing: |
- Navigate to the "Rank" tab. |
- Upload the job description as a text file. This file should contain the description of the job you want to compare resumes against. |
- Upload the Excel file that contains the resumes you want to rank. |
- Click the "Start Processing" button. |
- The app will analyze the job description and rank the resumes based on their semantic similarity to the job description. |
###### |
""") |
imgRnk1 = Image.open('rnk-1.png') |
st.image(imgRnk1, use_column_width = True, output_format = "PNG") |
st.write(""" |
##### View Job Description: |
- The output will display the contents of the job description for reference. |
###### |
""") |
imgRnk2 = Image.open('rnk-2.png') |
st.image(imgRnk2, use_column_width = True, output_format = "PNG") |
st.write(""" |
##### Add Filters: |
- You can apply filters to the dataframe to narrow down your results. |
###### |
""") |
imgRnk3 = Image.open('rnk-3.png') |
st.image(imgRnk3, use_column_width = True, output_format = "PNG") |
st.write(""" |
##### Donwload Results: |
- Once you've applied filters or are satisfied with the results, you can download the current dataframe as an Excel file by clicking the "Save Current Output as XLSX" button. |
#### |
""") |
imgRnk4 = Image.open('rnk-4.png') |
st.image(imgRnk4, use_column_width = True, output_format = "PNG") |