|
import joblib |
|
import matplotlib.pyplot as plt |
|
import pandas as pd |
|
import re |
|
import seaborn as sns |
|
from nltk.corpus import stopwords |
|
from nltk.stem import PorterStemmer |
|
from sklearn import metrics |
|
from sklearn.feature_extraction.text import TfidfVectorizer |
|
from sklearn.model_selection import train_test_split, GridSearchCV |
|
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis |
|
from sklearn.preprocessing import LabelEncoder |
|
|
|
file_path = '~/Projects/hau/csstudy/resume-screening-and-classification/knn-trial/datasets/dataset_hr_edited.csv' |
|
|
|
resumeDataSet = pd.read_csv(file_path) |
|
|
|
stop_words = set(stopwords.words('english')) |
|
stemmer = PorterStemmer() |
|
|
|
print (resumeDataSet['Category'].value_counts()) |
|
|
|
def cleanResume(resumeText): |
|
resumeText = re.sub('http\S+\s*', ' ', resumeText) |
|
resumeText = re.sub('RT|cc', ' ', resumeText) |
|
resumeText = re.sub('#\S+', '', resumeText) |
|
resumeText = re.sub('@\S+', ' ', resumeText) |
|
resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText) |
|
resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) |
|
resumeText = re.sub('\s+', ' ', resumeText) |
|
|
|
words = resumeText.split() |
|
words = [word for word in words if word.lower() not in stop_words] |
|
words = [stemmer.stem(word.lower()) for word in words if word.lower() not in stop_words] |
|
resumeText = ' '.join(words) |
|
return resumeText |
|
|
|
resumeDataSet['cleaned_resume'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x)) |
|
|
|
le = LabelEncoder() |
|
resumeDataSet['Category'] = le.fit_transform(resumeDataSet['Category']) |
|
le_filename = f'label_encoder.joblib' |
|
joblib.dump(le, le_filename) |
|
|
|
requiredText = resumeDataSet['cleaned_resume'].values |
|
requiredTarget = resumeDataSet['Category'].values |
|
|
|
word_vectorizer = TfidfVectorizer( |
|
stop_words='english', |
|
sublinear_tf=True, |
|
max_features=18038 |
|
) |
|
|
|
word_vectorizer.fit(requiredText) |
|
joblib.dump(word_vectorizer, 'tfidf_vectorizer.joblib') |
|
WordFeatures = word_vectorizer.transform(requiredText) |
|
|
|
nca = NeighborhoodComponentsAnalysis(n_components=400, random_state=42) |
|
WordFeatures = nca.fit_transform(WordFeatures.toarray(), requiredTarget) |
|
nca_filename = f'nca_model.joblib' |
|
joblib.dump(nca, nca_filename) |
|
|
|
X_train,X_test,y_train,y_test = train_test_split(WordFeatures,requiredTarget,random_state=42, test_size=0.2,shuffle=True, stratify=requiredTarget) |
|
print(X_train.shape) |
|
print(X_test.shape) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
knn = KNeighborsClassifier(n_neighbors=1, |
|
metric='manhattan', |
|
weights='uniform', |
|
algorithm='kd_tree', |
|
) |
|
knn.fit(X_train, y_train) |
|
|
|
knnModel_filename = f'knn_model.joblib' |
|
joblib.dump(knn, knnModel_filename) |
|
|
|
prediction = knn.predict(X_test) |
|
print('Accuracy of KNeighbors Classifier on training set: {:.2f}'.format(knn.score(X_train, y_train))) |
|
print('Accuracy of KNeighbors Classifier on test set: {:.2f}'.format(knn.score(X_test, y_test))) |
|
print("\n Classification report for classifier %s:\n%s\n" % (knn, metrics.classification_report(y_test, prediction))) |
|
|
|
confusion_matrix = metrics.confusion_matrix(y_test, prediction) |
|
|
|
plt.figure(figsize=(10, 10)) |
|
sns.heatmap(confusion_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_) |
|
plt.xlabel('Predicted') |
|
plt.ylabel('True') |
|
plt.title('Confusion Matrix') |
|
plt.show() |
|
|