resume-screening-classification / train_classifier.py
chelscelis's picture
Upload 3 files
77e86cf
raw
history blame
4.7 kB
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
from sklearn.preprocessing import LabelEncoder
file_path = '~/Projects/hau/csstudy/resume-screening-and-classification/knn-trial/datasets/dataset_hr_edited.csv'
resumeDataSet = pd.read_csv(file_path)
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
print (resumeDataSet['Category'].value_counts())
def cleanResume(resumeText):
resumeText = re.sub('http\S+\s*', ' ', resumeText) # remove URLs
resumeText = re.sub('RT|cc', ' ', resumeText) # remove RT and cc
resumeText = re.sub('#\S+', '', resumeText) # remove hashtags
resumeText = re.sub('@\S+', ' ', resumeText) # remove mentions
resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText) # remove punctuations
resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText)
resumeText = re.sub('\s+', ' ', resumeText) # remove extra whitespace
words = resumeText.split()
words = [word for word in words if word.lower() not in stop_words]
words = [stemmer.stem(word.lower()) for word in words if word.lower() not in stop_words]
resumeText = ' '.join(words)
return resumeText
resumeDataSet['cleaned_resume'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x))
le = LabelEncoder()
resumeDataSet['Category'] = le.fit_transform(resumeDataSet['Category'])
le_filename = f'label_encoder.joblib'
joblib.dump(le, le_filename)
requiredText = resumeDataSet['cleaned_resume'].values
requiredTarget = resumeDataSet['Category'].values
word_vectorizer = TfidfVectorizer(
stop_words='english',
sublinear_tf=True,
max_features=18038
)
word_vectorizer.fit(requiredText)
joblib.dump(word_vectorizer, 'tfidf_vectorizer.joblib')
WordFeatures = word_vectorizer.transform(requiredText)
nca = NeighborhoodComponentsAnalysis(n_components=400, random_state=42)
WordFeatures = nca.fit_transform(WordFeatures.toarray(), requiredTarget)
nca_filename = f'nca_model.joblib'
joblib.dump(nca, nca_filename)
X_train,X_test,y_train,y_test = train_test_split(WordFeatures,requiredTarget,random_state=42, test_size=0.2,shuffle=True, stratify=requiredTarget)
print(X_train.shape)
print(X_test.shape)
# n_neighbors_values = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99]
# weights = ["uniform", "distance"]
# metric = ["euclidean", "manhattan", "minkowski", "cosine"]
# algorithm = ['ball_tree', 'kd_tree', 'brute', 'auto']
# param_grid = dict(n_neighbors=n_neighbors_values, weights=weights, metric=metric, algorithm=algorithm)
# knn = KNeighborsClassifier()
# gs = GridSearchCV(estimator=knn, param_grid=param_grid, scoring="accuracy", verbose=1, cv=10, n_jobs=3)
# grid_search = gs.fit(X_train, y_train)
# results_df = pd.DataFrame(grid_search.cv_results_)
# # results_df.to_excel('grid_search_results_with_nca_500.xlsx', index=False)
# # results_df.to_excel('grid_search_results_with_nca_400.xlsx', index=False)
# # results_df.to_excel('grid_search_results_with_nca_300.xlsx', index=False)
# # results_df.to_excel('grid_search_results_no_nca.xlsx', index=False)
# best_score = grid_search.best_score_
# best_parameters = grid_search.best_params_
# print("Best Score:", best_score)
# print("Best Parameters:", best_parameters)
knn = KNeighborsClassifier(n_neighbors=1,
metric='manhattan',
weights='uniform',
algorithm='kd_tree',
)
knn.fit(X_train, y_train)
knnModel_filename = f'knn_model.joblib'
joblib.dump(knn, knnModel_filename)
prediction = knn.predict(X_test)
print('Accuracy of KNeighbors Classifier on training set: {:.2f}'.format(knn.score(X_train, y_train)))
print('Accuracy of KNeighbors Classifier on test set: {:.2f}'.format(knn.score(X_test, y_test)))
print("\n Classification report for classifier %s:\n%s\n" % (knn, metrics.classification_report(y_test, prediction)))
confusion_matrix = metrics.confusion_matrix(y_test, prediction)
plt.figure(figsize=(10, 10))
sns.heatmap(confusion_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()