import joblib import matplotlib.pyplot as plt import pandas as pd import re import seaborn as sns from nltk.corpus import stopwords from nltk.stem import PorterStemmer from sklearn import metrics from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.model_selection import train_test_split, GridSearchCV from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis from sklearn.preprocessing import LabelEncoder file_path = '~/Projects/hau/csstudy/resume-screening-and-classification/knn-trial/datasets/dataset_hr_edited.csv' resumeDataSet = pd.read_csv(file_path) stop_words = set(stopwords.words('english')) stemmer = PorterStemmer() print (resumeDataSet['Category'].value_counts()) def cleanResume(resumeText): resumeText = re.sub('http\S+\s*', ' ', resumeText) # remove URLs resumeText = re.sub('RT|cc', ' ', resumeText) # remove RT and cc resumeText = re.sub('#\S+', '', resumeText) # remove hashtags resumeText = re.sub('@\S+', ' ', resumeText) # remove mentions resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText) # remove punctuations resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) resumeText = re.sub('\s+', ' ', resumeText) # remove extra whitespace words = resumeText.split() words = [word for word in words if word.lower() not in stop_words] words = [stemmer.stem(word.lower()) for word in words if word.lower() not in stop_words] resumeText = ' '.join(words) return resumeText resumeDataSet['cleaned_resume'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x)) le = LabelEncoder() resumeDataSet['Category'] = le.fit_transform(resumeDataSet['Category']) le_filename = f'label_encoder.joblib' joblib.dump(le, le_filename) requiredText = resumeDataSet['cleaned_resume'].values requiredTarget = resumeDataSet['Category'].values word_vectorizer = TfidfVectorizer( stop_words='english', sublinear_tf=True, max_features=18038 ) word_vectorizer.fit(requiredText) joblib.dump(word_vectorizer, 'tfidf_vectorizer.joblib') WordFeatures = word_vectorizer.transform(requiredText) nca = NeighborhoodComponentsAnalysis(n_components=300, random_state=42) WordFeatures = nca.fit_transform(WordFeatures.toarray(), requiredTarget) nca_filename = f'nca_model.joblib' joblib.dump(nca, nca_filename) X_train,X_test,y_train,y_test = train_test_split(WordFeatures,requiredTarget,random_state=42, test_size=0.2,shuffle=True, stratify=requiredTarget) print(X_train.shape) print(X_test.shape) # n_neighbors_values = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99] # weights = ["uniform", "distance"] # metric = ["euclidean", "manhattan", "minkowski", "cosine"] # algorithm = ['ball_tree', 'kd_tree', 'brute', 'auto'] # param_grid = dict(n_neighbors=n_neighbors_values, weights=weights, metric=metric, algorithm=algorithm) # knn = KNeighborsClassifier() # gs = GridSearchCV(estimator=knn, param_grid=param_grid, scoring="accuracy", verbose=1, cv=10, n_jobs=3) # grid_search = gs.fit(X_train, y_train) # best_score = grid_search.best_score_ # best_parameters = grid_search.best_params_ # print("Best Score:", best_score) # print("Best Parameters:", best_parameters) knn = KNeighborsClassifier(n_neighbors=1, metric='manhattan', weights='uniform', algorithm='ball_tree', ) knn.fit(X_train, y_train) knnModel_filename = f'knn_model.joblib' joblib.dump(knn, knnModel_filename) prediction = knn.predict(X_test) print('Accuracy of KNeighbors Classifier on training set: {:.2f}'.format(knn.score(X_train, y_train))) print('Accuracy of KNeighbors Classifier on test set: {:.2f}'.format(knn.score(X_test, y_test))) print("\n Classification report for classifier %s:\n%s\n" % (knn, metrics.classification_report(y_test, prediction))) confusion_matrix = metrics.confusion_matrix(y_test, prediction) plt.figure(figsize=(10, 10)) sns.heatmap(confusion_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_) plt.xlabel('Predicted') plt.ylabel('True') plt.title('Confusion Matrix') plt.show()