File size: 4,697 Bytes
7877864
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77e86cf
7877864
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
77e86cf
 
 
 
 
7877864
 
 
 
 
 
 
 
77e86cf
7877864
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
import joblib
import matplotlib.pyplot as plt
import pandas as pd
import re
import seaborn as sns
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from sklearn import metrics
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
from sklearn.preprocessing import LabelEncoder

file_path = '~/Projects/hau/csstudy/resume-screening-and-classification/knn-trial/datasets/dataset_hr_edited.csv'

resumeDataSet = pd.read_csv(file_path)

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()

print (resumeDataSet['Category'].value_counts())

def cleanResume(resumeText):
    resumeText = re.sub('http\S+\s*', ' ', resumeText)  # remove URLs
    resumeText = re.sub('RT|cc', ' ', resumeText)  # remove RT and cc
    resumeText = re.sub('#\S+', '', resumeText)  # remove hashtags
    resumeText = re.sub('@\S+', '  ', resumeText)  # remove mentions
    resumeText = re.sub('[%s]' % re.escape("""!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"""), ' ', resumeText)  # remove punctuations
    resumeText = re.sub(r'[^\x00-\x7f]',r' ', resumeText) 
    resumeText = re.sub('\s+', ' ', resumeText)  # remove extra whitespace

    words = resumeText.split()
    words = [word for word in words if word.lower() not in stop_words]
    words = [stemmer.stem(word.lower()) for word in words if word.lower() not in stop_words]
    resumeText = ' '.join(words)
    return resumeText

resumeDataSet['cleaned_resume'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x))

le = LabelEncoder()
resumeDataSet['Category'] = le.fit_transform(resumeDataSet['Category'])
le_filename = f'label_encoder.joblib'
joblib.dump(le, le_filename)

requiredText = resumeDataSet['cleaned_resume'].values
requiredTarget = resumeDataSet['Category'].values

word_vectorizer = TfidfVectorizer(
    stop_words='english',
    sublinear_tf=True,
    max_features=18038
)

word_vectorizer.fit(requiredText)
joblib.dump(word_vectorizer, 'tfidf_vectorizer.joblib')
WordFeatures = word_vectorizer.transform(requiredText)

nca = NeighborhoodComponentsAnalysis(n_components=400, random_state=42)
WordFeatures = nca.fit_transform(WordFeatures.toarray(), requiredTarget)
nca_filename = f'nca_model.joblib'
joblib.dump(nca, nca_filename)

X_train,X_test,y_train,y_test = train_test_split(WordFeatures,requiredTarget,random_state=42, test_size=0.2,shuffle=True, stratify=requiredTarget)
print(X_train.shape)
print(X_test.shape)

# n_neighbors_values = [1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 73, 75, 77, 79, 81, 83, 85, 87, 89, 91, 93, 95, 97, 99]
# weights = ["uniform", "distance"]
# metric = ["euclidean", "manhattan", "minkowski", "cosine"]
# algorithm = ['ball_tree', 'kd_tree', 'brute', 'auto']
# param_grid = dict(n_neighbors=n_neighbors_values, weights=weights, metric=metric, algorithm=algorithm)
# knn = KNeighborsClassifier()
# gs = GridSearchCV(estimator=knn, param_grid=param_grid, scoring="accuracy", verbose=1, cv=10, n_jobs=3)
# grid_search = gs.fit(X_train, y_train)
# results_df = pd.DataFrame(grid_search.cv_results_)
# # results_df.to_excel('grid_search_results_with_nca_500.xlsx', index=False)
# # results_df.to_excel('grid_search_results_with_nca_400.xlsx', index=False)
# # results_df.to_excel('grid_search_results_with_nca_300.xlsx', index=False)
# # results_df.to_excel('grid_search_results_no_nca.xlsx', index=False)
# best_score = grid_search.best_score_
# best_parameters = grid_search.best_params_
# print("Best Score:", best_score)
# print("Best Parameters:", best_parameters)

knn = KNeighborsClassifier(n_neighbors=1, 
                           metric='manhattan',
                           weights='uniform',
                           algorithm='kd_tree',
                           )
knn.fit(X_train, y_train)

knnModel_filename = f'knn_model.joblib'
joblib.dump(knn, knnModel_filename)

prediction = knn.predict(X_test)
print('Accuracy of KNeighbors Classifier on training set: {:.2f}'.format(knn.score(X_train, y_train)))
print('Accuracy of KNeighbors Classifier on test set: {:.2f}'.format(knn.score(X_test, y_test)))
print("\n Classification report for classifier %s:\n%s\n" % (knn, metrics.classification_report(y_test, prediction)))

confusion_matrix = metrics.confusion_matrix(y_test, prediction)

plt.figure(figsize=(10, 10))
sns.heatmap(confusion_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=le.classes_, yticklabels=le.classes_)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix')
plt.show()