chelscelis
commited on
Commit
•
e236812
1
Parent(s):
fc797e5
Upload 5 files
Browse files- knn_model.joblib +2 -2
- nca_model.joblib +2 -2
- tfidf_vectorizer.joblib +1 -1
- train_classifier.py +6 -6
knn_model.joblib
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:63bcb5d47a13120d48d2fda296c68c5f96102f00ec9d6811909b2625228a5cd3
|
3 |
+
size 4293046
|
nca_model.joblib
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
-
size
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:6f123db7c979eb889c1943ef83369ba45078628adf30063f4908dd1e039d835a
|
3 |
+
size 43294492
|
tfidf_vectorizer.joblib
CHANGED
@@ -1,3 +1,3 @@
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
-
oid sha256:
|
3 |
size 794455
|
|
|
1 |
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3d3defdb1fda27499f742f2da47e46eb9d10bf33fd0bb8922a3ee465d7e8bef6
|
3 |
size 794455
|
train_classifier.py
CHANGED
@@ -11,14 +11,14 @@ from sklearn.model_selection import train_test_split, GridSearchCV
|
|
11 |
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
|
12 |
from sklearn.preprocessing import LabelEncoder
|
13 |
|
14 |
-
file_path = '~/Projects/hau/csstudy/resume-screening-and-classification/knn-trial/datasets/
|
15 |
|
16 |
-
resumeDataSet = pd.
|
17 |
|
18 |
stop_words = set(stopwords.words('english'))
|
19 |
stemmer = PorterStemmer()
|
20 |
|
21 |
-
print (resumeDataSet['Category'].value_counts())
|
22 |
|
23 |
def cleanResume(resumeText):
|
24 |
resumeText = re.sub('http\S+\s*', ' ', resumeText) # remove URLs
|
@@ -38,12 +38,12 @@ def cleanResume(resumeText):
|
|
38 |
resumeDataSet['cleaned_resume'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x))
|
39 |
|
40 |
le = LabelEncoder()
|
41 |
-
resumeDataSet['Category'] = le.fit_transform(resumeDataSet['Category'])
|
42 |
le_filename = f'label_encoder.joblib'
|
43 |
joblib.dump(le, le_filename)
|
44 |
|
45 |
requiredText = resumeDataSet['cleaned_resume'].values
|
46 |
-
requiredTarget = resumeDataSet['Category'].values
|
47 |
|
48 |
word_vectorizer = TfidfVectorizer(
|
49 |
stop_words='english',
|
@@ -55,7 +55,7 @@ word_vectorizer.fit(requiredText)
|
|
55 |
joblib.dump(word_vectorizer, 'tfidf_vectorizer.joblib')
|
56 |
WordFeatures = word_vectorizer.transform(requiredText)
|
57 |
|
58 |
-
nca = NeighborhoodComponentsAnalysis(n_components=
|
59 |
WordFeatures = nca.fit_transform(WordFeatures.toarray(), requiredTarget)
|
60 |
nca_filename = f'nca_model.joblib'
|
61 |
joblib.dump(nca, nca_filename)
|
|
|
11 |
from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
|
12 |
from sklearn.preprocessing import LabelEncoder
|
13 |
|
14 |
+
file_path = '~/Projects/hau/csstudy/resume-screening-and-classification/knn-trial/datasets/Labeled_LiveCareer_Resumes_1076.xlsx'
|
15 |
|
16 |
+
resumeDataSet = pd.read_excel(file_path)
|
17 |
|
18 |
stop_words = set(stopwords.words('english'))
|
19 |
stemmer = PorterStemmer()
|
20 |
|
21 |
+
print (resumeDataSet['Actual Category'].value_counts())
|
22 |
|
23 |
def cleanResume(resumeText):
|
24 |
resumeText = re.sub('http\S+\s*', ' ', resumeText) # remove URLs
|
|
|
38 |
resumeDataSet['cleaned_resume'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x))
|
39 |
|
40 |
le = LabelEncoder()
|
41 |
+
resumeDataSet['Actual Category'] = le.fit_transform(resumeDataSet['Actual Category'])
|
42 |
le_filename = f'label_encoder.joblib'
|
43 |
joblib.dump(le, le_filename)
|
44 |
|
45 |
requiredText = resumeDataSet['cleaned_resume'].values
|
46 |
+
requiredTarget = resumeDataSet['Actual Category'].values
|
47 |
|
48 |
word_vectorizer = TfidfVectorizer(
|
49 |
stop_words='english',
|
|
|
55 |
joblib.dump(word_vectorizer, 'tfidf_vectorizer.joblib')
|
56 |
WordFeatures = word_vectorizer.transform(requiredText)
|
57 |
|
58 |
+
nca = NeighborhoodComponentsAnalysis(n_components=300, random_state=42)
|
59 |
WordFeatures = nca.fit_transform(WordFeatures.toarray(), requiredTarget)
|
60 |
nca_filename = f'nca_model.joblib'
|
61 |
joblib.dump(nca, nca_filename)
|