chelscelis commited on
Commit
e236812
1 Parent(s): fc797e5

Upload 5 files

Browse files
knn_model.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:6ceaeb3f040b36abf9aadbd303aeb88f4745d1a41c620b803d6a56c3229e0dd1
3
- size 5725078
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63bcb5d47a13120d48d2fda296c68c5f96102f00ec9d6811909b2625228a5cd3
3
+ size 4293046
nca_model.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:c5053da7e87086d6ec242c2ae73df5e0a299ab0cb2dc395c81f8cc625f84987a
3
- size 57724892
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6f123db7c979eb889c1943ef83369ba45078628adf30063f4908dd1e039d835a
3
+ size 43294492
tfidf_vectorizer.joblib CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:986d302aa10cddb969608ad2afe08fade436d19afd812ba508a0c8b4f1498a2b
3
  size 794455
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d3defdb1fda27499f742f2da47e46eb9d10bf33fd0bb8922a3ee465d7e8bef6
3
  size 794455
train_classifier.py CHANGED
@@ -11,14 +11,14 @@ from sklearn.model_selection import train_test_split, GridSearchCV
11
  from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
12
  from sklearn.preprocessing import LabelEncoder
13
 
14
- file_path = '~/Projects/hau/csstudy/resume-screening-and-classification/knn-trial/datasets/dataset_hr_edited.csv'
15
 
16
- resumeDataSet = pd.read_csv(file_path)
17
 
18
  stop_words = set(stopwords.words('english'))
19
  stemmer = PorterStemmer()
20
 
21
- print (resumeDataSet['Category'].value_counts())
22
 
23
  def cleanResume(resumeText):
24
  resumeText = re.sub('http\S+\s*', ' ', resumeText) # remove URLs
@@ -38,12 +38,12 @@ def cleanResume(resumeText):
38
  resumeDataSet['cleaned_resume'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x))
39
 
40
  le = LabelEncoder()
41
- resumeDataSet['Category'] = le.fit_transform(resumeDataSet['Category'])
42
  le_filename = f'label_encoder.joblib'
43
  joblib.dump(le, le_filename)
44
 
45
  requiredText = resumeDataSet['cleaned_resume'].values
46
- requiredTarget = resumeDataSet['Category'].values
47
 
48
  word_vectorizer = TfidfVectorizer(
49
  stop_words='english',
@@ -55,7 +55,7 @@ word_vectorizer.fit(requiredText)
55
  joblib.dump(word_vectorizer, 'tfidf_vectorizer.joblib')
56
  WordFeatures = word_vectorizer.transform(requiredText)
57
 
58
- nca = NeighborhoodComponentsAnalysis(n_components=400, random_state=42)
59
  WordFeatures = nca.fit_transform(WordFeatures.toarray(), requiredTarget)
60
  nca_filename = f'nca_model.joblib'
61
  joblib.dump(nca, nca_filename)
 
11
  from sklearn.neighbors import KNeighborsClassifier, NeighborhoodComponentsAnalysis
12
  from sklearn.preprocessing import LabelEncoder
13
 
14
+ file_path = '~/Projects/hau/csstudy/resume-screening-and-classification/knn-trial/datasets/Labeled_LiveCareer_Resumes_1076.xlsx'
15
 
16
+ resumeDataSet = pd.read_excel(file_path)
17
 
18
  stop_words = set(stopwords.words('english'))
19
  stemmer = PorterStemmer()
20
 
21
+ print (resumeDataSet['Actual Category'].value_counts())
22
 
23
  def cleanResume(resumeText):
24
  resumeText = re.sub('http\S+\s*', ' ', resumeText) # remove URLs
 
38
  resumeDataSet['cleaned_resume'] = resumeDataSet.Resume.apply(lambda x: cleanResume(x))
39
 
40
  le = LabelEncoder()
41
+ resumeDataSet['Actual Category'] = le.fit_transform(resumeDataSet['Actual Category'])
42
  le_filename = f'label_encoder.joblib'
43
  joblib.dump(le, le_filename)
44
 
45
  requiredText = resumeDataSet['cleaned_resume'].values
46
+ requiredTarget = resumeDataSet['Actual Category'].values
47
 
48
  word_vectorizer = TfidfVectorizer(
49
  stop_words='english',
 
55
  joblib.dump(word_vectorizer, 'tfidf_vectorizer.joblib')
56
  WordFeatures = word_vectorizer.transform(requiredText)
57
 
58
+ nca = NeighborhoodComponentsAnalysis(n_components=300, random_state=42)
59
  WordFeatures = nca.fit_transform(WordFeatures.toarray(), requiredTarget)
60
  nca_filename = f'nca_model.joblib'
61
  joblib.dump(nca, nca_filename)