Upload Job_Recommendation_System.ipynb
Browse files- Job_Recommendation_System.ipynb +1230 -0
Job_Recommendation_System.ipynb
ADDED
@@ -0,0 +1,1230 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"metadata": {
|
6 |
+
"id": "_NrjL2ccH3yp"
|
7 |
+
},
|
8 |
+
"source": [
|
9 |
+
"RECOMMENDATION MODEL"
|
10 |
+
]
|
11 |
+
},
|
12 |
+
{
|
13 |
+
"cell_type": "code",
|
14 |
+
"execution_count": 20,
|
15 |
+
"metadata": {
|
16 |
+
"id": "IZfnA6W_GDyf"
|
17 |
+
},
|
18 |
+
"outputs": [],
|
19 |
+
"source": [
|
20 |
+
"import numpy as np\n",
|
21 |
+
"import pandas as pd\n",
|
22 |
+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
23 |
+
"from sklearn.metrics.pairwise import cosine_similarity"
|
24 |
+
]
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"cell_type": "code",
|
28 |
+
"execution_count": 21,
|
29 |
+
"metadata": {
|
30 |
+
"id": "MV-7idG1F_NU"
|
31 |
+
},
|
32 |
+
"outputs": [],
|
33 |
+
"source": [
|
34 |
+
"# Mock data creation\n",
|
35 |
+
"def create_mock_data():\n",
|
36 |
+
" users_data = \"rematch_train_candidate_field.csv\"\n",
|
37 |
+
" applicants = pd.read_csv(users_data)\n",
|
38 |
+
"\n",
|
39 |
+
" jobs_data = \"jobs_data.csv\"\n",
|
40 |
+
" companies = pd.read_csv(jobs_data)\n",
|
41 |
+
"\n",
|
42 |
+
" train_applicants = applicants\n",
|
43 |
+
" test_data = \"1st_test.csv\"\n",
|
44 |
+
" # \"/content/sample_data/test_train.csv\"\n",
|
45 |
+
" test_applicants = pd.read_csv(test_data)\n",
|
46 |
+
"\n",
|
47 |
+
" return train_applicants, test_applicants, companies"
|
48 |
+
]
|
49 |
+
},
|
50 |
+
{
|
51 |
+
"cell_type": "code",
|
52 |
+
"execution_count": 22,
|
53 |
+
"metadata": {
|
54 |
+
"id": "wF1oZ6Ez96BE"
|
55 |
+
},
|
56 |
+
"outputs": [],
|
57 |
+
"source": [
|
58 |
+
"train_user, test_user, jobs = create_mock_data()"
|
59 |
+
]
|
60 |
+
},
|
61 |
+
{
|
62 |
+
"cell_type": "code",
|
63 |
+
"execution_count": 23,
|
64 |
+
"metadata": {},
|
65 |
+
"outputs": [
|
66 |
+
{
|
67 |
+
"name": "stdout",
|
68 |
+
"output_type": "stream",
|
69 |
+
"text": [
|
70 |
+
"<class 'pandas.core.frame.DataFrame'>\n"
|
71 |
+
]
|
72 |
+
}
|
73 |
+
],
|
74 |
+
"source": [
|
75 |
+
"print(type(train_user))"
|
76 |
+
]
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"cell_type": "code",
|
80 |
+
"execution_count": 24,
|
81 |
+
"metadata": {
|
82 |
+
"colab": {
|
83 |
+
"base_uri": "https://localhost:8080/"
|
84 |
+
},
|
85 |
+
"id": "Gj8tJNrph8Go",
|
86 |
+
"outputId": "a44b8cf0-a56f-4cd2-bbda-ca9bcabf35a0"
|
87 |
+
},
|
88 |
+
"outputs": [
|
89 |
+
{
|
90 |
+
"name": "stdout",
|
91 |
+
"output_type": "stream",
|
92 |
+
"text": [
|
93 |
+
"Training data size: 23724\n",
|
94 |
+
"Test data size: 4745\n"
|
95 |
+
]
|
96 |
+
}
|
97 |
+
],
|
98 |
+
"source": [
|
99 |
+
"print(\"Training data size:\", train_user.shape[0])\n",
|
100 |
+
"print(\"Test data size:\", test_user.shape[0])"
|
101 |
+
]
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"cell_type": "code",
|
105 |
+
"execution_count": 25,
|
106 |
+
"metadata": {
|
107 |
+
"id": "d0XY4al7K0UT"
|
108 |
+
},
|
109 |
+
"outputs": [],
|
110 |
+
"source": [
|
111 |
+
"list_hard_skill = [test_user[\"hard_skill\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(test_user))]\n",
|
112 |
+
"list_soft_skill = [test_user[\"soft_skill\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(test_user))]"
|
113 |
+
]
|
114 |
+
},
|
115 |
+
{
|
116 |
+
"cell_type": "code",
|
117 |
+
"execution_count": 26,
|
118 |
+
"metadata": {},
|
119 |
+
"outputs": [
|
120 |
+
{
|
121 |
+
"name": "stdout",
|
122 |
+
"output_type": "stream",
|
123 |
+
"text": [
|
124 |
+
"<class 'list'>\n"
|
125 |
+
]
|
126 |
+
}
|
127 |
+
],
|
128 |
+
"source": [
|
129 |
+
"print(type(list_hard_skill))"
|
130 |
+
]
|
131 |
+
},
|
132 |
+
{
|
133 |
+
"cell_type": "code",
|
134 |
+
"execution_count": 27,
|
135 |
+
"metadata": {
|
136 |
+
"colab": {
|
137 |
+
"base_uri": "https://localhost:8080/",
|
138 |
+
"height": 213
|
139 |
+
},
|
140 |
+
"id": "JOZ9_NlLK8uS",
|
141 |
+
"outputId": "17d09f55-192f-4486-bb47-b56f525d44a3"
|
142 |
+
},
|
143 |
+
"outputs": [
|
144 |
+
{
|
145 |
+
"data": {
|
146 |
+
"text/html": [
|
147 |
+
"<div>\n",
|
148 |
+
"<style scoped>\n",
|
149 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
150 |
+
" vertical-align: middle;\n",
|
151 |
+
" }\n",
|
152 |
+
"\n",
|
153 |
+
" .dataframe tbody tr th {\n",
|
154 |
+
" vertical-align: top;\n",
|
155 |
+
" }\n",
|
156 |
+
"\n",
|
157 |
+
" .dataframe thead th {\n",
|
158 |
+
" text-align: right;\n",
|
159 |
+
" }\n",
|
160 |
+
"</style>\n",
|
161 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
162 |
+
" <thead>\n",
|
163 |
+
" <tr style=\"text-align: right;\">\n",
|
164 |
+
" <th></th>\n",
|
165 |
+
" <th>User ID</th>\n",
|
166 |
+
" <th>candidate_field</th>\n",
|
167 |
+
" <th>label</th>\n",
|
168 |
+
" <th>hard_skill</th>\n",
|
169 |
+
" <th>soft_skill</th>\n",
|
170 |
+
" <th>final_hard_skill</th>\n",
|
171 |
+
" <th>final_soft_skill</th>\n",
|
172 |
+
" </tr>\n",
|
173 |
+
" </thead>\n",
|
174 |
+
" <tbody>\n",
|
175 |
+
" <tr>\n",
|
176 |
+
" <th>0</th>\n",
|
177 |
+
" <td>14649</td>\n",
|
178 |
+
" <td>it jobs</td>\n",
|
179 |
+
" <td>1</td>\n",
|
180 |
+
" <td>['act', 'advertising sales', 'algorithms', 'bu...</td>\n",
|
181 |
+
" <td>['collaboration', 'decision making', 'operatio...</td>\n",
|
182 |
+
" <td>act, advertising sales, algorithms, business, ...</td>\n",
|
183 |
+
" <td>collaboration, decision making, operations, wr...</td>\n",
|
184 |
+
" </tr>\n",
|
185 |
+
" <tr>\n",
|
186 |
+
" <th>1</th>\n",
|
187 |
+
" <td>801</td>\n",
|
188 |
+
" <td>marketing</td>\n",
|
189 |
+
" <td>0</td>\n",
|
190 |
+
" <td>['act', 'brand communication', 'business', 'bu...</td>\n",
|
191 |
+
" <td>['collaboration', 'customer service', 'managem...</td>\n",
|
192 |
+
" <td>act, brand communication, business, business d...</td>\n",
|
193 |
+
" <td>collaboration, customer service, management</td>\n",
|
194 |
+
" </tr>\n",
|
195 |
+
" <tr>\n",
|
196 |
+
" <th>2</th>\n",
|
197 |
+
" <td>4393</td>\n",
|
198 |
+
" <td>accounting</td>\n",
|
199 |
+
" <td>0</td>\n",
|
200 |
+
" <td>['application', 'balance sheet', 'finance', 'p...</td>\n",
|
201 |
+
" <td>['filing', 'management']</td>\n",
|
202 |
+
" <td>application, balance sheet, finance, property ...</td>\n",
|
203 |
+
" <td>filing, management</td>\n",
|
204 |
+
" </tr>\n",
|
205 |
+
" </tbody>\n",
|
206 |
+
"</table>\n",
|
207 |
+
"</div>"
|
208 |
+
],
|
209 |
+
"text/plain": [
|
210 |
+
" User ID candidate_field label \\\n",
|
211 |
+
"0 14649 it jobs 1 \n",
|
212 |
+
"1 801 marketing 0 \n",
|
213 |
+
"2 4393 accounting 0 \n",
|
214 |
+
"\n",
|
215 |
+
" hard_skill \\\n",
|
216 |
+
"0 ['act', 'advertising sales', 'algorithms', 'bu... \n",
|
217 |
+
"1 ['act', 'brand communication', 'business', 'bu... \n",
|
218 |
+
"2 ['application', 'balance sheet', 'finance', 'p... \n",
|
219 |
+
"\n",
|
220 |
+
" soft_skill \\\n",
|
221 |
+
"0 ['collaboration', 'decision making', 'operatio... \n",
|
222 |
+
"1 ['collaboration', 'customer service', 'managem... \n",
|
223 |
+
"2 ['filing', 'management'] \n",
|
224 |
+
"\n",
|
225 |
+
" final_hard_skill \\\n",
|
226 |
+
"0 act, advertising sales, algorithms, business, ... \n",
|
227 |
+
"1 act, brand communication, business, business d... \n",
|
228 |
+
"2 application, balance sheet, finance, property ... \n",
|
229 |
+
"\n",
|
230 |
+
" final_soft_skill \n",
|
231 |
+
"0 collaboration, decision making, operations, wr... \n",
|
232 |
+
"1 collaboration, customer service, management \n",
|
233 |
+
"2 filing, management "
|
234 |
+
]
|
235 |
+
},
|
236 |
+
"execution_count": 27,
|
237 |
+
"metadata": {},
|
238 |
+
"output_type": "execute_result"
|
239 |
+
}
|
240 |
+
],
|
241 |
+
"source": [
|
242 |
+
"test_user[\"final_hard_skill\"] = pd.DataFrame(list_hard_skill)\n",
|
243 |
+
"test_user[\"final_soft_skill\"] = pd.DataFrame(list_soft_skill)\n",
|
244 |
+
"test_user.head(3)"
|
245 |
+
]
|
246 |
+
},
|
247 |
+
{
|
248 |
+
"cell_type": "code",
|
249 |
+
"execution_count": 28,
|
250 |
+
"metadata": {
|
251 |
+
"id": "kYbjYsDjABda"
|
252 |
+
},
|
253 |
+
"outputs": [],
|
254 |
+
"source": [
|
255 |
+
"list_hard_skill = [train_user[\"hard_skill\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(train_user))]\n",
|
256 |
+
"list_soft_skill = [train_user[\"soft_skill\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(train_user))]"
|
257 |
+
]
|
258 |
+
},
|
259 |
+
{
|
260 |
+
"cell_type": "code",
|
261 |
+
"execution_count": 29,
|
262 |
+
"metadata": {
|
263 |
+
"colab": {
|
264 |
+
"base_uri": "https://localhost:8080/",
|
265 |
+
"height": 213
|
266 |
+
},
|
267 |
+
"id": "GC8bn3cjB8D5",
|
268 |
+
"outputId": "436e843d-425e-4ce2-e551-e4f249bdd10b"
|
269 |
+
},
|
270 |
+
"outputs": [
|
271 |
+
{
|
272 |
+
"data": {
|
273 |
+
"text/html": [
|
274 |
+
"<div>\n",
|
275 |
+
"<style scoped>\n",
|
276 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
277 |
+
" vertical-align: middle;\n",
|
278 |
+
" }\n",
|
279 |
+
"\n",
|
280 |
+
" .dataframe tbody tr th {\n",
|
281 |
+
" vertical-align: top;\n",
|
282 |
+
" }\n",
|
283 |
+
"\n",
|
284 |
+
" .dataframe thead th {\n",
|
285 |
+
" text-align: right;\n",
|
286 |
+
" }\n",
|
287 |
+
"</style>\n",
|
288 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
289 |
+
" <thead>\n",
|
290 |
+
" <tr style=\"text-align: right;\">\n",
|
291 |
+
" <th></th>\n",
|
292 |
+
" <th>User ID</th>\n",
|
293 |
+
" <th>candidate_field</th>\n",
|
294 |
+
" <th>label</th>\n",
|
295 |
+
" <th>hard_skill</th>\n",
|
296 |
+
" <th>soft_skill</th>\n",
|
297 |
+
" <th>final_hard_skill</th>\n",
|
298 |
+
" <th>final_soft_skill</th>\n",
|
299 |
+
" </tr>\n",
|
300 |
+
" </thead>\n",
|
301 |
+
" <tbody>\n",
|
302 |
+
" <tr>\n",
|
303 |
+
" <th>0</th>\n",
|
304 |
+
" <td>1</td>\n",
|
305 |
+
" <td>retail & consumer products</td>\n",
|
306 |
+
" <td>0</td>\n",
|
307 |
+
" <td>['business', 'merchandising', 'sales', 'service']</td>\n",
|
308 |
+
" <td>['customer service']</td>\n",
|
309 |
+
" <td>business, merchandising, sales, service</td>\n",
|
310 |
+
" <td>customer service</td>\n",
|
311 |
+
" </tr>\n",
|
312 |
+
" <tr>\n",
|
313 |
+
" <th>1</th>\n",
|
314 |
+
" <td>2</td>\n",
|
315 |
+
" <td>sales</td>\n",
|
316 |
+
" <td>0</td>\n",
|
317 |
+
" <td>['application', 'business', 'business requirem...</td>\n",
|
318 |
+
" <td>['accountability', 'collaboration', 'innovatio...</td>\n",
|
319 |
+
" <td>application, business, business requirements, ...</td>\n",
|
320 |
+
" <td>accountability, collaboration, innovation, man...</td>\n",
|
321 |
+
" </tr>\n",
|
322 |
+
" <tr>\n",
|
323 |
+
" <th>2</th>\n",
|
324 |
+
" <td>3</td>\n",
|
325 |
+
" <td>healthcare & medical</td>\n",
|
326 |
+
" <td>0</td>\n",
|
327 |
+
" <td>['application', 'cancer', 'endocrinology', 'hy...</td>\n",
|
328 |
+
" <td>['research', 'training and development']</td>\n",
|
329 |
+
" <td>application, cancer, endocrinology, hydrothera...</td>\n",
|
330 |
+
" <td>research, training and development</td>\n",
|
331 |
+
" </tr>\n",
|
332 |
+
" </tbody>\n",
|
333 |
+
"</table>\n",
|
334 |
+
"</div>"
|
335 |
+
],
|
336 |
+
"text/plain": [
|
337 |
+
" User ID candidate_field label \\\n",
|
338 |
+
"0 1 retail & consumer products 0 \n",
|
339 |
+
"1 2 sales 0 \n",
|
340 |
+
"2 3 healthcare & medical 0 \n",
|
341 |
+
"\n",
|
342 |
+
" hard_skill \\\n",
|
343 |
+
"0 ['business', 'merchandising', 'sales', 'service'] \n",
|
344 |
+
"1 ['application', 'business', 'business requirem... \n",
|
345 |
+
"2 ['application', 'cancer', 'endocrinology', 'hy... \n",
|
346 |
+
"\n",
|
347 |
+
" soft_skill \\\n",
|
348 |
+
"0 ['customer service'] \n",
|
349 |
+
"1 ['accountability', 'collaboration', 'innovatio... \n",
|
350 |
+
"2 ['research', 'training and development'] \n",
|
351 |
+
"\n",
|
352 |
+
" final_hard_skill \\\n",
|
353 |
+
"0 business, merchandising, sales, service \n",
|
354 |
+
"1 application, business, business requirements, ... \n",
|
355 |
+
"2 application, cancer, endocrinology, hydrothera... \n",
|
356 |
+
"\n",
|
357 |
+
" final_soft_skill \n",
|
358 |
+
"0 customer service \n",
|
359 |
+
"1 accountability, collaboration, innovation, man... \n",
|
360 |
+
"2 research, training and development "
|
361 |
+
]
|
362 |
+
},
|
363 |
+
"execution_count": 29,
|
364 |
+
"metadata": {},
|
365 |
+
"output_type": "execute_result"
|
366 |
+
}
|
367 |
+
],
|
368 |
+
"source": [
|
369 |
+
"train_user[\"final_hard_skill\"] = pd.DataFrame(list_hard_skill)\n",
|
370 |
+
"train_user[\"final_soft_skill\"] = pd.DataFrame(list_soft_skill)\n",
|
371 |
+
"train_user.head(3)"
|
372 |
+
]
|
373 |
+
},
|
374 |
+
{
|
375 |
+
"cell_type": "code",
|
376 |
+
"execution_count": 30,
|
377 |
+
"metadata": {
|
378 |
+
"id": "znBy9q8XDcM7"
|
379 |
+
},
|
380 |
+
"outputs": [],
|
381 |
+
"source": [
|
382 |
+
"list_hard_skill = [jobs[\"Hard Skills\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(jobs))]\n",
|
383 |
+
"list_soft_skill = [jobs[\"Soft Skills\"].iloc[i].replace(\"[\", \"\").replace(\"]\", \"\").replace(\"'\", \"\") for i in range(len(jobs))]"
|
384 |
+
]
|
385 |
+
},
|
386 |
+
{
|
387 |
+
"cell_type": "code",
|
388 |
+
"execution_count": 31,
|
389 |
+
"metadata": {
|
390 |
+
"colab": {
|
391 |
+
"base_uri": "https://localhost:8080/",
|
392 |
+
"height": 213
|
393 |
+
},
|
394 |
+
"id": "knFii8o3EQmv",
|
395 |
+
"outputId": "47afb484-0765-4ad9-8765-d084673450ac"
|
396 |
+
},
|
397 |
+
"outputs": [
|
398 |
+
{
|
399 |
+
"data": {
|
400 |
+
"text/html": [
|
401 |
+
"<div>\n",
|
402 |
+
"<style scoped>\n",
|
403 |
+
" .dataframe tbody tr th:only-of-type {\n",
|
404 |
+
" vertical-align: middle;\n",
|
405 |
+
" }\n",
|
406 |
+
"\n",
|
407 |
+
" .dataframe tbody tr th {\n",
|
408 |
+
" vertical-align: top;\n",
|
409 |
+
" }\n",
|
410 |
+
"\n",
|
411 |
+
" .dataframe thead th {\n",
|
412 |
+
" text-align: right;\n",
|
413 |
+
" }\n",
|
414 |
+
"</style>\n",
|
415 |
+
"<table border=\"1\" class=\"dataframe\">\n",
|
416 |
+
" <thead>\n",
|
417 |
+
" <tr style=\"text-align: right;\">\n",
|
418 |
+
" <th></th>\n",
|
419 |
+
" <th>Job ID</th>\n",
|
420 |
+
" <th>Major</th>\n",
|
421 |
+
" <th>Hard Skills</th>\n",
|
422 |
+
" <th>Soft Skills</th>\n",
|
423 |
+
" <th>final_hard_skill</th>\n",
|
424 |
+
" <th>final_soft_skill</th>\n",
|
425 |
+
" </tr>\n",
|
426 |
+
" </thead>\n",
|
427 |
+
" <tbody>\n",
|
428 |
+
" <tr>\n",
|
429 |
+
" <th>0</th>\n",
|
430 |
+
" <td>1</td>\n",
|
431 |
+
" <td>accounting</td>\n",
|
432 |
+
" <td>['business', 'finance', 'excel', 'tax', 'servi...</td>\n",
|
433 |
+
" <td>['management', 'planning', 'operations', 'lead...</td>\n",
|
434 |
+
" <td>business, finance, excel, tax, service, data, ...</td>\n",
|
435 |
+
" <td>management, planning, operations, leadership, ...</td>\n",
|
436 |
+
" </tr>\n",
|
437 |
+
" <tr>\n",
|
438 |
+
" <th>1</th>\n",
|
439 |
+
" <td>2</td>\n",
|
440 |
+
" <td>administration & office support</td>\n",
|
441 |
+
" <td>['service', 'business', 'data', 'excel', 'appl...</td>\n",
|
442 |
+
" <td>['management', 'customer service', 'microsoft ...</td>\n",
|
443 |
+
" <td>service, business, data, excel, application, s...</td>\n",
|
444 |
+
" <td>management, customer service, microsoft office...</td>\n",
|
445 |
+
" </tr>\n",
|
446 |
+
" <tr>\n",
|
447 |
+
" <th>2</th>\n",
|
448 |
+
" <td>3</td>\n",
|
449 |
+
" <td>advertising, arts & media</td>\n",
|
450 |
+
" <td>['business', 'digital', 'sales', 'service', 'a...</td>\n",
|
451 |
+
" <td>['management', 'social media', 'writing', 'com...</td>\n",
|
452 |
+
" <td>business, digital, sales, service, application...</td>\n",
|
453 |
+
" <td>management, social media, writing, communicati...</td>\n",
|
454 |
+
" </tr>\n",
|
455 |
+
" </tbody>\n",
|
456 |
+
"</table>\n",
|
457 |
+
"</div>"
|
458 |
+
],
|
459 |
+
"text/plain": [
|
460 |
+
" Job ID Major \\\n",
|
461 |
+
"0 1 accounting \n",
|
462 |
+
"1 2 administration & office support \n",
|
463 |
+
"2 3 advertising, arts & media \n",
|
464 |
+
"\n",
|
465 |
+
" Hard Skills \\\n",
|
466 |
+
"0 ['business', 'finance', 'excel', 'tax', 'servi... \n",
|
467 |
+
"1 ['service', 'business', 'data', 'excel', 'appl... \n",
|
468 |
+
"2 ['business', 'digital', 'sales', 'service', 'a... \n",
|
469 |
+
"\n",
|
470 |
+
" Soft Skills \\\n",
|
471 |
+
"0 ['management', 'planning', 'operations', 'lead... \n",
|
472 |
+
"1 ['management', 'customer service', 'microsoft ... \n",
|
473 |
+
"2 ['management', 'social media', 'writing', 'com... \n",
|
474 |
+
"\n",
|
475 |
+
" final_hard_skill \\\n",
|
476 |
+
"0 business, finance, excel, tax, service, data, ... \n",
|
477 |
+
"1 service, business, data, excel, application, s... \n",
|
478 |
+
"2 business, digital, sales, service, application... \n",
|
479 |
+
"\n",
|
480 |
+
" final_soft_skill \n",
|
481 |
+
"0 management, planning, operations, leadership, ... \n",
|
482 |
+
"1 management, customer service, microsoft office... \n",
|
483 |
+
"2 management, social media, writing, communicati... "
|
484 |
+
]
|
485 |
+
},
|
486 |
+
"execution_count": 31,
|
487 |
+
"metadata": {},
|
488 |
+
"output_type": "execute_result"
|
489 |
+
}
|
490 |
+
],
|
491 |
+
"source": [
|
492 |
+
"jobs[\"final_hard_skill\"] = pd.DataFrame(list_hard_skill)\n",
|
493 |
+
"jobs[\"final_soft_skill\"] = pd.DataFrame(list_soft_skill)\n",
|
494 |
+
"jobs.head(3)"
|
495 |
+
]
|
496 |
+
},
|
497 |
+
{
|
498 |
+
"cell_type": "code",
|
499 |
+
"execution_count": 32,
|
500 |
+
"metadata": {
|
501 |
+
"id": "wiDiHL6lStnd"
|
502 |
+
},
|
503 |
+
"outputs": [],
|
504 |
+
"source": [
|
505 |
+
"# Feature Engineering\n",
|
506 |
+
"def feature_engineering(applicants, companies):\n",
|
507 |
+
" # Vectorize skills and majors\n",
|
508 |
+
" tfidf_vectorizer_skills = TfidfVectorizer()\n",
|
509 |
+
" tfidf_vectorizer_majors = TfidfVectorizer()\n",
|
510 |
+
"\n",
|
511 |
+
" all_skills = pd.concat([applicants['final_hard_skill'], applicants['final_soft_skill'],\n",
|
512 |
+
" companies['final_hard_skill'], companies['final_soft_skill']])\n",
|
513 |
+
" all_majors = pd.concat([applicants['candidate_field'], companies['Major']])\n",
|
514 |
+
"\n",
|
515 |
+
" all_skills_vectorized = tfidf_vectorizer_skills.fit_transform(all_skills)\n",
|
516 |
+
" all_majors_vectorized = tfidf_vectorizer_majors.fit_transform(all_majors)\n",
|
517 |
+
"\n",
|
518 |
+
" num_applicants = len(applicants)\n",
|
519 |
+
" num_companies = len(companies)\n",
|
520 |
+
"\n",
|
521 |
+
" # Split the TF-IDF vectors back into applicants and companies\n",
|
522 |
+
" applicants_skills_vectorized = all_skills_vectorized[:num_applicants*2] # because each applicant has 2 skill entries\n",
|
523 |
+
" companies_skills_vectorized = all_skills_vectorized[num_applicants*2:]\n",
|
524 |
+
"\n",
|
525 |
+
" applicants_majors_vectorized = all_majors_vectorized[:num_applicants]\n",
|
526 |
+
" companies_majors_vectorized = all_majors_vectorized[num_applicants:]\n",
|
527 |
+
"\n",
|
528 |
+
" return (applicants_skills_vectorized, applicants_majors_vectorized,\n",
|
529 |
+
" companies_skills_vectorized, companies_majors_vectorized, tfidf_vectorizer_skills, tfidf_vectorizer_majors)"
|
530 |
+
]
|
531 |
+
},
|
532 |
+
{
|
533 |
+
"cell_type": "code",
|
534 |
+
"execution_count": 33,
|
535 |
+
"metadata": {
|
536 |
+
"id": "THM0mszQGNyD"
|
537 |
+
},
|
538 |
+
"outputs": [],
|
539 |
+
"source": [
|
540 |
+
"def compute_similarity(applicants_skills_vectorized, applicants_majors_vectorized,\n",
|
541 |
+
" companies_skills_vectorized, companies_majors_vectorized):\n",
|
542 |
+
" # Calculate similarity based on skills (averaging hard and soft skills similarities)\n",
|
543 |
+
" applicants_skills = (applicants_skills_vectorized[0::2] + applicants_skills_vectorized[1::2]) / 2\n",
|
544 |
+
" companies_skills = (companies_skills_vectorized[0::2] + companies_skills_vectorized[1::2]) / 2\n",
|
545 |
+
"\n",
|
546 |
+
" skills_similarity = cosine_similarity(applicants_skills, companies_skills)\n",
|
547 |
+
"\n",
|
548 |
+
" # Calculate similarity based on majors\n",
|
549 |
+
" majors_similarity = cosine_similarity(applicants_majors_vectorized, companies_majors_vectorized)\n",
|
550 |
+
"\n",
|
551 |
+
" # Ensure the number of companies in both similarities is aligned\n",
|
552 |
+
" if skills_similarity.shape[1] != majors_similarity.shape[1]:\n",
|
553 |
+
" min_dim = min(skills_similarity.shape[1], majors_similarity.shape[1])\n",
|
554 |
+
" skills_similarity = skills_similarity[:, :min_dim]\n",
|
555 |
+
" majors_similarity = majors_similarity[:, :min_dim]\n",
|
556 |
+
"\n",
|
557 |
+
" # Combine these similarities (simple average for this example)\n",
|
558 |
+
" combined_similarity = (skills_similarity + majors_similarity) / 2\n",
|
559 |
+
" return combined_similarity"
|
560 |
+
]
|
561 |
+
},
|
562 |
+
{
|
563 |
+
"cell_type": "code",
|
564 |
+
"execution_count": 34,
|
565 |
+
"metadata": {
|
566 |
+
"id": "ter3YAzxoelD"
|
567 |
+
},
|
568 |
+
"outputs": [],
|
569 |
+
"source": [
|
570 |
+
"# Recommendation Function\n",
|
571 |
+
"def recommend_jobs(applicants, companies, similarity_scores):\n",
|
572 |
+
" recommendations = {}\n",
|
573 |
+
" for i, applicant in enumerate(applicants['User ID']):\n",
|
574 |
+
" if i < len(similarity_scores):\n",
|
575 |
+
" sorted_company_indices = np.argsort(-similarity_scores[i]) # Descending sort of scores\n",
|
576 |
+
" recommended_companies = companies.iloc[sorted_company_indices]['Major'].values[:3] # Top 3 recommendations\n",
|
577 |
+
" recommendations[applicant] = recommended_companies\n",
|
578 |
+
" return recommendations\n",
|
579 |
+
"\n",
|
580 |
+
"# Testing and Evaluation Function\n",
|
581 |
+
"def print_recommendations(applicants, companies, recommendations):\n",
|
582 |
+
" # This is a mock function since we don't have ground truth to compare to.\n",
|
583 |
+
" # In a real scenario, we would compare against actual matches or use some form of feedback.\n",
|
584 |
+
" print(\"Recommendations for each applicant:\")\n",
|
585 |
+
" for applicant in recommendations:\n",
|
586 |
+
" print(f\"{applicant}: {recommendations[applicant]}\")"
|
587 |
+
]
|
588 |
+
},
|
589 |
+
{
|
590 |
+
"cell_type": "code",
|
591 |
+
"execution_count": null,
|
592 |
+
"metadata": {
|
593 |
+
"colab": {
|
594 |
+
"base_uri": "https://localhost:8080/"
|
595 |
+
},
|
596 |
+
"collapsed": true,
|
597 |
+
"id": "Ajxp0xelIrl2",
|
598 |
+
"outputId": "08bafc5b-73cc-4695-924a-931840047dd5"
|
599 |
+
},
|
600 |
+
"outputs": [],
|
601 |
+
"source": [
|
602 |
+
"# Let's create and process the data, and compute recommendations\n",
|
603 |
+
"# train_applicants, test_applicants, companies = create_mock_data()\n",
|
604 |
+
"applicants_skills_vec, applicants_majors_vec, companies_skills_vec, companies_majors_vec, tfidf_vectorizer_skills, tfidf_vectorizer_majors = feature_engineering(train_user, jobs)\n",
|
605 |
+
"\n",
|
606 |
+
"similarity_scores = compute_similarity(applicants_skills_vec, applicants_majors_vec, companies_skills_vec, companies_majors_vec)\n",
|
607 |
+
"recommendations = recommend_jobs(test_user, jobs, similarity_scores)\n",
|
608 |
+
"\n",
|
609 |
+
"# Output the recommendations to observe the results\n",
|
610 |
+
"print_recommendations(test_user, jobs, recommendations)"
|
611 |
+
]
|
612 |
+
},
|
613 |
+
{
|
614 |
+
"cell_type": "code",
|
615 |
+
"execution_count": 36,
|
616 |
+
"metadata": {
|
617 |
+
"colab": {
|
618 |
+
"base_uri": "https://localhost:8080/"
|
619 |
+
},
|
620 |
+
"id": "nj-HEdyJlYNY",
|
621 |
+
"outputId": "063b84bc-5717-4a0c-8367-939a054657bc"
|
622 |
+
},
|
623 |
+
"outputs": [],
|
624 |
+
"source": [
|
625 |
+
"# Process input skills and recommend jobs\n",
|
626 |
+
"def recommend_jobs_for_input_skills(input_hard_skills, input_soft_skills, input_major, jobs, tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec):\n",
|
627 |
+
" input_hard_skills_vec = tfidf_vectorizer_skills.transform([input_hard_skills])\n",
|
628 |
+
" input_soft_skills_vec = tfidf_vectorizer_skills.transform([input_soft_skills])\n",
|
629 |
+
" input_major_vec = tfidf_vectorizer_majors.transform([input_major])\n",
|
630 |
+
"\n",
|
631 |
+
" # Average the vectorized hard and soft skills\n",
|
632 |
+
" input_skills_vec = (input_hard_skills_vec + input_soft_skills_vec) / 2\n",
|
633 |
+
"\n",
|
634 |
+
" # Compute similarities\n",
|
635 |
+
" skills_similarity = cosine_similarity(input_skills_vec, companies_skills_vec)\n",
|
636 |
+
" major_similarity = cosine_similarity(input_major_vec, companies_majors_vec)\n",
|
637 |
+
"\n",
|
638 |
+
" # Ensure the number of companies in both similarities is aligned\n",
|
639 |
+
" if skills_similarity.shape[1] != major_similarity.shape[1]:\n",
|
640 |
+
" min_dim = min(skills_similarity.shape[1], major_similarity.shape[1])\n",
|
641 |
+
" skills_similarity = skills_similarity[:, :min_dim]\n",
|
642 |
+
" major_similarity = major_similarity[:, :min_dim]\n",
|
643 |
+
"\n",
|
644 |
+
" # Combine similarities\n",
|
645 |
+
" combined_similarity = (skills_similarity + major_similarity) / 2\n",
|
646 |
+
"\n",
|
647 |
+
" # Get top 3 job recommendations\n",
|
648 |
+
" sorted_company_indices = np.argsort(-combined_similarity[0])\n",
|
649 |
+
" recommended_companies = jobs.iloc[sorted_company_indices]['Major'].values[:3]\n",
|
650 |
+
"\n",
|
651 |
+
" return recommended_companies"
|
652 |
+
]
|
653 |
+
},
|
654 |
+
{
|
655 |
+
"cell_type": "markdown",
|
656 |
+
"metadata": {
|
657 |
+
"id": "IMTilMnQINZC"
|
658 |
+
},
|
659 |
+
"source": [
|
660 |
+
"TEST RECOMMENDED SYSTEM"
|
661 |
+
]
|
662 |
+
},
|
663 |
+
{
|
664 |
+
"cell_type": "code",
|
665 |
+
"execution_count": 37,
|
666 |
+
"metadata": {},
|
667 |
+
"outputs": [
|
668 |
+
{
|
669 |
+
"name": "stdout",
|
670 |
+
"output_type": "stream",
|
671 |
+
"text": [
|
672 |
+
"Recommended Jobs based on input skills and major:\n",
|
673 |
+
"['it jobs' 'sales' 'administration & office support']\n"
|
674 |
+
]
|
675 |
+
}
|
676 |
+
],
|
677 |
+
"source": [
|
678 |
+
"input_hard_skills = \"Java, Excel, Python\"\n",
|
679 |
+
"input_soft_skills = \"Communication, Teamwork\"\n",
|
680 |
+
"input_major = \"Economy\"\n",
|
681 |
+
"\n",
|
682 |
+
"recommended_jobs = recommend_jobs_for_input_skills(input_hard_skills, input_soft_skills, input_major, jobs, tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec)\n",
|
683 |
+
"print(\"Recommended Jobs based on input skills and major:\")\n",
|
684 |
+
"print(recommended_jobs)"
|
685 |
+
]
|
686 |
+
},
|
687 |
+
{
|
688 |
+
"cell_type": "markdown",
|
689 |
+
"metadata": {
|
690 |
+
"id": "kShd99z_NiTa"
|
691 |
+
},
|
692 |
+
"source": [
|
693 |
+
"Evaluating (PENDING)"
|
694 |
+
]
|
695 |
+
},
|
696 |
+
{
|
697 |
+
"cell_type": "code",
|
698 |
+
"execution_count": 38,
|
699 |
+
"metadata": {
|
700 |
+
"id": "WfEgjqw9JE3l"
|
701 |
+
},
|
702 |
+
"outputs": [],
|
703 |
+
"source": [
|
704 |
+
"def create_ground_truth(csv_file_path):\n",
|
705 |
+
" data = pd.read_csv(csv_file_path)\n",
|
706 |
+
"\n",
|
707 |
+
" # Tạo dictionary `ground_truth`\n",
|
708 |
+
" ground_truth = {}\n",
|
709 |
+
" for index, row in data.iterrows():\n",
|
710 |
+
" user_id = row['User ID']\n",
|
711 |
+
" actual_major = row['candidate_field']\n",
|
712 |
+
"\n",
|
713 |
+
" # Thêm vào dictionary, giả sử mỗi ứng viên chỉ chọn một công việc\n",
|
714 |
+
" ground_truth[user_id] = [actual_major]\n",
|
715 |
+
"\n",
|
716 |
+
" return ground_truth\n",
|
717 |
+
"\n",
|
718 |
+
"# Sử dụng hàm trên để tạo `ground_truth`\n",
|
719 |
+
"csv_file_path = '1st_test.csv'\n",
|
720 |
+
"ground_truth = create_ground_truth(csv_file_path)"
|
721 |
+
]
|
722 |
+
},
|
723 |
+
{
|
724 |
+
"cell_type": "code",
|
725 |
+
"execution_count": null,
|
726 |
+
"metadata": {
|
727 |
+
"colab": {
|
728 |
+
"base_uri": "https://localhost:8080/",
|
729 |
+
"height": 1000
|
730 |
+
},
|
731 |
+
"collapsed": true,
|
732 |
+
"id": "TRiD4oS-AKFE",
|
733 |
+
"outputId": "256fadeb-b250-4602-affb-005cb9c658eb"
|
734 |
+
},
|
735 |
+
"outputs": [],
|
736 |
+
"source": [
|
737 |
+
"display(ground_truth)"
|
738 |
+
]
|
739 |
+
},
|
740 |
+
{
|
741 |
+
"cell_type": "code",
|
742 |
+
"execution_count": 40,
|
743 |
+
"metadata": {
|
744 |
+
"colab": {
|
745 |
+
"base_uri": "https://localhost:8080/"
|
746 |
+
},
|
747 |
+
"id": "pXsa_wbANjmb",
|
748 |
+
"outputId": "9bd4fc1e-781b-439c-fe35-c28769f6714c"
|
749 |
+
},
|
750 |
+
"outputs": [
|
751 |
+
{
|
752 |
+
"name": "stdout",
|
753 |
+
"output_type": "stream",
|
754 |
+
"text": [
|
755 |
+
"Average Precision@3 with 18979 trains and 4745 tests: 0.12764313312258516\n"
|
756 |
+
]
|
757 |
+
}
|
758 |
+
],
|
759 |
+
"source": [
|
760 |
+
"def precision_at_k(recommendations, ground_truth, k=3):\n",
|
761 |
+
" \"\"\"\n",
|
762 |
+
" Calculate the precision at k for recommendation system.\n",
|
763 |
+
"\n",
|
764 |
+
" Parameters:\n",
|
765 |
+
" - recommendations (dict): Dictionary where keys are user IDs and values are lists of recommended majors.\n",
|
766 |
+
" - ground_truth (dict): Dictionary where keys are user IDs and values are lists of truly suitable majors.\n",
|
767 |
+
" - k (int): The number of top recommendations to consider for calculating precision.\n",
|
768 |
+
"\n",
|
769 |
+
" Returns:\n",
|
770 |
+
" - float: The average precision at k for all users.\n",
|
771 |
+
" \"\"\"\n",
|
772 |
+
" precision_scores = []\n",
|
773 |
+
"\n",
|
774 |
+
" for applicant, recommended_major in recommendations.items():\n",
|
775 |
+
" if applicant in ground_truth:\n",
|
776 |
+
" # Get top k recommendations\n",
|
777 |
+
" top_k_recs = recommended_major[:k]\n",
|
778 |
+
" # Calculate the number of relevant recommendations\n",
|
779 |
+
" relevant_recs = sum(1 for major in top_k_recs if major in ground_truth[applicant])\n",
|
780 |
+
" # Precision at k for this user\n",
|
781 |
+
" precision = relevant_recs / k\n",
|
782 |
+
" precision_scores.append(precision)\n",
|
783 |
+
"\n",
|
784 |
+
" # Average precision at k over all users\n",
|
785 |
+
" average_precision = np.mean(precision_scores) if precision_scores else 0\n",
|
786 |
+
" return average_precision\n",
|
787 |
+
"\n",
|
788 |
+
"avg_precision = precision_at_k(recommendations, ground_truth)\n",
|
789 |
+
"print(\"Average Precision@3 with 18979 trains and 4745 tests:\", avg_precision)"
|
790 |
+
]
|
791 |
+
},
|
792 |
+
{
|
793 |
+
"cell_type": "code",
|
794 |
+
"execution_count": 41,
|
795 |
+
"metadata": {
|
796 |
+
"colab": {
|
797 |
+
"base_uri": "https://localhost:8080/"
|
798 |
+
},
|
799 |
+
"id": "KAIvtKEaRQml",
|
800 |
+
"outputId": "7dd82dc6-0e1b-43d5-bc95-cb457cde5d72"
|
801 |
+
},
|
802 |
+
"outputs": [
|
803 |
+
{
|
804 |
+
"name": "stdout",
|
805 |
+
"output_type": "stream",
|
806 |
+
"text": [
|
807 |
+
"Average Recall@3 with 18979 trains and 4745 tests: 0.38292939936775555\n"
|
808 |
+
]
|
809 |
+
}
|
810 |
+
],
|
811 |
+
"source": [
|
812 |
+
"def recall_at_k(recommendations, ground_truth, k=3):\n",
|
813 |
+
" recall_scores = []\n",
|
814 |
+
"\n",
|
815 |
+
" for user_id, recommended_majors in recommendations.items():\n",
|
816 |
+
" if user_id in ground_truth:\n",
|
817 |
+
" # Get top k recommendations\n",
|
818 |
+
" top_k_recs = recommended_majors[:k]\n",
|
819 |
+
" # Calculate the number of relevant recommendations\n",
|
820 |
+
" relevant_recs = sum(1 for major in top_k_recs if major in ground_truth[user_id])\n",
|
821 |
+
" # Calculate the total number of relevant items\n",
|
822 |
+
" total_relevant = len(ground_truth[user_id])\n",
|
823 |
+
" # Recall at k for this user\n",
|
824 |
+
" recall = relevant_recs / total_relevant if total_relevant else 0\n",
|
825 |
+
" recall_scores.append(recall)\n",
|
826 |
+
"\n",
|
827 |
+
" # Average recall at k over all users\n",
|
828 |
+
" average_recall = sum(recall_scores) / len(recall_scores) if recall_scores else 0\n",
|
829 |
+
" return average_recall\n",
|
830 |
+
"\n",
|
831 |
+
"# Example usage:\n",
|
832 |
+
"avg_recall = recall_at_k(recommendations, ground_truth)\n",
|
833 |
+
"print(\"Average Recall@3 with 18979 trains and 4745 tests:\", avg_recall)\n"
|
834 |
+
]
|
835 |
+
},
|
836 |
+
{
|
837 |
+
"cell_type": "code",
|
838 |
+
"execution_count": 42,
|
839 |
+
"metadata": {
|
840 |
+
"colab": {
|
841 |
+
"base_uri": "https://localhost:8080/"
|
842 |
+
},
|
843 |
+
"id": "QUHBsQS_-5Eu",
|
844 |
+
"outputId": "fdab3075-dab8-458e-e663-2564b20da97c"
|
845 |
+
},
|
846 |
+
"outputs": [
|
847 |
+
{
|
848 |
+
"name": "stdout",
|
849 |
+
"output_type": "stream",
|
850 |
+
"text": [
|
851 |
+
"Average F1 Score@3: 0.19146469968387775\n"
|
852 |
+
]
|
853 |
+
}
|
854 |
+
],
|
855 |
+
"source": [
|
856 |
+
"def f1_score_at_k(recommendations, ground_truth, k=3):\n",
|
857 |
+
" precision = precision_at_k(recommendations, ground_truth, k)\n",
|
858 |
+
" recall = recall_at_k(recommendations, ground_truth, k)\n",
|
859 |
+
"\n",
|
860 |
+
" if precision + recall == 0:\n",
|
861 |
+
" return 0\n",
|
862 |
+
"\n",
|
863 |
+
" f1_score = 2 * (precision * recall) / (precision + recall)\n",
|
864 |
+
" return f1_score\n",
|
865 |
+
"\n",
|
866 |
+
"avg_f1_score = f1_score_at_k(recommendations, ground_truth)\n",
|
867 |
+
"\n",
|
868 |
+
"print(\"Average F1 Score@3:\", avg_f1_score)"
|
869 |
+
]
|
870 |
+
},
|
871 |
+
{
|
872 |
+
"cell_type": "code",
|
873 |
+
"execution_count": 43,
|
874 |
+
"metadata": {},
|
875 |
+
"outputs": [],
|
876 |
+
"source": [
|
877 |
+
"import numpy as np\n",
|
878 |
+
"import pandas as pd\n",
|
879 |
+
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
|
880 |
+
"from sklearn.metrics.pairwise import cosine_similarity\n",
|
881 |
+
"from sklearn.pipeline import Pipeline\n",
|
882 |
+
"from sklearn.base import BaseEstimator, TransformerMixin"
|
883 |
+
]
|
884 |
+
},
|
885 |
+
{
|
886 |
+
"cell_type": "code",
|
887 |
+
"execution_count": 44,
|
888 |
+
"metadata": {},
|
889 |
+
"outputs": [],
|
890 |
+
"source": [
|
891 |
+
"class FeatureEngineeringTransformer(BaseEstimator, TransformerMixin):\n",
|
892 |
+
" def __init__(self):\n",
|
893 |
+
" self.tfidf_vectorizer_skills = TfidfVectorizer()\n",
|
894 |
+
" self.tfidf_vectorizer_majors = TfidfVectorizer()\n",
|
895 |
+
"\n",
|
896 |
+
" def fit(self, X, y=None):\n",
|
897 |
+
" all_skills = pd.concat([X['final_hard_skill'], X['final_soft_skill']])\n",
|
898 |
+
" all_majors = X['candidate_field']\n",
|
899 |
+
" \n",
|
900 |
+
" self.tfidf_vectorizer_skills.fit(all_skills)\n",
|
901 |
+
" self.tfidf_vectorizer_majors.fit(all_majors)\n",
|
902 |
+
" return self\n",
|
903 |
+
" \n",
|
904 |
+
" def transform(self, X):\n",
|
905 |
+
" all_skills = pd.concat([X['final_hard_skill'], X['final_soft_skill']])\n",
|
906 |
+
" all_majors = X['candidate_field']\n",
|
907 |
+
" \n",
|
908 |
+
" applicants_skills_vec = self.tfidf_vectorizer_skills.transform(all_skills)\n",
|
909 |
+
" applicants_majors_vec = self.tfidf_vectorizer_majors.transform(all_majors)\n",
|
910 |
+
" \n",
|
911 |
+
" return applicants_skills_vec, applicants_majors_vec"
|
912 |
+
]
|
913 |
+
},
|
914 |
+
{
|
915 |
+
"cell_type": "code",
|
916 |
+
"execution_count": 45,
|
917 |
+
"metadata": {},
|
918 |
+
"outputs": [],
|
919 |
+
"source": [
|
920 |
+
"class JobRecommender(BaseEstimator, TransformerMixin):\n",
|
921 |
+
" def __init__(self, jobs, tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec):\n",
|
922 |
+
" self.jobs = jobs\n",
|
923 |
+
" self.tfidf_vectorizer_skills = tfidf_vectorizer_skills\n",
|
924 |
+
" self.tfidf_vectorizer_majors = tfidf_vectorizer_majors\n",
|
925 |
+
" self.companies_skills_vec = companies_skills_vec\n",
|
926 |
+
" self.companies_majors_vec = companies_majors_vec\n",
|
927 |
+
"\n",
|
928 |
+
" def fit(self, X, y=None):\n",
|
929 |
+
" return self\n",
|
930 |
+
"\n",
|
931 |
+
" def transform(self, X):\n",
|
932 |
+
" input_hard_skills_vec = self.tfidf_vectorizer_skills.transform(X['final_hard_skill'])\n",
|
933 |
+
" input_soft_skills_vec = self.tfidf_vectorizer_skills.transform(X['final_soft_skill'])\n",
|
934 |
+
" input_major_vec = self.tfidf_vectorizer_majors.transform(X['candidate_field'])\n",
|
935 |
+
"\n",
|
936 |
+
" input_skills_vec = (input_hard_skills_vec + input_soft_skills_vec) / 2\n",
|
937 |
+
"\n",
|
938 |
+
" skills_similarity = cosine_similarity(input_skills_vec, self.companies_skills_vec)\n",
|
939 |
+
" major_similarity = cosine_similarity(input_major_vec, self.companies_majors_vec)\n",
|
940 |
+
"\n",
|
941 |
+
" if skills_similarity.shape[1] != major_similarity.shape[1]:\n",
|
942 |
+
" min_dim = min(skills_similarity.shape[1], major_similarity.shape[1])\n",
|
943 |
+
" skills_similarity = skills_similarity[:, :min_dim]\n",
|
944 |
+
" major_similarity = major_similarity[:, :min_dim]\n",
|
945 |
+
"\n",
|
946 |
+
" combined_similarity = (skills_similarity + major_similarity) / 2\n",
|
947 |
+
"\n",
|
948 |
+
" recommendations = []\n",
|
949 |
+
" for i in range(combined_similarity.shape[0]):\n",
|
950 |
+
" sorted_company_indices = np.argsort(-combined_similarity[i])\n",
|
951 |
+
" recommended_companies = self.jobs.iloc[sorted_company_indices]['Major'].values[:3]\n",
|
952 |
+
" recommendations.append(recommended_companies)\n",
|
953 |
+
"\n",
|
954 |
+
" return recommendations"
|
955 |
+
]
|
956 |
+
},
|
957 |
+
{
|
958 |
+
"cell_type": "code",
|
959 |
+
"execution_count": 46,
|
960 |
+
"metadata": {},
|
961 |
+
"outputs": [],
|
962 |
+
"source": [
|
963 |
+
"def create_recommendation_pipeline():\n",
|
964 |
+
" # Instantiate the feature engineering transformer\n",
|
965 |
+
" feature_engineering = FeatureEngineeringTransformer()\n",
|
966 |
+
"\n",
|
967 |
+
" # Define the recommendation function as a callable estimator\n",
|
968 |
+
" def recommend_jobs_function(X, y=None):\n",
|
969 |
+
" applicants_skills_vec, applicants_majors_vec = feature_engineering.fit_transform(X)\n",
|
970 |
+
" companies_skills_vec, companies_majors_vec = feature_engineering.tfidf_vectorizer_skills.transform(jobs['final_hard_skill']), feature_engineering.tfidf_vectorizer_majors.transform(jobs['Major'])\n",
|
971 |
+
" \n",
|
972 |
+
" return recommend_jobs_for_input_skills(X['final_hard_skill'], X['final_soft_skill'], X['candidate_field'], jobs, feature_engineering.tfidf_vectorizer_skills, feature_engineering.tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec)\n",
|
973 |
+
"\n",
|
974 |
+
" pipeline = Pipeline([\n",
|
975 |
+
" ('feature_engineering', feature_engineering),\n",
|
976 |
+
" ('recommendation', recommend_jobs_function)\n",
|
977 |
+
" ])\n",
|
978 |
+
" \n",
|
979 |
+
" return pipeline\n",
|
980 |
+
"recommendation_pipeline = create_recommendation_pipeline()"
|
981 |
+
]
|
982 |
+
},
|
983 |
+
{
|
984 |
+
"cell_type": "code",
|
985 |
+
"execution_count": 47,
|
986 |
+
"metadata": {},
|
987 |
+
"outputs": [
|
988 |
+
{
|
989 |
+
"name": "stdout",
|
990 |
+
"output_type": "stream",
|
991 |
+
"text": [
|
992 |
+
"Model components saved successfully!\n"
|
993 |
+
]
|
994 |
+
}
|
995 |
+
],
|
996 |
+
"source": [
|
997 |
+
"import pickle\n",
|
998 |
+
"def create_recommendation_pipeline(jobs):\n",
|
999 |
+
" feature_engineering = FeatureEngineeringTransformer()\n",
|
1000 |
+
"\n",
|
1001 |
+
" # Fit feature engineering transformer to get the vectorizers and company vectors\n",
|
1002 |
+
" applicants_skills_vec, applicants_majors_vec = feature_engineering.fit_transform(train_user)\n",
|
1003 |
+
" companies_skills_vec = feature_engineering.tfidf_vectorizer_skills.transform(jobs['final_hard_skill'])\n",
|
1004 |
+
" companies_majors_vec = feature_engineering.tfidf_vectorizer_majors.transform(jobs['Major'])\n",
|
1005 |
+
"\n",
|
1006 |
+
" recommender = JobRecommender(jobs, feature_engineering.tfidf_vectorizer_skills, feature_engineering.tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec)\n",
|
1007 |
+
"\n",
|
1008 |
+
" pipeline = Pipeline([\n",
|
1009 |
+
" ('feature_engineering', feature_engineering),\n",
|
1010 |
+
" ('recommendation', recommender)\n",
|
1011 |
+
" ])\n",
|
1012 |
+
" \n",
|
1013 |
+
" return pipeline\n",
|
1014 |
+
"\n",
|
1015 |
+
"# Create the pipeline\n",
|
1016 |
+
"recommendation_pipeline = create_recommendation_pipeline(jobs)\n",
|
1017 |
+
"\n",
|
1018 |
+
"# Save the pipeline using pickle\n",
|
1019 |
+
"model_path = \"recommendation_pipeline.pkl\"\n",
|
1020 |
+
"with open(model_path, mode=\"bw\") as f:\n",
|
1021 |
+
" pickle.dump(recommendation_pipeline, f)\n",
|
1022 |
+
"print(\"Model components saved successfully!\")\n"
|
1023 |
+
]
|
1024 |
+
},
|
1025 |
+
{
|
1026 |
+
"cell_type": "code",
|
1027 |
+
"execution_count": 48,
|
1028 |
+
"metadata": {},
|
1029 |
+
"outputs": [
|
1030 |
+
{
|
1031 |
+
"data": {
|
1032 |
+
"application/vnd.jupyter.widget-view+json": {
|
1033 |
+
"model_id": "1c9a071d0a244c4a8e8fe7403a96295c",
|
1034 |
+
"version_major": 2,
|
1035 |
+
"version_minor": 0
|
1036 |
+
},
|
1037 |
+
"text/plain": [
|
1038 |
+
"VBox(children=(HTML(value='<center> <img\\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…"
|
1039 |
+
]
|
1040 |
+
},
|
1041 |
+
"metadata": {},
|
1042 |
+
"output_type": "display_data"
|
1043 |
+
}
|
1044 |
+
],
|
1045 |
+
"source": [
|
1046 |
+
"from huggingface_hub import notebook_login\n",
|
1047 |
+
"notebook_login()"
|
1048 |
+
]
|
1049 |
+
},
|
1050 |
+
{
|
1051 |
+
"cell_type": "code",
|
1052 |
+
"execution_count": 50,
|
1053 |
+
"metadata": {},
|
1054 |
+
"outputs": [
|
1055 |
+
{
|
1056 |
+
"name": "stderr",
|
1057 |
+
"output_type": "stream",
|
1058 |
+
"text": [
|
1059 |
+
"c:\\Program Files\\Python311\\Lib\\site-packages\\skops\\hub_utils\\_hf_hub.py:577: FutureWarning: Creating repos on hf.co is subject to strict rate limits now and therefore this feature is to be removed from this library in version 0.10. You can use tools directly available in the huggingface_hub library instead to create and push files.\n",
|
1060 |
+
" warnings.warn(\n"
|
1061 |
+
]
|
1062 |
+
},
|
1063 |
+
{
|
1064 |
+
"data": {
|
1065 |
+
"application/vnd.jupyter.widget-view+json": {
|
1066 |
+
"model_id": "2d4d813e6bf0451c9dbef4b9ba67b808",
|
1067 |
+
"version_major": 2,
|
1068 |
+
"version_minor": 0
|
1069 |
+
},
|
1070 |
+
"text/plain": [
|
1071 |
+
"recommendation_pipeline.pkl: 0%| | 0.00/163k [00:00<?, ?B/s]"
|
1072 |
+
]
|
1073 |
+
},
|
1074 |
+
"metadata": {},
|
1075 |
+
"output_type": "display_data"
|
1076 |
+
}
|
1077 |
+
],
|
1078 |
+
"source": [
|
1079 |
+
"import shutil\n",
|
1080 |
+
"import os\n",
|
1081 |
+
"from skops import card, hub_utils\n",
|
1082 |
+
"from pathlib import Path\n",
|
1083 |
+
"\n",
|
1084 |
+
"model_path = \"recommendation_pipeline.pkl\"\n",
|
1085 |
+
"local_repo = \"job-recommendation-model\"\n",
|
1086 |
+
"# Clear the existing directory if it exists\n",
|
1087 |
+
"if os.path.exists(local_repo):\n",
|
1088 |
+
" shutil.rmtree(local_repo)\n",
|
1089 |
+
"\n",
|
1090 |
+
"sample_data = pd.DataFrame({\n",
|
1091 |
+
" 'final_hard_skill': [\"Python, Java, Finance, Excel\"],\n",
|
1092 |
+
" 'final_soft_skill': [\"Communication, Teamwork\"],\n",
|
1093 |
+
" 'candidate_field': [\"\"]\n",
|
1094 |
+
"})\n",
|
1095 |
+
"\n",
|
1096 |
+
"# Initialize the local repository\n",
|
1097 |
+
"hub_utils.init(\n",
|
1098 |
+
" model=model_path,\n",
|
1099 |
+
" requirements=[\"scikit-learn\", \"pandas\", \"numpy\"],\n",
|
1100 |
+
" dst=local_repo,\n",
|
1101 |
+
" task=\"tabular-classification\",\n",
|
1102 |
+
" data=sample_data,\n",
|
1103 |
+
")\n",
|
1104 |
+
"\n",
|
1105 |
+
"# # Create model card metadata manually\n",
|
1106 |
+
"# metadata = {\n",
|
1107 |
+
"# \"model_type\": \"Custom Recommendation Model\",\n",
|
1108 |
+
"# \"model_description\": \"This is a recommendation model for job applicants based on their skills and majors.\",\n",
|
1109 |
+
"# \"author\": \"trangannh\",\n",
|
1110 |
+
"# \"license\": \"mit\",\n",
|
1111 |
+
"# \"citation\": \"\"\"\n",
|
1112 |
+
"# @misc{example2024recommendation,\n",
|
1113 |
+
"# author = {trangannh},\n",
|
1114 |
+
"# title = {Job Recommendation Model},\n",
|
1115 |
+
"# year = {2024},\n",
|
1116 |
+
"# howpublished = {\\\\url{https://huggingface.co/job-recommendation-model}},\n",
|
1117 |
+
"# }\n",
|
1118 |
+
"# \"\"\",\n",
|
1119 |
+
"# \"limitations\": \"This model is not ready to be used in production.\",\n",
|
1120 |
+
"# }\n",
|
1121 |
+
"\n",
|
1122 |
+
"# # Create and save the model card\n",
|
1123 |
+
"# model_card = card.Card(model=model_path, metadata=metadata)\n",
|
1124 |
+
"\n",
|
1125 |
+
"# # Add the get started code\n",
|
1126 |
+
"# get_started_code = \"\"\"\n",
|
1127 |
+
"# import pickle\n",
|
1128 |
+
"# import pandas as pd\n",
|
1129 |
+
"\n",
|
1130 |
+
"# with open('recommendation_model.pkl', 'rb') as file:\n",
|
1131 |
+
"# tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec = pickle.load(file)\n",
|
1132 |
+
"\n",
|
1133 |
+
"# input_hard_skills = \"Python, Java, Finance, Excel\"\n",
|
1134 |
+
"# input_soft_skills = \"Communication, Teamwork\"\n",
|
1135 |
+
"# input_major = \"\"\n",
|
1136 |
+
"# jobs_data = pd.read_csv(\"/content/sample_data/jobs_data.csv\")\n",
|
1137 |
+
"\n",
|
1138 |
+
"# recommended_jobs = recommend_jobs_for_input_skills(input_hard_skills, input_soft_skills, input_major, jobs_data, 'recommendation_model.pkl')\n",
|
1139 |
+
"# print(\"Recommended Jobs based on input skills and major:\")\n",
|
1140 |
+
"# print(recommended_jobs)\n",
|
1141 |
+
"# \"\"\"\n",
|
1142 |
+
"\n",
|
1143 |
+
"# model_card.add(\n",
|
1144 |
+
"# get_started_code=get_started_code,\n",
|
1145 |
+
"# model_card_authors=\"trangannh\",\n",
|
1146 |
+
"# model_description=\"This is a recommendation model for job applicants based on their skills and majors.\",\n",
|
1147 |
+
"# limitations=\"This model is not ready to be used in production.\"\n",
|
1148 |
+
"# )\n",
|
1149 |
+
"\n",
|
1150 |
+
"# # Save the model card\n",
|
1151 |
+
"# model_card.save(Path(local_repo) / \"README.md\")\n",
|
1152 |
+
"\n",
|
1153 |
+
"# Push the repository to Hugging Face Hub\n",
|
1154 |
+
"repo_id = \"trangannh/job-recommendation-model\"\n",
|
1155 |
+
"token = \"\"\n",
|
1156 |
+
"\n",
|
1157 |
+
"hub_utils.push(\n",
|
1158 |
+
" repo_id=repo_id,\n",
|
1159 |
+
" source=local_repo,\n",
|
1160 |
+
" token=token,\n",
|
1161 |
+
" commit_message=\"Initial commit of the job recommendation model\",\n",
|
1162 |
+
" create_remote=True,\n",
|
1163 |
+
")\n"
|
1164 |
+
]
|
1165 |
+
},
|
1166 |
+
{
|
1167 |
+
"cell_type": "code",
|
1168 |
+
"execution_count": 52,
|
1169 |
+
"metadata": {},
|
1170 |
+
"outputs": [
|
1171 |
+
{
|
1172 |
+
"ename": "AttributeError",
|
1173 |
+
"evalue": "'list' object has no attribute 'lower'",
|
1174 |
+
"output_type": "error",
|
1175 |
+
"traceback": [
|
1176 |
+
"\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
|
1177 |
+
"\u001b[1;31mAttributeError\u001b[0m Traceback (most recent call last)",
|
1178 |
+
"Cell \u001b[1;32mIn[52], line 12\u001b[0m\n\u001b[0;32m 10\u001b[0m input_soft_skills \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mCommunication\u001b[39m\u001b[38;5;124m\"\u001b[39m, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mTeamwork\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[0;32m 11\u001b[0m input_major \u001b[38;5;241m=\u001b[39m [\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mData Science\u001b[39m\u001b[38;5;124m\"\u001b[39m]\n\u001b[1;32m---> 12\u001b[0m recommended_jobs \u001b[38;5;241m=\u001b[39m \u001b[43mrecommend_jobs_for_input_skills\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_hard_skills\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minput_soft_skills\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43minput_major\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mjobs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtfidf_vectorizer_skills\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtfidf_vectorizer_majors\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcompanies_skills_vec\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mcompanies_majors_vec\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 13\u001b[0m \u001b[38;5;28mprint\u001b[39m(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mRecommended Jobs based on input skills and major:\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[0;32m 14\u001b[0m \u001b[38;5;28mprint\u001b[39m(recommended_jobs)\n",
|
1179 |
+
"Cell \u001b[1;32mIn[36], line 3\u001b[0m, in \u001b[0;36mrecommend_jobs_for_input_skills\u001b[1;34m(input_hard_skills, input_soft_skills, input_major, jobs, tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec)\u001b[0m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mrecommend_jobs_for_input_skills\u001b[39m(input_hard_skills, input_soft_skills, input_major, jobs, tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec):\n\u001b[1;32m----> 3\u001b[0m input_hard_skills_vec \u001b[38;5;241m=\u001b[39m \u001b[43mtfidf_vectorizer_skills\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtransform\u001b[49m\u001b[43m(\u001b[49m\u001b[43m[\u001b[49m\u001b[43minput_hard_skills\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 4\u001b[0m input_soft_skills_vec \u001b[38;5;241m=\u001b[39m tfidf_vectorizer_skills\u001b[38;5;241m.\u001b[39mtransform([input_soft_skills])\n\u001b[0;32m 5\u001b[0m input_major_vec \u001b[38;5;241m=\u001b[39m tfidf_vectorizer_majors\u001b[38;5;241m.\u001b[39mtransform([input_major])\n",
|
1180 |
+
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\sklearn\\feature_extraction\\text.py:2157\u001b[0m, in \u001b[0;36mTfidfVectorizer.transform\u001b[1;34m(self, raw_documents)\u001b[0m\n\u001b[0;32m 2140\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Transform documents to document-term matrix.\u001b[39;00m\n\u001b[0;32m 2141\u001b[0m \n\u001b[0;32m 2142\u001b[0m \u001b[38;5;124;03mUses the vocabulary and document frequencies (df) learned by fit (or\u001b[39;00m\n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 2153\u001b[0m \u001b[38;5;124;03m Tf-idf-weighted document-term matrix.\u001b[39;00m\n\u001b[0;32m 2154\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 2155\u001b[0m check_is_fitted(\u001b[38;5;28mself\u001b[39m, msg\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mThe TF-IDF vectorizer is not fitted\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m-> 2157\u001b[0m X \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43msuper\u001b[39;49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mtransform\u001b[49m\u001b[43m(\u001b[49m\u001b[43mraw_documents\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 2158\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_tfidf\u001b[38;5;241m.\u001b[39mtransform(X, copy\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n",
|
1181 |
+
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\sklearn\\feature_extraction\\text.py:1433\u001b[0m, in \u001b[0;36mCountVectorizer.transform\u001b[1;34m(self, raw_documents)\u001b[0m\n\u001b[0;32m 1430\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_check_vocabulary()\n\u001b[0;32m 1432\u001b[0m \u001b[38;5;66;03m# use the same matrix-building strategy as fit_transform\u001b[39;00m\n\u001b[1;32m-> 1433\u001b[0m _, X \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_count_vocab\u001b[49m\u001b[43m(\u001b[49m\u001b[43mraw_documents\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfixed_vocab\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43;01mTrue\u001b[39;49;00m\u001b[43m)\u001b[49m\n\u001b[0;32m 1434\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mbinary:\n\u001b[0;32m 1435\u001b[0m X\u001b[38;5;241m.\u001b[39mdata\u001b[38;5;241m.\u001b[39mfill(\u001b[38;5;241m1\u001b[39m)\n",
|
1182 |
+
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\sklearn\\feature_extraction\\text.py:1275\u001b[0m, in \u001b[0;36mCountVectorizer._count_vocab\u001b[1;34m(self, raw_documents, fixed_vocab)\u001b[0m\n\u001b[0;32m 1273\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m doc \u001b[38;5;129;01min\u001b[39;00m raw_documents:\n\u001b[0;32m 1274\u001b[0m feature_counter \u001b[38;5;241m=\u001b[39m {}\n\u001b[1;32m-> 1275\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m feature \u001b[38;5;129;01min\u001b[39;00m \u001b[43manalyze\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc\u001b[49m\u001b[43m)\u001b[49m:\n\u001b[0;32m 1276\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[0;32m 1277\u001b[0m feature_idx \u001b[38;5;241m=\u001b[39m vocabulary[feature]\n",
|
1183 |
+
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\sklearn\\feature_extraction\\text.py:111\u001b[0m, in \u001b[0;36m_analyze\u001b[1;34m(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)\u001b[0m\n\u001b[0;32m 109\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m 110\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m preprocessor \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m--> 111\u001b[0m doc \u001b[38;5;241m=\u001b[39m \u001b[43mpreprocessor\u001b[49m\u001b[43m(\u001b[49m\u001b[43mdoc\u001b[49m\u001b[43m)\u001b[49m\n\u001b[0;32m 112\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m tokenizer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 113\u001b[0m doc \u001b[38;5;241m=\u001b[39m tokenizer(doc)\n",
|
1184 |
+
"File \u001b[1;32m~\\AppData\\Roaming\\Python\\Python311\\site-packages\\sklearn\\feature_extraction\\text.py:69\u001b[0m, in \u001b[0;36m_preprocess\u001b[1;34m(doc, accent_function, lower)\u001b[0m\n\u001b[0;32m 50\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Chain together an optional series of text preprocessing steps to\u001b[39;00m\n\u001b[0;32m 51\u001b[0m \u001b[38;5;124;03mapply to a document.\u001b[39;00m\n\u001b[0;32m 52\u001b[0m \n\u001b[1;32m (...)\u001b[0m\n\u001b[0;32m 66\u001b[0m \u001b[38;5;124;03m preprocessed string\u001b[39;00m\n\u001b[0;32m 67\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[0;32m 68\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m lower:\n\u001b[1;32m---> 69\u001b[0m doc \u001b[38;5;241m=\u001b[39m \u001b[43mdoc\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mlower\u001b[49m()\n\u001b[0;32m 70\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m accent_function \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m 71\u001b[0m doc \u001b[38;5;241m=\u001b[39m accent_function(doc)\n",
|
1185 |
+
"\u001b[1;31mAttributeError\u001b[0m: 'list' object has no attribute 'lower'"
|
1186 |
+
]
|
1187 |
+
}
|
1188 |
+
],
|
1189 |
+
"source": [
|
1190 |
+
"import pickle\n",
|
1191 |
+
"import pandas as pd\n",
|
1192 |
+
"\n",
|
1193 |
+
"# Load the model (pipeline)\n",
|
1194 |
+
"with open('recommendation_pipeline.pkl', 'rb') as file:\n",
|
1195 |
+
" recommendation_pipeline = pickle.load(file)\n",
|
1196 |
+
"\n",
|
1197 |
+
"# Example input data\n",
|
1198 |
+
"input_hard_skills = [\"Python\", \"Java\", \"Finance\", \"Excel\"]\n",
|
1199 |
+
"input_soft_skills = [\"Communication\", \"Teamwork\"]\n",
|
1200 |
+
"input_major = [\"Data Science\"]\n",
|
1201 |
+
"recommended_jobs = recommend_jobs_for_input_skills(input_hard_skills, input_soft_skills, input_major, jobs, tfidf_vectorizer_skills, tfidf_vectorizer_majors, companies_skills_vec, companies_majors_vec)\n",
|
1202 |
+
"print(\"Recommended Jobs based on input skills and major:\")\n",
|
1203 |
+
"print(recommended_jobs)"
|
1204 |
+
]
|
1205 |
+
}
|
1206 |
+
],
|
1207 |
+
"metadata": {
|
1208 |
+
"colab": {
|
1209 |
+
"provenance": []
|
1210 |
+
},
|
1211 |
+
"kernelspec": {
|
1212 |
+
"display_name": "Python 3",
|
1213 |
+
"name": "python3"
|
1214 |
+
},
|
1215 |
+
"language_info": {
|
1216 |
+
"codemirror_mode": {
|
1217 |
+
"name": "ipython",
|
1218 |
+
"version": 3
|
1219 |
+
},
|
1220 |
+
"file_extension": ".py",
|
1221 |
+
"mimetype": "text/x-python",
|
1222 |
+
"name": "python",
|
1223 |
+
"nbconvert_exporter": "python",
|
1224 |
+
"pygments_lexer": "ipython3",
|
1225 |
+
"version": "3.11.2"
|
1226 |
+
}
|
1227 |
+
},
|
1228 |
+
"nbformat": 4,
|
1229 |
+
"nbformat_minor": 0
|
1230 |
+
}
|