Basanth commited on
Commit
91ec262
·
1 Parent(s): 857f872

uploaded all the files

Browse files
app.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ st.set_page_config(layout="wide")
3
+ from streamlit_option_menu import option_menu
4
+ from apps import home,eda,models,demo
5
+
6
+
7
+
8
+
9
+
10
+ if not "valid_inputs_received" in st.session_state:
11
+ st.session_state["valid_inputs_received"] = False
12
+ # image = Image.open('data/logo.png')
13
+ # image=image.resize((100,100))
14
+ header = st.container()
15
+
16
+ apps = [
17
+ {"func": home.app, "title": "Home", "icon": "house"},
18
+ {"func": eda.app, "title": "EDA", "icon": "bar-chart"},
19
+ # {"func": models.app, "title": "Models", "icon": "cpu"},
20
+ {"func": demo.app, "title": "Demo", "icon": "cloud-upload"},
21
+ ]
22
+
23
+ titles = [app["title"] for app in apps]
24
+ titles_lower = [title.lower() for title in titles]
25
+ icons = [app["icon"] for app in apps]
26
+
27
+
28
+
29
+ with st.sidebar:
30
+ # logo = st.image(image)
31
+ selected = option_menu(
32
+ "Main Menu",
33
+ options=titles,
34
+ icons=icons,
35
+ menu_icon="cast",
36
+ )
37
+
38
+ for app in apps:
39
+ if app["title"] == selected:
40
+ app["func"]()
41
+ break
apps/demo.py ADDED
@@ -0,0 +1,113 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from streamlit_echarts import st_echarts
3
+ import pandas as pd
4
+ from annotated_text import annotated_text, annotation
5
+ from utils import load_skill_extractor, create_ann_list
6
+ from utils import get_skill, clean_text, default_text, predict_cat, load_model
7
+
8
+
9
+ model = load_model()
10
+ # skill_extractor = load_skill_extractor()
11
+
12
+ def app():
13
+ st.title("Demo")
14
+ with st.form(key="text_val"):
15
+ input_text = st.text_area('Enter the text here', value=default_text(), height=200)
16
+ submit_button = st.form_submit_button(label="Submit")
17
+
18
+ cls_text = clean_text(input_text)
19
+ # st.write(cls_text)
20
+
21
+ col1, col2,= st.columns(2)
22
+ gaugeData = [
23
+ {
24
+ "value": 0,
25
+ "name": 'Probabiltiy',
26
+ "detail": {
27
+ "valueAnimation": True,
28
+ "offsetCenter": ['0%', '0%']
29
+ }
30
+ }]
31
+ option = {
32
+ "series": [
33
+ {
34
+ "type": "gauge",
35
+ "startAngle": 90,
36
+ "endAngle": -270,
37
+ "pointer": {
38
+ "show": False,
39
+ },
40
+ "progress": {
41
+ "show": True,
42
+ "overlap": False,
43
+ "roundCap":False,
44
+ "clip": False,
45
+ "backgroundColor": '#11D1F9',
46
+ "itemStyle": {
47
+ "color": '#E96605',
48
+ "borderWidth": 0,
49
+ "borderColor": "light blue"
50
+ }
51
+ },
52
+ "axisLine": {
53
+ "lineStyle": {
54
+ "width": 40
55
+ }
56
+ },
57
+ "splitLine": {
58
+ "show": False,
59
+ "distance": 0,
60
+ "length": 20
61
+ },
62
+ "axisTick": {
63
+ "show": False
64
+ },
65
+ "axisLabel": {
66
+ "show": False,
67
+ "distance": 50
68
+ },
69
+ "data": gaugeData,
70
+ "detail": {
71
+ "valueAnimation": True,
72
+ "offsetCenter": ['0%', '0%'],
73
+ "width": 40,
74
+ "height": 14,
75
+ "fontSize": 24,
76
+ "color": 'inherit',
77
+ "borderColor": 'inherit',
78
+ "borderRadius": 0,
79
+ "borderWidth": 0,
80
+ "formatter": '{value}%'
81
+ },
82
+ }
83
+ ]
84
+ }
85
+
86
+ prob,job_cat = predict_cat(model, cls_text)
87
+
88
+
89
+
90
+ with st.form(key='result'):
91
+ if submit_button:
92
+ gaugeData[0]['value']=prob
93
+ with col1:
94
+ st.markdown("<h1 style='text-align: center; color: #05A4E9;'>Job Category</h1>", unsafe_allow_html=True)
95
+ html_str = f"""
96
+ <h1
97
+ style = 'text-align: center;
98
+ font: bold {2}em Courier;'
99
+ <p class="a">{job_cat}</p>
100
+ </h1>
101
+ """
102
+
103
+
104
+ st.markdown(html_str, unsafe_allow_html=True )
105
+ with col2:
106
+ # st.title("Probability")
107
+
108
+ st.markdown("<h1 style='text-align: center; color: #05A4E9;'>Probability</h1>", unsafe_allow_html=True)
109
+ st_echarts(options=option, key="1")
110
+
111
+
112
+
113
+
apps/eda.py ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ def app():
5
+ st.title("EDA")
apps/home.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+ def app():
5
+
6
+ st.title("Ghana - Understanding The Disconnect between Skills and Jobs")
7
+
8
+ st.markdown('''
9
+ In Africa, there is often a disconnection between the skills that job seekers possess and the skills that employers require. This can be due to a lack of access to education and training opportunities, as well as a lack of alignment between the education system and the needs of the job market.
10
+ Additionally, many employers in Africa may not have the resources or capacity to provide the necessary training and development for their employees. As a result, there is often a mismatch between the skills that workers have and the skills that employers need, which can make it difficult for workers to find employment or for employers to find qualified candidates.
11
+ ''')
12
+
apps/models.py ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+
4
+
5
+ def app():
6
+ st.title("Models")
data/labels.txt ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ,0
2
+ Accounting/Finance/Banking,0
3
+ Administrative/Secretarial ,1
4
+ Advertising/Media ,2
5
+ Agricultural ,3
6
+ Communication ,4
7
+ Customer Service ,5
8
+ Education ,6
9
+ "Energy,Oil & Gas ",7
10
+ Engineering/Processing/Manufacturing,8
11
+ Healthcare ,9
12
+ Hospitalilty/Food Service ,10
13
+ Human Resource Management ,11
14
+ IT ,12
15
+ Legal ,13
16
+ Mining ,14
17
+ Other,15
18
+ Purchasing/Procurement ,16
19
+ Quality Control/Assurance ,17
20
+ Sales / Marketing ,18
21
+ Securtity/Law Enforcement ,19
22
+ Supply Chain/Logistics/Warehousing ,20
data/sample.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ An introduction to computers and how they work. Types and Historical Development of Computers. Number systems: binary, octal, hexadecimal, integer and fractional representations, Signed and Unsigned numbers, 1‘s complement, 2‘s complement and Arithmetic Overflows. Integer and floating point arithmetic (IEEE standard 754 Floating point Formats). Data Representation and Manipulation: Bits, bytes and words: Logic operations and Logic gates applications.
models/distilbert/config.json ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "/tmp/tmpqaytn67y",
3
+ "activation": "gelu",
4
+ "architectures": [
5
+ "DistilBertForSequenceClassification"
6
+ ],
7
+ "attention_dropout": 0.1,
8
+ "dim": 768,
9
+ "dropout": 0.1,
10
+ "hidden_dim": 3072,
11
+ "id2label": {
12
+ "0": "LABEL_0",
13
+ "1": "LABEL_1",
14
+ "2": "LABEL_2",
15
+ "3": "LABEL_3",
16
+ "4": "LABEL_4",
17
+ "5": "LABEL_5",
18
+ "6": "LABEL_6",
19
+ "7": "LABEL_7",
20
+ "8": "LABEL_8",
21
+ "9": "LABEL_9",
22
+ "10": "LABEL_10",
23
+ "11": "LABEL_11",
24
+ "12": "LABEL_12",
25
+ "13": "LABEL_13",
26
+ "14": "LABEL_14",
27
+ "15": "LABEL_15",
28
+ "16": "LABEL_16",
29
+ "17": "LABEL_17",
30
+ "18": "LABEL_18",
31
+ "19": "LABEL_19",
32
+ "20": "LABEL_20"
33
+ },
34
+ "initializer_range": 0.02,
35
+ "label2id": {
36
+ "LABEL_0": 0,
37
+ "LABEL_1": 1,
38
+ "LABEL_10": 10,
39
+ "LABEL_11": 11,
40
+ "LABEL_12": 12,
41
+ "LABEL_13": 13,
42
+ "LABEL_14": 14,
43
+ "LABEL_15": 15,
44
+ "LABEL_16": 16,
45
+ "LABEL_17": 17,
46
+ "LABEL_18": 18,
47
+ "LABEL_19": 19,
48
+ "LABEL_2": 2,
49
+ "LABEL_20": 20,
50
+ "LABEL_3": 3,
51
+ "LABEL_4": 4,
52
+ "LABEL_5": 5,
53
+ "LABEL_6": 6,
54
+ "LABEL_7": 7,
55
+ "LABEL_8": 8,
56
+ "LABEL_9": 9
57
+ },
58
+ "max_position_embeddings": 512,
59
+ "model_type": "distilbert",
60
+ "n_heads": 12,
61
+ "n_layers": 6,
62
+ "pad_token_id": 0,
63
+ "qa_dropout": 0.1,
64
+ "seq_classif_dropout": 0.2,
65
+ "sinusoidal_pos_embds": false,
66
+ "tie_weights_": true,
67
+ "transformers_version": "4.26.0",
68
+ "vocab_size": 30522
69
+ }
models/distilbert/special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
models/distilbert/tf_model.h5 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2f5ddf0f86b178c98159599cedfbad2b01062ed3290d73f0af7780480ba1106c
3
+ size 268014760
models/distilbert/tf_model.preproc ADDED
Binary file (2.98 kB). View file
 
models/distilbert/tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
models/distilbert/tokenizer_config.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "do_lower_case": true,
4
+ "mask_token": "[MASK]",
5
+ "model_max_length": 512,
6
+ "name_or_path": "distilbert-base-uncased",
7
+ "pad_token": "[PAD]",
8
+ "sep_token": "[SEP]",
9
+ "special_tokens_map_file": null,
10
+ "strip_accents": null,
11
+ "tokenize_chinese_chars": true,
12
+ "tokenizer_class": "DistilBertTokenizer",
13
+ "unk_token": "[UNK]"
14
+ }
models/distilbert/vocab.txt ADDED
The diff for this file is too large to render. See raw diff
 
requirements.txt ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ streamlit
2
+ streamlit-option-menu
3
+ streamlit-echarts
4
+ ktrain
5
+ # spacy>=3.0.0,<4.0.0
6
+ # en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl
7
+ # skillNer
8
+ pandas
9
+ numpy
10
+ ipython
11
+ neattext
12
+ tensorflow
13
+ st_annotated_text==2.0.0
14
+ requests
15
+ # nltk
16
+ # Unidecode
utils.py ADDED
@@ -0,0 +1,180 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ import subprocess
3
+ import streamlit as st
4
+ import numpy as np
5
+ from annotated_text import annotation
6
+ import collections
7
+ import ktrain
8
+ import pandas as pd
9
+ import os
10
+ import neattext.functions as nfx
11
+
12
+
13
+ label_path = ("./data/labels.txt")
14
+ cols = ['cat', 'code']
15
+ label_df = pd.read_csv(label_path, names=cols, header=0)
16
+
17
+
18
+ def default_text():
19
+ with open("./data/sample.txt", 'r') as fs:
20
+ text = fs.read()
21
+ return text
22
+
23
+ @st.cache(allow_output_mutation=True,suppress_st_warning=True)
24
+ def load_model():
25
+ model_path = "./models/distilbert/"
26
+ model = ktrain.load_predictor(model_path)
27
+ return model
28
+
29
+ @st.cache(allow_output_mutation=True, suppress_st_warning=True)
30
+ def load_skill_extractor():
31
+ # This function will only be run the first time it's called
32
+ import spacy
33
+
34
+ from skillNer.skill_extractor_class import SkillExtractor
35
+ from skillNer.general_params import SKILL_DB
36
+
37
+ from spacy.matcher import PhraseMatcher
38
+ # init params of skill extractor
39
+ print('load model')
40
+
41
+ nlp = spacy.load('en_core_web_lg')
42
+
43
+ print('load matcher')
44
+ # init skill extractor
45
+ skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher,)
46
+ return skill_extractor
47
+
48
+
49
+
50
+ def clean_text(text):
51
+ try:
52
+ docx = nfx.TextFrame(text)
53
+ result = docx.remove_emails().remove_urls().remove_dates().remove_html_tags().remove_numbers().remove_puncts().remove_stopwords().remove_special_characters()
54
+ # doc = nlp(result.text)
55
+ # empty_list = []
56
+ # for token in doc:
57
+ # empty_list.append(token.lemma_)
58
+ # final_string = ' '.join(map(str,empty_list))
59
+ return result.text
60
+ except Exception as e:
61
+ print(e)
62
+ return None
63
+
64
+
65
+ def predict_cat(model, text):
66
+ p = int(model.predict(text,return_proba=True).max()*100)
67
+ cat = model.predict(text)
68
+ return p,cat
69
+
70
+
71
+ def grouper(iterable):
72
+ prev = None
73
+ group = []
74
+ for item in iterable:
75
+ if not prev or item - prev <= 1:
76
+ group.append(item)
77
+ else:
78
+ yield group
79
+ group = [item]
80
+ prev = item
81
+ if group:
82
+ yield group
83
+
84
+
85
+ def get_skill(annotations):
86
+ try:
87
+ # annotations = skill_extractor.annotate(text,tresh=0.5)
88
+ # skill_dict = {"Soft Skill": [], "Hard Skill": []}
89
+ soft_skill = []
90
+ hard_skill = []
91
+
92
+ for item in annotations['results']['ngram_scored']:
93
+ skill_id = item['skill_id']
94
+ skill_type = skill_extractor.skills_db[skill_id]['skill_type']
95
+ if skill_type == 'Soft Skill' and item['doc_node_value']:
96
+ soft_skill.append(item['doc_node_value'])
97
+ if skill_type == 'Hard Skill':
98
+ hard_skill.append(item['doc_node_value'])
99
+ # skill_dict['Soft Skill'] =set(soft_skill)
100
+ sk = " ".join(list(set(soft_skill)))
101
+ hk = " ".join(list(set(hard_skill)))
102
+ # st.write(skill_extractor.describe(annotations))
103
+ return sk+hk
104
+ except Exception as e:
105
+ return None
106
+
107
+
108
+ def install(package):
109
+ subprocess.check_call([sys.executable, "-m", "pip", "install", package])
110
+
111
+
112
+
113
+
114
+
115
+ def create_ann_list(text, results):
116
+ try:
117
+ from skillNer.general_params import SKILL_DB
118
+ except:
119
+ # install skillner if not done yet
120
+ os.system('pip install skillner')
121
+ from skillNer.general_params import SKILL_DB
122
+
123
+ type_to_color = {'Hard Skill': "#faa",
124
+ 'Soft Skill': '#afa', 'Certification': '#ff4'}
125
+ text_tokens = text.split(' ')
126
+ annots = {}
127
+ all_res = results['ngram_scored']+results['full_matches']
128
+ ids_done = []
129
+ # create annotations from matches
130
+ for match in all_res:
131
+ id_ = match['skill_id']
132
+ type_ = SKILL_DB[id_]['skill_type']
133
+ span_str = ' '.join([text_tokens[i] for i in match['doc_node_id']])
134
+ annot = annotation(span_str, type_, background=type_to_color[type_],
135
+ color="#333", margin='2px')
136
+ annots[match['doc_node_id'][0]] = annot
137
+ for i in match['doc_node_id']:
138
+ ids_done.append(i)
139
+ # create strs for non annotated text
140
+ non_match_ids = [i for i, _ in enumerate(text_tokens) if i not in ids_done]
141
+ dict_ = dict(enumerate(grouper(non_match_ids), 1))
142
+ for v in dict_.values():
143
+ span = ' '.join([text_tokens[i] for i in v])
144
+ annots[v[0]] = span
145
+ # annotation(token,color="#fff", background="transparent",)
146
+ print(dict_)
147
+ print('-----')
148
+ # print(collections.OrderedDict(sorted(annots.items())))
149
+ annots_ = collections.OrderedDict(sorted(annots.items())).values()
150
+ return annots_
151
+
152
+
153
+ def create_dfs(results):
154
+ try:
155
+ from skillNer.general_params import SKILL_DB
156
+ except:
157
+ # install skillner if not done yet
158
+ os.system('pip install skillner')
159
+ from skillNer.general_params import SKILL_DB
160
+
161
+ f_matches = results['full_matches']
162
+ f_arr = []
163
+ for match in f_matches:
164
+ id_ = match['skill_id']
165
+ full_name = SKILL_DB[id_]['skill_name']
166
+ type_ = SKILL_DB[id_]['skill_type']
167
+ f_arr.append([id_, full_name, type_])
168
+ s_matches = results['ngram_scored']
169
+ s_arr = []
170
+ for match in s_matches:
171
+ id_ = match['skill_id']
172
+ full_name = SKILL_DB[id_]['skill_name']
173
+ type_ = SKILL_DB[id_]['skill_type']
174
+ score = match['score']
175
+ s_arr.append([id_, full_name, type_, score])
176
+ full_df = pd.DataFrame(
177
+ f_arr, columns=['skill id', 'skill name', 'skill type'])
178
+ sub_df = pd.DataFrame(
179
+ s_arr, columns=['skill id', 'skill name', 'skill type', 'score'])
180
+ return full_df, sub_df