Spaces:
Runtime error
Runtime error
uploaded all the files
Browse files- app.py +41 -0
- apps/demo.py +113 -0
- apps/eda.py +5 -0
- apps/home.py +12 -0
- apps/models.py +6 -0
- data/labels.txt +22 -0
- data/sample.txt +1 -0
- models/distilbert/config.json +69 -0
- models/distilbert/special_tokens_map.json +7 -0
- models/distilbert/tf_model.h5 +3 -0
- models/distilbert/tf_model.preproc +0 -0
- models/distilbert/tokenizer.json +0 -0
- models/distilbert/tokenizer_config.json +14 -0
- models/distilbert/vocab.txt +0 -0
- requirements.txt +16 -0
- utils.py +180 -0
app.py
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
st.set_page_config(layout="wide")
|
3 |
+
from streamlit_option_menu import option_menu
|
4 |
+
from apps import home,eda,models,demo
|
5 |
+
|
6 |
+
|
7 |
+
|
8 |
+
|
9 |
+
|
10 |
+
if not "valid_inputs_received" in st.session_state:
|
11 |
+
st.session_state["valid_inputs_received"] = False
|
12 |
+
# image = Image.open('data/logo.png')
|
13 |
+
# image=image.resize((100,100))
|
14 |
+
header = st.container()
|
15 |
+
|
16 |
+
apps = [
|
17 |
+
{"func": home.app, "title": "Home", "icon": "house"},
|
18 |
+
{"func": eda.app, "title": "EDA", "icon": "bar-chart"},
|
19 |
+
# {"func": models.app, "title": "Models", "icon": "cpu"},
|
20 |
+
{"func": demo.app, "title": "Demo", "icon": "cloud-upload"},
|
21 |
+
]
|
22 |
+
|
23 |
+
titles = [app["title"] for app in apps]
|
24 |
+
titles_lower = [title.lower() for title in titles]
|
25 |
+
icons = [app["icon"] for app in apps]
|
26 |
+
|
27 |
+
|
28 |
+
|
29 |
+
with st.sidebar:
|
30 |
+
# logo = st.image(image)
|
31 |
+
selected = option_menu(
|
32 |
+
"Main Menu",
|
33 |
+
options=titles,
|
34 |
+
icons=icons,
|
35 |
+
menu_icon="cast",
|
36 |
+
)
|
37 |
+
|
38 |
+
for app in apps:
|
39 |
+
if app["title"] == selected:
|
40 |
+
app["func"]()
|
41 |
+
break
|
apps/demo.py
ADDED
@@ -0,0 +1,113 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from streamlit_echarts import st_echarts
|
3 |
+
import pandas as pd
|
4 |
+
from annotated_text import annotated_text, annotation
|
5 |
+
from utils import load_skill_extractor, create_ann_list
|
6 |
+
from utils import get_skill, clean_text, default_text, predict_cat, load_model
|
7 |
+
|
8 |
+
|
9 |
+
model = load_model()
|
10 |
+
# skill_extractor = load_skill_extractor()
|
11 |
+
|
12 |
+
def app():
|
13 |
+
st.title("Demo")
|
14 |
+
with st.form(key="text_val"):
|
15 |
+
input_text = st.text_area('Enter the text here', value=default_text(), height=200)
|
16 |
+
submit_button = st.form_submit_button(label="Submit")
|
17 |
+
|
18 |
+
cls_text = clean_text(input_text)
|
19 |
+
# st.write(cls_text)
|
20 |
+
|
21 |
+
col1, col2,= st.columns(2)
|
22 |
+
gaugeData = [
|
23 |
+
{
|
24 |
+
"value": 0,
|
25 |
+
"name": 'Probabiltiy',
|
26 |
+
"detail": {
|
27 |
+
"valueAnimation": True,
|
28 |
+
"offsetCenter": ['0%', '0%']
|
29 |
+
}
|
30 |
+
}]
|
31 |
+
option = {
|
32 |
+
"series": [
|
33 |
+
{
|
34 |
+
"type": "gauge",
|
35 |
+
"startAngle": 90,
|
36 |
+
"endAngle": -270,
|
37 |
+
"pointer": {
|
38 |
+
"show": False,
|
39 |
+
},
|
40 |
+
"progress": {
|
41 |
+
"show": True,
|
42 |
+
"overlap": False,
|
43 |
+
"roundCap":False,
|
44 |
+
"clip": False,
|
45 |
+
"backgroundColor": '#11D1F9',
|
46 |
+
"itemStyle": {
|
47 |
+
"color": '#E96605',
|
48 |
+
"borderWidth": 0,
|
49 |
+
"borderColor": "light blue"
|
50 |
+
}
|
51 |
+
},
|
52 |
+
"axisLine": {
|
53 |
+
"lineStyle": {
|
54 |
+
"width": 40
|
55 |
+
}
|
56 |
+
},
|
57 |
+
"splitLine": {
|
58 |
+
"show": False,
|
59 |
+
"distance": 0,
|
60 |
+
"length": 20
|
61 |
+
},
|
62 |
+
"axisTick": {
|
63 |
+
"show": False
|
64 |
+
},
|
65 |
+
"axisLabel": {
|
66 |
+
"show": False,
|
67 |
+
"distance": 50
|
68 |
+
},
|
69 |
+
"data": gaugeData,
|
70 |
+
"detail": {
|
71 |
+
"valueAnimation": True,
|
72 |
+
"offsetCenter": ['0%', '0%'],
|
73 |
+
"width": 40,
|
74 |
+
"height": 14,
|
75 |
+
"fontSize": 24,
|
76 |
+
"color": 'inherit',
|
77 |
+
"borderColor": 'inherit',
|
78 |
+
"borderRadius": 0,
|
79 |
+
"borderWidth": 0,
|
80 |
+
"formatter": '{value}%'
|
81 |
+
},
|
82 |
+
}
|
83 |
+
]
|
84 |
+
}
|
85 |
+
|
86 |
+
prob,job_cat = predict_cat(model, cls_text)
|
87 |
+
|
88 |
+
|
89 |
+
|
90 |
+
with st.form(key='result'):
|
91 |
+
if submit_button:
|
92 |
+
gaugeData[0]['value']=prob
|
93 |
+
with col1:
|
94 |
+
st.markdown("<h1 style='text-align: center; color: #05A4E9;'>Job Category</h1>", unsafe_allow_html=True)
|
95 |
+
html_str = f"""
|
96 |
+
<h1
|
97 |
+
style = 'text-align: center;
|
98 |
+
font: bold {2}em Courier;'
|
99 |
+
<p class="a">{job_cat}</p>
|
100 |
+
</h1>
|
101 |
+
"""
|
102 |
+
|
103 |
+
|
104 |
+
st.markdown(html_str, unsafe_allow_html=True )
|
105 |
+
with col2:
|
106 |
+
# st.title("Probability")
|
107 |
+
|
108 |
+
st.markdown("<h1 style='text-align: center; color: #05A4E9;'>Probability</h1>", unsafe_allow_html=True)
|
109 |
+
st_echarts(options=option, key="1")
|
110 |
+
|
111 |
+
|
112 |
+
|
113 |
+
|
apps/eda.py
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
|
4 |
+
def app():
|
5 |
+
st.title("EDA")
|
apps/home.py
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
|
4 |
+
def app():
|
5 |
+
|
6 |
+
st.title("Ghana - Understanding The Disconnect between Skills and Jobs")
|
7 |
+
|
8 |
+
st.markdown('''
|
9 |
+
In Africa, there is often a disconnection between the skills that job seekers possess and the skills that employers require. This can be due to a lack of access to education and training opportunities, as well as a lack of alignment between the education system and the needs of the job market.
|
10 |
+
Additionally, many employers in Africa may not have the resources or capacity to provide the necessary training and development for their employees. As a result, there is often a mismatch between the skills that workers have and the skills that employers need, which can make it difficult for workers to find employment or for employers to find qualified candidates.
|
11 |
+
''')
|
12 |
+
|
apps/models.py
ADDED
@@ -0,0 +1,6 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
|
3 |
+
|
4 |
+
|
5 |
+
def app():
|
6 |
+
st.title("Models")
|
data/labels.txt
ADDED
@@ -0,0 +1,22 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
,0
|
2 |
+
Accounting/Finance/Banking,0
|
3 |
+
Administrative/Secretarial ,1
|
4 |
+
Advertising/Media ,2
|
5 |
+
Agricultural ,3
|
6 |
+
Communication ,4
|
7 |
+
Customer Service ,5
|
8 |
+
Education ,6
|
9 |
+
"Energy,Oil & Gas ",7
|
10 |
+
Engineering/Processing/Manufacturing,8
|
11 |
+
Healthcare ,9
|
12 |
+
Hospitalilty/Food Service ,10
|
13 |
+
Human Resource Management ,11
|
14 |
+
IT ,12
|
15 |
+
Legal ,13
|
16 |
+
Mining ,14
|
17 |
+
Other,15
|
18 |
+
Purchasing/Procurement ,16
|
19 |
+
Quality Control/Assurance ,17
|
20 |
+
Sales / Marketing ,18
|
21 |
+
Securtity/Law Enforcement ,19
|
22 |
+
Supply Chain/Logistics/Warehousing ,20
|
data/sample.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
An introduction to computers and how they work. Types and Historical Development of Computers. Number systems: binary, octal, hexadecimal, integer and fractional representations, Signed and Unsigned numbers, 1‘s complement, 2‘s complement and Arithmetic Overflows. Integer and floating point arithmetic (IEEE standard 754 Floating point Formats). Data Representation and Manipulation: Bits, bytes and words: Logic operations and Logic gates applications.
|
models/distilbert/config.json
ADDED
@@ -0,0 +1,69 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "/tmp/tmpqaytn67y",
|
3 |
+
"activation": "gelu",
|
4 |
+
"architectures": [
|
5 |
+
"DistilBertForSequenceClassification"
|
6 |
+
],
|
7 |
+
"attention_dropout": 0.1,
|
8 |
+
"dim": 768,
|
9 |
+
"dropout": 0.1,
|
10 |
+
"hidden_dim": 3072,
|
11 |
+
"id2label": {
|
12 |
+
"0": "LABEL_0",
|
13 |
+
"1": "LABEL_1",
|
14 |
+
"2": "LABEL_2",
|
15 |
+
"3": "LABEL_3",
|
16 |
+
"4": "LABEL_4",
|
17 |
+
"5": "LABEL_5",
|
18 |
+
"6": "LABEL_6",
|
19 |
+
"7": "LABEL_7",
|
20 |
+
"8": "LABEL_8",
|
21 |
+
"9": "LABEL_9",
|
22 |
+
"10": "LABEL_10",
|
23 |
+
"11": "LABEL_11",
|
24 |
+
"12": "LABEL_12",
|
25 |
+
"13": "LABEL_13",
|
26 |
+
"14": "LABEL_14",
|
27 |
+
"15": "LABEL_15",
|
28 |
+
"16": "LABEL_16",
|
29 |
+
"17": "LABEL_17",
|
30 |
+
"18": "LABEL_18",
|
31 |
+
"19": "LABEL_19",
|
32 |
+
"20": "LABEL_20"
|
33 |
+
},
|
34 |
+
"initializer_range": 0.02,
|
35 |
+
"label2id": {
|
36 |
+
"LABEL_0": 0,
|
37 |
+
"LABEL_1": 1,
|
38 |
+
"LABEL_10": 10,
|
39 |
+
"LABEL_11": 11,
|
40 |
+
"LABEL_12": 12,
|
41 |
+
"LABEL_13": 13,
|
42 |
+
"LABEL_14": 14,
|
43 |
+
"LABEL_15": 15,
|
44 |
+
"LABEL_16": 16,
|
45 |
+
"LABEL_17": 17,
|
46 |
+
"LABEL_18": 18,
|
47 |
+
"LABEL_19": 19,
|
48 |
+
"LABEL_2": 2,
|
49 |
+
"LABEL_20": 20,
|
50 |
+
"LABEL_3": 3,
|
51 |
+
"LABEL_4": 4,
|
52 |
+
"LABEL_5": 5,
|
53 |
+
"LABEL_6": 6,
|
54 |
+
"LABEL_7": 7,
|
55 |
+
"LABEL_8": 8,
|
56 |
+
"LABEL_9": 9
|
57 |
+
},
|
58 |
+
"max_position_embeddings": 512,
|
59 |
+
"model_type": "distilbert",
|
60 |
+
"n_heads": 12,
|
61 |
+
"n_layers": 6,
|
62 |
+
"pad_token_id": 0,
|
63 |
+
"qa_dropout": 0.1,
|
64 |
+
"seq_classif_dropout": 0.2,
|
65 |
+
"sinusoidal_pos_embds": false,
|
66 |
+
"tie_weights_": true,
|
67 |
+
"transformers_version": "4.26.0",
|
68 |
+
"vocab_size": 30522
|
69 |
+
}
|
models/distilbert/special_tokens_map.json
ADDED
@@ -0,0 +1,7 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"mask_token": "[MASK]",
|
4 |
+
"pad_token": "[PAD]",
|
5 |
+
"sep_token": "[SEP]",
|
6 |
+
"unk_token": "[UNK]"
|
7 |
+
}
|
models/distilbert/tf_model.h5
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2f5ddf0f86b178c98159599cedfbad2b01062ed3290d73f0af7780480ba1106c
|
3 |
+
size 268014760
|
models/distilbert/tf_model.preproc
ADDED
Binary file (2.98 kB). View file
|
|
models/distilbert/tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
models/distilbert/tokenizer_config.json
ADDED
@@ -0,0 +1,14 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cls_token": "[CLS]",
|
3 |
+
"do_lower_case": true,
|
4 |
+
"mask_token": "[MASK]",
|
5 |
+
"model_max_length": 512,
|
6 |
+
"name_or_path": "distilbert-base-uncased",
|
7 |
+
"pad_token": "[PAD]",
|
8 |
+
"sep_token": "[SEP]",
|
9 |
+
"special_tokens_map_file": null,
|
10 |
+
"strip_accents": null,
|
11 |
+
"tokenize_chinese_chars": true,
|
12 |
+
"tokenizer_class": "DistilBertTokenizer",
|
13 |
+
"unk_token": "[UNK]"
|
14 |
+
}
|
models/distilbert/vocab.txt
ADDED
The diff for this file is too large to render.
See raw diff
|
|
requirements.txt
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
streamlit
|
2 |
+
streamlit-option-menu
|
3 |
+
streamlit-echarts
|
4 |
+
ktrain
|
5 |
+
# spacy>=3.0.0,<4.0.0
|
6 |
+
# en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl
|
7 |
+
# skillNer
|
8 |
+
pandas
|
9 |
+
numpy
|
10 |
+
ipython
|
11 |
+
neattext
|
12 |
+
tensorflow
|
13 |
+
st_annotated_text==2.0.0
|
14 |
+
requests
|
15 |
+
# nltk
|
16 |
+
# Unidecode
|
utils.py
ADDED
@@ -0,0 +1,180 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import sys
|
2 |
+
import subprocess
|
3 |
+
import streamlit as st
|
4 |
+
import numpy as np
|
5 |
+
from annotated_text import annotation
|
6 |
+
import collections
|
7 |
+
import ktrain
|
8 |
+
import pandas as pd
|
9 |
+
import os
|
10 |
+
import neattext.functions as nfx
|
11 |
+
|
12 |
+
|
13 |
+
label_path = ("./data/labels.txt")
|
14 |
+
cols = ['cat', 'code']
|
15 |
+
label_df = pd.read_csv(label_path, names=cols, header=0)
|
16 |
+
|
17 |
+
|
18 |
+
def default_text():
|
19 |
+
with open("./data/sample.txt", 'r') as fs:
|
20 |
+
text = fs.read()
|
21 |
+
return text
|
22 |
+
|
23 |
+
@st.cache(allow_output_mutation=True,suppress_st_warning=True)
|
24 |
+
def load_model():
|
25 |
+
model_path = "./models/distilbert/"
|
26 |
+
model = ktrain.load_predictor(model_path)
|
27 |
+
return model
|
28 |
+
|
29 |
+
@st.cache(allow_output_mutation=True, suppress_st_warning=True)
|
30 |
+
def load_skill_extractor():
|
31 |
+
# This function will only be run the first time it's called
|
32 |
+
import spacy
|
33 |
+
|
34 |
+
from skillNer.skill_extractor_class import SkillExtractor
|
35 |
+
from skillNer.general_params import SKILL_DB
|
36 |
+
|
37 |
+
from spacy.matcher import PhraseMatcher
|
38 |
+
# init params of skill extractor
|
39 |
+
print('load model')
|
40 |
+
|
41 |
+
nlp = spacy.load('en_core_web_lg')
|
42 |
+
|
43 |
+
print('load matcher')
|
44 |
+
# init skill extractor
|
45 |
+
skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher,)
|
46 |
+
return skill_extractor
|
47 |
+
|
48 |
+
|
49 |
+
|
50 |
+
def clean_text(text):
|
51 |
+
try:
|
52 |
+
docx = nfx.TextFrame(text)
|
53 |
+
result = docx.remove_emails().remove_urls().remove_dates().remove_html_tags().remove_numbers().remove_puncts().remove_stopwords().remove_special_characters()
|
54 |
+
# doc = nlp(result.text)
|
55 |
+
# empty_list = []
|
56 |
+
# for token in doc:
|
57 |
+
# empty_list.append(token.lemma_)
|
58 |
+
# final_string = ' '.join(map(str,empty_list))
|
59 |
+
return result.text
|
60 |
+
except Exception as e:
|
61 |
+
print(e)
|
62 |
+
return None
|
63 |
+
|
64 |
+
|
65 |
+
def predict_cat(model, text):
|
66 |
+
p = int(model.predict(text,return_proba=True).max()*100)
|
67 |
+
cat = model.predict(text)
|
68 |
+
return p,cat
|
69 |
+
|
70 |
+
|
71 |
+
def grouper(iterable):
|
72 |
+
prev = None
|
73 |
+
group = []
|
74 |
+
for item in iterable:
|
75 |
+
if not prev or item - prev <= 1:
|
76 |
+
group.append(item)
|
77 |
+
else:
|
78 |
+
yield group
|
79 |
+
group = [item]
|
80 |
+
prev = item
|
81 |
+
if group:
|
82 |
+
yield group
|
83 |
+
|
84 |
+
|
85 |
+
def get_skill(annotations):
|
86 |
+
try:
|
87 |
+
# annotations = skill_extractor.annotate(text,tresh=0.5)
|
88 |
+
# skill_dict = {"Soft Skill": [], "Hard Skill": []}
|
89 |
+
soft_skill = []
|
90 |
+
hard_skill = []
|
91 |
+
|
92 |
+
for item in annotations['results']['ngram_scored']:
|
93 |
+
skill_id = item['skill_id']
|
94 |
+
skill_type = skill_extractor.skills_db[skill_id]['skill_type']
|
95 |
+
if skill_type == 'Soft Skill' and item['doc_node_value']:
|
96 |
+
soft_skill.append(item['doc_node_value'])
|
97 |
+
if skill_type == 'Hard Skill':
|
98 |
+
hard_skill.append(item['doc_node_value'])
|
99 |
+
# skill_dict['Soft Skill'] =set(soft_skill)
|
100 |
+
sk = " ".join(list(set(soft_skill)))
|
101 |
+
hk = " ".join(list(set(hard_skill)))
|
102 |
+
# st.write(skill_extractor.describe(annotations))
|
103 |
+
return sk+hk
|
104 |
+
except Exception as e:
|
105 |
+
return None
|
106 |
+
|
107 |
+
|
108 |
+
def install(package):
|
109 |
+
subprocess.check_call([sys.executable, "-m", "pip", "install", package])
|
110 |
+
|
111 |
+
|
112 |
+
|
113 |
+
|
114 |
+
|
115 |
+
def create_ann_list(text, results):
|
116 |
+
try:
|
117 |
+
from skillNer.general_params import SKILL_DB
|
118 |
+
except:
|
119 |
+
# install skillner if not done yet
|
120 |
+
os.system('pip install skillner')
|
121 |
+
from skillNer.general_params import SKILL_DB
|
122 |
+
|
123 |
+
type_to_color = {'Hard Skill': "#faa",
|
124 |
+
'Soft Skill': '#afa', 'Certification': '#ff4'}
|
125 |
+
text_tokens = text.split(' ')
|
126 |
+
annots = {}
|
127 |
+
all_res = results['ngram_scored']+results['full_matches']
|
128 |
+
ids_done = []
|
129 |
+
# create annotations from matches
|
130 |
+
for match in all_res:
|
131 |
+
id_ = match['skill_id']
|
132 |
+
type_ = SKILL_DB[id_]['skill_type']
|
133 |
+
span_str = ' '.join([text_tokens[i] for i in match['doc_node_id']])
|
134 |
+
annot = annotation(span_str, type_, background=type_to_color[type_],
|
135 |
+
color="#333", margin='2px')
|
136 |
+
annots[match['doc_node_id'][0]] = annot
|
137 |
+
for i in match['doc_node_id']:
|
138 |
+
ids_done.append(i)
|
139 |
+
# create strs for non annotated text
|
140 |
+
non_match_ids = [i for i, _ in enumerate(text_tokens) if i not in ids_done]
|
141 |
+
dict_ = dict(enumerate(grouper(non_match_ids), 1))
|
142 |
+
for v in dict_.values():
|
143 |
+
span = ' '.join([text_tokens[i] for i in v])
|
144 |
+
annots[v[0]] = span
|
145 |
+
# annotation(token,color="#fff", background="transparent",)
|
146 |
+
print(dict_)
|
147 |
+
print('-----')
|
148 |
+
# print(collections.OrderedDict(sorted(annots.items())))
|
149 |
+
annots_ = collections.OrderedDict(sorted(annots.items())).values()
|
150 |
+
return annots_
|
151 |
+
|
152 |
+
|
153 |
+
def create_dfs(results):
|
154 |
+
try:
|
155 |
+
from skillNer.general_params import SKILL_DB
|
156 |
+
except:
|
157 |
+
# install skillner if not done yet
|
158 |
+
os.system('pip install skillner')
|
159 |
+
from skillNer.general_params import SKILL_DB
|
160 |
+
|
161 |
+
f_matches = results['full_matches']
|
162 |
+
f_arr = []
|
163 |
+
for match in f_matches:
|
164 |
+
id_ = match['skill_id']
|
165 |
+
full_name = SKILL_DB[id_]['skill_name']
|
166 |
+
type_ = SKILL_DB[id_]['skill_type']
|
167 |
+
f_arr.append([id_, full_name, type_])
|
168 |
+
s_matches = results['ngram_scored']
|
169 |
+
s_arr = []
|
170 |
+
for match in s_matches:
|
171 |
+
id_ = match['skill_id']
|
172 |
+
full_name = SKILL_DB[id_]['skill_name']
|
173 |
+
type_ = SKILL_DB[id_]['skill_type']
|
174 |
+
score = match['score']
|
175 |
+
s_arr.append([id_, full_name, type_, score])
|
176 |
+
full_df = pd.DataFrame(
|
177 |
+
f_arr, columns=['skill id', 'skill name', 'skill type'])
|
178 |
+
sub_df = pd.DataFrame(
|
179 |
+
s_arr, columns=['skill id', 'skill name', 'skill type', 'score'])
|
180 |
+
return full_df, sub_df
|