Spaces:

omdenalagos
/

job_skill_cat

Runtime error

App Files Files Community

Basanth commited on Feb 1, 2023

Commit

91ec262

1 Parent(s): 857f872

uploaded all the files

Browse files

Files changed (16) hide show

app.py +41 -0
apps/demo.py +113 -0
apps/eda.py +5 -0
apps/home.py +12 -0
apps/models.py +6 -0
data/labels.txt +22 -0
data/sample.txt +1 -0
models/distilbert/config.json +69 -0
models/distilbert/special_tokens_map.json +7 -0
models/distilbert/tf_model.h5 +3 -0
models/distilbert/tf_model.preproc +0 -0
models/distilbert/tokenizer.json +0 -0
models/distilbert/tokenizer_config.json +14 -0
models/distilbert/vocab.txt +0 -0
requirements.txt +16 -0
utils.py +180 -0

app.py ADDED Viewed

	@@ -0,0 +1,41 @@

+import streamlit as st
+st.set_page_config(layout="wide")
+from streamlit_option_menu import option_menu
+from apps import home,eda,models,demo
+if not "valid_inputs_received" in st.session_state:
+    st.session_state["valid_inputs_received"] = False
+# image = Image.open('data/logo.png')
+# image=image.resize((100,100))
+header = st.container()
+apps = [
+    {"func": home.app, "title": "Home", "icon": "house"},
+    {"func": eda.app, "title": "EDA", "icon": "bar-chart"},
+    # {"func": models.app, "title": "Models", "icon": "cpu"},
+    {"func": demo.app, "title": "Demo", "icon": "cloud-upload"},
+]
+titles = [app["title"] for app in apps]
+titles_lower = [title.lower() for title in titles]
+icons = [app["icon"] for app in apps]
+with st.sidebar:
+    # logo = st.image(image)
+    selected = option_menu(
+        "Main Menu",
+        options=titles,
+        icons=icons,
+        menu_icon="cast",
+    )
+for app in apps:
+    if app["title"] == selected:
+        app["func"]()
+        break

apps/demo.py ADDED Viewed

	@@ -0,0 +1,113 @@

+import streamlit as st
+from streamlit_echarts import st_echarts
+import pandas as pd
+from annotated_text import annotated_text, annotation
+from utils import load_skill_extractor, create_ann_list
+from utils import get_skill, clean_text, default_text, predict_cat, load_model
+model = load_model()
+# skill_extractor = load_skill_extractor()
+def app():
+    st.title("Demo")
+    with st.form(key="text_val"):
+        input_text = st.text_area('Enter the text here', value=default_text(), height=200)
+        submit_button = st.form_submit_button(label="Submit")
+    cls_text = clean_text(input_text)
+    # st.write(cls_text)
+    col1, col2,= st.columns(2)
+    gaugeData = [
+    {
+        "value": 0,
+        "name": 'Probabiltiy',
+        "detail": {
+        "valueAnimation": True,
+        "offsetCenter": ['0%', '0%']
+        }
+    }]
+    option = {
+    "series": [
+        {
+        "type": "gauge",
+        "startAngle": 90,
+        "endAngle": -270,
+        "pointer": {
+            "show": False,
+        },
+        "progress": {
+            "show": True,
+            "overlap": False,
+            "roundCap":False,
+            "clip": False,
+            "backgroundColor": '#11D1F9',
+            "itemStyle": {
+            "color": '#E96605',
+            "borderWidth": 0,
+            "borderColor": "light blue"
+            }
+        },
+        "axisLine": {
+            "lineStyle": {
+            "width": 40
+            }
+        },
+        "splitLine": {
+            "show": False,
+            "distance": 0,
+            "length": 20
+        },
+        "axisTick": {
+            "show": False
+        },
+        "axisLabel": {
+            "show": False,
+            "distance": 50
+        },
+        "data": gaugeData,
+        "detail": {
+            "valueAnimation": True,
+            "offsetCenter": ['0%', '0%'],
+            "width": 40,
+            "height": 14,
+            "fontSize": 24,
+            "color": 'inherit',
+            "borderColor": 'inherit',
+            "borderRadius": 0,
+            "borderWidth": 0,
+            "formatter": '{value}%'
+        },
+        }
+    ]
+    }
+    prob,job_cat = predict_cat(model, cls_text)
+    with st.form(key='result'):
+        if submit_button:
+            gaugeData[0]['value']=prob
+            with col1:
+                st.markdown("<h1 style='text-align: center; color: #05A4E9;'>Job Category</h1>", unsafe_allow_html=True)
+                html_str = f"""
+                            <h1
+                            style = 'text-align: center;
+                                    font: bold {2}em Courier;'
+                            <p class="a">{job_cat}</p>
+                            </h1>
+                            """
+                st.markdown(html_str, unsafe_allow_html=True )
+            with col2:
+                # st.title("Probability")
+                st.markdown("<h1 style='text-align: center; color: #05A4E9;'>Probability</h1>", unsafe_allow_html=True)
+                st_echarts(options=option, key="1")

apps/eda.py ADDED Viewed

	@@ -0,0 +1,5 @@

+import streamlit as st
+def app():
+    st.title("EDA")

apps/home.py ADDED Viewed

	@@ -0,0 +1,12 @@

+import streamlit as st
+def app():
+    st.title("Ghana - Understanding The Disconnect between Skills and Jobs")
+    st.markdown('''
+    In Africa, there is often a disconnection between the skills that job seekers possess and the skills that employers require. This can be due to a lack of access to education and training opportunities, as well as a lack of alignment between the education system and the needs of the job market.
+ Additionally, many employers in Africa may not have the resources or capacity to provide the necessary training and development for their employees. As a result, there is often a mismatch between the skills that workers have and the skills that employers need, which can make it difficult for workers to find employment or for employers to find qualified candidates.
+    ''')

apps/models.py ADDED Viewed

	@@ -0,0 +1,6 @@

+import streamlit as st
+def app():
+    st.title("Models")

data/labels.txt ADDED Viewed

	@@ -0,0 +1,22 @@

+,0
+Accounting/Finance/Banking,0
+Administrative/Secretarial ,1
+Advertising/Media ,2
+Agricultural ,3
+Communication ,4
+Customer Service ,5
+Education ,6
+"Energy,Oil & Gas ",7
+Engineering/Processing/Manufacturing,8
+Healthcare ,9
+Hospitalilty/Food Service ,10
+Human Resource Management ,11
+IT ,12
+Legal ,13
+Mining ,14
+Other,15
+Purchasing/Procurement ,16
+Quality Control/Assurance ,17
+Sales / Marketing ,18
+Securtity/Law Enforcement ,19
+Supply Chain/Logistics/Warehousing ,20

data/sample.txt ADDED Viewed

	@@ -0,0 +1 @@

+ An introduction to computers and how they work. Types and Historical Development of Computers. Number systems: binary, octal, hexadecimal, integer and fractional representations, Signed and Unsigned numbers, 1‘s complement, 2‘s complement and Arithmetic Overflows. Integer and floating point arithmetic (IEEE standard 754 Floating point Formats). Data Representation and Manipulation: Bits, bytes and words: Logic operations and Logic gates applications.

models/distilbert/config.json ADDED Viewed

	@@ -0,0 +1,69 @@

+{
+  "_name_or_path": "/tmp/tmpqaytn67y",
+  "activation": "gelu",
+  "architectures": [
+    "DistilBertForSequenceClassification"
+  ],
+  "attention_dropout": 0.1,
+  "dim": 768,
+  "dropout": 0.1,
+  "hidden_dim": 3072,
+  "id2label": {
+    "0": "LABEL_0",
+    "1": "LABEL_1",
+    "2": "LABEL_2",
+    "3": "LABEL_3",
+    "4": "LABEL_4",
+    "5": "LABEL_5",
+    "6": "LABEL_6",
+    "7": "LABEL_7",
+    "8": "LABEL_8",
+    "9": "LABEL_9",
+    "10": "LABEL_10",
+    "11": "LABEL_11",
+    "12": "LABEL_12",
+    "13": "LABEL_13",
+    "14": "LABEL_14",
+    "15": "LABEL_15",
+    "16": "LABEL_16",
+    "17": "LABEL_17",
+    "18": "LABEL_18",
+    "19": "LABEL_19",
+    "20": "LABEL_20"
+  },
+  "initializer_range": 0.02,
+  "label2id": {
+    "LABEL_0": 0,
+    "LABEL_1": 1,
+    "LABEL_10": 10,
+    "LABEL_11": 11,
+    "LABEL_12": 12,
+    "LABEL_13": 13,
+    "LABEL_14": 14,
+    "LABEL_15": 15,
+    "LABEL_16": 16,
+    "LABEL_17": 17,
+    "LABEL_18": 18,
+    "LABEL_19": 19,
+    "LABEL_2": 2,
+    "LABEL_20": 20,
+    "LABEL_3": 3,
+    "LABEL_4": 4,
+    "LABEL_5": 5,
+    "LABEL_6": 6,
+    "LABEL_7": 7,
+    "LABEL_8": 8,
+    "LABEL_9": 9
+  },
+  "max_position_embeddings": 512,
+  "model_type": "distilbert",
+  "n_heads": 12,
+  "n_layers": 6,
+  "pad_token_id": 0,
+  "qa_dropout": 0.1,
+  "seq_classif_dropout": 0.2,
+  "sinusoidal_pos_embds": false,
+  "tie_weights_": true,
+  "transformers_version": "4.26.0",
+  "vocab_size": 30522
+}

models/distilbert/special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

models/distilbert/tf_model.h5 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2f5ddf0f86b178c98159599cedfbad2b01062ed3290d73f0af7780480ba1106c
+size 268014760

models/distilbert/tf_model.preproc ADDED Viewed

Binary file (2.98 kB). View file

models/distilbert/tokenizer.json ADDED Viewed

The diff for this file is too large to render. See raw diff

models/distilbert/tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,14 @@

+{
+  "cls_token": "[CLS]",
+  "do_lower_case": true,
+  "mask_token": "[MASK]",
+  "model_max_length": 512,
+  "name_or_path": "distilbert-base-uncased",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "special_tokens_map_file": null,
+  "strip_accents": null,
+  "tokenize_chinese_chars": true,
+  "tokenizer_class": "DistilBertTokenizer",
+  "unk_token": "[UNK]"
+}

models/distilbert/vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff

requirements.txt ADDED Viewed

	@@ -0,0 +1,16 @@

+streamlit
+streamlit-option-menu
+streamlit-echarts
+ktrain
+# spacy>=3.0.0,<4.0.0
+# en_core_web_lg @ https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.4.1/en_core_web_lg-3.4.1-py3-none-any.whl
+# skillNer
+pandas
+numpy
+ipython
+neattext
+tensorflow
+st_annotated_text==2.0.0
+requests
+# nltk
+# Unidecode

utils.py ADDED Viewed

	@@ -0,0 +1,180 @@

+import sys
+import subprocess
+import streamlit as st
+import numpy as np
+from annotated_text import annotation
+import collections
+import ktrain
+import pandas as pd
+import os
+import neattext.functions as nfx
+label_path = ("./data/labels.txt")
+cols = ['cat', 'code']
+label_df = pd.read_csv(label_path, names=cols, header=0)
+def default_text():
+    with open("./data/sample.txt", 'r') as fs:
+        text = fs.read()
+    return text
+@st.cache(allow_output_mutation=True,suppress_st_warning=True)
+def load_model():
+    model_path = "./models/distilbert/"
+    model = ktrain.load_predictor(model_path)
+    return model
+@st.cache(allow_output_mutation=True, suppress_st_warning=True)
+def load_skill_extractor():
+    # This function will only be run the first time it's called
+    import spacy
+    from skillNer.skill_extractor_class import SkillExtractor
+    from skillNer.general_params import SKILL_DB
+    from spacy.matcher import PhraseMatcher
+    # init params of skill extractor
+    print('load model')
+    nlp = spacy.load('en_core_web_lg')
+    print('load matcher')
+    # init skill extractor
+    skill_extractor = SkillExtractor(nlp, SKILL_DB, PhraseMatcher,)
+    return skill_extractor
+def clean_text(text):
+    try:
+        docx = nfx.TextFrame(text)
+        result = docx.remove_emails().remove_urls().remove_dates().remove_html_tags().remove_numbers().remove_puncts().remove_stopwords().remove_special_characters()
+        # doc = nlp(result.text)
+        # empty_list = []
+        # for token in doc:
+        # empty_list.append(token.lemma_)
+        # final_string = ' '.join(map(str,empty_list))
+        return result.text
+    except Exception as e:
+        print(e)
+        return None
+def predict_cat(model, text):
+    p = int(model.predict(text,return_proba=True).max()*100)
+    cat =  model.predict(text)
+    return p,cat
+def grouper(iterable):
+    prev = None
+    group = []
+    for item in iterable:
+        if not prev or item - prev <= 1:
+            group.append(item)
+        else:
+            yield group
+            group = [item]
+        prev = item
+    if group:
+        yield group
+def get_skill(annotations):
+    try:
+        # annotations = skill_extractor.annotate(text,tresh=0.5)
+        # skill_dict = {"Soft Skill": [], "Hard Skill": []}
+        soft_skill = []
+        hard_skill = []
+        for item in annotations['results']['ngram_scored']:
+            skill_id = item['skill_id']
+            skill_type = skill_extractor.skills_db[skill_id]['skill_type']
+            if skill_type == 'Soft Skill' and item['doc_node_value']:
+                soft_skill.append(item['doc_node_value'])
+            if skill_type == 'Hard Skill':
+                hard_skill.append(item['doc_node_value'])
+            # skill_dict['Soft Skill'] =set(soft_skill)
+        sk = " ".join(list(set(soft_skill)))
+        hk = " ".join(list(set(hard_skill)))
+        # st.write(skill_extractor.describe(annotations))
+        return sk+hk
+    except Exception as e:
+        return None
+def install(package):
+    subprocess.check_call([sys.executable, "-m", "pip", "install", package])
+def create_ann_list(text, results):
+    try:
+        from skillNer.general_params import SKILL_DB
+    except:
+        # install skillner if not done yet
+        os.system('pip install skillner')
+        from skillNer.general_params import SKILL_DB
+    type_to_color = {'Hard Skill': "#faa",
+                     'Soft Skill': '#afa', 'Certification': '#ff4'}
+    text_tokens = text.split(' ')
+    annots = {}
+    all_res = results['ngram_scored']+results['full_matches']
+    ids_done = []
+    # create annotations from matches
+    for match in all_res:
+        id_ = match['skill_id']
+        type_ = SKILL_DB[id_]['skill_type']
+        span_str = ' '.join([text_tokens[i] for i in match['doc_node_id']])
+        annot = annotation(span_str, type_, background=type_to_color[type_],
+                           color="#333", margin='2px')
+        annots[match['doc_node_id'][0]] = annot
+        for i in match['doc_node_id']:
+            ids_done.append(i)
+    # create strs for non annotated text
+    non_match_ids = [i for i, _ in enumerate(text_tokens) if i not in ids_done]
+    dict_ = dict(enumerate(grouper(non_match_ids), 1))
+    for v in dict_.values():
+        span = ' '.join([text_tokens[i] for i in v])
+        annots[v[0]] = span
+        # annotation(token,color="#fff", background="transparent",)
+    print(dict_)
+    print('-----')
+    # print(collections.OrderedDict(sorted(annots.items())))
+    annots_ = collections.OrderedDict(sorted(annots.items())).values()
+    return annots_
+def create_dfs(results):
+    try:
+        from skillNer.general_params import SKILL_DB
+    except:
+        # install skillner if not done yet
+        os.system('pip install skillner')
+        from skillNer.general_params import SKILL_DB
+    f_matches = results['full_matches']
+    f_arr = []
+    for match in f_matches:
+        id_ = match['skill_id']
+        full_name = SKILL_DB[id_]['skill_name']
+        type_ = SKILL_DB[id_]['skill_type']
+        f_arr.append([id_, full_name, type_])
+    s_matches = results['ngram_scored']
+    s_arr = []
+    for match in s_matches:
+        id_ = match['skill_id']
+        full_name = SKILL_DB[id_]['skill_name']
+        type_ = SKILL_DB[id_]['skill_type']
+        score = match['score']
+        s_arr.append([id_, full_name, type_, score])
+    full_df = pd.DataFrame(
+        f_arr, columns=['skill id', 'skill name', 'skill type'])
+    sub_df = pd.DataFrame(
+        s_arr, columns=['skill id', 'skill name', 'skill type', 'score'])
+    return full_df, sub_df