Upload 4 files
Browse files- .gitignore +1 -0
- app.py +21 -0
- pipeline.py +171 -0
- requirements.txt +5 -0
.gitignore
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
bert_fine_tuned
|
app.py
ADDED
@@ -0,0 +1,21 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
from pipeline import get_predictions , load_model
|
3 |
+
import time
|
4 |
+
|
5 |
+
|
6 |
+
model, tokenizer = load_model()
|
7 |
+
|
8 |
+
st.title("Skills Extraction from Job post")
|
9 |
+
|
10 |
+
# Input text area for user to input description
|
11 |
+
user_input = st.text_area("Enter the job description:", "")
|
12 |
+
|
13 |
+
# Button to trigger predictions
|
14 |
+
if st.button("Get Predictions"):
|
15 |
+
# Assuming 'model' and 'tokenizer' are already defined
|
16 |
+
start_time = time.time()
|
17 |
+
predicted_skills = get_predictions(user_input, model, tokenizer, threshold=0.65)
|
18 |
+
execution_time = time.time() - start_time
|
19 |
+
|
20 |
+
st.write("Predicted Skills:", predicted_skills)
|
21 |
+
st.write(f"Execution Time: {execution_time:.4f} seconds")
|
pipeline.py
ADDED
@@ -0,0 +1,171 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import pandas as pd
|
2 |
+
import numpy as np
|
3 |
+
import torch
|
4 |
+
import zipfile
|
5 |
+
import os
|
6 |
+
|
7 |
+
from transformers import BertTokenizer, BertForSequenceClassification
|
8 |
+
import contractions
|
9 |
+
import re
|
10 |
+
import nltk
|
11 |
+
from nltk.corpus import stopwords
|
12 |
+
stop_words = set(stopwords.words('english'))
|
13 |
+
from nltk.tokenize import word_tokenize
|
14 |
+
|
15 |
+
|
16 |
+
# Load pre-trained BERT model and tokenizer
|
17 |
+
# def load_model():
|
18 |
+
# model_name = "./bert_fine_tuned/bert_fine_tuned"
|
19 |
+
# tokenizer = BertTokenizer.from_pretrained('./bert_fine_tuned/bert_tokens')
|
20 |
+
# model = BertForSequenceClassification.from_pretrained(model_name)
|
21 |
+
# return model, tokenizer
|
22 |
+
def load_model():
|
23 |
+
model_name = "azrai99/bert-skills-extraction"
|
24 |
+
model = BertForSequenceClassification.from_pretrained(model_name)
|
25 |
+
tokenizer = BertTokenizer.from_pretrained(model_name)
|
26 |
+
|
27 |
+
return model,tokenizer
|
28 |
+
|
29 |
+
|
30 |
+
def clean(desc):
|
31 |
+
desc = contractions.fix(desc)
|
32 |
+
desc = re.sub("[!@.$\'\'':()]", "", desc)
|
33 |
+
return desc
|
34 |
+
|
35 |
+
def extract_POS(tagged):
|
36 |
+
#pattern 1
|
37 |
+
grammar1 = ('''Noun Phrases: {<DT>?<JJ>*<NN|NNS|NNP>+}''')
|
38 |
+
chunkParser = nltk.RegexpParser(grammar1)
|
39 |
+
tree1 = chunkParser.parse(tagged)
|
40 |
+
|
41 |
+
# typical noun phrase pattern appending to be concatted later
|
42 |
+
g1_chunks = []
|
43 |
+
for subtree in tree1.subtrees(filter=lambda t: t.label() == 'Noun Phrases'):
|
44 |
+
g1_chunks.append(subtree)
|
45 |
+
|
46 |
+
#pattern 2
|
47 |
+
grammar2 = ('''NP2: {<IN>?<JJ|NN>*<NNS|NN>} ''')
|
48 |
+
chunkParser = nltk.RegexpParser(grammar2)
|
49 |
+
tree2 = chunkParser.parse(tagged)
|
50 |
+
|
51 |
+
# variation of a noun phrase pattern to be pickled for later analyses
|
52 |
+
g2_chunks = []
|
53 |
+
for subtree in tree2.subtrees(filter=lambda t: t.label() == 'NP2'):
|
54 |
+
g2_chunks.append(subtree)
|
55 |
+
|
56 |
+
#pattern 3
|
57 |
+
grammar3 = (''' VS: {<VBG|VBZ|VBP|VBD|VB|VBN><NNS|NN>*}''')
|
58 |
+
chunkParser = nltk.RegexpParser(grammar3)
|
59 |
+
tree3 = chunkParser.parse(tagged)
|
60 |
+
|
61 |
+
# verb-noun pattern appending to be concatted later
|
62 |
+
g3_chunks = []
|
63 |
+
for subtree in tree3.subtrees(filter=lambda t: t.label() == 'VS'):
|
64 |
+
g3_chunks.append(subtree)
|
65 |
+
|
66 |
+
|
67 |
+
# pattern 4
|
68 |
+
# any number of a singular or plural noun followed by a comma followed by the same noun, noun, noun pattern
|
69 |
+
grammar4 = ('''Commas: {<NN|NNS>*<,><NN|NNS>*<,><NN|NNS>*} ''')
|
70 |
+
chunkParser = nltk.RegexpParser(grammar4)
|
71 |
+
tree4 = chunkParser.parse(tagged)
|
72 |
+
|
73 |
+
# common pattern of listing skills appending to be concatted later
|
74 |
+
g4_chunks = []
|
75 |
+
for subtree in tree4.subtrees(filter=lambda t: t.label() == 'Commas'):
|
76 |
+
g4_chunks.append(subtree)
|
77 |
+
|
78 |
+
return g1_chunks, g2_chunks, g3_chunks, g4_chunks
|
79 |
+
|
80 |
+
def tokenize_and_tag(desc):
|
81 |
+
tokens = nltk.word_tokenize(desc.lower())
|
82 |
+
filtered_tokens = [w for w in tokens if not w in stop_words]
|
83 |
+
tagged = nltk.pos_tag(filtered_tokens)
|
84 |
+
return tagged
|
85 |
+
|
86 |
+
def training_set(chunks):
|
87 |
+
'''creates a dataframe that easily parsed with the chunks data '''
|
88 |
+
df = pd.DataFrame(chunks)
|
89 |
+
df.fillna('X', inplace = True)
|
90 |
+
|
91 |
+
train = []
|
92 |
+
for row in df.values:
|
93 |
+
phrase = ''
|
94 |
+
for tup in row:
|
95 |
+
# needs a space at the end for seperation
|
96 |
+
phrase += tup[0] + ' '
|
97 |
+
phrase = ''.join(phrase)
|
98 |
+
# could use padding tages but encoder method will provide during
|
99 |
+
# tokenizing/embeddings; X can replace paddding for now
|
100 |
+
train.append( phrase.replace('X', '').strip())
|
101 |
+
|
102 |
+
df['phrase'] = train
|
103 |
+
|
104 |
+
return df.phrase
|
105 |
+
|
106 |
+
def strip_commas(df):
|
107 |
+
'''create new series of individual n-grams'''
|
108 |
+
grams = []
|
109 |
+
for sen in df:
|
110 |
+
sent = sen.split(',')
|
111 |
+
for word in sent:
|
112 |
+
grams.append(word)
|
113 |
+
return pd.Series(grams)
|
114 |
+
|
115 |
+
def generate_phrases(desc):
|
116 |
+
tagged = tokenize_and_tag(desc)
|
117 |
+
g1_chunks, g2_chunks, g3_chunks, g4_chunks = extract_POS(tagged)
|
118 |
+
c = training_set(g4_chunks)
|
119 |
+
separated_chunks4 = strip_commas(c)
|
120 |
+
phrases = pd.concat([training_set(g1_chunks),
|
121 |
+
training_set(g2_chunks),
|
122 |
+
training_set(g3_chunks),
|
123 |
+
separated_chunks4],
|
124 |
+
ignore_index = True )
|
125 |
+
return phrases
|
126 |
+
|
127 |
+
def get_predictions(desc, model, tokenizer, threshold=0.6, return_probabilities=False):
|
128 |
+
# Clean
|
129 |
+
desc = clean(desc)
|
130 |
+
|
131 |
+
|
132 |
+
phrases = generate_phrases(desc).tolist()
|
133 |
+
phrases = [phrase.strip() for phrase in phrases]
|
134 |
+
|
135 |
+
print(phrases)
|
136 |
+
|
137 |
+
# Tokenize and prepare phrases for the model
|
138 |
+
inputs = tokenizer(phrases, return_tensors="pt", truncation=True, padding=True)
|
139 |
+
|
140 |
+
model,tokenizer = load_model()
|
141 |
+
# Perform inference
|
142 |
+
with torch.no_grad():
|
143 |
+
outputs = model(**inputs)
|
144 |
+
|
145 |
+
# Get predicted probabilities
|
146 |
+
probs = torch.nn.functional.softmax(outputs.logits, dim=1)
|
147 |
+
|
148 |
+
# Get predicted classes based on the threshold
|
149 |
+
predictions = (probs[:, 1] > threshold).to(torch.int32)
|
150 |
+
|
151 |
+
# Return predicted skills as a list
|
152 |
+
out = pd.DataFrame({'Phrase': phrases, 'Class': predictions})
|
153 |
+
skills = out.loc[out['Class'] == 1]
|
154 |
+
|
155 |
+
return skills['Phrase'].unique().tolist()
|
156 |
+
|
157 |
+
# # Return predicted skills and probabilities as lists
|
158 |
+
# out = pd.DataFrame({'Phrase': phrases, 'Class': predictions, 'Probability': probs[:, 1]})
|
159 |
+
# skills = out.loc[out['Class'] == 1]
|
160 |
+
|
161 |
+
# if return_probabilities:
|
162 |
+
# return skills['Phrase'].tolist(), skills['Probability'].tolist()
|
163 |
+
# else:
|
164 |
+
# return skills['Phrase'].tolist()
|
165 |
+
|
166 |
+
def get_predictions_excel(filename):
|
167 |
+
"""description column must be titled Job Desc"""
|
168 |
+
df = pd.read_csv(filename)
|
169 |
+
df['Extracted skills'] = df['Job Description'].apply(lambda x: get_predictions(x))
|
170 |
+
|
171 |
+
return df.to_csv('extracted.csv')
|
requirements.txt
ADDED
@@ -0,0 +1,5 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
nltk
|
2 |
+
torch
|
3 |
+
contractions
|
4 |
+
transformers
|
5 |
+
zipfile
|