azrai99 commited on
Commit
6fd73cf
·
verified ·
1 Parent(s): c173f11

Upload 4 files

Browse files
Files changed (4) hide show
  1. .gitignore +1 -0
  2. app.py +21 -0
  3. pipeline.py +171 -0
  4. requirements.txt +5 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ bert_fine_tuned
app.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from pipeline import get_predictions , load_model
3
+ import time
4
+
5
+
6
+ model, tokenizer = load_model()
7
+
8
+ st.title("Skills Extraction from Job post")
9
+
10
+ # Input text area for user to input description
11
+ user_input = st.text_area("Enter the job description:", "")
12
+
13
+ # Button to trigger predictions
14
+ if st.button("Get Predictions"):
15
+ # Assuming 'model' and 'tokenizer' are already defined
16
+ start_time = time.time()
17
+ predicted_skills = get_predictions(user_input, model, tokenizer, threshold=0.65)
18
+ execution_time = time.time() - start_time
19
+
20
+ st.write("Predicted Skills:", predicted_skills)
21
+ st.write(f"Execution Time: {execution_time:.4f} seconds")
pipeline.py ADDED
@@ -0,0 +1,171 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import numpy as np
3
+ import torch
4
+ import zipfile
5
+ import os
6
+
7
+ from transformers import BertTokenizer, BertForSequenceClassification
8
+ import contractions
9
+ import re
10
+ import nltk
11
+ from nltk.corpus import stopwords
12
+ stop_words = set(stopwords.words('english'))
13
+ from nltk.tokenize import word_tokenize
14
+
15
+
16
+ # Load pre-trained BERT model and tokenizer
17
+ # def load_model():
18
+ # model_name = "./bert_fine_tuned/bert_fine_tuned"
19
+ # tokenizer = BertTokenizer.from_pretrained('./bert_fine_tuned/bert_tokens')
20
+ # model = BertForSequenceClassification.from_pretrained(model_name)
21
+ # return model, tokenizer
22
+ def load_model():
23
+ model_name = "azrai99/bert-skills-extraction"
24
+ model = BertForSequenceClassification.from_pretrained(model_name)
25
+ tokenizer = BertTokenizer.from_pretrained(model_name)
26
+
27
+ return model,tokenizer
28
+
29
+
30
+ def clean(desc):
31
+ desc = contractions.fix(desc)
32
+ desc = re.sub("[!@.$\'\'':()]", "", desc)
33
+ return desc
34
+
35
+ def extract_POS(tagged):
36
+ #pattern 1
37
+ grammar1 = ('''Noun Phrases: {<DT>?<JJ>*<NN|NNS|NNP>+}''')
38
+ chunkParser = nltk.RegexpParser(grammar1)
39
+ tree1 = chunkParser.parse(tagged)
40
+
41
+ # typical noun phrase pattern appending to be concatted later
42
+ g1_chunks = []
43
+ for subtree in tree1.subtrees(filter=lambda t: t.label() == 'Noun Phrases'):
44
+ g1_chunks.append(subtree)
45
+
46
+ #pattern 2
47
+ grammar2 = ('''NP2: {<IN>?<JJ|NN>*<NNS|NN>} ''')
48
+ chunkParser = nltk.RegexpParser(grammar2)
49
+ tree2 = chunkParser.parse(tagged)
50
+
51
+ # variation of a noun phrase pattern to be pickled for later analyses
52
+ g2_chunks = []
53
+ for subtree in tree2.subtrees(filter=lambda t: t.label() == 'NP2'):
54
+ g2_chunks.append(subtree)
55
+
56
+ #pattern 3
57
+ grammar3 = (''' VS: {<VBG|VBZ|VBP|VBD|VB|VBN><NNS|NN>*}''')
58
+ chunkParser = nltk.RegexpParser(grammar3)
59
+ tree3 = chunkParser.parse(tagged)
60
+
61
+ # verb-noun pattern appending to be concatted later
62
+ g3_chunks = []
63
+ for subtree in tree3.subtrees(filter=lambda t: t.label() == 'VS'):
64
+ g3_chunks.append(subtree)
65
+
66
+
67
+ # pattern 4
68
+ # any number of a singular or plural noun followed by a comma followed by the same noun, noun, noun pattern
69
+ grammar4 = ('''Commas: {<NN|NNS>*<,><NN|NNS>*<,><NN|NNS>*} ''')
70
+ chunkParser = nltk.RegexpParser(grammar4)
71
+ tree4 = chunkParser.parse(tagged)
72
+
73
+ # common pattern of listing skills appending to be concatted later
74
+ g4_chunks = []
75
+ for subtree in tree4.subtrees(filter=lambda t: t.label() == 'Commas'):
76
+ g4_chunks.append(subtree)
77
+
78
+ return g1_chunks, g2_chunks, g3_chunks, g4_chunks
79
+
80
+ def tokenize_and_tag(desc):
81
+ tokens = nltk.word_tokenize(desc.lower())
82
+ filtered_tokens = [w for w in tokens if not w in stop_words]
83
+ tagged = nltk.pos_tag(filtered_tokens)
84
+ return tagged
85
+
86
+ def training_set(chunks):
87
+ '''creates a dataframe that easily parsed with the chunks data '''
88
+ df = pd.DataFrame(chunks)
89
+ df.fillna('X', inplace = True)
90
+
91
+ train = []
92
+ for row in df.values:
93
+ phrase = ''
94
+ for tup in row:
95
+ # needs a space at the end for seperation
96
+ phrase += tup[0] + ' '
97
+ phrase = ''.join(phrase)
98
+ # could use padding tages but encoder method will provide during
99
+ # tokenizing/embeddings; X can replace paddding for now
100
+ train.append( phrase.replace('X', '').strip())
101
+
102
+ df['phrase'] = train
103
+
104
+ return df.phrase
105
+
106
+ def strip_commas(df):
107
+ '''create new series of individual n-grams'''
108
+ grams = []
109
+ for sen in df:
110
+ sent = sen.split(',')
111
+ for word in sent:
112
+ grams.append(word)
113
+ return pd.Series(grams)
114
+
115
+ def generate_phrases(desc):
116
+ tagged = tokenize_and_tag(desc)
117
+ g1_chunks, g2_chunks, g3_chunks, g4_chunks = extract_POS(tagged)
118
+ c = training_set(g4_chunks)
119
+ separated_chunks4 = strip_commas(c)
120
+ phrases = pd.concat([training_set(g1_chunks),
121
+ training_set(g2_chunks),
122
+ training_set(g3_chunks),
123
+ separated_chunks4],
124
+ ignore_index = True )
125
+ return phrases
126
+
127
+ def get_predictions(desc, model, tokenizer, threshold=0.6, return_probabilities=False):
128
+ # Clean
129
+ desc = clean(desc)
130
+
131
+
132
+ phrases = generate_phrases(desc).tolist()
133
+ phrases = [phrase.strip() for phrase in phrases]
134
+
135
+ print(phrases)
136
+
137
+ # Tokenize and prepare phrases for the model
138
+ inputs = tokenizer(phrases, return_tensors="pt", truncation=True, padding=True)
139
+
140
+ model,tokenizer = load_model()
141
+ # Perform inference
142
+ with torch.no_grad():
143
+ outputs = model(**inputs)
144
+
145
+ # Get predicted probabilities
146
+ probs = torch.nn.functional.softmax(outputs.logits, dim=1)
147
+
148
+ # Get predicted classes based on the threshold
149
+ predictions = (probs[:, 1] > threshold).to(torch.int32)
150
+
151
+ # Return predicted skills as a list
152
+ out = pd.DataFrame({'Phrase': phrases, 'Class': predictions})
153
+ skills = out.loc[out['Class'] == 1]
154
+
155
+ return skills['Phrase'].unique().tolist()
156
+
157
+ # # Return predicted skills and probabilities as lists
158
+ # out = pd.DataFrame({'Phrase': phrases, 'Class': predictions, 'Probability': probs[:, 1]})
159
+ # skills = out.loc[out['Class'] == 1]
160
+
161
+ # if return_probabilities:
162
+ # return skills['Phrase'].tolist(), skills['Probability'].tolist()
163
+ # else:
164
+ # return skills['Phrase'].tolist()
165
+
166
+ def get_predictions_excel(filename):
167
+ """description column must be titled Job Desc"""
168
+ df = pd.read_csv(filename)
169
+ df['Extracted skills'] = df['Job Description'].apply(lambda x: get_predictions(x))
170
+
171
+ return df.to_csv('extracted.csv')
requirements.txt ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ nltk
2
+ torch
3
+ contractions
4
+ transformers
5
+ zipfile