|
import pandas as pd |
|
import numpy as np |
|
import torch |
|
import zipfile |
|
import os |
|
|
|
from transformers import BertTokenizer, BertForSequenceClassification |
|
import contractions |
|
import re |
|
import nltk |
|
|
|
nltk.download('stopwords') |
|
nltk.download('wordnet') |
|
nltk.download('punkt') |
|
nltk.download('averaged_perceptron_tagger') |
|
|
|
from nltk.corpus import stopwords |
|
stop_words = set(stopwords.words('english')) |
|
from nltk.tokenize import word_tokenize |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def load_model(): |
|
model_name = "azrai99/bert-skills-extraction" |
|
model = BertForSequenceClassification.from_pretrained(model_name) |
|
tokenizer = BertTokenizer.from_pretrained(model_name) |
|
|
|
return model,tokenizer |
|
|
|
|
|
def clean(desc): |
|
desc = contractions.fix(desc) |
|
desc = re.sub("[!@.$\'\'':()]", "", desc) |
|
return desc |
|
|
|
def extract_POS(tagged): |
|
|
|
grammar1 = ('''Noun Phrases: {<DT>?<JJ>*<NN|NNS|NNP>+}''') |
|
chunkParser = nltk.RegexpParser(grammar1) |
|
tree1 = chunkParser.parse(tagged) |
|
|
|
|
|
g1_chunks = [] |
|
for subtree in tree1.subtrees(filter=lambda t: t.label() == 'Noun Phrases'): |
|
g1_chunks.append(subtree) |
|
|
|
|
|
grammar2 = ('''NP2: {<IN>?<JJ|NN>*<NNS|NN>} ''') |
|
chunkParser = nltk.RegexpParser(grammar2) |
|
tree2 = chunkParser.parse(tagged) |
|
|
|
|
|
g2_chunks = [] |
|
for subtree in tree2.subtrees(filter=lambda t: t.label() == 'NP2'): |
|
g2_chunks.append(subtree) |
|
|
|
|
|
grammar3 = (''' VS: {<VBG|VBZ|VBP|VBD|VB|VBN><NNS|NN>*}''') |
|
chunkParser = nltk.RegexpParser(grammar3) |
|
tree3 = chunkParser.parse(tagged) |
|
|
|
|
|
g3_chunks = [] |
|
for subtree in tree3.subtrees(filter=lambda t: t.label() == 'VS'): |
|
g3_chunks.append(subtree) |
|
|
|
|
|
|
|
|
|
grammar4 = ('''Commas: {<NN|NNS>*<,><NN|NNS>*<,><NN|NNS>*} ''') |
|
chunkParser = nltk.RegexpParser(grammar4) |
|
tree4 = chunkParser.parse(tagged) |
|
|
|
|
|
g4_chunks = [] |
|
for subtree in tree4.subtrees(filter=lambda t: t.label() == 'Commas'): |
|
g4_chunks.append(subtree) |
|
|
|
return g1_chunks, g2_chunks, g3_chunks, g4_chunks |
|
|
|
def tokenize_and_tag(desc): |
|
tokens = nltk.word_tokenize(desc.lower()) |
|
filtered_tokens = [w for w in tokens if not w in stop_words] |
|
tagged = nltk.pos_tag(filtered_tokens) |
|
return tagged |
|
|
|
def training_set(chunks): |
|
'''creates a dataframe that easily parsed with the chunks data ''' |
|
df = pd.DataFrame(chunks) |
|
df.fillna('X', inplace = True) |
|
|
|
train = [] |
|
for row in df.values: |
|
phrase = '' |
|
for tup in row: |
|
|
|
phrase += tup[0] + ' ' |
|
phrase = ''.join(phrase) |
|
|
|
|
|
train.append( phrase.replace('X', '').strip()) |
|
|
|
df['phrase'] = train |
|
|
|
return df.phrase |
|
|
|
def strip_commas(df): |
|
'''create new series of individual n-grams''' |
|
grams = [] |
|
for sen in df: |
|
sent = sen.split(',') |
|
for word in sent: |
|
grams.append(word) |
|
return pd.Series(grams) |
|
|
|
def generate_phrases(desc): |
|
tagged = tokenize_and_tag(desc) |
|
g1_chunks, g2_chunks, g3_chunks, g4_chunks = extract_POS(tagged) |
|
c = training_set(g4_chunks) |
|
separated_chunks4 = strip_commas(c) |
|
phrases = pd.concat([training_set(g1_chunks), |
|
training_set(g2_chunks), |
|
training_set(g3_chunks), |
|
separated_chunks4], |
|
ignore_index = True ) |
|
return phrases |
|
|
|
def get_predictions(desc, model, tokenizer, threshold=0.6, return_probabilities=False): |
|
|
|
desc = clean(desc) |
|
|
|
|
|
phrases = generate_phrases(desc).tolist() |
|
phrases = [phrase.strip() for phrase in phrases] |
|
|
|
print(phrases) |
|
|
|
|
|
inputs = tokenizer(phrases, return_tensors="pt", truncation=True, padding=True) |
|
|
|
model,tokenizer = load_model() |
|
|
|
with torch.no_grad(): |
|
outputs = model(**inputs) |
|
|
|
|
|
probs = torch.nn.functional.softmax(outputs.logits, dim=1) |
|
|
|
|
|
predictions = (probs[:, 1] > threshold).to(torch.int32) |
|
|
|
|
|
out = pd.DataFrame({'Phrase': phrases, 'Class': predictions}) |
|
skills = out.loc[out['Class'] == 1] |
|
|
|
return skills['Phrase'].unique().tolist() |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def get_predictions_excel(filename): |
|
"""description column must be titled Job Desc""" |
|
df = pd.read_csv(filename) |
|
df['Extracted skills'] = df['Job Description'].apply(lambda x: get_predictions(x)) |
|
|
|
return df.to_csv('extracted.csv') |