sky-scribe / knowledge_extraction.py
nkasmanoff's picture
Create knowledge_extraction.py
f8aa4be
raw
history blame
2.29 kB
import spacy
from spacy.matcher import Matcher
def get_entities(sent):
## chunk 1
ent1 = ""
ent2 = ""
prv_tok_dep = "" # dependency tag of previous token in the sentence
prv_tok_text = "" # previous token in the sentence
prefix = ""
modifier = ""
#############################################################
for tok in nlp(sent):
## chunk 2
# if token is a punctuation mark then move on to the next token
if tok.dep_ != "punct":
# check: token is a compound word or not
if tok.dep_ == "compound":
prefix = tok.text
# if the previous word was also a 'compound' then add the current word to it
if prv_tok_dep == "compound":
prefix = prv_tok_text + " " + tok.text
# check: token is a modifier or not
if tok.dep_.endswith("mod") == True:
modifier = tok.text
# if the previous word was also a 'compound' then add the current word to it
if prv_tok_dep == "compound":
modifier = prv_tok_text + " " + tok.text
## chunk 3
if tok.dep_.find("subj") == True:
ent1 = modifier + " " + prefix + " " + tok.text
prefix = ""
modifier = ""
prv_tok_dep = ""
prv_tok_text = ""
## chunk 4
if tok.dep_.find("obj") == True:
ent2 = modifier + " " + prefix + " " + tok.text
## chunk 5
# update variables
prv_tok_dep = tok.dep_
prv_tok_text = tok.text
#############################################################
return [ent1.strip(), ent2.strip()]
def get_relation(sent):
nlp = spacy.load('en_core_web_sm')
doc = nlp(sent)
# Matcher class object
matcher = Matcher(nlp.vocab)
#define the pattern
pattern = [{'DEP':'ROOT'},
{'DEP':'prep','OP':"?"},
{'DEP':'agent','OP':"?"},
{'POS':'ADJ','OP':"?"}]
matcher.add('matching_pattern', patterns=[pattern])
matches = matcher(doc)
k = len(matches) - 1
span = doc[matches[k][1]:matches[k][2]]
return(span.text)