|
import spacy |
|
from spacy.language import Language |
|
import regex |
|
|
|
|
|
@Language.component("entity_punctuation_removal") |
|
|
|
|
|
def entity_punctuation_removal(doc): |
|
|
|
ents = list(doc.ents) |
|
|
|
i = 0 |
|
while i < len(ents): |
|
current_ent = ents[i] |
|
|
|
|
|
|
|
|
|
|
|
|
|
if i + 1 < len(ents) and regex.match(r'^\p{P}$', current_ent.text) and current_ent.root.ent_iob_ == "B" : |
|
ents.pop(i) |
|
elif i == len(ents) - 1 and regex.match(r'^\p{P}$', current_ent.text) and current_ent.root.ent_iob_ == "B" : |
|
ents.pop(i) |
|
else: |
|
i += 1 |
|
|
|
|
|
doc.ents = tuple(ents) |
|
|
|
return doc |
|
|
|
Language.component("entity_punctuation_removal", func=entity_punctuation_removal) |