aimlnerd's picture
add
1d6faef
raw
history blame
2.98 kB
from typing import List, Union
from spacy.tokens import Doc
from configuration.config import settings
from spacy.training import docs_to_json
import pycrfsuite
import srsly
nlp = settings.spacy_pretrained_model_nl_md
DOC2JSON_FT = ["id", "orth", "lemma", "ner"]
def get_entity_prediction(lst_token: List, pred: List) -> List:
lst = []
for token, pred_token in zip(lst_token[0], pred[0]):
if pred_token != 'O':
lst.append((token[1], token[2], pred_token[2:]))
return lst
def format_prediction(offsets: List, text: str, **kwargs) -> List:
if kwargs.get('model_name', None):
source = kwargs['model_name']
else:
source = "crf-broker"
lst = []
for pred_token in offsets:
lst.append({
"text": text[pred_token[0]:pred_token[1]],
"start": pred_token[0],
"end": pred_token[1],
"label": pred_token[2],
"source": source,
"score": 1.0})
return lst
def convert_spacybilou_to_crfsuitebilou(doc2json, save_path=False):
"""
Convert from spacybilou to convert_spacybilou_to_crfsuitebilou i.e. [[(),()...],[(),()...]....]
the model will be feed at document level not sentence level
doc['sentences'] represents one document eg: 1 email or 1 attachment
format example
{
"id":10,
"orth":"Belgium",
"space":"",
"tag":"SPEC|deeleigen",
"pos":"PROPN",
"morph":"",
"lemma":"belgium",
"head":-1,
"dep":"flat",
"ner":"O"
}
"""
lst_crf_docs = []
for doc in doc2json['paragraphs']:
lst_crf_doc = []
for sents in doc['sentences']:
sentence = [tuple({k: v for k, v in token.items() if k in DOC2JSON_FT}.values()) for token in
sents['tokens']]
lst_crf_doc.extend(sentence)
lst_crf_docs.append(lst_crf_doc)
if save_path:
srsly.write_json(save_path, lst_crf_docs)
return lst_crf_docs
def create_raw_data(input_text: str) -> tuple[List, List, Union[Doc, Doc]]:
doc = nlp(input_text)
lst_tokens = []
for token in doc:
lst_tokens.append((token.text, token.idx, token.idx + len(token.text)))
doc2json1 = docs_to_json(doc)
lst_data1 = convert_spacybilou_to_crfsuitebilou(doc2json=doc2json1)
return lst_data1, [lst_tokens], doc
def token_feature_engineering(raw_data: List, tokenfeatures) -> List:
X = [tokenfeatures.sent2features(s) for s in raw_data]
return X
def load_crf_model(path: str) -> pycrfsuite.Tagger:
tagger = pycrfsuite.Tagger()
tagger.open(path)
return tagger
def predictor(tagger, x: List) -> List:
"""Runs prediction.
Args:
tagger: CRF
x: input data
Returns:
prediction: List
"""
y_pred = [tagger.tag(xseq) for xseq in x]
return y_pred