Spaces:
Runtime error
Runtime error
from typing import List, Union | |
from spacy.tokens import Doc | |
from configuration.config import settings | |
from spacy.training import docs_to_json | |
import pycrfsuite | |
import srsly | |
nlp = settings.spacy_pretrained_model_nl_md | |
DOC2JSON_FT = ["id", "orth", "lemma", "ner"] | |
def get_entity_prediction(lst_token: List, pred: List) -> List: | |
lst = [] | |
for token, pred_token in zip(lst_token[0], pred[0]): | |
if pred_token != 'O': | |
lst.append((token[1], token[2], pred_token[2:])) | |
return lst | |
def format_prediction(offsets: List, text: str, **kwargs) -> List: | |
if kwargs.get('model_name', None): | |
source = kwargs['model_name'] | |
else: | |
source = "crf-broker" | |
lst = [] | |
for pred_token in offsets: | |
lst.append({ | |
"text": text[pred_token[0]:pred_token[1]], | |
"start": pred_token[0], | |
"end": pred_token[1], | |
"label": pred_token[2], | |
"source": source, | |
"score": 1.0}) | |
return lst | |
def convert_spacybilou_to_crfsuitebilou(doc2json, save_path=False): | |
""" | |
Convert from spacybilou to convert_spacybilou_to_crfsuitebilou i.e. [[(),()...],[(),()...]....] | |
the model will be feed at document level not sentence level | |
doc['sentences'] represents one document eg: 1 email or 1 attachment | |
format example | |
{ | |
"id":10, | |
"orth":"Belgium", | |
"space":"", | |
"tag":"SPEC|deeleigen", | |
"pos":"PROPN", | |
"morph":"", | |
"lemma":"belgium", | |
"head":-1, | |
"dep":"flat", | |
"ner":"O" | |
} | |
""" | |
lst_crf_docs = [] | |
for doc in doc2json['paragraphs']: | |
lst_crf_doc = [] | |
for sents in doc['sentences']: | |
sentence = [tuple({k: v for k, v in token.items() if k in DOC2JSON_FT}.values()) for token in | |
sents['tokens']] | |
lst_crf_doc.extend(sentence) | |
lst_crf_docs.append(lst_crf_doc) | |
if save_path: | |
srsly.write_json(save_path, lst_crf_docs) | |
return lst_crf_docs | |
def create_raw_data(input_text: str) -> tuple[List, List, Union[Doc, Doc]]: | |
doc = nlp(input_text) | |
lst_tokens = [] | |
for token in doc: | |
lst_tokens.append((token.text, token.idx, token.idx + len(token.text))) | |
doc2json1 = docs_to_json(doc) | |
lst_data1 = convert_spacybilou_to_crfsuitebilou(doc2json=doc2json1) | |
return lst_data1, [lst_tokens], doc | |
def token_feature_engineering(raw_data: List, tokenfeatures) -> List: | |
X = [tokenfeatures.sent2features(s) for s in raw_data] | |
return X | |
def load_crf_model(path: str) -> pycrfsuite.Tagger: | |
tagger = pycrfsuite.Tagger() | |
tagger.open(path) | |
return tagger | |
def predictor(tagger, x: List) -> List: | |
"""Runs prediction. | |
Args: | |
tagger: CRF | |
x: input data | |
Returns: | |
prediction: List | |
""" | |
y_pred = [tagger.tag(xseq) for xseq in x] | |
return y_pred | |