from typing import List, Union from spacy.tokens import Doc from configuration.config import settings from spacy.training import docs_to_json import pycrfsuite import srsly nlp = settings.spacy_pretrained_model_nl_md DOC2JSON_FT = ["id", "orth", "lemma", "ner"] def get_entity_prediction(lst_token: List, pred: List) -> List: lst = [] for token, pred_token in zip(lst_token[0], pred[0]): if pred_token != 'O': lst.append((token[1], token[2], pred_token[2:])) return lst def format_prediction(offsets: List, text: str, **kwargs) -> List: if kwargs.get('model_name', None): source = kwargs['model_name'] else: source = "crf-broker" lst = [] for pred_token in offsets: lst.append({ "text": text[pred_token[0]:pred_token[1]], "start": pred_token[0], "end": pred_token[1], "label": pred_token[2], "source": source, "score": 1.0}) return lst def convert_spacybilou_to_crfsuitebilou(doc2json, save_path=False): """ Convert from spacybilou to convert_spacybilou_to_crfsuitebilou i.e. [[(),()...],[(),()...]....] the model will be feed at document level not sentence level doc['sentences'] represents one document eg: 1 email or 1 attachment format example { "id":10, "orth":"Belgium", "space":"", "tag":"SPEC|deeleigen", "pos":"PROPN", "morph":"", "lemma":"belgium", "head":-1, "dep":"flat", "ner":"O" } """ lst_crf_docs = [] for doc in doc2json['paragraphs']: lst_crf_doc = [] for sents in doc['sentences']: sentence = [tuple({k: v for k, v in token.items() if k in DOC2JSON_FT}.values()) for token in sents['tokens']] lst_crf_doc.extend(sentence) lst_crf_docs.append(lst_crf_doc) if save_path: srsly.write_json(save_path, lst_crf_docs) return lst_crf_docs def create_raw_data(input_text: str) -> tuple[List, List, Union[Doc, Doc]]: doc = nlp(input_text) lst_tokens = [] for token in doc: lst_tokens.append((token.text, token.idx, token.idx + len(token.text))) doc2json1 = docs_to_json(doc) lst_data1 = convert_spacybilou_to_crfsuitebilou(doc2json=doc2json1) return lst_data1, [lst_tokens], doc def token_feature_engineering(raw_data: List, tokenfeatures) -> List: X = [tokenfeatures.sent2features(s) for s in raw_data] return X def load_crf_model(path: str) -> pycrfsuite.Tagger: tagger = pycrfsuite.Tagger() tagger.open(path) return tagger def predictor(tagger, x: List) -> List: """Runs prediction. Args: tagger: CRF x: input data Returns: prediction: List """ y_pred = [tagger.tag(xseq) for xseq in x] return y_pred