File size: 2,980 Bytes
1d6faef
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from typing import List, Union

from spacy.tokens import Doc
from configuration.config import settings
from spacy.training import docs_to_json
import pycrfsuite

import srsly

nlp = settings.spacy_pretrained_model_nl_md

DOC2JSON_FT = ["id", "orth", "lemma", "ner"]


def get_entity_prediction(lst_token: List, pred: List) -> List:
    lst = []
    for token, pred_token in zip(lst_token[0], pred[0]):
        if pred_token != 'O':
            lst.append((token[1], token[2], pred_token[2:]))
    return lst


def format_prediction(offsets: List, text: str, **kwargs) -> List:
    if kwargs.get('model_name', None):
        source = kwargs['model_name']
    else:
        source = "crf-broker"

    lst = []
    for pred_token in offsets:
        lst.append({
            "text": text[pred_token[0]:pred_token[1]],
            "start": pred_token[0],
            "end": pred_token[1],
            "label": pred_token[2],
            "source": source,
            "score": 1.0})
    return lst


def convert_spacybilou_to_crfsuitebilou(doc2json, save_path=False):
    """
    Convert from spacybilou to convert_spacybilou_to_crfsuitebilou i.e. [[(),()...],[(),()...]....]
    the model will be feed at document level not sentence level
    doc['sentences'] represents one document eg: 1 email or 1 attachment
                 format example
            {
              "id":10,
              "orth":"Belgium",
              "space":"",
              "tag":"SPEC|deeleigen",
              "pos":"PROPN",
              "morph":"",
              "lemma":"belgium",
              "head":-1,
              "dep":"flat",
              "ner":"O"
            }

    """
    lst_crf_docs = []
    for doc in doc2json['paragraphs']:
        lst_crf_doc = []
        for sents in doc['sentences']:
            sentence = [tuple({k: v for k, v in token.items() if k in DOC2JSON_FT}.values()) for token in
                        sents['tokens']]
            lst_crf_doc.extend(sentence)
        lst_crf_docs.append(lst_crf_doc)
    if save_path:
        srsly.write_json(save_path, lst_crf_docs)
    return lst_crf_docs


def create_raw_data(input_text: str) -> tuple[List, List, Union[Doc, Doc]]:
    doc = nlp(input_text)
    lst_tokens = []
    for token in doc:
        lst_tokens.append((token.text, token.idx, token.idx + len(token.text)))

    doc2json1 = docs_to_json(doc)
    lst_data1 = convert_spacybilou_to_crfsuitebilou(doc2json=doc2json1)
    return lst_data1, [lst_tokens], doc


def token_feature_engineering(raw_data: List, tokenfeatures) -> List:
    X = [tokenfeatures.sent2features(s) for s in raw_data]
    return X


def load_crf_model(path: str) -> pycrfsuite.Tagger:
    tagger = pycrfsuite.Tagger()
    tagger.open(path)
    return tagger


def predictor(tagger, x: List) -> List:
    """Runs prediction.
    Args:
        tagger: CRF
        x: input data
    Returns:
        prediction: List
    """

    y_pred = [tagger.tag(xseq) for xseq in x]
    return y_pred