File size: 2,761 Bytes
6680682
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
from sftp import SpanPredictor
import spacy

import sys
import dataclasses
from typing import List, Optional, Dict, Any


predictor = SpanPredictor.from_path("model.mod.tar.gz")
nlp = spacy.load("xx_sent_ud_sm")


@dataclasses.dataclass
class FrameAnnotation:
    tokens: List[str] = dataclasses.field(default_factory=list)
    pos: List[str] = dataclasses.field(default_factory=list)


@dataclasses.dataclass
class MultiLabelAnnotation(FrameAnnotation):
    frame_list: List[List[str]] = dataclasses.field(default_factory=list)
    lu_list: List[Optional[str]] = dataclasses.field(default_factory=list)

    def to_txt(self):
        for i, tok in enumerate(self.tokens):
            yield f"{tok} {self.pos[i]} {'|'.join(self.frame_list[i]) or '_'} {self.lu_list[i] or '_'}"


# reused from "combine_predictions.py" (cloned/lome/src/spanfinder/sociolome)
def convert_to_seq_labels(sentence: List[str], structures: Dict[int, Dict[str, Any]]) -> List[List[str]]:
    labels = [[] for _ in sentence]

    for struct_id, struct in structures.items():
        tgt_span = struct["target"]
        frame = struct["frame"]

        for i in range(tgt_span[0], tgt_span[1] + 1):
            labels[i].append(f"T:{frame}@{struct_id:02}")
        for role in struct["roles"]:
            role_span = role["boundary"]
            role_label = role["label"]
            for i in range(role_span[0], role_span[1] + 1):
                prefix = "B" if i == role_span[0] else "I"
                labels[i].append(f"{prefix}:{frame}:{role_label}@{struct_id:02}")
    return labels

def make_prediction(sentence, spacy_model, predictor):
    spacy_doc = spacy_model(sentence)
    tokens = [t.text for t in spacy_doc]
    tgt_spans, fr_labels, _ = predictor.force_decode(tokens)

    frame_structures = {}

    for i, (tgt, frm) in enumerate(sorted(zip(tgt_spans, fr_labels), key=lambda t: t[0][0])):
        arg_spans, arg_labels, _ = predictor.force_decode(tokens, parent_span=tgt, parent_label=frm)

        frame_structures[i] = {
                "target": tgt,
                "frame": frm,
                "roles": [
                    {"boundary": bnd, "label": label}
                    for bnd, label in zip(arg_spans, arg_labels)
                    if label != "Target"
                ]
            }

    return MultiLabelAnnotation(
        tokens=tokens,
        pos=[t.pos_ for t in spacy_doc],
        frame_list=convert_to_seq_labels(tokens, frame_structures),
        lu_list=[None for _ in tokens]
    )


def analyze(text):
    analyses = []
    for sentence in text.split("\n"):
        analyses.append(make_prediction(sentence, nlp, predictor))

    return {
        "result": "OK",
        "analyses": [dataclasses.asdict(an) for an in analyses]
    }