ltg
/

File size: 5,677 Bytes
c45d283
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
import yaml


class Params:
    def __init__(self):
        self.graph_mode = "sequential"               # possibilities: {sequential, node-centric, edge-labeled}
        self.accumulation_steps = 1                  # number of gradient accumulation steps for achieving a bigger batch_size
        self.activation = "relu"                     # transformer (decoder) activation function, supported values: {'relu', 'gelu', 'sigmoid', 'mish'}
        self.predict_intensity = False
        self.batch_size = 32                         # batch size (further divided into multiple GPUs)
        self.beta_2 = 0.98                           # beta 2 parameter for Adam(W) optimizer
        self.blank_weight = 1.0                      # weight of cross-entropy loss for predicting an empty label
        self.char_embedding = True                   # use character embedding in addition to bert
        self.char_embedding_size = 128               # dimension of the character embedding layer in the character embedding module
        self.decoder_delay_steps = 0                 # number of initial steps with frozen decoder
        self.decoder_learning_rate = 6e-4            # initial decoder learning rate
        self.decoder_weight_decay = 1.2e-6           # amount of weight decay
        self.dropout_anchor = 0.5                    # dropout at the last layer of anchor classifier
        self.dropout_edge_label = 0.5                # dropout at the last layer of edge label classifier
        self.dropout_edge_presence = 0.5             # dropout at the last layer of edge presence classifier
        self.dropout_label = 0.5                     # dropout at the last layer of label classifier
        self.dropout_transformer = 0.5               # dropout for the transformer layers (decoder)
        self.dropout_transformer_attention = 0.1     # dropout for the transformer's attention (decoder)
        self.dropout_word = 0.1                      # probability of dropping out a whole word from the encoder (in favour of char embedding)
        self.encoder = "xlm-roberta-base"            # pretrained encoder model
        self.encoder_delay_steps = 2000              # number of initial steps with frozen XLM-R
        self.encoder_freeze_embedding = True         # freeze the first embedding layer in XLM-R
        self.encoder_learning_rate = 6e-5            # initial encoder learning rate
        self.encoder_weight_decay = 1e-2             # amount of weight decay
        self.lr_decay_multiplier = 100
        self.epochs = 100                            # number of epochs for train
        self.focal = True                            # use focal loss for the label prediction
        self.freeze_bert = False                     # use focal loss for the label prediction
        self.group_ops = False                       # group 'opN' edge labels into one
        self.hidden_size_ff = 4 * 768                # hidden size of the transformer feed-forward submodule
        self.hidden_size_anchor = 128                # hidden size anchor biaffine layer
        self.hidden_size_edge_label = 256            # hidden size for edge label biaffine layer
        self.hidden_size_edge_presence = 512         # hidden size for edge label biaffine layer
        self.layerwise_lr_decay = 1.0                # layerwise decay of learning rate in the encoder
        self.n_attention_heads = 8                   # number of attention heads in the decoding transformer
        self.n_layers = 3                            # number of layers in the decoder
        self.query_length = 4                        # number of queries genereted for each word on the input
        self.pre_norm = True                         # use pre-normalized version of the transformer (as in Transformers without Tears)
        self.warmup_steps = 6000                     # number of the warm-up steps for the inverse_sqrt scheduler

    def init_data_paths(self):
        directory_1 = {
            "sequential": "node_centric_mrp",
            "node-centric": "node_centric_mrp",
            "labeled-edge": "labeled_edge_mrp"
        }[self.graph_mode]
        directory_2 = {
            ("darmstadt", "en"): "darmstadt_unis",
            ("mpqa", "en"): "mpqa",
            ("multibooked", "ca"): "multibooked_ca",
            ("multibooked", "eu"): "multibooked_eu",
            ("norec", "no"): "norec",
            ("opener", "en"): "opener_en",
            ("opener", "es"): "opener_es",
        }[(self.framework, self.language)]

        self.training_data = f"{self.data_directory}/{directory_1}/{directory_2}/train.mrp"
        self.validation_data = f"{self.data_directory}/{directory_1}/{directory_2}/dev.mrp"
        self.test_data = f"{self.data_directory}/{directory_1}/{directory_2}/test.mrp"

        self.raw_training_data = f"{self.data_directory}/raw/{directory_2}/train.json"
        self.raw_validation_data = f"{self.data_directory}/raw/{directory_2}/dev.json"

        return self

    def load_state_dict(self, d):
        for k, v in d.items():
            setattr(self, k, v)
        return self

    def state_dict(self):
        members = [attr for attr in dir(self) if not callable(getattr(self, attr)) and not attr.startswith("__")]
        return {k: self.__dict__[k] for k in members}

    def load(self, args):
        with open(args.config, "r", encoding="utf-8") as f:
            params = yaml.safe_load(f)
            self.load_state_dict(params)
        self.init_data_paths()

    def save(self, json_path):
        with open(json_path, "w", encoding="utf-8") as f:
            d = self.state_dict()
            yaml.dump(d, f)