Gosse Minnema commited on
Commit
05922fb
1 Parent(s): 7717281

Initial commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitignore +1 -0
  2. Dockerfile +5 -0
  3. README.md +5 -6
  4. config/ace/ace.jsonnet +131 -0
  5. config/ace/ft.jsonnet +51 -0
  6. config/ace/pt.jsonnet +69 -0
  7. config/ace/rt.jsonnet +89 -0
  8. config/basic/basic.jsonnet +132 -0
  9. config/basic/ft.jsonnet +51 -0
  10. config/basic/pt.jsonnet +67 -0
  11. config/basic/rt.jsonnet +87 -0
  12. config/env.jsonnet +4 -0
  13. config/fn-evalita/evalita.framenet_xlmr.jsonnet +141 -0
  14. config/fn-evalita/evalita.it_mono.jsonnet +141 -0
  15. config/fn-evalita/evalita.vanilla_xlmr.jsonnet +141 -0
  16. config/fn-evalita/evalita_plus_fn.vanilla_xlmr.freeze.jsonnet +142 -0
  17. config/fn-evalita/evalita_plus_fn.vanilla_xlmr.jsonnet +141 -0
  18. config/fn-kicktionary/kicktionary.concat_clipped.vanilla_xlmr.jsonnet +141 -0
  19. config/fn-kicktionary/kicktionary.football_xlmr.jsonnet +141 -0
  20. config/fn-kicktionary/kicktionary.framenet_xlmr.jsonnet +141 -0
  21. config/fn-kicktionary/kicktionary.vanilla_xlmr.jsonnet +141 -0
  22. config/fn-sonar/sonar-a1.framenet_xlmr.jsonnet +141 -0
  23. config/fn-sonar/sonar-a1.sonar_plus_fn.vanilla_xlmr.jsonnet +142 -0
  24. config/fn-sonar/sonar-a1.vanilla_xlmr.jsonnet +141 -0
  25. config/fn-sonar/sonar-a2.framenet_xlmr.jsonnet +141 -0
  26. config/fn-sonar/sonar-a2.sonar_plus_fn.vanilla_xlmr.jsonnet +141 -0
  27. config/fn-sonar/sonar-a2.vanilla_xlmr.jsonnet +141 -0
  28. config/fn/fn.orig.jsonnet +139 -0
  29. config/fn/fn.train-football.jsonnet +142 -0
  30. config/fn/fn.train3.jsonnet +141 -0
  31. docs/data.md +68 -0
  32. docs/mapping.md +17 -0
  33. docs/training.md +65 -0
  34. evalita_scores.txt +0 -0
  35. model.mod.tar.gz +3 -0
  36. requirements.txt +15 -0
  37. scripts/__pycache__/predict_concrete.cpython-37.pyc +0 -0
  38. scripts/__pycache__/predict_concrete.cpython-38.pyc +0 -0
  39. scripts/__pycache__/predict_concrete.cpython-39.pyc +0 -0
  40. scripts/__pycache__/predict_force.cpython-39.pyc +0 -0
  41. scripts/__pycache__/repl.cpython-39.pyc +0 -0
  42. scripts/aida_experiment/predict_aida.py +42 -0
  43. scripts/aida_experiment/read_aida.py +107 -0
  44. scripts/aida_experiment/test_mapping.py +59 -0
  45. scripts/archive/eval_tie.py +50 -0
  46. scripts/archive/frame_similarity.py +143 -0
  47. scripts/archive/kairos_mapping.py +43 -0
  48. scripts/archive/onto_test.py +34 -0
  49. scripts/archive/predict_better.py +47 -0
  50. scripts/archive/predict_kairos.py +98 -0
.gitignore ADDED
@@ -0,0 +1 @@
 
 
1
+ .venv/
Dockerfile ADDED
@@ -0,0 +1,5 @@
 
 
 
 
 
 
1
+ FROM python:3.9
2
+ WORKDIR /app
3
+ ADD . /app
4
+ RUN pip install -r requirements.txt
5
+ CMD ["python", "-m", "sociolome.lome_webserver", "0.0.0.0"]
README.md CHANGED
@@ -1,10 +1,9 @@
1
  ---
2
- title: Lome Private
3
- emoji: 😻
4
  colorFrom: yellow
5
- colorTo: purple
6
  sdk: docker
7
  pinned: false
8
- ---
9
-
10
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Sociofillmore Public
3
+ emoji: 💻
4
  colorFrom: yellow
5
+ colorTo: red
6
  sdk: docker
7
  pinned: false
8
+ app_port: 5000
9
+ ---
 
config/ace/ace.jsonnet ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+
3
+ local dataset_path = env.str("DATA_PATH", "data/ace/events");
4
+ local ontology_path = "data/ace/ontology.tsv";
5
+
6
+ local debug = false;
7
+
8
+ # embedding
9
+ local label_dim = 64;
10
+ local pretrained_model = env.str("ENCODER", "roberta-large");
11
+
12
+ # module
13
+ local dropout = 0.2;
14
+ local bio_dim = 512;
15
+ local bio_layers = 2;
16
+ local span_typing_dims = [256, 256];
17
+ local event_smoothing_factor = env.json("SMOOTHING", "0.0");
18
+ local arg_smoothing_factor = env.json("SMOOTHING", "0.0");
19
+ local layer_fix = 0;
20
+
21
+ # training
22
+ local typing_loss_factor = 8.0;
23
+ local grad_acc = env.json("GRAD_ACC", "1");
24
+ local max_training_tokens = 512;
25
+ local max_inference_tokens = 1024;
26
+ local lr = env.json("LR", "1e-3");
27
+ local cuda_devices = env.json("CUDA_DEVICES", "[0]");
28
+
29
+ {
30
+ dataset_reader: {
31
+ type: "concrete",
32
+ debug: debug,
33
+ pretrained_model: pretrained_model,
34
+ ignore_label: false,
35
+ [ if debug then "max_instances" ]: 128,
36
+ event_smoothing_factor: event_smoothing_factor,
37
+ arg_smoothing_factor: event_smoothing_factor,
38
+ },
39
+ train_data_path: dataset_path + "/train.tar.gz",
40
+ validation_data_path: dataset_path + "/dev.tar.gz",
41
+ test_data_path: dataset_path + "/test.tar.gz",
42
+
43
+ datasets_for_vocab_creation: ["train"],
44
+
45
+ data_loader: {
46
+ batch_sampler: {
47
+ type: "max_tokens_sampler",
48
+ max_tokens: max_training_tokens,
49
+ sorting_keys: ['tokens']
50
+ }
51
+ },
52
+
53
+ validation_data_loader: {
54
+ batch_sampler: {
55
+ type: "max_tokens_sampler",
56
+ max_tokens: max_inference_tokens,
57
+ sorting_keys: ['tokens']
58
+ }
59
+ },
60
+
61
+ model: {
62
+ type: "span",
63
+ word_embedding: {
64
+ token_embedders: {
65
+ "pieces": {
66
+ type: "pretrained_transformer",
67
+ model_name: pretrained_model,
68
+ }
69
+ },
70
+ },
71
+ span_extractor: {
72
+ type: 'combo',
73
+ sub_extractors: [
74
+ {
75
+ type: 'self_attentive',
76
+ },
77
+ {
78
+ type: 'bidirectional_endpoint',
79
+ }
80
+ ]
81
+ },
82
+ span_finder: {
83
+ type: "bio",
84
+ bio_encoder: {
85
+ type: "lstm",
86
+ hidden_size: bio_dim,
87
+ num_layers: bio_layers,
88
+ bidirectional: true,
89
+ dropout: dropout,
90
+ },
91
+ no_label: false,
92
+ },
93
+ span_typing: {
94
+ type: 'mlp',
95
+ hidden_dims: span_typing_dims,
96
+ },
97
+ metrics: [{type: "srl"}],
98
+
99
+ ontology_path: ontology_path,
100
+ typing_loss_factor: typing_loss_factor,
101
+ label_dim: label_dim,
102
+ max_decoding_spans: 128,
103
+ max_recursion_depth: 2,
104
+ debug: debug,
105
+ },
106
+
107
+ trainer: {
108
+ num_epochs: 128,
109
+ patience: null,
110
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
111
+ validation_metric: "+arg-c_f",
112
+ num_gradient_accumulation_steps: grad_acc,
113
+ optimizer: {
114
+ type: "transformer",
115
+ base: {
116
+ type: "adam",
117
+ lr: lr,
118
+ },
119
+ embeddings_lr: 0.0,
120
+ encoder_lr: 1e-5,
121
+ pooler_lr: 1e-5,
122
+ layer_fix: layer_fix,
123
+ }
124
+ },
125
+
126
+ cuda_devices:: cuda_devices,
127
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
128
+ "cuda_devices": cuda_devices
129
+ },
130
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true,
131
+ }
config/ace/ft.jsonnet ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+ local base = import "ace.jsonnet";
3
+
4
+ local pretrained_path = env.str("PRETRAINED_PATH", "cache/ace/best");
5
+ local lr = env.json("FT_LR", 5e-5);
6
+
7
+ # training
8
+ local cuda_devices = base.cuda_devices;
9
+
10
+ {
11
+ dataset_reader: base.dataset_reader,
12
+ train_data_path: base.train_data_path,
13
+ validation_data_path: base.validation_data_path,
14
+ test_data_path: base.test_data_path,
15
+ datasets_for_vocab_creation: ["train"],
16
+ data_loader: base.data_loader,
17
+ validation_data_loader: base.validation_data_loader,
18
+
19
+ model: {
20
+ type: "from_archive",
21
+ archive_file: pretrained_path
22
+ },
23
+ vocabulary: {
24
+ type: "from_files",
25
+ directory: pretrained_path + "/vocabulary"
26
+ },
27
+
28
+ trainer: {
29
+ num_epochs: base.trainer.num_epochs,
30
+ patience: base.trainer.patience,
31
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
32
+ validation_metric: "+arg-c_f",
33
+ num_gradient_accumulation_steps: base.trainer.num_gradient_accumulation_steps,
34
+ optimizer: {
35
+ type: "transformer",
36
+ base: {
37
+ type: "adam",
38
+ lr: lr,
39
+ },
40
+ embeddings_lr: 0.0,
41
+ encoder_lr: 1e-5,
42
+ pooler_lr: 1e-5,
43
+ layer_fix: base.trainer.optimizer.layer_fix,
44
+ }
45
+ },
46
+
47
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
48
+ "cuda_devices": cuda_devices
49
+ },
50
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
51
+ }
config/ace/pt.jsonnet ADDED
@@ -0,0 +1,69 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+ local base = import "ace.jsonnet";
3
+
4
+ local fn_path = "data/framenet/full/full.jsonl";
5
+ local mapping_path = "data/ace/framenet2ace/";
6
+
7
+ local debug = false;
8
+
9
+ # training
10
+ local lr = env.json("PT_LR", "5e-5");
11
+ local cuda_devices = base.cuda_devices;
12
+
13
+ # mapping
14
+ local min_weight = env.json("MIN_WEIGHT", '0.0');
15
+ local max_weight = env.json("MAX_WEIGHT", '5.0');
16
+
17
+ {
18
+ dataset_reader: {
19
+ type: "semantic_role_labeling",
20
+ debug: debug,
21
+ pretrained_model: base.dataset_reader.pretrained_model,
22
+ ignore_label: false,
23
+ [ if debug then "max_instances" ]: 128,
24
+ event_smoothing_factor: base.dataset_reader.event_smoothing_factor,
25
+ arg_smoothing_factor: base.dataset_reader.arg_smoothing_factor,
26
+ ontology_mapping_path: mapping_path + '/ontology_mapping.json',
27
+ min_weight: min_weight,
28
+ max_weight: max_weight,
29
+ },
30
+ validation_dataset_reader: base.dataset_reader,
31
+ train_data_path: fn_path,
32
+ validation_data_path: base.validation_data_path,
33
+ test_data_path: base.test_data_path,
34
+ vocabulary: {
35
+ type: "extend",
36
+ directory: mapping_path + "/vocabulary"
37
+ },
38
+
39
+ datasets_for_vocab_creation: ["train"],
40
+
41
+ data_loader: base.data_loader,
42
+ validation_data_loader: base.validation_data_loader,
43
+
44
+ model: base.model,
45
+
46
+ trainer: {
47
+ num_epochs: base.trainer.num_epochs,
48
+ patience: base.trainer.patience,
49
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
50
+ validation_metric: "+arg-c_f",
51
+ num_gradient_accumulation_steps: base.trainer.num_gradient_accumulation_steps,
52
+ optimizer: {
53
+ type: "transformer",
54
+ base: {
55
+ type: "adam",
56
+ lr: lr,
57
+ },
58
+ embeddings_lr: 0.0,
59
+ encoder_lr: 1e-5,
60
+ pooler_lr: 1e-5,
61
+ layer_fix: base.trainer.optimizer.layer_fix,
62
+ }
63
+ },
64
+
65
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
66
+ "cuda_devices": cuda_devices
67
+ },
68
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
69
+ }
config/ace/rt.jsonnet ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+ local base = import "ace.jsonnet";
3
+
4
+ local dataset_path = env.str("DATA_PATH", "data/ace/events");
5
+
6
+ local debug = false;
7
+
8
+ # re-train
9
+ local pretrained_path = env.str("PRETRAINED_PATH", "cache/fn/best");
10
+ local rt_lr = env.json("RT_LR", 5e-5);
11
+
12
+ # module
13
+ local cuda_devices = base.cuda_devices;
14
+
15
+ {
16
+ dataset_reader: base.dataset_reader,
17
+ train_data_path: base.train_data_path,
18
+ validation_data_path: base.validation_data_path,
19
+ test_data_path: base.test_data_path,
20
+
21
+ datasets_for_vocab_creation: ["train"],
22
+
23
+ data_loader: base.data_loader,
24
+ validation_data_loader: base.validation_data_loader,
25
+
26
+ model: {
27
+ type: "span",
28
+ word_embedding: {
29
+ "_pretrained": {
30
+ "archive_file": pretrained_path,
31
+ "module_path": "word_embedding",
32
+ "freeze": false,
33
+ }
34
+ },
35
+ span_extractor: {
36
+ "_pretrained": {
37
+ "archive_file": pretrained_path,
38
+ "module_path": "_span_extractor",
39
+ "freeze": false,
40
+ }
41
+ },
42
+ span_finder: {
43
+ "_pretrained": {
44
+ "archive_file": pretrained_path,
45
+ "module_path": "_span_finder",
46
+ "freeze": false,
47
+ }
48
+ },
49
+ span_typing: {
50
+ type: 'mlp',
51
+ hidden_dims: base.model.span_typing.hidden_dims,
52
+ },
53
+ metrics: [{type: "srl"}],
54
+
55
+ typing_loss_factor: base.model.typing_loss_factor,
56
+ label_dim: base.model.label_dim,
57
+ max_decoding_spans: 128,
58
+ max_recursion_depth: 2,
59
+ debug: debug,
60
+ },
61
+
62
+ trainer: {
63
+ num_epochs: base.trainer.num_epochs,
64
+ patience: base.trainer.patience,
65
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
66
+ validation_metric: "+arg-c_f",
67
+ num_gradient_accumulation_steps: base.trainer.num_gradient_accumulation_steps,
68
+ optimizer: {
69
+ type: "transformer",
70
+ base: {
71
+ type: "adam",
72
+ lr: base.trainer.optimizer.base.lr,
73
+ },
74
+ embeddings_lr: 0.0,
75
+ encoder_lr: 1e-5,
76
+ pooler_lr: 1e-5,
77
+ layer_fix: base.trainer.optimizer.layer_fix,
78
+ parameter_groups: [
79
+ [['_span_finder.*'], {'lr': rt_lr}],
80
+ [['_span_extractor.*'], {'lr': rt_lr}],
81
+ ]
82
+ }
83
+ },
84
+
85
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
86
+ "cuda_devices": cuda_devices
87
+ },
88
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
89
+ }
config/basic/basic.jsonnet ADDED
@@ -0,0 +1,132 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+
3
+ local dataset_path = "data/better/basic/sent/";
4
+ local ontology_path = "data/better/ontology.tsv";
5
+
6
+ local debug = false;
7
+
8
+ # reader
9
+ local pretrained_model = env.str("ENCODER", "xlm-roberta-large");
10
+
11
+ # model
12
+ local label_dim = env.json("LABEL_DIM", "64");
13
+ local dropout = env.json("DROPOUT", "0.2");
14
+ local bio_dim = env.json("BIO_DIM", "512");
15
+ local bio_layers = env.json("BIO_LAYER", "2");
16
+ local span_typing_dims = env.json("TYPING_DIMS", "[256, 256]");
17
+ local typing_loss_factor = env.json("LOSS_FACTOR", "8.0");
18
+
19
+ # loader
20
+ local max_training_tokens = 512;
21
+ local max_inference_tokens = 1024;
22
+
23
+ # training
24
+ local layer_fix = env.json("LAYER_FIX", "0");
25
+ local grad_acc = env.json("GRAD_ACC", "1");
26
+ local cuda_devices = env.json("CUDA_DEVICES", "[-1]");
27
+ local patience = env.json("PATIENCE", "null");
28
+
29
+ {
30
+ dataset_reader: {
31
+ type: "better",
32
+ eval_type: "basic",
33
+ debug: debug,
34
+ pretrained_model: pretrained_model,
35
+ ignore_label: false,
36
+ [ if debug then "max_instances" ]: 128,
37
+ },
38
+ train_data_path: dataset_path + "/basic.eng-provided-72.0pct.train-70.0pct.d.bp.json",
39
+ validation_data_path: dataset_path + "/basic.eng-provided-72.0pct.analysis-15.0pct.ref.d.bp.json",
40
+ test_data_path: dataset_path + "/basic.eng-provided-72.0pct.devtest-15.0pct.ref.d.bp.json",
41
+
42
+ datasets_for_vocab_creation: ["train"],
43
+
44
+ data_loader: {
45
+ batch_sampler: {
46
+ type: "max_tokens_sampler",
47
+ max_tokens: max_training_tokens,
48
+ sorting_keys: ['tokens']
49
+ }
50
+ },
51
+
52
+ validation_data_loader: {
53
+ batch_sampler: {
54
+ type: "max_tokens_sampler",
55
+ max_tokens: max_inference_tokens,
56
+ sorting_keys: ['tokens']
57
+ }
58
+ },
59
+
60
+ model: {
61
+ type: "span",
62
+ word_embedding: {
63
+ token_embedders: {
64
+ "pieces": {
65
+ type: "pretrained_transformer",
66
+ model_name: pretrained_model,
67
+ }
68
+ },
69
+ },
70
+ span_extractor: {
71
+ type: 'combo',
72
+ sub_extractors: [
73
+ {
74
+ type: 'self_attentive',
75
+ },
76
+ {
77
+ type: 'bidirectional_endpoint',
78
+ }
79
+ ]
80
+ },
81
+ span_finder: {
82
+ type: "bio",
83
+ bio_encoder: {
84
+ type: "lstm",
85
+ hidden_size: bio_dim,
86
+ num_layers: bio_layers,
87
+ bidirectional: true,
88
+ dropout: dropout,
89
+ },
90
+ no_label: false,
91
+ },
92
+ span_typing: {
93
+ type: 'mlp',
94
+ hidden_dims: span_typing_dims,
95
+ },
96
+ metrics: [{type: "srl"}],
97
+
98
+ typing_loss_factor: typing_loss_factor,
99
+ ontology_path: ontology_path,
100
+ label_dim: label_dim,
101
+ max_decoding_spans: 128,
102
+ max_recursion_depth: 2,
103
+ debug: debug,
104
+ },
105
+
106
+ trainer: {
107
+ num_epochs: 128,
108
+ patience: patience,
109
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
110
+ validation_metric: "+em_f",
111
+ grad_norm: 10,
112
+ grad_clipping: 10,
113
+ num_gradient_accumulation_steps: grad_acc,
114
+ optimizer: {
115
+ type: "transformer",
116
+ base: {
117
+ type: "adam",
118
+ lr: 1e-3,
119
+ },
120
+ embeddings_lr: 0.0,
121
+ encoder_lr: 1e-5,
122
+ pooler_lr: 1e-5,
123
+ layer_fix: layer_fix,
124
+ }
125
+ },
126
+
127
+ cuda_devices:: cuda_devices,
128
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
129
+ "cuda_devices": cuda_devices
130
+ },
131
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
132
+ }
config/basic/ft.jsonnet ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+ local base = import "basic.jsonnet";
3
+
4
+ local pretrained_path = env.str("PRETRAINED_PATH", "cache/basic/best");
5
+ local lr = env.json("FT_LR", 5e-5);
6
+
7
+ # training
8
+ local cuda_devices = base.cuda_devices;
9
+
10
+ {
11
+ dataset_reader: base.dataset_reader,
12
+ train_data_path: base.train_data_path,
13
+ validation_data_path: base.validation_data_path,
14
+ test_data_path: base.test_data_path,
15
+ datasets_for_vocab_creation: ["train"],
16
+ data_loader: base.data_loader,
17
+ validation_data_loader: base.validation_data_loader,
18
+
19
+ model: {
20
+ type: "from_archive",
21
+ archive_file: pretrained_path
22
+ },
23
+ vocabulary: {
24
+ type: "from_files",
25
+ directory: pretrained_path + "/vocabulary"
26
+ },
27
+
28
+ trainer: {
29
+ num_epochs: base.trainer.num_epochs,
30
+ patience: base.trainer.patience,
31
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
32
+ validation_metric: "+arg-c_f",
33
+ num_gradient_accumulation_steps: base.trainer.num_gradient_accumulation_steps,
34
+ optimizer: {
35
+ type: "transformer",
36
+ base: {
37
+ type: "adam",
38
+ lr: lr,
39
+ },
40
+ embeddings_lr: 0.0,
41
+ encoder_lr: 1e-5,
42
+ pooler_lr: 1e-5,
43
+ layer_fix: base.trainer.optimizer.layer_fix,
44
+ }
45
+ },
46
+
47
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
48
+ "cuda_devices": cuda_devices
49
+ },
50
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
51
+ }
config/basic/pt.jsonnet ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+ local base = import "basic.jsonnet";
3
+
4
+ local fn_path = "data/framenet/full/full.jsonl";
5
+ local mapping_path = "data/basic/framenet2better/";
6
+
7
+ local debug = false;
8
+
9
+ # training
10
+ local lr = env.json("PT_LR", "5e-5");
11
+ local cuda_devices = base.cuda_devices;
12
+
13
+ # mapping
14
+ local min_weight = env.json("MIN_WEIGHT", '0.0');
15
+ local max_weight = env.json("MAX_WEIGHT", '5.0');
16
+
17
+ {
18
+ dataset_reader: {
19
+ type: "semantic_role_labeling",
20
+ debug: debug,
21
+ pretrained_model: base.dataset_reader.pretrained_model,
22
+ ignore_label: false,
23
+ [ if debug then "max_instances" ]: 128,
24
+ ontology_mapping_path: mapping_path + '/ontology_mapping.json',
25
+ min_weight: min_weight,
26
+ max_weight: max_weight,
27
+ },
28
+ validation_dataset_reader: base.dataset_reader,
29
+ train_data_path: fn_path,
30
+ validation_data_path: base.validation_data_path,
31
+ test_data_path: base.test_data_path,
32
+ vocabulary: {
33
+ type: "extend",
34
+ directory: mapping_path + "/vocabulary"
35
+ },
36
+
37
+ datasets_for_vocab_creation: ["train"],
38
+
39
+ data_loader: base.data_loader,
40
+ validation_data_loader: base.validation_data_loader,
41
+
42
+ model: base.model,
43
+
44
+ trainer: {
45
+ num_epochs: base.trainer.num_epochs,
46
+ patience: base.trainer.patience,
47
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
48
+ validation_metric: "+arg-c_f",
49
+ num_gradient_accumulation_steps: base.trainer.num_gradient_accumulation_steps,
50
+ optimizer: {
51
+ type: "transformer",
52
+ base: {
53
+ type: "adam",
54
+ lr: lr,
55
+ },
56
+ embeddings_lr: 0.0,
57
+ encoder_lr: 1e-5,
58
+ pooler_lr: 1e-5,
59
+ layer_fix: base.trainer.optimizer.layer_fix,
60
+ }
61
+ },
62
+
63
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
64
+ "cuda_devices": cuda_devices
65
+ },
66
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
67
+ }
config/basic/rt.jsonnet ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+ local base = import "basic.jsonnet";
3
+
4
+ local debug = false;
5
+
6
+ # re-train
7
+ local pretrained_path = env.str("PRETRAINED_PATH", "cache/fn/best");
8
+ local rt_lr = env.json("RT_LR", 5e-5);
9
+
10
+ # module
11
+ local cuda_devices = base.cuda_devices;
12
+
13
+ {
14
+ dataset_reader: base.dataset_reader,
15
+ train_data_path: base.train_data_path,
16
+ validation_data_path: base.validation_data_path,
17
+ test_data_path: base.test_data_path,
18
+
19
+ datasets_for_vocab_creation: ["train"],
20
+
21
+ data_loader: base.data_loader,
22
+ validation_data_loader: base.validation_data_loader,
23
+
24
+ model: {
25
+ type: "span",
26
+ word_embedding: {
27
+ "_pretrained": {
28
+ "archive_file": pretrained_path,
29
+ "module_path": "word_embedding",
30
+ "freeze": false,
31
+ }
32
+ },
33
+ span_extractor: {
34
+ "_pretrained": {
35
+ "archive_file": pretrained_path,
36
+ "module_path": "_span_extractor",
37
+ "freeze": false,
38
+ }
39
+ },
40
+ span_finder: {
41
+ "_pretrained": {
42
+ "archive_file": pretrained_path,
43
+ "module_path": "_span_finder",
44
+ "freeze": false,
45
+ }
46
+ },
47
+ span_typing: {
48
+ type: 'mlp',
49
+ hidden_dims: base.model.span_typing.hidden_dims,
50
+ },
51
+ metrics: [{type: "srl"}],
52
+
53
+ typing_loss_factor: base.model.typing_loss_factor,
54
+ label_dim: base.model.label_dim,
55
+ max_decoding_spans: 128,
56
+ max_recursion_depth: 2,
57
+ debug: debug,
58
+ },
59
+
60
+ trainer: {
61
+ num_epochs: base.trainer.num_epochs,
62
+ patience: base.trainer.patience,
63
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
64
+ validation_metric: "+arg-c_f",
65
+ num_gradient_accumulation_steps: base.trainer.num_gradient_accumulation_steps,
66
+ optimizer: {
67
+ type: "transformer",
68
+ base: {
69
+ type: "adam",
70
+ lr: base.trainer.optimizer.base.lr,
71
+ },
72
+ embeddings_lr: 0.0,
73
+ encoder_lr: 1e-5,
74
+ pooler_lr: 1e-5,
75
+ layer_fix: base.trainer.optimizer.layer_fix,
76
+ parameter_groups: [
77
+ [['_span_finder.*'], {'lr': rt_lr}],
78
+ [['_span_extractor.*'], {'lr': rt_lr}],
79
+ ]
80
+ }
81
+ },
82
+
83
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
84
+ "cuda_devices": cuda_devices
85
+ },
86
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
87
+ }
config/env.jsonnet ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ {
2
+ json: function(name, default) if std.extVar("LOGNAME")=="tuning" then std.parseJson(std.extVar(name)) else std.parseJson(default),
3
+ str: function(name, default) if std.extVar("LOGNAME")=="tuning" then std.extVar(name) else default
4
+ }
config/fn-evalita/evalita.framenet_xlmr.jsonnet ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+
3
+ #local dataset_path = env.str("DATA_PATH", "data/framenet/full");
4
+ local dataset_path = "/home/p289731/cloned/lome/preproc/evalita_jsonl";
5
+ local ontology_path = "data/framenet/ontology.tsv";
6
+
7
+ local debug = false;
8
+
9
+ # reader
10
+ local pretrained_model = "/data/p289731/cloned/lome-models/models/xlm-roberta-framenet/";
11
+ local smoothing_factor = env.json("SMOOTHING", "0.1");
12
+
13
+ # model
14
+ local label_dim = env.json("LABEL_DIM", "64");
15
+ local dropout = env.json("DROPOUT", "0.2");
16
+ local bio_dim = env.json("BIO_DIM", "512");
17
+ local bio_layers = env.json("BIO_LAYER", "2");
18
+ local span_typing_dims = env.json("TYPING_DIMS", "[256, 256]");
19
+ local typing_loss_factor = env.json("LOSS_FACTOR", "8.0");
20
+
21
+ # loader
22
+ local exemplar_ratio = env.json("EXEMPLAR_RATIO", "0.05");
23
+ local max_training_tokens = 512;
24
+ local max_inference_tokens = 1024;
25
+
26
+ # training
27
+ local layer_fix = env.json("LAYER_FIX", "0");
28
+ local grad_acc = env.json("GRAD_ACC", "1");
29
+ #local cuda_devices = env.json("CUDA_DEVICES", "[-1]");
30
+ local cuda_devices = [0];
31
+ local patience = 32;
32
+
33
+ {
34
+ dataset_reader: {
35
+ type: "semantic_role_labeling",
36
+ debug: debug,
37
+ pretrained_model: pretrained_model,
38
+ ignore_label: false,
39
+ [ if debug then "max_instances" ]: 128,
40
+ event_smoothing_factor: smoothing_factor,
41
+ arg_smoothing_factor: smoothing_factor,
42
+ },
43
+ train_data_path: dataset_path + "/evalita_train.jsonl",
44
+ validation_data_path: dataset_path + "/evalita_dev.jsonl",
45
+ test_data_path: dataset_path + "/evalita_test.jsonl",
46
+
47
+ datasets_for_vocab_creation: ["train"],
48
+
49
+ data_loader: {
50
+ batch_sampler: {
51
+ type: "mix_sampler",
52
+ max_tokens: max_training_tokens,
53
+ sorting_keys: ['tokens'],
54
+ sampling_ratios: {
55
+ 'exemplar': 1.0,
56
+ 'full text': 0.0,
57
+ }
58
+ }
59
+ },
60
+
61
+ validation_data_loader: {
62
+ batch_sampler: {
63
+ type: "max_tokens_sampler",
64
+ max_tokens: max_inference_tokens,
65
+ sorting_keys: ['tokens']
66
+ }
67
+ },
68
+
69
+ model: {
70
+ type: "span",
71
+ word_embedding: {
72
+ token_embedders: {
73
+ "pieces": {
74
+ type: "pretrained_transformer",
75
+ model_name: pretrained_model,
76
+ }
77
+ },
78
+ },
79
+ span_extractor: {
80
+ type: 'combo',
81
+ sub_extractors: [
82
+ {
83
+ type: 'self_attentive',
84
+ },
85
+ {
86
+ type: 'bidirectional_endpoint',
87
+ }
88
+ ]
89
+ },
90
+ span_finder: {
91
+ type: "bio",
92
+ bio_encoder: {
93
+ type: "lstm",
94
+ hidden_size: bio_dim,
95
+ num_layers: bio_layers,
96
+ bidirectional: true,
97
+ dropout: dropout,
98
+ },
99
+ no_label: false,
100
+ },
101
+ span_typing: {
102
+ type: 'mlp',
103
+ hidden_dims: span_typing_dims,
104
+ },
105
+ metrics: [{type: "srl"}],
106
+
107
+ typing_loss_factor: typing_loss_factor,
108
+ ontology_path: null,
109
+ label_dim: label_dim,
110
+ max_decoding_spans: 128,
111
+ max_recursion_depth: 2,
112
+ debug: debug,
113
+ },
114
+
115
+ trainer: {
116
+ num_epochs: 128,
117
+ patience: patience,
118
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
119
+ validation_metric: "+em_f",
120
+ grad_norm: 10,
121
+ grad_clipping: 10,
122
+ num_gradient_accumulation_steps: grad_acc,
123
+ optimizer: {
124
+ type: "transformer",
125
+ base: {
126
+ type: "adam",
127
+ lr: 1e-3,
128
+ },
129
+ embeddings_lr: 0.0,
130
+ encoder_lr: 1e-5,
131
+ pooler_lr: 1e-5,
132
+ layer_fix: layer_fix,
133
+ }
134
+ },
135
+
136
+ cuda_devices:: cuda_devices,
137
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
138
+ "cuda_devices": cuda_devices
139
+ },
140
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
141
+ }
config/fn-evalita/evalita.it_mono.jsonnet ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+
3
+ #local dataset_path = env.str("DATA_PATH", "data/framenet/full");
4
+ local dataset_path = "/home/p289731/cloned/lome/preproc/evalita_jsonl";
5
+ local ontology_path = "data/framenet/ontology.tsv";
6
+
7
+ local debug = false;
8
+
9
+ # reader
10
+ local pretrained_model = env.str("ENCODER", "Musixmatch/umberto-commoncrawl-cased-v1");
11
+ local smoothing_factor = env.json("SMOOTHING", "0.1");
12
+
13
+ # model
14
+ local label_dim = env.json("LABEL_DIM", "64");
15
+ local dropout = env.json("DROPOUT", "0.2");
16
+ local bio_dim = env.json("BIO_DIM", "512");
17
+ local bio_layers = env.json("BIO_LAYER", "2");
18
+ local span_typing_dims = env.json("TYPING_DIMS", "[256, 256]");
19
+ local typing_loss_factor = env.json("LOSS_FACTOR", "8.0");
20
+
21
+ # loader
22
+ local exemplar_ratio = env.json("EXEMPLAR_RATIO", "0.05");
23
+ local max_training_tokens = 512;
24
+ local max_inference_tokens = 1024;
25
+
26
+ # training
27
+ local layer_fix = env.json("LAYER_FIX", "0");
28
+ local grad_acc = env.json("GRAD_ACC", "1");
29
+ #local cuda_devices = env.json("CUDA_DEVICES", "[-1]");
30
+ local cuda_devices = [0];
31
+ local patience = 32;
32
+
33
+ {
34
+ dataset_reader: {
35
+ type: "semantic_role_labeling",
36
+ debug: debug,
37
+ pretrained_model: pretrained_model,
38
+ ignore_label: false,
39
+ [ if debug then "max_instances" ]: 128,
40
+ event_smoothing_factor: smoothing_factor,
41
+ arg_smoothing_factor: smoothing_factor,
42
+ },
43
+ train_data_path: dataset_path + "/evalita_train.jsonl",
44
+ validation_data_path: dataset_path + "/evalita_dev.jsonl",
45
+ test_data_path: dataset_path + "/evalita_test.jsonl",
46
+
47
+ datasets_for_vocab_creation: ["train"],
48
+
49
+ data_loader: {
50
+ batch_sampler: {
51
+ type: "mix_sampler",
52
+ max_tokens: max_training_tokens,
53
+ sorting_keys: ['tokens'],
54
+ sampling_ratios: {
55
+ 'exemplar': 1.0,
56
+ 'full text': 0.0,
57
+ }
58
+ }
59
+ },
60
+
61
+ validation_data_loader: {
62
+ batch_sampler: {
63
+ type: "max_tokens_sampler",
64
+ max_tokens: max_inference_tokens,
65
+ sorting_keys: ['tokens']
66
+ }
67
+ },
68
+
69
+ model: {
70
+ type: "span",
71
+ word_embedding: {
72
+ token_embedders: {
73
+ "pieces": {
74
+ type: "pretrained_transformer",
75
+ model_name: pretrained_model,
76
+ }
77
+ },
78
+ },
79
+ span_extractor: {
80
+ type: 'combo',
81
+ sub_extractors: [
82
+ {
83
+ type: 'self_attentive',
84
+ },
85
+ {
86
+ type: 'bidirectional_endpoint',
87
+ }
88
+ ]
89
+ },
90
+ span_finder: {
91
+ type: "bio",
92
+ bio_encoder: {
93
+ type: "lstm",
94
+ hidden_size: bio_dim,
95
+ num_layers: bio_layers,
96
+ bidirectional: true,
97
+ dropout: dropout,
98
+ },
99
+ no_label: false,
100
+ },
101
+ span_typing: {
102
+ type: 'mlp',
103
+ hidden_dims: span_typing_dims,
104
+ },
105
+ metrics: [{type: "srl"}],
106
+
107
+ typing_loss_factor: typing_loss_factor,
108
+ ontology_path: null,
109
+ label_dim: label_dim,
110
+ max_decoding_spans: 128,
111
+ max_recursion_depth: 2,
112
+ debug: debug,
113
+ },
114
+
115
+ trainer: {
116
+ num_epochs: 128,
117
+ patience: patience,
118
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
119
+ validation_metric: "+em_f",
120
+ grad_norm: 10,
121
+ grad_clipping: 10,
122
+ num_gradient_accumulation_steps: grad_acc,
123
+ optimizer: {
124
+ type: "transformer",
125
+ base: {
126
+ type: "adam",
127
+ lr: 1e-3,
128
+ },
129
+ embeddings_lr: 0.0,
130
+ encoder_lr: 1e-5,
131
+ pooler_lr: 1e-5,
132
+ layer_fix: layer_fix,
133
+ }
134
+ },
135
+
136
+ cuda_devices:: cuda_devices,
137
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
138
+ "cuda_devices": cuda_devices
139
+ },
140
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
141
+ }
config/fn-evalita/evalita.vanilla_xlmr.jsonnet ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+
3
+ #local dataset_path = env.str("DATA_PATH", "data/framenet/full");
4
+ local dataset_path = "/home/p289731/cloned/lome/preproc/evalita_jsonl";
5
+ local ontology_path = "data/framenet/ontology.tsv";
6
+
7
+ local debug = false;
8
+
9
+ # reader
10
+ local pretrained_model = env.str("ENCODER", "xlm-roberta-large");
11
+ local smoothing_factor = env.json("SMOOTHING", "0.1");
12
+
13
+ # model
14
+ local label_dim = env.json("LABEL_DIM", "64");
15
+ local dropout = env.json("DROPOUT", "0.2");
16
+ local bio_dim = env.json("BIO_DIM", "512");
17
+ local bio_layers = env.json("BIO_LAYER", "2");
18
+ local span_typing_dims = env.json("TYPING_DIMS", "[256, 256]");
19
+ local typing_loss_factor = env.json("LOSS_FACTOR", "8.0");
20
+
21
+ # loader
22
+ local exemplar_ratio = env.json("EXEMPLAR_RATIO", "0.05");
23
+ local max_training_tokens = 512;
24
+ local max_inference_tokens = 1024;
25
+
26
+ # training
27
+ local layer_fix = env.json("LAYER_FIX", "0");
28
+ local grad_acc = env.json("GRAD_ACC", "1");
29
+ #local cuda_devices = env.json("CUDA_DEVICES", "[-1]");
30
+ local cuda_devices = [0];
31
+ local patience = 32;
32
+
33
+ {
34
+ dataset_reader: {
35
+ type: "semantic_role_labeling",
36
+ debug: debug,
37
+ pretrained_model: pretrained_model,
38
+ ignore_label: false,
39
+ [ if debug then "max_instances" ]: 128,
40
+ event_smoothing_factor: smoothing_factor,
41
+ arg_smoothing_factor: smoothing_factor,
42
+ },
43
+ train_data_path: dataset_path + "/evalita_train.jsonl",
44
+ validation_data_path: dataset_path + "/evalita_dev.jsonl",
45
+ test_data_path: dataset_path + "/evalita_test.jsonl",
46
+
47
+ datasets_for_vocab_creation: ["train"],
48
+
49
+ data_loader: {
50
+ batch_sampler: {
51
+ type: "mix_sampler",
52
+ max_tokens: max_training_tokens,
53
+ sorting_keys: ['tokens'],
54
+ sampling_ratios: {
55
+ 'exemplar': 1.0,
56
+ 'full text': 0.0,
57
+ }
58
+ }
59
+ },
60
+
61
+ validation_data_loader: {
62
+ batch_sampler: {
63
+ type: "max_tokens_sampler",
64
+ max_tokens: max_inference_tokens,
65
+ sorting_keys: ['tokens']
66
+ }
67
+ },
68
+
69
+ model: {
70
+ type: "span",
71
+ word_embedding: {
72
+ token_embedders: {
73
+ "pieces": {
74
+ type: "pretrained_transformer",
75
+ model_name: pretrained_model,
76
+ }
77
+ },
78
+ },
79
+ span_extractor: {
80
+ type: 'combo',
81
+ sub_extractors: [
82
+ {
83
+ type: 'self_attentive',
84
+ },
85
+ {
86
+ type: 'bidirectional_endpoint',
87
+ }
88
+ ]
89
+ },
90
+ span_finder: {
91
+ type: "bio",
92
+ bio_encoder: {
93
+ type: "lstm",
94
+ hidden_size: bio_dim,
95
+ num_layers: bio_layers,
96
+ bidirectional: true,
97
+ dropout: dropout,
98
+ },
99
+ no_label: false,
100
+ },
101
+ span_typing: {
102
+ type: 'mlp',
103
+ hidden_dims: span_typing_dims,
104
+ },
105
+ metrics: [{type: "srl"}],
106
+
107
+ typing_loss_factor: typing_loss_factor,
108
+ ontology_path: null,
109
+ label_dim: label_dim,
110
+ max_decoding_spans: 128,
111
+ max_recursion_depth: 2,
112
+ debug: debug,
113
+ },
114
+
115
+ trainer: {
116
+ num_epochs: 128,
117
+ patience: patience,
118
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
119
+ validation_metric: "+em_f",
120
+ grad_norm: 10,
121
+ grad_clipping: 10,
122
+ num_gradient_accumulation_steps: grad_acc,
123
+ optimizer: {
124
+ type: "transformer",
125
+ base: {
126
+ type: "adam",
127
+ lr: 1e-3,
128
+ },
129
+ embeddings_lr: 0.0,
130
+ encoder_lr: 1e-5,
131
+ pooler_lr: 1e-5,
132
+ layer_fix: layer_fix,
133
+ }
134
+ },
135
+
136
+ cuda_devices:: cuda_devices,
137
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
138
+ "cuda_devices": cuda_devices
139
+ },
140
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
141
+ }
config/fn-evalita/evalita_plus_fn.vanilla_xlmr.freeze.jsonnet ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+
3
+ #local dataset_path = env.str("DATA_PATH", "data/framenet/full");
4
+ local dataset_path = "/home/p289731/cloned/lome/preproc/evalita_jsonl";
5
+ local ontology_path = "data/framenet/ontology.tsv";
6
+
7
+ local debug = false;
8
+
9
+ # reader
10
+ local pretrained_model = env.str("ENCODER", "xlm-roberta-large");
11
+ local smoothing_factor = env.json("SMOOTHING", "0.1");
12
+
13
+ # model
14
+ local label_dim = env.json("LABEL_DIM", "64");
15
+ local dropout = env.json("DROPOUT", "0.2");
16
+ local bio_dim = env.json("BIO_DIM", "512");
17
+ local bio_layers = env.json("BIO_LAYER", "2");
18
+ local span_typing_dims = env.json("TYPING_DIMS", "[256, 256]");
19
+ local typing_loss_factor = env.json("LOSS_FACTOR", "8.0");
20
+
21
+ # loader
22
+ local exemplar_ratio = env.json("EXEMPLAR_RATIO", "0.05");
23
+ local max_training_tokens = 512;
24
+ local max_inference_tokens = 1024;
25
+
26
+ # training
27
+ local layer_fix = env.json("LAYER_FIX", "0");
28
+ local grad_acc = env.json("GRAD_ACC", "1");
29
+ #local cuda_devices = env.json("CUDA_DEVICES", "[-1]");
30
+ local cuda_devices = [0];
31
+ local patience = 32;
32
+
33
+ {
34
+ dataset_reader: {
35
+ type: "semantic_role_labeling",
36
+ debug: debug,
37
+ pretrained_model: pretrained_model,
38
+ ignore_label: false,
39
+ [ if debug then "max_instances" ]: 128,
40
+ event_smoothing_factor: smoothing_factor,
41
+ arg_smoothing_factor: smoothing_factor,
42
+ },
43
+ train_data_path: dataset_path + "/evalita_plus_fn_train.jsonl",
44
+ validation_data_path: dataset_path + "/evalita_dev.jsonl",
45
+ test_data_path: dataset_path + "/evalita_test.jsonl",
46
+
47
+ datasets_for_vocab_creation: ["train"],
48
+
49
+ data_loader: {
50
+ batch_sampler: {
51
+ type: "mix_sampler",
52
+ max_tokens: max_training_tokens,
53
+ sorting_keys: ['tokens'],
54
+ sampling_ratios: {
55
+ 'exemplar': 1.0,
56
+ 'full text': 0.0,
57
+ }
58
+ }
59
+ },
60
+
61
+ validation_data_loader: {
62
+ batch_sampler: {
63
+ type: "max_tokens_sampler",
64
+ max_tokens: max_inference_tokens,
65
+ sorting_keys: ['tokens']
66
+ }
67
+ },
68
+
69
+ model: {
70
+ type: "span",
71
+ word_embedding: {
72
+ token_embedders: {
73
+ "pieces": {
74
+ type: "pretrained_transformer",
75
+ model_name: pretrained_model,
76
+ train_parameters: false
77
+ }
78
+ },
79
+ },
80
+ span_extractor: {
81
+ type: 'combo',
82
+ sub_extractors: [
83
+ {
84
+ type: 'self_attentive',
85
+ },
86
+ {
87
+ type: 'bidirectional_endpoint',
88
+ }
89
+ ]
90
+ },
91
+ span_finder: {
92
+ type: "bio",
93
+ bio_encoder: {
94
+ type: "lstm",
95
+ hidden_size: bio_dim,
96
+ num_layers: bio_layers,
97
+ bidirectional: true,
98
+ dropout: dropout,
99
+ },
100
+ no_label: false,
101
+ },
102
+ span_typing: {
103
+ type: 'mlp',
104
+ hidden_dims: span_typing_dims,
105
+ },
106
+ metrics: [{type: "srl"}],
107
+
108
+ typing_loss_factor: typing_loss_factor,
109
+ ontology_path: null,
110
+ label_dim: label_dim,
111
+ max_decoding_spans: 128,
112
+ max_recursion_depth: 2,
113
+ debug: debug,
114
+ },
115
+
116
+ trainer: {
117
+ num_epochs: 128,
118
+ patience: patience,
119
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
120
+ validation_metric: "+em_f",
121
+ grad_norm: 10,
122
+ grad_clipping: 10,
123
+ num_gradient_accumulation_steps: grad_acc,
124
+ optimizer: {
125
+ type: "transformer",
126
+ base: {
127
+ type: "adam",
128
+ lr: 1e-3,
129
+ },
130
+ embeddings_lr: 0.0,
131
+ encoder_lr: 1e-5,
132
+ pooler_lr: 1e-5,
133
+ layer_fix: layer_fix,
134
+ }
135
+ },
136
+
137
+ cuda_devices:: cuda_devices,
138
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
139
+ "cuda_devices": cuda_devices
140
+ },
141
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
142
+ }
config/fn-evalita/evalita_plus_fn.vanilla_xlmr.jsonnet ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+
3
+ #local dataset_path = env.str("DATA_PATH", "data/framenet/full");
4
+ local dataset_path = "/home/p289731/cloned/lome/preproc/evalita_jsonl";
5
+ local ontology_path = "data/framenet/ontology.tsv";
6
+
7
+ local debug = false;
8
+
9
+ # reader
10
+ local pretrained_model = env.str("ENCODER", "xlm-roberta-large");
11
+ local smoothing_factor = env.json("SMOOTHING", "0.1");
12
+
13
+ # model
14
+ local label_dim = env.json("LABEL_DIM", "64");
15
+ local dropout = env.json("DROPOUT", "0.2");
16
+ local bio_dim = env.json("BIO_DIM", "512");
17
+ local bio_layers = env.json("BIO_LAYER", "2");
18
+ local span_typing_dims = env.json("TYPING_DIMS", "[256, 256]");
19
+ local typing_loss_factor = env.json("LOSS_FACTOR", "8.0");
20
+
21
+ # loader
22
+ local exemplar_ratio = env.json("EXEMPLAR_RATIO", "0.05");
23
+ local max_training_tokens = 512;
24
+ local max_inference_tokens = 1024;
25
+
26
+ # training
27
+ local layer_fix = env.json("LAYER_FIX", "0");
28
+ local grad_acc = env.json("GRAD_ACC", "1");
29
+ #local cuda_devices = env.json("CUDA_DEVICES", "[-1]");
30
+ local cuda_devices = [0];
31
+ local patience = 32;
32
+
33
+ {
34
+ dataset_reader: {
35
+ type: "semantic_role_labeling",
36
+ debug: debug,
37
+ pretrained_model: pretrained_model,
38
+ ignore_label: false,
39
+ [ if debug then "max_instances" ]: 128,
40
+ event_smoothing_factor: smoothing_factor,
41
+ arg_smoothing_factor: smoothing_factor,
42
+ },
43
+ train_data_path: dataset_path + "/evalita_plus_fn_train.jsonl",
44
+ validation_data_path: dataset_path + "/evalita_dev.jsonl",
45
+ test_data_path: dataset_path + "/evalita_test.jsonl",
46
+
47
+ datasets_for_vocab_creation: ["train"],
48
+
49
+ data_loader: {
50
+ batch_sampler: {
51
+ type: "mix_sampler",
52
+ max_tokens: max_training_tokens,
53
+ sorting_keys: ['tokens'],
54
+ sampling_ratios: {
55
+ 'exemplar': 1.0,
56
+ 'full text': 0.0,
57
+ }
58
+ }
59
+ },
60
+
61
+ validation_data_loader: {
62
+ batch_sampler: {
63
+ type: "max_tokens_sampler",
64
+ max_tokens: max_inference_tokens,
65
+ sorting_keys: ['tokens']
66
+ }
67
+ },
68
+
69
+ model: {
70
+ type: "span",
71
+ word_embedding: {
72
+ token_embedders: {
73
+ "pieces": {
74
+ type: "pretrained_transformer",
75
+ model_name: pretrained_model,
76
+ }
77
+ },
78
+ },
79
+ span_extractor: {
80
+ type: 'combo',
81
+ sub_extractors: [
82
+ {
83
+ type: 'self_attentive',
84
+ },
85
+ {
86
+ type: 'bidirectional_endpoint',
87
+ }
88
+ ]
89
+ },
90
+ span_finder: {
91
+ type: "bio",
92
+ bio_encoder: {
93
+ type: "lstm",
94
+ hidden_size: bio_dim,
95
+ num_layers: bio_layers,
96
+ bidirectional: true,
97
+ dropout: dropout,
98
+ },
99
+ no_label: false,
100
+ },
101
+ span_typing: {
102
+ type: 'mlp',
103
+ hidden_dims: span_typing_dims,
104
+ },
105
+ metrics: [{type: "srl"}],
106
+
107
+ typing_loss_factor: typing_loss_factor,
108
+ ontology_path: null,
109
+ label_dim: label_dim,
110
+ max_decoding_spans: 128,
111
+ max_recursion_depth: 2,
112
+ debug: debug,
113
+ },
114
+
115
+ trainer: {
116
+ num_epochs: 128,
117
+ patience: patience,
118
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
119
+ validation_metric: "+em_f",
120
+ grad_norm: 10,
121
+ grad_clipping: 10,
122
+ num_gradient_accumulation_steps: grad_acc,
123
+ optimizer: {
124
+ type: "transformer",
125
+ base: {
126
+ type: "adam",
127
+ lr: 1e-3,
128
+ },
129
+ embeddings_lr: 0.0,
130
+ encoder_lr: 1e-5,
131
+ pooler_lr: 1e-5,
132
+ layer_fix: layer_fix,
133
+ }
134
+ },
135
+
136
+ cuda_devices:: cuda_devices,
137
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
138
+ "cuda_devices": cuda_devices
139
+ },
140
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
141
+ }
config/fn-kicktionary/kicktionary.concat_clipped.vanilla_xlmr.jsonnet ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+
3
+ #local dataset_path = env.str("DATA_PATH", "data/framenet/full");
4
+ local dataset_path = "/home/p289731/cloned/lome/preproc/kicktionary_jsonl";
5
+ local ontology_path = "data/framenet/ontology.tsv";
6
+
7
+ local debug = false;
8
+
9
+ # reader
10
+ local pretrained_model = env.str("ENCODER", "xlm-roberta-large");
11
+ local smoothing_factor = env.json("SMOOTHING", "0.1");
12
+
13
+ # model
14
+ local label_dim = env.json("LABEL_DIM", "64");
15
+ local dropout = env.json("DROPOUT", "0.2");
16
+ local bio_dim = env.json("BIO_DIM", "512");
17
+ local bio_layers = env.json("BIO_LAYER", "2");
18
+ local span_typing_dims = env.json("TYPING_DIMS", "[256, 256]");
19
+ local typing_loss_factor = env.json("LOSS_FACTOR", "8.0");
20
+
21
+ # loader
22
+ local exemplar_ratio = env.json("EXEMPLAR_RATIO", "0.05");
23
+ local max_training_tokens = 512;
24
+ local max_inference_tokens = 1024;
25
+
26
+ # training
27
+ local layer_fix = env.json("LAYER_FIX", "0");
28
+ local grad_acc = env.json("GRAD_ACC", "1");
29
+ #local cuda_devices = env.json("CUDA_DEVICES", "[-1]");
30
+ local cuda_devices = [0];
31
+ local patience = 32;
32
+
33
+ {
34
+ dataset_reader: {
35
+ type: "semantic_role_labeling",
36
+ debug: debug,
37
+ pretrained_model: pretrained_model,
38
+ ignore_label: false,
39
+ [ if debug then "max_instances" ]: 128,
40
+ event_smoothing_factor: smoothing_factor,
41
+ arg_smoothing_factor: smoothing_factor,
42
+ },
43
+ train_data_path: dataset_path + "/kicktionary_exemplars_train.concat_clipped.jsonl",
44
+ validation_data_path: dataset_path + "/kicktionary_exemplars_dev.jsonl",
45
+ test_data_path: dataset_path + "/kicktionary_exemplars_test.jsonl",
46
+
47
+ datasets_for_vocab_creation: ["train"],
48
+
49
+ data_loader: {
50
+ batch_sampler: {
51
+ type: "mix_sampler",
52
+ max_tokens: max_training_tokens,
53
+ sorting_keys: ['tokens'],
54
+ sampling_ratios: {
55
+ 'exemplar': 1.0,
56
+ 'full text': 0.0,
57
+ }
58
+ }
59
+ },
60
+
61
+ validation_data_loader: {
62
+ batch_sampler: {
63
+ type: "max_tokens_sampler",
64
+ max_tokens: max_inference_tokens,
65
+ sorting_keys: ['tokens']
66
+ }
67
+ },
68
+
69
+ model: {
70
+ type: "span",
71
+ word_embedding: {
72
+ token_embedders: {
73
+ "pieces": {
74
+ type: "pretrained_transformer",
75
+ model_name: pretrained_model,
76
+ }
77
+ },
78
+ },
79
+ span_extractor: {
80
+ type: 'combo',
81
+ sub_extractors: [
82
+ {
83
+ type: 'self_attentive',
84
+ },
85
+ {
86
+ type: 'bidirectional_endpoint',
87
+ }
88
+ ]
89
+ },
90
+ span_finder: {
91
+ type: "bio",
92
+ bio_encoder: {
93
+ type: "lstm",
94
+ hidden_size: bio_dim,
95
+ num_layers: bio_layers,
96
+ bidirectional: true,
97
+ dropout: dropout,
98
+ },
99
+ no_label: false,
100
+ },
101
+ span_typing: {
102
+ type: 'mlp',
103
+ hidden_dims: span_typing_dims,
104
+ },
105
+ metrics: [{type: "srl"}],
106
+
107
+ typing_loss_factor: typing_loss_factor,
108
+ ontology_path: null,
109
+ label_dim: label_dim,
110
+ max_decoding_spans: 128,
111
+ max_recursion_depth: 2,
112
+ debug: debug,
113
+ },
114
+
115
+ trainer: {
116
+ num_epochs: 128,
117
+ patience: patience,
118
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
119
+ validation_metric: "+em_f",
120
+ grad_norm: 10,
121
+ grad_clipping: 10,
122
+ num_gradient_accumulation_steps: grad_acc,
123
+ optimizer: {
124
+ type: "transformer",
125
+ base: {
126
+ type: "adam",
127
+ lr: 1e-3,
128
+ },
129
+ embeddings_lr: 0.0,
130
+ encoder_lr: 1e-5,
131
+ pooler_lr: 1e-5,
132
+ layer_fix: layer_fix,
133
+ }
134
+ },
135
+
136
+ cuda_devices:: cuda_devices,
137
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
138
+ "cuda_devices": cuda_devices
139
+ },
140
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
141
+ }
config/fn-kicktionary/kicktionary.football_xlmr.jsonnet ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+
3
+ #local dataset_path = env.str("DATA_PATH", "data/framenet/full");
4
+ local dataset_path = "/home/p289731/cloned/lome/preproc/kicktionary_jsonl";
5
+ local ontology_path = "data/framenet/ontology.tsv";
6
+
7
+ local debug = false;
8
+
9
+ # reader
10
+ local pretrained_model = env.str("ENCODER", "/data/p289731/cloned/lome-models/models/xlm-roberta-football/");
11
+ local smoothing_factor = env.json("SMOOTHING", "0.1");
12
+
13
+ # model
14
+ local label_dim = env.json("LABEL_DIM", "64");
15
+ local dropout = env.json("DROPOUT", "0.2");
16
+ local bio_dim = env.json("BIO_DIM", "512");
17
+ local bio_layers = env.json("BIO_LAYER", "2");
18
+ local span_typing_dims = env.json("TYPING_DIMS", "[256, 256]");
19
+ local typing_loss_factor = env.json("LOSS_FACTOR", "8.0");
20
+
21
+ # loader
22
+ local exemplar_ratio = env.json("EXEMPLAR_RATIO", "0.05");
23
+ local max_training_tokens = 512;
24
+ local max_inference_tokens = 1024;
25
+
26
+ # training
27
+ local layer_fix = env.json("LAYER_FIX", "0");
28
+ local grad_acc = env.json("GRAD_ACC", "1");
29
+ #local cuda_devices = env.json("CUDA_DEVICES", "[-1]");
30
+ local cuda_devices = [0];
31
+ local patience = 32;
32
+
33
+ {
34
+ dataset_reader: {
35
+ type: "semantic_role_labeling",
36
+ debug: debug,
37
+ pretrained_model: pretrained_model,
38
+ ignore_label: false,
39
+ [ if debug then "max_instances" ]: 128,
40
+ event_smoothing_factor: smoothing_factor,
41
+ arg_smoothing_factor: smoothing_factor,
42
+ },
43
+ train_data_path: dataset_path + "/kicktionary_exemplars_train.jsonl",
44
+ validation_data_path: dataset_path + "/kicktionary_exemplars_dev.jsonl",
45
+ test_data_path: dataset_path + "/kicktionary_exemplars_test.jsonl",
46
+
47
+ datasets_for_vocab_creation: ["train"],
48
+
49
+ data_loader: {
50
+ batch_sampler: {
51
+ type: "mix_sampler",
52
+ max_tokens: max_training_tokens,
53
+ sorting_keys: ['tokens'],
54
+ sampling_ratios: {
55
+ 'exemplar': 1.0,
56
+ 'full text': 0.0,
57
+ }
58
+ }
59
+ },
60
+
61
+ validation_data_loader: {
62
+ batch_sampler: {
63
+ type: "max_tokens_sampler",
64
+ max_tokens: max_inference_tokens,
65
+ sorting_keys: ['tokens']
66
+ }
67
+ },
68
+
69
+ model: {
70
+ type: "span",
71
+ word_embedding: {
72
+ token_embedders: {
73
+ "pieces": {
74
+ type: "pretrained_transformer",
75
+ model_name: pretrained_model,
76
+ }
77
+ },
78
+ },
79
+ span_extractor: {
80
+ type: 'combo',
81
+ sub_extractors: [
82
+ {
83
+ type: 'self_attentive',
84
+ },
85
+ {
86
+ type: 'bidirectional_endpoint',
87
+ }
88
+ ]
89
+ },
90
+ span_finder: {
91
+ type: "bio",
92
+ bio_encoder: {
93
+ type: "lstm",
94
+ hidden_size: bio_dim,
95
+ num_layers: bio_layers,
96
+ bidirectional: true,
97
+ dropout: dropout,
98
+ },
99
+ no_label: false,
100
+ },
101
+ span_typing: {
102
+ type: 'mlp',
103
+ hidden_dims: span_typing_dims,
104
+ },
105
+ metrics: [{type: "srl"}],
106
+
107
+ typing_loss_factor: typing_loss_factor,
108
+ ontology_path: null,
109
+ label_dim: label_dim,
110
+ max_decoding_spans: 128,
111
+ max_recursion_depth: 2,
112
+ debug: debug,
113
+ },
114
+
115
+ trainer: {
116
+ num_epochs: 128,
117
+ patience: patience,
118
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
119
+ validation_metric: "+em_f",
120
+ grad_norm: 10,
121
+ grad_clipping: 10,
122
+ num_gradient_accumulation_steps: grad_acc,
123
+ optimizer: {
124
+ type: "transformer",
125
+ base: {
126
+ type: "adam",
127
+ lr: 1e-3,
128
+ },
129
+ embeddings_lr: 0.0,
130
+ encoder_lr: 1e-5,
131
+ pooler_lr: 1e-5,
132
+ layer_fix: layer_fix,
133
+ }
134
+ },
135
+
136
+ cuda_devices:: cuda_devices,
137
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
138
+ "cuda_devices": cuda_devices
139
+ },
140
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
141
+ }
config/fn-kicktionary/kicktionary.framenet_xlmr.jsonnet ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+
3
+ #local dataset_path = env.str("DATA_PATH", "data/framenet/full");
4
+ local dataset_path = "/home/p289731/cloned/lome/preproc/kicktionary_jsonl";
5
+ local ontology_path = "data/framenet/ontology.tsv";
6
+
7
+ local debug = false;
8
+
9
+ # reader
10
+ local pretrained_model = "/data/p289731/cloned/lome-models/models/xlm-roberta-framenet/";
11
+ local smoothing_factor = env.json("SMOOTHING", "0.1");
12
+
13
+ # model
14
+ local label_dim = env.json("LABEL_DIM", "64");
15
+ local dropout = env.json("DROPOUT", "0.2");
16
+ local bio_dim = env.json("BIO_DIM", "512");
17
+ local bio_layers = env.json("BIO_LAYER", "2");
18
+ local span_typing_dims = env.json("TYPING_DIMS", "[256, 256]");
19
+ local typing_loss_factor = env.json("LOSS_FACTOR", "8.0");
20
+
21
+ # loader
22
+ local exemplar_ratio = env.json("EXEMPLAR_RATIO", "0.05");
23
+ local max_training_tokens = 512;
24
+ local max_inference_tokens = 1024;
25
+
26
+ # training
27
+ local layer_fix = env.json("LAYER_FIX", "0");
28
+ local grad_acc = env.json("GRAD_ACC", "1");
29
+ #local cuda_devices = env.json("CUDA_DEVICES", "[-1]");
30
+ local cuda_devices = [0];
31
+ local patience = 32;
32
+
33
+ {
34
+ dataset_reader: {
35
+ type: "semantic_role_labeling",
36
+ debug: debug,
37
+ pretrained_model: pretrained_model,
38
+ ignore_label: false,
39
+ [ if debug then "max_instances" ]: 128,
40
+ event_smoothing_factor: smoothing_factor,
41
+ arg_smoothing_factor: smoothing_factor,
42
+ },
43
+ train_data_path: dataset_path + "/kicktionary_exemplars_train.jsonl",
44
+ validation_data_path: dataset_path + "/kicktionary_exemplars_dev.jsonl",
45
+ test_data_path: dataset_path + "/kicktionary_exemplars_test.jsonl",
46
+
47
+ datasets_for_vocab_creation: ["train"],
48
+
49
+ data_loader: {
50
+ batch_sampler: {
51
+ type: "mix_sampler",
52
+ max_tokens: max_training_tokens,
53
+ sorting_keys: ['tokens'],
54
+ sampling_ratios: {
55
+ 'exemplar': 1.0,
56
+ 'full text': 0.0,
57
+ }
58
+ }
59
+ },
60
+
61
+ validation_data_loader: {
62
+ batch_sampler: {
63
+ type: "max_tokens_sampler",
64
+ max_tokens: max_inference_tokens,
65
+ sorting_keys: ['tokens']
66
+ }
67
+ },
68
+
69
+ model: {
70
+ type: "span",
71
+ word_embedding: {
72
+ token_embedders: {
73
+ "pieces": {
74
+ type: "pretrained_transformer",
75
+ model_name: pretrained_model,
76
+ }
77
+ },
78
+ },
79
+ span_extractor: {
80
+ type: 'combo',
81
+ sub_extractors: [
82
+ {
83
+ type: 'self_attentive',
84
+ },
85
+ {
86
+ type: 'bidirectional_endpoint',
87
+ }
88
+ ]
89
+ },
90
+ span_finder: {
91
+ type: "bio",
92
+ bio_encoder: {
93
+ type: "lstm",
94
+ hidden_size: bio_dim,
95
+ num_layers: bio_layers,
96
+ bidirectional: true,
97
+ dropout: dropout,
98
+ },
99
+ no_label: false,
100
+ },
101
+ span_typing: {
102
+ type: 'mlp',
103
+ hidden_dims: span_typing_dims,
104
+ },
105
+ metrics: [{type: "srl"}],
106
+
107
+ typing_loss_factor: typing_loss_factor,
108
+ ontology_path: null,
109
+ label_dim: label_dim,
110
+ max_decoding_spans: 128,
111
+ max_recursion_depth: 2,
112
+ debug: debug,
113
+ },
114
+
115
+ trainer: {
116
+ num_epochs: 128,
117
+ patience: patience,
118
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
119
+ validation_metric: "+em_f",
120
+ grad_norm: 10,
121
+ grad_clipping: 10,
122
+ num_gradient_accumulation_steps: grad_acc,
123
+ optimizer: {
124
+ type: "transformer",
125
+ base: {
126
+ type: "adam",
127
+ lr: 1e-3,
128
+ },
129
+ embeddings_lr: 0.0,
130
+ encoder_lr: 1e-5,
131
+ pooler_lr: 1e-5,
132
+ layer_fix: layer_fix,
133
+ }
134
+ },
135
+
136
+ cuda_devices:: cuda_devices,
137
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
138
+ "cuda_devices": cuda_devices
139
+ },
140
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
141
+ }
config/fn-kicktionary/kicktionary.vanilla_xlmr.jsonnet ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+
3
+ #local dataset_path = env.str("DATA_PATH", "data/framenet/full");
4
+ local dataset_path = "/home/p289731/cloned/lome/preproc/kicktionary_jsonl";
5
+ local ontology_path = "data/framenet/ontology.tsv";
6
+
7
+ local debug = false;
8
+
9
+ # reader
10
+ local pretrained_model = env.str("ENCODER", "xlm-roberta-large");
11
+ local smoothing_factor = env.json("SMOOTHING", "0.1");
12
+
13
+ # model
14
+ local label_dim = env.json("LABEL_DIM", "64");
15
+ local dropout = env.json("DROPOUT", "0.2");
16
+ local bio_dim = env.json("BIO_DIM", "512");
17
+ local bio_layers = env.json("BIO_LAYER", "2");
18
+ local span_typing_dims = env.json("TYPING_DIMS", "[256, 256]");
19
+ local typing_loss_factor = env.json("LOSS_FACTOR", "8.0");
20
+
21
+ # loader
22
+ local exemplar_ratio = env.json("EXEMPLAR_RATIO", "0.05");
23
+ local max_training_tokens = 512;
24
+ local max_inference_tokens = 1024;
25
+
26
+ # training
27
+ local layer_fix = env.json("LAYER_FIX", "0");
28
+ local grad_acc = env.json("GRAD_ACC", "1");
29
+ #local cuda_devices = env.json("CUDA_DEVICES", "[-1]");
30
+ local cuda_devices = [0];
31
+ local patience = 32;
32
+
33
+ {
34
+ dataset_reader: {
35
+ type: "semantic_role_labeling",
36
+ debug: debug,
37
+ pretrained_model: pretrained_model,
38
+ ignore_label: false,
39
+ [ if debug then "max_instances" ]: 128,
40
+ event_smoothing_factor: smoothing_factor,
41
+ arg_smoothing_factor: smoothing_factor,
42
+ },
43
+ train_data_path: dataset_path + "/kicktionary_exemplars_train.jsonl",
44
+ validation_data_path: dataset_path + "/kicktionary_exemplars_dev.jsonl",
45
+ test_data_path: dataset_path + "/kicktionary_exemplars_test.jsonl",
46
+
47
+ datasets_for_vocab_creation: ["train"],
48
+
49
+ data_loader: {
50
+ batch_sampler: {
51
+ type: "mix_sampler",
52
+ max_tokens: max_training_tokens,
53
+ sorting_keys: ['tokens'],
54
+ sampling_ratios: {
55
+ 'exemplar': 1.0,
56
+ 'full text': 0.0,
57
+ }
58
+ }
59
+ },
60
+
61
+ validation_data_loader: {
62
+ batch_sampler: {
63
+ type: "max_tokens_sampler",
64
+ max_tokens: max_inference_tokens,
65
+ sorting_keys: ['tokens']
66
+ }
67
+ },
68
+
69
+ model: {
70
+ type: "span",
71
+ word_embedding: {
72
+ token_embedders: {
73
+ "pieces": {
74
+ type: "pretrained_transformer",
75
+ model_name: pretrained_model,
76
+ }
77
+ },
78
+ },
79
+ span_extractor: {
80
+ type: 'combo',
81
+ sub_extractors: [
82
+ {
83
+ type: 'self_attentive',
84
+ },
85
+ {
86
+ type: 'bidirectional_endpoint',
87
+ }
88
+ ]
89
+ },
90
+ span_finder: {
91
+ type: "bio",
92
+ bio_encoder: {
93
+ type: "lstm",
94
+ hidden_size: bio_dim,
95
+ num_layers: bio_layers,
96
+ bidirectional: true,
97
+ dropout: dropout,
98
+ },
99
+ no_label: false,
100
+ },
101
+ span_typing: {
102
+ type: 'mlp',
103
+ hidden_dims: span_typing_dims,
104
+ },
105
+ metrics: [{type: "srl"}],
106
+
107
+ typing_loss_factor: typing_loss_factor,
108
+ ontology_path: null,
109
+ label_dim: label_dim,
110
+ max_decoding_spans: 128,
111
+ max_recursion_depth: 2,
112
+ debug: debug,
113
+ },
114
+
115
+ trainer: {
116
+ num_epochs: 128,
117
+ patience: patience,
118
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
119
+ validation_metric: "+em_f",
120
+ grad_norm: 10,
121
+ grad_clipping: 10,
122
+ num_gradient_accumulation_steps: grad_acc,
123
+ optimizer: {
124
+ type: "transformer",
125
+ base: {
126
+ type: "adam",
127
+ lr: 1e-3,
128
+ },
129
+ embeddings_lr: 0.0,
130
+ encoder_lr: 1e-5,
131
+ pooler_lr: 1e-5,
132
+ layer_fix: layer_fix,
133
+ }
134
+ },
135
+
136
+ cuda_devices:: cuda_devices,
137
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
138
+ "cuda_devices": cuda_devices
139
+ },
140
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
141
+ }
config/fn-sonar/sonar-a1.framenet_xlmr.jsonnet ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+
3
+ #local dataset_path = env.str("DATA_PATH", "data/framenet/full");
4
+ local dataset_path = "/home/p289731/cloned/lome/preproc/sonar_jsonl";
5
+ local ontology_path = "data/framenet/ontology.tsv";
6
+
7
+ local debug = false;
8
+
9
+ # reader
10
+ local pretrained_model = "/data/p289731/cloned/lome-models/models/xlm-roberta-framenet/";
11
+ local smoothing_factor = env.json("SMOOTHING", "0.1");
12
+
13
+ # model
14
+ local label_dim = env.json("LABEL_DIM", "64");
15
+ local dropout = env.json("DROPOUT", "0.2");
16
+ local bio_dim = env.json("BIO_DIM", "512");
17
+ local bio_layers = env.json("BIO_LAYER", "2");
18
+ local span_typing_dims = env.json("TYPING_DIMS", "[256, 256]");
19
+ local typing_loss_factor = env.json("LOSS_FACTOR", "8.0");
20
+
21
+ # loader
22
+ local exemplar_ratio = env.json("EXEMPLAR_RATIO", "0.05");
23
+ local max_training_tokens = 512;
24
+ local max_inference_tokens = 1024;
25
+
26
+ # training
27
+ local layer_fix = env.json("LAYER_FIX", "0");
28
+ local grad_acc = env.json("GRAD_ACC", "1");
29
+ #local cuda_devices = env.json("CUDA_DEVICES", "[-1]");
30
+ local cuda_devices = [0];
31
+ local patience = 32;
32
+
33
+ {
34
+ dataset_reader: {
35
+ type: "semantic_role_labeling",
36
+ debug: debug,
37
+ pretrained_model: pretrained_model,
38
+ ignore_label: false,
39
+ [ if debug then "max_instances" ]: 128,
40
+ event_smoothing_factor: smoothing_factor,
41
+ arg_smoothing_factor: smoothing_factor,
42
+ },
43
+ train_data_path: dataset_path + "/dutch-sonar-train-A1.jsonl",
44
+ validation_data_path: dataset_path + "/dutch-sonar-dev-A1.jsonl",
45
+ test_data_path: dataset_path + "/dutch-sonar-test-A1.jsonl",
46
+
47
+ datasets_for_vocab_creation: ["train"],
48
+
49
+ data_loader: {
50
+ batch_sampler: {
51
+ type: "mix_sampler",
52
+ max_tokens: max_training_tokens,
53
+ sorting_keys: ['tokens'],
54
+ sampling_ratios: {
55
+ 'exemplar': 1.0,
56
+ 'full text': 0.0,
57
+ }
58
+ }
59
+ },
60
+
61
+ validation_data_loader: {
62
+ batch_sampler: {
63
+ type: "max_tokens_sampler",
64
+ max_tokens: max_inference_tokens,
65
+ sorting_keys: ['tokens']
66
+ }
67
+ },
68
+
69
+ model: {
70
+ type: "span",
71
+ word_embedding: {
72
+ token_embedders: {
73
+ "pieces": {
74
+ type: "pretrained_transformer",
75
+ model_name: pretrained_model,
76
+ }
77
+ },
78
+ },
79
+ span_extractor: {
80
+ type: 'combo',
81
+ sub_extractors: [
82
+ {
83
+ type: 'self_attentive',
84
+ },
85
+ {
86
+ type: 'bidirectional_endpoint',
87
+ }
88
+ ]
89
+ },
90
+ span_finder: {
91
+ type: "bio",
92
+ bio_encoder: {
93
+ type: "lstm",
94
+ hidden_size: bio_dim,
95
+ num_layers: bio_layers,
96
+ bidirectional: true,
97
+ dropout: dropout,
98
+ },
99
+ no_label: false,
100
+ },
101
+ span_typing: {
102
+ type: 'mlp',
103
+ hidden_dims: span_typing_dims,
104
+ },
105
+ metrics: [{type: "srl"}],
106
+
107
+ typing_loss_factor: typing_loss_factor,
108
+ ontology_path: null,
109
+ label_dim: label_dim,
110
+ max_decoding_spans: 128,
111
+ max_recursion_depth: 2,
112
+ debug: debug,
113
+ },
114
+
115
+ trainer: {
116
+ num_epochs: 128,
117
+ patience: patience,
118
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
119
+ validation_metric: "+em_f",
120
+ grad_norm: 10,
121
+ grad_clipping: 10,
122
+ num_gradient_accumulation_steps: grad_acc,
123
+ optimizer: {
124
+ type: "transformer",
125
+ base: {
126
+ type: "adam",
127
+ lr: 1e-3,
128
+ },
129
+ embeddings_lr: 0.0,
130
+ encoder_lr: 1e-5,
131
+ pooler_lr: 1e-5,
132
+ layer_fix: layer_fix,
133
+ }
134
+ },
135
+
136
+ cuda_devices:: cuda_devices,
137
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
138
+ "cuda_devices": cuda_devices
139
+ },
140
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
141
+ }
config/fn-sonar/sonar-a1.sonar_plus_fn.vanilla_xlmr.jsonnet ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+
3
+ #local dataset_path = env.str("DATA_PATH", "data/framenet/full");
4
+ local dataset_path = "/home/p289731/cloned/lome/preproc/sonar_jsonl";
5
+ local ontology_path = "data/framenet/ontology.tsv";
6
+
7
+ local debug = false;
8
+
9
+ # reader
10
+ local pretrained_model = env.str("ENCODER", "xlm-roberta-large");
11
+ local smoothing_factor = env.json("SMOOTHING", "0.1");
12
+
13
+ # model
14
+ local label_dim = env.json("LABEL_DIM", "64");
15
+ local dropout = env.json("DROPOUT", "0.2");
16
+ local bio_dim = env.json("BIO_DIM", "512");
17
+ local bio_layers = env.json("BIO_LAYER", "2");
18
+ local span_typing_dims = env.json("TYPING_DIMS", "[256, 256]");
19
+ local typing_loss_factor = env.json("LOSS_FACTOR", "8.0");
20
+
21
+ # loader
22
+ local exemplar_ratio = env.json("EXEMPLAR_RATIO", "0.05");
23
+ local max_training_tokens = 512;
24
+ local max_inference_tokens = 1024;
25
+
26
+ # training
27
+ local layer_fix = env.json("LAYER_FIX", "0");
28
+ local grad_acc = env.json("GRAD_ACC", "1");
29
+ #local cuda_devices = env.json("CUDA_DEVICES", "[-1]");
30
+ local cuda_devices = [0];
31
+ local patience = 32;
32
+
33
+ {
34
+ dataset_reader: {
35
+ type: "semantic_role_labeling",
36
+ debug: debug,
37
+ pretrained_model: pretrained_model,
38
+ ignore_label: false,
39
+ [ if debug then "max_instances" ]: 128,
40
+ event_smoothing_factor: smoothing_factor,
41
+ arg_smoothing_factor: smoothing_factor,
42
+ },
43
+
44
+ train_data_path: dataset_path + "/dutch-sonar-train-A1.jsonl",
45
+ validation_data_path: dataset_path + "/dutch-sonar-dev-A1.jsonl",
46
+ test_data_path: dataset_path + "/dutch-sonar-test-A1.jsonl",
47
+
48
+ datasets_for_vocab_creation: ["train"],
49
+
50
+ data_loader: {
51
+ batch_sampler: {
52
+ type: "mix_sampler",
53
+ max_tokens: max_training_tokens,
54
+ sorting_keys: ['tokens'],
55
+ sampling_ratios: {
56
+ 'exemplar': 1.0,
57
+ 'full text': 0.0,
58
+ }
59
+ }
60
+ },
61
+
62
+ validation_data_loader: {
63
+ batch_sampler: {
64
+ type: "max_tokens_sampler",
65
+ max_tokens: max_inference_tokens,
66
+ sorting_keys: ['tokens']
67
+ }
68
+ },
69
+
70
+ model: {
71
+ type: "span",
72
+ word_embedding: {
73
+ token_embedders: {
74
+ "pieces": {
75
+ type: "pretrained_transformer",
76
+ model_name: pretrained_model,
77
+ }
78
+ },
79
+ },
80
+ span_extractor: {
81
+ type: 'combo',
82
+ sub_extractors: [
83
+ {
84
+ type: 'self_attentive',
85
+ },
86
+ {
87
+ type: 'bidirectional_endpoint',
88
+ }
89
+ ]
90
+ },
91
+ span_finder: {
92
+ type: "bio",
93
+ bio_encoder: {
94
+ type: "lstm",
95
+ hidden_size: bio_dim,
96
+ num_layers: bio_layers,
97
+ bidirectional: true,
98
+ dropout: dropout,
99
+ },
100
+ no_label: false,
101
+ },
102
+ span_typing: {
103
+ type: 'mlp',
104
+ hidden_dims: span_typing_dims,
105
+ },
106
+ metrics: [{type: "srl"}],
107
+
108
+ typing_loss_factor: typing_loss_factor,
109
+ ontology_path: null,
110
+ label_dim: label_dim,
111
+ max_decoding_spans: 128,
112
+ max_recursion_depth: 2,
113
+ debug: debug,
114
+ },
115
+
116
+ trainer: {
117
+ num_epochs: 128,
118
+ patience: patience,
119
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
120
+ validation_metric: "+em_f",
121
+ grad_norm: 10,
122
+ grad_clipping: 10,
123
+ num_gradient_accumulation_steps: grad_acc,
124
+ optimizer: {
125
+ type: "transformer",
126
+ base: {
127
+ type: "adam",
128
+ lr: 1e-3,
129
+ },
130
+ embeddings_lr: 0.0,
131
+ encoder_lr: 1e-5,
132
+ pooler_lr: 1e-5,
133
+ layer_fix: layer_fix,
134
+ }
135
+ },
136
+
137
+ cuda_devices:: cuda_devices,
138
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
139
+ "cuda_devices": cuda_devices
140
+ },
141
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
142
+ }
config/fn-sonar/sonar-a1.vanilla_xlmr.jsonnet ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+
3
+ #local dataset_path = env.str("DATA_PATH", "data/framenet/full");
4
+ local dataset_path = "/home/p289731/cloned/lome/preproc/sonar_jsonl";
5
+ local ontology_path = "data/framenet/ontology.tsv";
6
+
7
+ local debug = false;
8
+
9
+ # reader
10
+ local pretrained_model = env.str("ENCODER", "xlm-roberta-large");
11
+ local smoothing_factor = env.json("SMOOTHING", "0.1");
12
+
13
+ # model
14
+ local label_dim = env.json("LABEL_DIM", "64");
15
+ local dropout = env.json("DROPOUT", "0.2");
16
+ local bio_dim = env.json("BIO_DIM", "512");
17
+ local bio_layers = env.json("BIO_LAYER", "2");
18
+ local span_typing_dims = env.json("TYPING_DIMS", "[256, 256]");
19
+ local typing_loss_factor = env.json("LOSS_FACTOR", "8.0");
20
+
21
+ # loader
22
+ local exemplar_ratio = env.json("EXEMPLAR_RATIO", "0.05");
23
+ local max_training_tokens = 512;
24
+ local max_inference_tokens = 1024;
25
+
26
+ # training
27
+ local layer_fix = env.json("LAYER_FIX", "0");
28
+ local grad_acc = env.json("GRAD_ACC", "1");
29
+ #local cuda_devices = env.json("CUDA_DEVICES", "[-1]");
30
+ local cuda_devices = [0];
31
+ local patience = 32;
32
+
33
+ {
34
+ dataset_reader: {
35
+ type: "semantic_role_labeling",
36
+ debug: debug,
37
+ pretrained_model: pretrained_model,
38
+ ignore_label: false,
39
+ [ if debug then "max_instances" ]: 128,
40
+ event_smoothing_factor: smoothing_factor,
41
+ arg_smoothing_factor: smoothing_factor,
42
+ },
43
+ train_data_path: dataset_path + "/dutch-sonar-train-A1.jsonl",
44
+ validation_data_path: dataset_path + "/dutch-sonar-dev-A1.jsonl",
45
+ test_data_path: dataset_path + "/dutch-sonar-test-A1.jsonl",
46
+
47
+ datasets_for_vocab_creation: ["train"],
48
+
49
+ data_loader: {
50
+ batch_sampler: {
51
+ type: "mix_sampler",
52
+ max_tokens: max_training_tokens,
53
+ sorting_keys: ['tokens'],
54
+ sampling_ratios: {
55
+ 'exemplar': 1.0,
56
+ 'full text': 0.0,
57
+ }
58
+ }
59
+ },
60
+
61
+ validation_data_loader: {
62
+ batch_sampler: {
63
+ type: "max_tokens_sampler",
64
+ max_tokens: max_inference_tokens,
65
+ sorting_keys: ['tokens']
66
+ }
67
+ },
68
+
69
+ model: {
70
+ type: "span",
71
+ word_embedding: {
72
+ token_embedders: {
73
+ "pieces": {
74
+ type: "pretrained_transformer",
75
+ model_name: pretrained_model,
76
+ }
77
+ },
78
+ },
79
+ span_extractor: {
80
+ type: 'combo',
81
+ sub_extractors: [
82
+ {
83
+ type: 'self_attentive',
84
+ },
85
+ {
86
+ type: 'bidirectional_endpoint',
87
+ }
88
+ ]
89
+ },
90
+ span_finder: {
91
+ type: "bio",
92
+ bio_encoder: {
93
+ type: "lstm",
94
+ hidden_size: bio_dim,
95
+ num_layers: bio_layers,
96
+ bidirectional: true,
97
+ dropout: dropout,
98
+ },
99
+ no_label: false,
100
+ },
101
+ span_typing: {
102
+ type: 'mlp',
103
+ hidden_dims: span_typing_dims,
104
+ },
105
+ metrics: [{type: "srl"}],
106
+
107
+ typing_loss_factor: typing_loss_factor,
108
+ ontology_path: null,
109
+ label_dim: label_dim,
110
+ max_decoding_spans: 128,
111
+ max_recursion_depth: 2,
112
+ debug: debug,
113
+ },
114
+
115
+ trainer: {
116
+ num_epochs: 128,
117
+ patience: patience,
118
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
119
+ validation_metric: "+em_f",
120
+ grad_norm: 10,
121
+ grad_clipping: 10,
122
+ num_gradient_accumulation_steps: grad_acc,
123
+ optimizer: {
124
+ type: "transformer",
125
+ base: {
126
+ type: "adam",
127
+ lr: 1e-3,
128
+ },
129
+ embeddings_lr: 0.0,
130
+ encoder_lr: 1e-5,
131
+ pooler_lr: 1e-5,
132
+ layer_fix: layer_fix,
133
+ }
134
+ },
135
+
136
+ cuda_devices:: cuda_devices,
137
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
138
+ "cuda_devices": cuda_devices
139
+ },
140
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
141
+ }
config/fn-sonar/sonar-a2.framenet_xlmr.jsonnet ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+
3
+ #local dataset_path = env.str("DATA_PATH", "data/framenet/full");
4
+ local dataset_path = "/home/p289731/cloned/lome/preproc/sonar_jsonl";
5
+ local ontology_path = "data/framenet/ontology.tsv";
6
+
7
+ local debug = false;
8
+
9
+ # reader
10
+ local pretrained_model = "/data/p289731/cloned/lome-models/models/xlm-roberta-framenet/";
11
+ local smoothing_factor = env.json("SMOOTHING", "0.1");
12
+
13
+ # model
14
+ local label_dim = env.json("LABEL_DIM", "64");
15
+ local dropout = env.json("DROPOUT", "0.2");
16
+ local bio_dim = env.json("BIO_DIM", "512");
17
+ local bio_layers = env.json("BIO_LAYER", "2");
18
+ local span_typing_dims = env.json("TYPING_DIMS", "[256, 256]");
19
+ local typing_loss_factor = env.json("LOSS_FACTOR", "8.0");
20
+
21
+ # loader
22
+ local exemplar_ratio = env.json("EXEMPLAR_RATIO", "0.05");
23
+ local max_training_tokens = 512;
24
+ local max_inference_tokens = 1024;
25
+
26
+ # training
27
+ local layer_fix = env.json("LAYER_FIX", "0");
28
+ local grad_acc = env.json("GRAD_ACC", "1");
29
+ #local cuda_devices = env.json("CUDA_DEVICES", "[-1]");
30
+ local cuda_devices = [0];
31
+ local patience = 32;
32
+
33
+ {
34
+ dataset_reader: {
35
+ type: "semantic_role_labeling",
36
+ debug: debug,
37
+ pretrained_model: pretrained_model,
38
+ ignore_label: false,
39
+ [ if debug then "max_instances" ]: 128,
40
+ event_smoothing_factor: smoothing_factor,
41
+ arg_smoothing_factor: smoothing_factor,
42
+ },
43
+ train_data_path: dataset_path + "/dutch-sonar-train-A2.jsonl",
44
+ validation_data_path: dataset_path + "/dutch-sonar-dev-A2.jsonl",
45
+ test_data_path: dataset_path + "/dutch-sonar-test-A2.jsonl",
46
+
47
+ datasets_for_vocab_creation: ["train"],
48
+
49
+ data_loader: {
50
+ batch_sampler: {
51
+ type: "mix_sampler",
52
+ max_tokens: max_training_tokens,
53
+ sorting_keys: ['tokens'],
54
+ sampling_ratios: {
55
+ 'exemplar': 1.0,
56
+ 'full text': 0.0,
57
+ }
58
+ }
59
+ },
60
+
61
+ validation_data_loader: {
62
+ batch_sampler: {
63
+ type: "max_tokens_sampler",
64
+ max_tokens: max_inference_tokens,
65
+ sorting_keys: ['tokens']
66
+ }
67
+ },
68
+
69
+ model: {
70
+ type: "span",
71
+ word_embedding: {
72
+ token_embedders: {
73
+ "pieces": {
74
+ type: "pretrained_transformer",
75
+ model_name: pretrained_model,
76
+ }
77
+ },
78
+ },
79
+ span_extractor: {
80
+ type: 'combo',
81
+ sub_extractors: [
82
+ {
83
+ type: 'self_attentive',
84
+ },
85
+ {
86
+ type: 'bidirectional_endpoint',
87
+ }
88
+ ]
89
+ },
90
+ span_finder: {
91
+ type: "bio",
92
+ bio_encoder: {
93
+ type: "lstm",
94
+ hidden_size: bio_dim,
95
+ num_layers: bio_layers,
96
+ bidirectional: true,
97
+ dropout: dropout,
98
+ },
99
+ no_label: false,
100
+ },
101
+ span_typing: {
102
+ type: 'mlp',
103
+ hidden_dims: span_typing_dims,
104
+ },
105
+ metrics: [{type: "srl"}],
106
+
107
+ typing_loss_factor: typing_loss_factor,
108
+ ontology_path: null,
109
+ label_dim: label_dim,
110
+ max_decoding_spans: 128,
111
+ max_recursion_depth: 2,
112
+ debug: debug,
113
+ },
114
+
115
+ trainer: {
116
+ num_epochs: 128,
117
+ patience: patience,
118
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
119
+ validation_metric: "+em_f",
120
+ grad_norm: 10,
121
+ grad_clipping: 10,
122
+ num_gradient_accumulation_steps: grad_acc,
123
+ optimizer: {
124
+ type: "transformer",
125
+ base: {
126
+ type: "adam",
127
+ lr: 1e-3,
128
+ },
129
+ embeddings_lr: 0.0,
130
+ encoder_lr: 1e-5,
131
+ pooler_lr: 1e-5,
132
+ layer_fix: layer_fix,
133
+ }
134
+ },
135
+
136
+ cuda_devices:: cuda_devices,
137
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
138
+ "cuda_devices": cuda_devices
139
+ },
140
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
141
+ }
config/fn-sonar/sonar-a2.sonar_plus_fn.vanilla_xlmr.jsonnet ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+
3
+ #local dataset_path = env.str("DATA_PATH", "data/framenet/full");
4
+ local dataset_path = "/home/p289731/cloned/lome/preproc/sonar_jsonl";
5
+ local ontology_path = "data/framenet/ontology.tsv";
6
+
7
+ local debug = false;
8
+
9
+ # reader
10
+ local pretrained_model = env.str("ENCODER", "xlm-roberta-large");
11
+ local smoothing_factor = env.json("SMOOTHING", "0.1");
12
+
13
+ # model
14
+ local label_dim = env.json("LABEL_DIM", "64");
15
+ local dropout = env.json("DROPOUT", "0.2");
16
+ local bio_dim = env.json("BIO_DIM", "512");
17
+ local bio_layers = env.json("BIO_LAYER", "2");
18
+ local span_typing_dims = env.json("TYPING_DIMS", "[256, 256]");
19
+ local typing_loss_factor = env.json("LOSS_FACTOR", "8.0");
20
+
21
+ # loader
22
+ local exemplar_ratio = env.json("EXEMPLAR_RATIO", "0.05");
23
+ local max_training_tokens = 512;
24
+ local max_inference_tokens = 1024;
25
+
26
+ # training
27
+ local layer_fix = env.json("LAYER_FIX", "0");
28
+ local grad_acc = env.json("GRAD_ACC", "1");
29
+ #local cuda_devices = env.json("CUDA_DEVICES", "[-1]");
30
+ local cuda_devices = [0];
31
+ local patience = 32;
32
+
33
+ {
34
+ dataset_reader: {
35
+ type: "semantic_role_labeling",
36
+ debug: debug,
37
+ pretrained_model: pretrained_model,
38
+ ignore_label: false,
39
+ [ if debug then "max_instances" ]: 128,
40
+ event_smoothing_factor: smoothing_factor,
41
+ arg_smoothing_factor: smoothing_factor,
42
+ },
43
+ train_data_path: dataset_path + "/dutch-sonar-train-A2.jsonl",
44
+ validation_data_path: dataset_path + "/dutch-sonar-dev-A2.jsonl",
45
+ test_data_path: dataset_path + "/dutch-sonar-test-A2.jsonl",
46
+
47
+ datasets_for_vocab_creation: ["train"],
48
+
49
+ data_loader: {
50
+ batch_sampler: {
51
+ type: "mix_sampler",
52
+ max_tokens: max_training_tokens,
53
+ sorting_keys: ['tokens'],
54
+ sampling_ratios: {
55
+ 'exemplar': 1.0,
56
+ 'full text': 0.0,
57
+ }
58
+ }
59
+ },
60
+
61
+ validation_data_loader: {
62
+ batch_sampler: {
63
+ type: "max_tokens_sampler",
64
+ max_tokens: max_inference_tokens,
65
+ sorting_keys: ['tokens']
66
+ }
67
+ },
68
+
69
+ model: {
70
+ type: "span",
71
+ word_embedding: {
72
+ token_embedders: {
73
+ "pieces": {
74
+ type: "pretrained_transformer",
75
+ model_name: pretrained_model,
76
+ }
77
+ },
78
+ },
79
+ span_extractor: {
80
+ type: 'combo',
81
+ sub_extractors: [
82
+ {
83
+ type: 'self_attentive',
84
+ },
85
+ {
86
+ type: 'bidirectional_endpoint',
87
+ }
88
+ ]
89
+ },
90
+ span_finder: {
91
+ type: "bio",
92
+ bio_encoder: {
93
+ type: "lstm",
94
+ hidden_size: bio_dim,
95
+ num_layers: bio_layers,
96
+ bidirectional: true,
97
+ dropout: dropout,
98
+ },
99
+ no_label: false,
100
+ },
101
+ span_typing: {
102
+ type: 'mlp',
103
+ hidden_dims: span_typing_dims,
104
+ },
105
+ metrics: [{type: "srl"}],
106
+
107
+ typing_loss_factor: typing_loss_factor,
108
+ ontology_path: null,
109
+ label_dim: label_dim,
110
+ max_decoding_spans: 128,
111
+ max_recursion_depth: 2,
112
+ debug: debug,
113
+ },
114
+
115
+ trainer: {
116
+ num_epochs: 128,
117
+ patience: patience,
118
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
119
+ validation_metric: "+em_f",
120
+ grad_norm: 10,
121
+ grad_clipping: 10,
122
+ num_gradient_accumulation_steps: grad_acc,
123
+ optimizer: {
124
+ type: "transformer",
125
+ base: {
126
+ type: "adam",
127
+ lr: 1e-3,
128
+ },
129
+ embeddings_lr: 0.0,
130
+ encoder_lr: 1e-5,
131
+ pooler_lr: 1e-5,
132
+ layer_fix: layer_fix,
133
+ }
134
+ },
135
+
136
+ cuda_devices:: cuda_devices,
137
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
138
+ "cuda_devices": cuda_devices
139
+ },
140
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
141
+ }
config/fn-sonar/sonar-a2.vanilla_xlmr.jsonnet ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+
3
+ #local dataset_path = env.str("DATA_PATH", "data/framenet/full");
4
+ local dataset_path = "/home/p289731/cloned/lome/preproc/sonar_jsonl";
5
+ local ontology_path = "data/framenet/ontology.tsv";
6
+
7
+ local debug = false;
8
+
9
+ # reader
10
+ local pretrained_model = env.str("ENCODER", "xlm-roberta-large");
11
+ local smoothing_factor = env.json("SMOOTHING", "0.1");
12
+
13
+ # model
14
+ local label_dim = env.json("LABEL_DIM", "64");
15
+ local dropout = env.json("DROPOUT", "0.2");
16
+ local bio_dim = env.json("BIO_DIM", "512");
17
+ local bio_layers = env.json("BIO_LAYER", "2");
18
+ local span_typing_dims = env.json("TYPING_DIMS", "[256, 256]");
19
+ local typing_loss_factor = env.json("LOSS_FACTOR", "8.0");
20
+
21
+ # loader
22
+ local exemplar_ratio = env.json("EXEMPLAR_RATIO", "0.05");
23
+ local max_training_tokens = 512;
24
+ local max_inference_tokens = 1024;
25
+
26
+ # training
27
+ local layer_fix = env.json("LAYER_FIX", "0");
28
+ local grad_acc = env.json("GRAD_ACC", "1");
29
+ #local cuda_devices = env.json("CUDA_DEVICES", "[-1]");
30
+ local cuda_devices = [0];
31
+ local patience = 32;
32
+
33
+ {
34
+ dataset_reader: {
35
+ type: "semantic_role_labeling",
36
+ debug: debug,
37
+ pretrained_model: pretrained_model,
38
+ ignore_label: false,
39
+ [ if debug then "max_instances" ]: 128,
40
+ event_smoothing_factor: smoothing_factor,
41
+ arg_smoothing_factor: smoothing_factor,
42
+ },
43
+ train_data_path: dataset_path + "/dutch-sonar-train-A2.jsonl",
44
+ validation_data_path: dataset_path + "/dutch-sonar-dev-A2.jsonl",
45
+ test_data_path: dataset_path + "/dutch-sonar-test-A2.jsonl",
46
+
47
+ datasets_for_vocab_creation: ["train"],
48
+
49
+ data_loader: {
50
+ batch_sampler: {
51
+ type: "mix_sampler",
52
+ max_tokens: max_training_tokens,
53
+ sorting_keys: ['tokens'],
54
+ sampling_ratios: {
55
+ 'exemplar': 1.0,
56
+ 'full text': 0.0,
57
+ }
58
+ }
59
+ },
60
+
61
+ validation_data_loader: {
62
+ batch_sampler: {
63
+ type: "max_tokens_sampler",
64
+ max_tokens: max_inference_tokens,
65
+ sorting_keys: ['tokens']
66
+ }
67
+ },
68
+
69
+ model: {
70
+ type: "span",
71
+ word_embedding: {
72
+ token_embedders: {
73
+ "pieces": {
74
+ type: "pretrained_transformer",
75
+ model_name: pretrained_model,
76
+ }
77
+ },
78
+ },
79
+ span_extractor: {
80
+ type: 'combo',
81
+ sub_extractors: [
82
+ {
83
+ type: 'self_attentive',
84
+ },
85
+ {
86
+ type: 'bidirectional_endpoint',
87
+ }
88
+ ]
89
+ },
90
+ span_finder: {
91
+ type: "bio",
92
+ bio_encoder: {
93
+ type: "lstm",
94
+ hidden_size: bio_dim,
95
+ num_layers: bio_layers,
96
+ bidirectional: true,
97
+ dropout: dropout,
98
+ },
99
+ no_label: false,
100
+ },
101
+ span_typing: {
102
+ type: 'mlp',
103
+ hidden_dims: span_typing_dims,
104
+ },
105
+ metrics: [{type: "srl"}],
106
+
107
+ typing_loss_factor: typing_loss_factor,
108
+ ontology_path: null,
109
+ label_dim: label_dim,
110
+ max_decoding_spans: 128,
111
+ max_recursion_depth: 2,
112
+ debug: debug,
113
+ },
114
+
115
+ trainer: {
116
+ num_epochs: 128,
117
+ patience: patience,
118
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
119
+ validation_metric: "+em_f",
120
+ grad_norm: 10,
121
+ grad_clipping: 10,
122
+ num_gradient_accumulation_steps: grad_acc,
123
+ optimizer: {
124
+ type: "transformer",
125
+ base: {
126
+ type: "adam",
127
+ lr: 1e-3,
128
+ },
129
+ embeddings_lr: 0.0,
130
+ encoder_lr: 1e-5,
131
+ pooler_lr: 1e-5,
132
+ layer_fix: layer_fix,
133
+ }
134
+ },
135
+
136
+ cuda_devices:: cuda_devices,
137
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
138
+ "cuda_devices": cuda_devices
139
+ },
140
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
141
+ }
config/fn/fn.orig.jsonnet ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+
3
+ local dataset_path = env.str("DATA_PATH", "data/framenet/full");
4
+ local ontology_path = "data/framenet/ontology.tsv";
5
+
6
+ local debug = false;
7
+
8
+ # reader
9
+ local pretrained_model = env.str("ENCODER", "xlm-roberta-large");
10
+ local smoothing_factor = env.json("SMOOTHING", "0.1");
11
+
12
+ # model
13
+ local label_dim = env.json("LABEL_DIM", "64");
14
+ local dropout = env.json("DROPOUT", "0.2");
15
+ local bio_dim = env.json("BIO_DIM", "512");
16
+ local bio_layers = env.json("BIO_LAYER", "2");
17
+ local span_typing_dims = env.json("TYPING_DIMS", "[256, 256]");
18
+ local typing_loss_factor = env.json("LOSS_FACTOR", "8.0");
19
+
20
+ # loader
21
+ local exemplar_ratio = env.json("EXEMPLAR_RATIO", "0.05");
22
+ local max_training_tokens = 512;
23
+ local max_inference_tokens = 1024;
24
+
25
+ # training
26
+ local layer_fix = env.json("LAYER_FIX", "0");
27
+ local grad_acc = env.json("GRAD_ACC", "1");
28
+ local cuda_devices = env.json("CUDA_DEVICES", "[-1]");
29
+ local patience = env.json("PATIENCE", "null");
30
+
31
+ {
32
+ dataset_reader: {
33
+ type: "semantic_role_labeling",
34
+ debug: debug,
35
+ pretrained_model: pretrained_model,
36
+ ignore_label: false,
37
+ [ if debug then "max_instances" ]: 128,
38
+ event_smoothing_factor: smoothing_factor,
39
+ arg_smoothing_factor: smoothing_factor,
40
+ },
41
+ train_data_path: dataset_path + "/train.jsonl",
42
+ validation_data_path: dataset_path + "/dev.jsonl",
43
+ test_data_path: dataset_path + "/test.jsonl",
44
+
45
+ datasets_for_vocab_creation: ["train"],
46
+
47
+ data_loader: {
48
+ batch_sampler: {
49
+ type: "mix_sampler",
50
+ max_tokens: max_training_tokens,
51
+ sorting_keys: ['tokens'],
52
+ sampling_ratios: {
53
+ 'exemplar': exemplar_ratio,
54
+ 'full text': 1.0,
55
+ }
56
+ }
57
+ },
58
+
59
+ validation_data_loader: {
60
+ batch_sampler: {
61
+ type: "max_tokens_sampler",
62
+ max_tokens: max_inference_tokens,
63
+ sorting_keys: ['tokens']
64
+ }
65
+ },
66
+
67
+ model: {
68
+ type: "span",
69
+ word_embedding: {
70
+ token_embedders: {
71
+ "pieces": {
72
+ type: "pretrained_transformer",
73
+ model_name: pretrained_model,
74
+ }
75
+ },
76
+ },
77
+ span_extractor: {
78
+ type: 'combo',
79
+ sub_extractors: [
80
+ {
81
+ type: 'self_attentive',
82
+ },
83
+ {
84
+ type: 'bidirectional_endpoint',
85
+ }
86
+ ]
87
+ },
88
+ span_finder: {
89
+ type: "bio",
90
+ bio_encoder: {
91
+ type: "lstm",
92
+ hidden_size: bio_dim,
93
+ num_layers: bio_layers,
94
+ bidirectional: true,
95
+ dropout: dropout,
96
+ },
97
+ no_label: false,
98
+ },
99
+ span_typing: {
100
+ type: 'mlp',
101
+ hidden_dims: span_typing_dims,
102
+ },
103
+ metrics: [{type: "srl"}],
104
+
105
+ typing_loss_factor: typing_loss_factor,
106
+ ontology_path: ontology_path,
107
+ label_dim: label_dim,
108
+ max_decoding_spans: 128,
109
+ max_recursion_depth: 2,
110
+ debug: debug,
111
+ },
112
+
113
+ trainer: {
114
+ num_epochs: 128,
115
+ patience: patience,
116
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
117
+ validation_metric: "+em_f",
118
+ grad_norm: 10,
119
+ grad_clipping: 10,
120
+ num_gradient_accumulation_steps: grad_acc,
121
+ optimizer: {
122
+ type: "transformer",
123
+ base: {
124
+ type: "adam",
125
+ lr: 1e-3,
126
+ },
127
+ embeddings_lr: 0.0,
128
+ encoder_lr: 1e-5,
129
+ pooler_lr: 1e-5,
130
+ layer_fix: layer_fix,
131
+ }
132
+ },
133
+
134
+ cuda_devices:: cuda_devices,
135
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
136
+ "cuda_devices": cuda_devices
137
+ },
138
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
139
+ }
config/fn/fn.train-football.jsonnet ADDED
@@ -0,0 +1,142 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+
3
+ #local dataset_path = env.str("DATA_PATH", "data/framenet/full");
4
+ local dataset_path = "/home/p289731/cloned/lome/preproc/framenet_jsonl/full";
5
+ local ontology_path = "data/framenet/ontology.tsv";
6
+
7
+ local debug = false;
8
+
9
+ # reader
10
+ #local pretrained_model = env.str("ENCODER", "xlm-roberta-large");
11
+ local pretrained_model = env.str("ENCODER", "/data/p289731/cloned/lome-models/models/xlm-roberta-football/");
12
+ local smoothing_factor = env.json("SMOOTHING", "0.1");
13
+
14
+ # model
15
+ local label_dim = env.json("LABEL_DIM", "64");
16
+ local dropout = env.json("DROPOUT", "0.2");
17
+ local bio_dim = env.json("BIO_DIM", "512");
18
+ local bio_layers = env.json("BIO_LAYER", "2");
19
+ local span_typing_dims = env.json("TYPING_DIMS", "[256, 256]");
20
+ local typing_loss_factor = env.json("LOSS_FACTOR", "8.0");
21
+
22
+ # loader
23
+ local exemplar_ratio = env.json("EXEMPLAR_RATIO", "0.05");
24
+ local max_training_tokens = 512;
25
+ local max_inference_tokens = 1024;
26
+
27
+ # training
28
+ local layer_fix = env.json("LAYER_FIX", "0");
29
+ local grad_acc = env.json("GRAD_ACC", "1");
30
+ #local cuda_devices = env.json("CUDA_DEVICES", "[-1]");
31
+ local cuda_devices = [0];
32
+ local patience = 32;
33
+
34
+ {
35
+ dataset_reader: {
36
+ type: "semantic_role_labeling",
37
+ debug: debug,
38
+ pretrained_model: "xlm-roberta-large",
39
+ ignore_label: false,
40
+ [ if debug then "max_instances" ]: 128,
41
+ event_smoothing_factor: smoothing_factor,
42
+ arg_smoothing_factor: smoothing_factor,
43
+ },
44
+ train_data_path: dataset_path + "/train.jsonl",
45
+ validation_data_path: dataset_path + "/dev.jsonl",
46
+ test_data_path: dataset_path + "/test.jsonl",
47
+
48
+ datasets_for_vocab_creation: ["train"],
49
+
50
+ data_loader: {
51
+ batch_sampler: {
52
+ type: "mix_sampler",
53
+ max_tokens: max_training_tokens,
54
+ sorting_keys: ['tokens'],
55
+ sampling_ratios: {
56
+ 'exemplar': exemplar_ratio,
57
+ 'full text': 1.0,
58
+ }
59
+ }
60
+ },
61
+
62
+ validation_data_loader: {
63
+ batch_sampler: {
64
+ type: "max_tokens_sampler",
65
+ max_tokens: max_inference_tokens,
66
+ sorting_keys: ['tokens']
67
+ }
68
+ },
69
+
70
+ model: {
71
+ type: "span",
72
+ word_embedding: {
73
+ token_embedders: {
74
+ "pieces": {
75
+ type: "pretrained_transformer",
76
+ model_name: pretrained_model,
77
+ }
78
+ },
79
+ },
80
+ span_extractor: {
81
+ type: 'combo',
82
+ sub_extractors: [
83
+ {
84
+ type: 'self_attentive',
85
+ },
86
+ {
87
+ type: 'bidirectional_endpoint',
88
+ }
89
+ ]
90
+ },
91
+ span_finder: {
92
+ type: "bio",
93
+ bio_encoder: {
94
+ type: "lstm",
95
+ hidden_size: bio_dim,
96
+ num_layers: bio_layers,
97
+ bidirectional: true,
98
+ dropout: dropout,
99
+ },
100
+ no_label: false,
101
+ },
102
+ span_typing: {
103
+ type: 'mlp',
104
+ hidden_dims: span_typing_dims,
105
+ },
106
+ metrics: [{type: "srl"}],
107
+
108
+ typing_loss_factor: typing_loss_factor,
109
+ ontology_path: null,
110
+ label_dim: label_dim,
111
+ max_decoding_spans: 128,
112
+ max_recursion_depth: 2,
113
+ debug: debug,
114
+ },
115
+
116
+ trainer: {
117
+ num_epochs: 128,
118
+ patience: patience,
119
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
120
+ validation_metric: "+em_f",
121
+ grad_norm: 10,
122
+ grad_clipping: 10,
123
+ num_gradient_accumulation_steps: grad_acc,
124
+ optimizer: {
125
+ type: "transformer",
126
+ base: {
127
+ type: "adam",
128
+ lr: 1e-3,
129
+ },
130
+ embeddings_lr: 0.0,
131
+ encoder_lr: 1e-5,
132
+ pooler_lr: 1e-5,
133
+ layer_fix: layer_fix,
134
+ }
135
+ },
136
+
137
+ cuda_devices:: cuda_devices,
138
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
139
+ "cuda_devices": cuda_devices
140
+ },
141
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
142
+ }
config/fn/fn.train3.jsonnet ADDED
@@ -0,0 +1,141 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ local env = import "../env.jsonnet";
2
+
3
+ #local dataset_path = env.str("DATA_PATH", "data/framenet/full");
4
+ local dataset_path = "/home/p289731/cloned/lome/preproc/framenet_jsonl/full";
5
+ local ontology_path = "data/framenet/ontology.tsv";
6
+
7
+ local debug = false;
8
+
9
+ # reader
10
+ local pretrained_model = env.str("ENCODER", "xlm-roberta-large");
11
+ local smoothing_factor = env.json("SMOOTHING", "0.1");
12
+
13
+ # model
14
+ local label_dim = env.json("LABEL_DIM", "64");
15
+ local dropout = env.json("DROPOUT", "0.2");
16
+ local bio_dim = env.json("BIO_DIM", "512");
17
+ local bio_layers = env.json("BIO_LAYER", "2");
18
+ local span_typing_dims = env.json("TYPING_DIMS", "[256, 256]");
19
+ local typing_loss_factor = env.json("LOSS_FACTOR", "8.0");
20
+
21
+ # loader
22
+ local exemplar_ratio = env.json("EXEMPLAR_RATIO", "0.05");
23
+ local max_training_tokens = 512;
24
+ local max_inference_tokens = 1024;
25
+
26
+ # training
27
+ local layer_fix = env.json("LAYER_FIX", "0");
28
+ local grad_acc = env.json("GRAD_ACC", "1");
29
+ #local cuda_devices = env.json("CUDA_DEVICES", "[-1]");
30
+ local cuda_devices = [0];
31
+ local patience = 32;
32
+
33
+ {
34
+ dataset_reader: {
35
+ type: "semantic_role_labeling",
36
+ debug: debug,
37
+ pretrained_model: pretrained_model,
38
+ ignore_label: false,
39
+ [ if debug then "max_instances" ]: 128,
40
+ event_smoothing_factor: smoothing_factor,
41
+ arg_smoothing_factor: smoothing_factor,
42
+ },
43
+ train_data_path: dataset_path + "/train.jsonl",
44
+ validation_data_path: dataset_path + "/dev.jsonl",
45
+ test_data_path: dataset_path + "/test.jsonl",
46
+
47
+ datasets_for_vocab_creation: ["train"],
48
+
49
+ data_loader: {
50
+ batch_sampler: {
51
+ type: "mix_sampler",
52
+ max_tokens: max_training_tokens,
53
+ sorting_keys: ['tokens'],
54
+ sampling_ratios: {
55
+ 'exemplar': exemplar_ratio,
56
+ 'full text': 1.0,
57
+ }
58
+ }
59
+ },
60
+
61
+ validation_data_loader: {
62
+ batch_sampler: {
63
+ type: "max_tokens_sampler",
64
+ max_tokens: max_inference_tokens,
65
+ sorting_keys: ['tokens']
66
+ }
67
+ },
68
+
69
+ model: {
70
+ type: "span",
71
+ word_embedding: {
72
+ token_embedders: {
73
+ "pieces": {
74
+ type: "pretrained_transformer",
75
+ model_name: pretrained_model,
76
+ }
77
+ },
78
+ },
79
+ span_extractor: {
80
+ type: 'combo',
81
+ sub_extractors: [
82
+ {
83
+ type: 'self_attentive',
84
+ },
85
+ {
86
+ type: 'bidirectional_endpoint',
87
+ }
88
+ ]
89
+ },
90
+ span_finder: {
91
+ type: "bio",
92
+ bio_encoder: {
93
+ type: "lstm",
94
+ hidden_size: bio_dim,
95
+ num_layers: bio_layers,
96
+ bidirectional: true,
97
+ dropout: dropout,
98
+ },
99
+ no_label: false,
100
+ },
101
+ span_typing: {
102
+ type: 'mlp',
103
+ hidden_dims: span_typing_dims,
104
+ },
105
+ metrics: [{type: "srl"}],
106
+
107
+ typing_loss_factor: typing_loss_factor,
108
+ ontology_path: null,
109
+ label_dim: label_dim,
110
+ max_decoding_spans: 128,
111
+ max_recursion_depth: 2,
112
+ debug: debug,
113
+ },
114
+
115
+ trainer: {
116
+ num_epochs: 128,
117
+ patience: patience,
118
+ [if std.length(cuda_devices) == 1 then "cuda_device"]: cuda_devices[0],
119
+ validation_metric: "+em_f",
120
+ grad_norm: 10,
121
+ grad_clipping: 10,
122
+ num_gradient_accumulation_steps: grad_acc,
123
+ optimizer: {
124
+ type: "transformer",
125
+ base: {
126
+ type: "adam",
127
+ lr: 1e-3,
128
+ },
129
+ embeddings_lr: 0.0,
130
+ encoder_lr: 1e-5,
131
+ pooler_lr: 1e-5,
132
+ layer_fix: layer_fix,
133
+ }
134
+ },
135
+
136
+ cuda_devices:: cuda_devices,
137
+ [if std.length(cuda_devices) > 1 then "distributed"]: {
138
+ "cuda_devices": cuda_devices
139
+ },
140
+ [if std.length(cuda_devices) == 1 then "evaluate_on_test"]: true
141
+ }
docs/data.md ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Data Format
2
+
3
+ You can pass SpanFinder any formats of data, as long as you implement a dataset reader inherited from SpanReader. We also provide a Concrete dataset reader. Besides them, SpanFinder comes with its own JSON data format, which enables richer features for training and modeling.
4
+
5
+ The minimal example of the JSON is
6
+
7
+ ```JSON
8
+ {
9
+ "meta": {
10
+ "fully_annotated": true
11
+ },
12
+ "tokens": ["Bob", "attacks", "the", "building", "."],
13
+ "annotations": [
14
+ {
15
+ "span": [1, 1],
16
+ "label": "Attack",
17
+ "children": [
18
+ {
19
+ "span": [0, 0],
20
+ "label": "Assailant",
21
+ "children": []
22
+ },
23
+ {
24
+ "span": [2, 3],
25
+ "label": "Victim",
26
+ "children": []
27
+ }
28
+ ]
29
+ },
30
+ {
31
+ "span": [3, 3],
32
+ "label": "Buildings",
33
+ "children": [
34
+ {
35
+ "span": [3, 3],
36
+ "label": "Building",
37
+ "children": []
38
+ }
39
+ ]
40
+ }
41
+ ]
42
+ }
43
+ ```
44
+
45
+ You can have nested spans with unlimited depth.
46
+
47
+ ## Meta-info for Semantic Role Labeling (SRL)
48
+
49
+ ```JSON
50
+ {
51
+ "ontology": {
52
+ "event": ["Violence-Attack"],
53
+ "argument": ["Agent", "Patient"],
54
+ "link": [[0, 0], [0, 1]]
55
+ },
56
+ "ontology_mapping": {
57
+ "event": {
58
+ "Attack": ["Violence-Attack", 0.8]
59
+ },
60
+ "argument": {
61
+ "Assault": ["Agent", 0.95],
62
+ "Victim": ["patient", 0.9]
63
+ }
64
+ }
65
+ }
66
+ ```
67
+
68
+ TODO: Guanghui needs to doc this.
docs/mapping.md ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ## Mapping
2
+
3
+ If a file is passed to the predictor,
4
+ the predicted spans will be converted into a new ontology.
5
+ The file format should be
6
+
7
+ `<original parent label>\t<original label>\t<new label>`
8
+
9
+ If the predicted span is labeled as `<original label>`,
10
+ and its parent is labeled as `<orignal parent label>`,
11
+ it will be re-labeled as `<new label>`.
12
+ If no rules match, the span and all of its descendents will be ignored.
13
+
14
+ The `<original parent label>` is optional.
15
+ If the parent label is `@@VIRTUAL_ROOT@@`, then this rule matches the first layer of spans.
16
+ In semantic parsing, it matches events.
17
+ If the parent label is `*`, it means it can match anything.
docs/training.md ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Training Span Finder
2
+
3
+ ## Metrics explanation
4
+
5
+ By default, the following metrics will be used
6
+
7
+ - em: (includes emp, emr, emf) Exact matching metric. A span is exactly matched iff its parent, boundaries, and label are all correctly predicted. Note that if a parent is not correctly predicted, all its children will be treated as false negative. In another word, errors are propagated.
8
+ - sm: (includes smp, smr, smf) Span matching metric. Similar to EM but will not check the labels. If you observe high EM but low SM, then the typing system is not properly working.
9
+ - finder: (includes finder-p, finder-r, finder-f) A metric to measure how well the model can find spans. Different from SM, in this metric, gold parent will be provided, so the errors will not be propagated.
10
+ - typing_acc: Span typing accuracy with gold parent and gold span boundaries.
11
+
12
+
13
+ Optional metrics that might be useful for SRL-style tasks. Put the following line
14
+
15
+ `metrics: [{type: "srl", check_type: true}],`
16
+
17
+ to the span model in the config file to turn on this feature. You will see the following two metrics:
18
+
19
+ - trigger: (include trigger-p, trigger-r, trigger-f) It measures how well the system can find the event triggers (or frames in FrameNet). If `check_type` is True, it also checks the event label.
20
+ - role: (include role-p, role-r, role-f) It measures how well the system can find roles. Note if the event/trigger is not found, all its children will be treated as false negative. If `check_type` is True, it also checks the role label.
21
+
22
+ ## Ontology Constraint
23
+
24
+ In some cases, certain spans can also be attached to specific spans.
25
+ E.g., in SRL tasks, event can only be attached to the VirtualRoot, and arguments can only be attached to the events.
26
+ The constraints of FrameNet is harsher, where each frame have some specific frame elements.
27
+
28
+ These constraints can be abstracted as a boolean square matrix whose columns and rows are span labels including VIRTUAL_ROOT.
29
+ Say it's `M`, label2 can be label1's child iff `M[label1, label2]` if True.
30
+
31
+ You can specify ontology constraint for SpanFinder with the `ontology_path` argument in the SpanModel class.
32
+ The format of this file is simple. Each line is one row of the `M` matrix:
33
+
34
+ ```parent_label child_label_1 child_label_2```
35
+
36
+ which means child1 and child2 can be attached to the parent.
37
+ Both `parent_label` and `child_label` are strings, and the space between them should be `\t` not ` `.
38
+ If a parent_label is missing from the file, by default all children be attachable.
39
+ If this file is not provided, all labels can be attached to all labels.
40
+
41
+ An example of this file can be found at CLSP grid:
42
+
43
+ ```/home/gqin2/data/framenet/ontology.tsv```
44
+
45
+ ## Typing loss factor
46
+
47
+ (This section might be updated soon -- Guanghui)
48
+
49
+ The loss comes from two sources: SpanFinding and SpanTyping modules.
50
+ SpanFinder uses CRF and use probability as loss, but SpanTyping uses cross entropy.
51
+ They're of different scale so we have to re-scale them.
52
+ The formula is:
53
+
54
+ `loss = finding_loss + typing_loss_factor * typing_loss`
55
+
56
+ Empirically Guanghui finds the optimal `typing_loss_factor` for FrameNet system is 750.
57
+
58
+ In theory, we should put the two losses to the same space. Guanghui is looking into this, and this might be solved in SpanFinder 0.0.2.
59
+
60
+ ## Optimizer
61
+
62
+ A custom optimizer `transformer` is used for span finder.
63
+ It allows you to specify special learning rate for transformer encoder and fix the parameters of certain modules.
64
+ Empirically, fix embedding (so only fine-tune the encoder and pooler) and train with lr=1e-5 yields best results for FrameNet.
65
+ For usage and more details, see its class doc.
evalita_scores.txt ADDED
File without changes
model.mod.tar.gz ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3f5be5aeef50b2f4840317b8196c51186f9f138a853dc1eb2da980b1947ceb23
3
+ size 1795605184
requirements.txt ADDED
@@ -0,0 +1,15 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ allennlp>=2.0.0
2
+ allennlp-models>=2.0.0
3
+ transformers>=4.0.0 # Why is huggingface so unstable?
4
+ numpy
5
+ torch>=1.7.0,<1.8.0
6
+ tqdm
7
+ nltk
8
+ overrides
9
+ concrete
10
+ flask
11
+ scipy
12
+ https://github.com/explosion/spacy-models/releases/download/it_core_news_md-3.0.0/it_core_news_md-3.0.0-py3-none-any.whl
13
+ https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.0.0/en_core_web_md-3.0.0-py3-none-any.whl
14
+ https://github.com/explosion/spacy-models/releases/download/nl_core_news_md-3.0.0/nl_core_news_md-3.0.0-py3-none-any.whl
15
+ https://github.com/explosion/spacy-models/releases/download/xx_sent_ud_sm-3.0.0/xx_sent_ud_sm-3.0.0-py3-none-any.whl
scripts/__pycache__/predict_concrete.cpython-37.pyc ADDED
Binary file (1.35 kB). View file
 
scripts/__pycache__/predict_concrete.cpython-38.pyc ADDED
Binary file (1.35 kB). View file
 
scripts/__pycache__/predict_concrete.cpython-39.pyc ADDED
Binary file (1.35 kB). View file
 
scripts/__pycache__/predict_force.cpython-39.pyc ADDED
Binary file (1.15 kB). View file
 
scripts/__pycache__/repl.cpython-39.pyc ADDED
Binary file (440 Bytes). View file
 
scripts/aida_experiment/predict_aida.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import copy
4
+ from collections import defaultdict
5
+ from argparse import ArgumentParser
6
+ from tqdm import tqdm
7
+ import random
8
+ from tqdm import tqdm
9
+ from scripts.predict_concrete import read_kairos
10
+
11
+ from sftp import SpanPredictor
12
+
13
+
14
+ parser = ArgumentParser()
15
+ parser.add_argument('aida', type=str)
16
+ parser.add_argument('model', type=str)
17
+ parser.add_argument('dst', type=str)
18
+ parser.add_argument('--topk', type=int, default=10)
19
+ parser.add_argument('--device', type=int, default=0)
20
+ args = parser.parse_args()
21
+
22
+ k = args.topk
23
+ corpus = json.load(open(args.aida))
24
+ predictor = SpanPredictor.from_path(args.model, cuda_device=args.device)
25
+ idx2fn = predictor._model.vocab.get_index_to_token_vocabulary('span_label')
26
+ random.seed(42)
27
+ random.shuffle(corpus)
28
+
29
+
30
+ output_fp = open(args.dst, 'a')
31
+ for line in tqdm(corpus):
32
+ tokens, ann = line['tokens'], line['annotation']
33
+ start, end, kairos_label = ann['start_idx'], ann['end_idx'], ann['label']
34
+ prob_dist = predictor.force_decode(tokens, [(start, end)])[0]
35
+ topk_indices = prob_dist.argsort(descending=True)[:k]
36
+ prob = prob_dist[topk_indices].tolist()
37
+ frames = [(idx2fn[int(idx)], p) for idx, p in zip(topk_indices, prob)]
38
+ output_fp.write(json.dumps({
39
+ 'tokens': tokens,
40
+ 'frames': frames,
41
+ 'kairos': kairos_label
42
+ }) + '\n')
scripts/aida_experiment/read_aida.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import copy
4
+ from collections import defaultdict
5
+ from argparse import ArgumentParser
6
+ from tqdm import tqdm
7
+
8
+
9
+ def extract_sentences(raw_doc):
10
+ sentence_tokens = list() # [(start, end), list_tokens, event_list]
11
+ for sent_boundary in raw_doc['_views']['_InitialView']['Sentence']:
12
+ start, end = sent_boundary.get('begin', 0), sent_boundary.get('end')
13
+ sentence_tokens.append([(start, end), list(), list()])
14
+ begin2sentence, end2sentence = dict(), dict()
15
+ for token in raw_doc['_views']['_InitialView']['Token']:
16
+ start, end = token.get('begin', 0), token.get('end')
17
+ added = False
18
+ for sent_idx, (bound, tl, _) in enumerate(sentence_tokens):
19
+ if start in range(*bound) and (end - 1) in range(*bound):
20
+ assert not added
21
+ begin2sentence[start] = (sent_idx, len(tl))
22
+ end2sentence[end] = (sent_idx, len(tl))
23
+ tl.append((start, end))
24
+ added = True
25
+ assert added
26
+ return sentence_tokens, begin2sentence, end2sentence
27
+
28
+
29
+ def read_aida2kairos(mapping_path):
30
+ mapping = dict()
31
+ for line in open(mapping_path).readlines():
32
+ kairos, aida_list = line.replace('\n', '').replace(',', '').split('\t')
33
+ for aida in aida_list.split():
34
+ if aida in 'x?':
35
+ continue
36
+ if aida in mapping:
37
+ print('warning:', aida, 'already in the mapping, repeated.')
38
+ mapping[aida] = kairos
39
+ return mapping
40
+
41
+
42
+ def read_aida(corpus_path, mapping_path):
43
+ print('reading aida data')
44
+ n_negative, n_span_mismatch, n_diff = 0, 0, 0
45
+ outputs = list()
46
+ mapping = read_aida2kairos(mapping_path)
47
+ for event_fn in tqdm(os.listdir(corpus_path)):
48
+ event_name = event_fn.split('-')[0]
49
+ if event_name not in mapping:
50
+ print('warning:', event_name, 'not in the mapping.')
51
+ continue
52
+ event_name = mapping[event_name]
53
+
54
+ for doc_name in os.listdir(os.path.join(corpus_path, event_fn)):
55
+ if not doc_name.endswith('json'):
56
+ continue
57
+ raw_doc = json.load(open(os.path.join(corpus_path, event_fn, doc_name)))
58
+ sentences, begin2sentence, end2sentence = extract_sentences(raw_doc)
59
+ for fss_no, fss in raw_doc['_referenced_fss'].items():
60
+ if fss_no == '1':
61
+ continue
62
+ begin, end, is_negative = fss['begin'], fss['end'], fss['negative_example']
63
+ if is_negative:
64
+ n_negative += 1
65
+ continue
66
+ if begin not in begin2sentence or end not in end2sentence:
67
+ n_span_mismatch += 1
68
+ continue
69
+ (b_idx_sent, b_idx_token), (e_idx_sent, e_idx_token) = begin2sentence[begin], end2sentence[end]
70
+ if b_idx_sent != e_idx_sent:
71
+ n_diff += 1
72
+ continue
73
+ sentences[b_idx_sent][2].append([b_idx_token, e_idx_token])
74
+
75
+ text = raw_doc['_referenced_fss']['1']['sofaString']
76
+
77
+ for _, tokens, events in sentences:
78
+ tokens = [text[start:end] for start, end in tokens]
79
+ for (start, end) in events:
80
+ outputs.append({
81
+ 'tokens': copy.deepcopy(tokens),
82
+ 'annotation': {
83
+ 'start_idx': start,
84
+ 'end_idx': end,
85
+ 'label': event_name,
86
+ }
87
+ })
88
+
89
+ print(f'Loaded {len(outputs)} annotations.')
90
+ print(f'{n_negative} negative annotations are ignored.')
91
+ print(f'{n_span_mismatch} mismatched annotations are ignored.')
92
+ print(f'{n_diff} annotations across sentences are ignored.')
93
+
94
+ return outputs
95
+
96
+
97
+ if __name__ == '__main__':
98
+ parser = ArgumentParser()
99
+ parser.add_argument('aida', type=str)
100
+ parser.add_argument('aida2kairos', type=str)
101
+ parser.add_argument('dst', type=str)
102
+ args = parser.parse_args()
103
+
104
+ aida = read_aida(args.aida, args.aida2kairos)
105
+
106
+ json.dump(aida, open(args.dst, 'w'))
107
+
scripts/aida_experiment/test_mapping.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import copy
4
+ from collections import defaultdict
5
+ from argparse import ArgumentParser
6
+ from tqdm import tqdm
7
+ import random
8
+ from tqdm import tqdm
9
+ from scripts.predict_concrete import read_kairos
10
+
11
+ from sftp import SpanPredictor
12
+
13
+
14
+ parser = ArgumentParser()
15
+ parser.add_argument('aida', type=str)
16
+ parser.add_argument('model', type=str)
17
+ parser.add_argument('fn2kairos', type=str, default=None)
18
+ parser.add_argument('--device', type=int, default=3)
19
+ args = parser.parse_args()
20
+
21
+ corpus = json.load(open(args.aida))
22
+ mapping = read_kairos(args.fn2kairos)
23
+ predictor = SpanPredictor.from_path(args.model, cuda_device=args.device)
24
+ random.seed(42)
25
+ random.shuffle(corpus)
26
+ batch_size = 128
27
+
28
+
29
+ def batchify(a_list):
30
+ cur = list()
31
+ for item in a_list:
32
+ cur.append(item)
33
+ if len(cur) == batch_size:
34
+ yield cur
35
+ cur = list()
36
+ if len(cur) > 0:
37
+ yield cur
38
+
39
+
40
+ batches = list(batchify(corpus))
41
+
42
+
43
+ n_total = n_pos = n_span_match = 0
44
+ for idx, lines in tqdm(enumerate(batches)):
45
+ n_total += batch_size
46
+ prediction_lines = predictor.predict_batch_sentences(
47
+ [line['tokens'] for line in lines], max_tokens=1024, ontology_mapping=mapping
48
+ )
49
+ for preds, ann in zip(prediction_lines, lines):
50
+ ann = ann['annotation']
51
+ preds = preds['prediction']
52
+ for pred in preds:
53
+ if pred['start_idx'] == ann['start_idx'] and pred['end_idx'] == ann['end_idx']:
54
+ n_span_match += 1
55
+ if pred['label'] == ann['label']:
56
+ n_pos += 1
57
+
58
+ print(f'exact match precision: {n_pos * 100 / n_total:.3f}')
59
+ print(f'span only precision: {n_span_match * 100 / n_total:.3f}')
scripts/archive/eval_tie.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import json
4
+ from pprint import pprint
5
+ from collections import defaultdict
6
+
7
+ from sftp.metrics.exact_match import ExactMatch
8
+
9
+
10
+ def evaluate():
11
+ em = ExactMatch(True)
12
+ sm = ExactMatch(False)
13
+ gold_file, pred_file = sys.argv[1:]
14
+ test_sentences = {json.loads(line)['meta']['sentence ID']: json.loads(line) for line in open(gold_file).readlines()}
15
+ pred_sentences = defaultdict(list)
16
+ for line in open(pred_file).readlines():
17
+ one_pred = json.loads(line)
18
+ pred_sentences[one_pred['meta']['sentence ID']].append(one_pred)
19
+ for sent_id, gold_sent in test_sentences.items():
20
+ pred_sent = pred_sentences.get(sent_id, [])
21
+ pred_frames, pred_fes = [], []
22
+ for fr_idx, fr in enumerate(pred_sent):
23
+ pred_frames.append({key: fr[key] for key in ["start_idx", "end_idx", "label"]})
24
+ pred_frames[-1]['parent'] = 0
25
+ for fe in fr['children']:
26
+ pred_fes.append({key: fe[key] for key in ["start_idx", "end_idx", "label"]})
27
+ pred_fes[-1]['parent'] = fr_idx+1
28
+ pred_to_eval = pred_frames + pred_fes
29
+
30
+ gold_frames, gold_fes = [], []
31
+ for fr_idx, fr in enumerate(gold_sent['frame']):
32
+ gold_frames.append({
33
+ 'start_idx': fr['target'][0], 'end_idx': fr['target'][-1], "label": fr['name'], 'parent': 0
34
+ })
35
+ for start_idx, end_idx, fe_name in fr['fe']:
36
+ gold_fes.append({
37
+ "start_idx": start_idx, "end_idx": end_idx, "label": fe_name, "parent": fr_idx+1
38
+ })
39
+ gold_to_eval = gold_frames + gold_fes
40
+ em(pred_to_eval, gold_to_eval)
41
+ sm(pred_to_eval, gold_to_eval)
42
+
43
+ print('EM')
44
+ pprint(em.get_metric(True))
45
+ print('SM')
46
+ pprint(sm.get_metric(True))
47
+
48
+
49
+ if __name__ == '__main__':
50
+ evaluate()
scripts/archive/frame_similarity.py ADDED
@@ -0,0 +1,143 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from argparse import ArgumentParser
2
+ from collections import defaultdict
3
+
4
+ from torch import nn
5
+ from copy import deepcopy
6
+ import torch
7
+ import os
8
+ import json
9
+
10
+ from sftp import SpanPredictor
11
+ import nltk
12
+
13
+
14
+ def shift_grid_cos_sim(mat: torch.Tensor):
15
+ mat1 = mat.unsqueeze(0).expand(mat.shape[0], -1, -1)
16
+ mat2 = mat.unsqueeze(1).expand(-1, mat.shape[0], -1)
17
+ cos = nn.CosineSimilarity(2)
18
+ sim = (cos(mat1, mat2) + 1) / 2
19
+ return sim
20
+
21
+
22
+ def all_frames():
23
+ nltk.download('framenet_v17')
24
+ fn = nltk.corpus.framenet
25
+ return fn.frames()
26
+
27
+
28
+ def extract_relations(fr):
29
+ ret = list()
30
+ added = {fr.name}
31
+ for rel in fr.frameRelations:
32
+ for key in ['subFrameName', 'superFrameName']:
33
+ rel_fr_name = rel[key]
34
+ if rel_fr_name in added:
35
+ continue
36
+ ret.append((rel_fr_name, key[:-4]))
37
+ return ret
38
+
39
+
40
+ def run():
41
+ parser = ArgumentParser()
42
+ parser.add_argument('archive', metavar='ARCHIVE_PATH', type=str)
43
+ parser.add_argument('dst', metavar='DESTINATION', type=str)
44
+ parser.add_argument('kairos', metavar='KAIROS', type=str)
45
+ parser.add_argument('--topk', metavar='TOPK', type=int, default=10)
46
+ args = parser.parse_args()
47
+
48
+ predictor = SpanPredictor.from_path(args.archive, cuda_device=-1)
49
+ kairos_gold_mapping = json.load(open(args.kairos))
50
+
51
+ label_emb = predictor._model._span_typing.label_emb.weight.clone().detach()
52
+ idx2label = predictor._model.vocab.get_index_to_token_vocabulary('span_label')
53
+
54
+ emb_sim = shift_grid_cos_sim(label_emb)
55
+ fr2definition = {fr.name: (fr.URL, fr.definition) for fr in all_frames()}
56
+
57
+ last_mlp = predictor._model._span_typing.MLPs[-1].weight.detach().clone()
58
+ mlp_sim = shift_grid_cos_sim(last_mlp)
59
+
60
+ def rank_frame(sim):
61
+ rank = sim.argsort(1, True)
62
+ scores = sim.gather(1, rank)
63
+ mapping = {
64
+ fr.name: {
65
+ 'similarity': list(),
66
+ 'ontology': extract_relations(fr),
67
+ 'URL': fr.URL,
68
+ 'definition': fr.definition
69
+ } for fr in all_frames()
70
+ }
71
+ for left_idx, (right_indices, match_scores) in enumerate(zip(rank, scores)):
72
+ left_label = idx2label[left_idx]
73
+ if left_label not in mapping:
74
+ continue
75
+ for right_idx, s in zip(right_indices, match_scores):
76
+ right_label = idx2label[int(right_idx)]
77
+ if right_label not in mapping or right_idx == left_idx:
78
+ continue
79
+ mapping[left_label]['similarity'].append((right_label, float(s)))
80
+ return mapping
81
+
82
+ emb_map = rank_frame(emb_sim)
83
+ mlp_map = rank_frame(mlp_sim)
84
+
85
+ def dump(mapping, folder_path):
86
+ os.makedirs(folder_path, exist_ok=True)
87
+ json.dump(mapping, open(os.path.join(folder_path, 'raw.json'), 'w'))
88
+ sim_lines, onto_lines = list(), list()
89
+
90
+ for fr, values in mapping.items():
91
+ sim_line = [
92
+ fr,
93
+ values['definition'],
94
+ values['URL'],
95
+ ]
96
+ onto_line = deepcopy(sim_line)
97
+ for rel_fr_name, rel_type in values['ontology']:
98
+ onto_line.append(f'{rel_fr_name} ({rel_type})')
99
+ onto_lines.append('\t'.join(onto_line))
100
+ if len(values['similarity']) > 0:
101
+ for sim_fr_name, score in values['similarity'][:args.topk]:
102
+ sim_line.append(f'{sim_fr_name} ({score:.3f})')
103
+ sim_lines.append('\t'.join(sim_line))
104
+
105
+ with open(os.path.join(folder_path, 'similarity.tsv'), 'w') as fp:
106
+ fp.write('\n'.join(sim_lines))
107
+ with open(os.path.join(folder_path, 'ontology.tsv'), 'w') as fp:
108
+ fp.write('\n'.join(onto_lines))
109
+
110
+ kairos_dump = list()
111
+ for kairos_event, kairos_content in kairos_gold_mapping.items():
112
+ for gold_fr in kairos_content['framenet']:
113
+ gold_fr = gold_fr['label']
114
+ if gold_fr not in fr2definition:
115
+ continue
116
+ kairos_dump.append([
117
+ 'GOLD',
118
+ gold_fr,
119
+ kairos_event,
120
+ fr2definition[gold_fr][0],
121
+ fr2definition[gold_fr][1],
122
+ str(kairos_content['description']),
123
+ '1.00'
124
+ ])
125
+ for ass_fr, sim_score in mapping[gold_fr]['similarity'][:args.topk]:
126
+ kairos_dump.append([
127
+ '',
128
+ ass_fr,
129
+ kairos_event,
130
+ fr2definition[ass_fr][0],
131
+ fr2definition[ass_fr][1],
132
+ str(kairos_content['description']),
133
+ f'{sim_score:.2f}'
134
+ ])
135
+ kairos_dump = list(map(lambda line: '\t'.join(line), kairos_dump))
136
+ open(os.path.join(folder_path, 'kairos_sheet.tsv'), 'w').write('\n'.join(kairos_dump))
137
+
138
+ dump(mlp_map, os.path.join(args.dst, 'mlp'))
139
+ dump(emb_map, os.path.join(args.dst, 'emb'))
140
+
141
+
142
+ if __name__ == '__main__':
143
+ run()
scripts/archive/kairos_mapping.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import os
3
+ import json
4
+
5
+
6
+ def main():
7
+ parser = argparse.ArgumentParser()
8
+ parser.add_argument('map', metavar='MappingFile', type=str, help="Mapping JSON file.")
9
+ parser.add_argument('src', metavar='SourceFile', type=str, help="Results of span finder.")
10
+ parser.add_argument('dst', metavar='Destination', type=str, help="Output path.")
11
+ args = parser.parse_args()
12
+ assert os.path.exists(args.map), "Mapping file doesn't exist."
13
+ assert os.path.exists(args.src), "Rouce file not found."
14
+
15
+ k_raw = json.load(open(args.map))
16
+ k_map = dict()
17
+ for kairos_event, content in k_raw.items():
18
+ for fr in content['framenet']:
19
+ if fr['label'] in k_map:
20
+ print("Duplicate frame: " + fr['label'])
21
+ k_map[fr['label']] = kairos_event
22
+ inputs = list(map(json.loads, open(args.src).readlines()))
23
+
24
+ n_total = n_mapped = 0
25
+
26
+ for line in inputs:
27
+ new_frames = list()
28
+ n_total += len(line['prediction'])
29
+ for fr in line['prediction']:
30
+ if fr['label'] in k_map:
31
+ fr['label'] = k_map[fr['label']]
32
+ new_frames.append(fr)
33
+ n_mapped += 1
34
+ line['prediction'] = new_frames
35
+
36
+ with open(args.dst, 'w') as fp:
37
+ fp.write('\n'.join(map(json.dumps, inputs)))
38
+
39
+ print(f'Done. Among {n_total} frames, {n_mapped} are mapped to KAIROS ontology, others are omitted.')
40
+
41
+
42
+ if __name__ == '__main__':
43
+ main()
scripts/archive/onto_test.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ from tools.framenet.naive_identifier import FrameIdentifier
3
+
4
+ test_file_path = '/home/gqin2/data/framenet/full/test.jsonl'
5
+ test_sentences = [
6
+ json.loads(line) for line in open(test_file_path)
7
+ ]
8
+ test_set = []
9
+ for ann in test_sentences:
10
+ for fr in ann['frame']:
11
+ test_set.append((fr['name'], ann['text'][fr['target'][0]: fr['target'][-1]+1], fr['lu']))
12
+
13
+ fi = FrameIdentifier()
14
+
15
+
16
+ tp = fp = fn = 0
17
+ fails = []
18
+ for frame, target_words, lu in test_set:
19
+ pred = fi(target_words)
20
+ if frame in pred:
21
+ tp += 1
22
+ fp += len(pred) - 1
23
+ else:
24
+ fp += len(pred)
25
+ fn += 1
26
+ fails.append((frame, target_words, pred, lu))
27
+
28
+ fails.sort(key=lambda x: x[0])
29
+ for frame, target_words, pred, lu in fails:
30
+ print(frame, ' '.join(target_words), ' '.join(pred), lu, sep='\t')
31
+
32
+ print(f'tp={tp}, fp={fp}, fn={fn}')
33
+ print(f'precision={tp/(tp+fp)}')
34
+ print(f'recall={tp/(tp+fn)}')
scripts/archive/predict_better.py ADDED
@@ -0,0 +1,47 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import *
2
+ import torch
3
+ import json
4
+ import argparse
5
+ import os
6
+ from tqdm import tqdm
7
+
8
+ from sftp.predictor import SpanPredictor
9
+ from sftp.models import SpanModel
10
+ from sftp.data_reader import BetterDatasetReader
11
+
12
+
13
+ def predict_doc(predictor, json_path: str):
14
+ src = json.load(open(json_path))
15
+ for doc_name, entry in tqdm(list(src['entries'].items())):
16
+ pred = predictor.predict_json(entry)
17
+ triggers = list()
18
+ for trigger in pred['prediction']:
19
+ children = list()
20
+ for child in trigger['children']:
21
+ children.append([child['start_idx'], child['end_idx']])
22
+ triggers.append({
23
+ "span": [trigger['start_idx'], trigger['end_idx']],
24
+ "argument": children
25
+ })
26
+ entry['trigger span'] = triggers
27
+ return src
28
+
29
+
30
+ if __name__ == '__main__':
31
+ parser = argparse.ArgumentParser()
32
+ parser.add_argument('-a', type=str, help='archive path')
33
+ parser.add_argument('-s', type=str, help='source path')
34
+ parser.add_argument('-d', type=str, help='destination path')
35
+ parser.add_argument('-c', type=int, default=0, help='cuda device')
36
+ args = parser.parse_args()
37
+ predictor_ = SpanPredictor.from_path(os.path.join(args.a, 'model.tar.gz'), 'span', cuda_device=args.c)
38
+ model_name = os.path.basename(args.a)
39
+ tgt_path = os.path.join(args.d, model_name)
40
+ os.makedirs(tgt_path, exist_ok=True)
41
+ for root, _, files in os.walk(args.s):
42
+ for fn in files:
43
+ if not fn.endswith('json') and not fn.endswith('valid'):
44
+ continue
45
+ processed_json = predict_doc(predictor_, os.path.join(root, fn))
46
+ with open(os.path.join(tgt_path, fn), 'w') as fp:
47
+ json.dump(processed_json, fp)
scripts/archive/predict_kairos.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import argparse
3
+ from xml.etree import ElementTree
4
+ import copy
5
+ from operator import attrgetter
6
+ import json
7
+ import logging
8
+
9
+ from sftp import SpanPredictor
10
+
11
+
12
+ def predict_kairos(model_archive, source_folder, onto_map):
13
+ xml_files = list()
14
+ for root, _, files in os.walk(source_folder):
15
+ for f in files:
16
+ if f.endswith('.xml'):
17
+ xml_files.append(os.path.join(root, f))
18
+ logging.info(f'{len(xml_files)} files are found:')
19
+ for fn in xml_files:
20
+ logging.info(' - ' + fn)
21
+
22
+ logging.info('Loading ontology from ' + onto_map)
23
+ k_map = dict()
24
+ for kairos_event, content in json.load(open(onto_map)).items():
25
+ for fr in content['framenet']:
26
+ if fr['label'] in k_map:
27
+ logging.info("Duplicate frame: " + fr['label'])
28
+ k_map[fr['label']] = kairos_event
29
+
30
+ logging.info('Loading model from ' + model_archive + ' ...')
31
+ predictor = SpanPredictor.from_path(model_archive)
32
+
33
+ predictions = list()
34
+
35
+ for fn in xml_files:
36
+ logging.info('Now processing ' + os.path.basename(fn))
37
+ tree = ElementTree.parse(fn).getroot()
38
+ for doc in tree:
39
+ doc_meta = copy.deepcopy(doc.attrib)
40
+ text = list(doc)[0]
41
+ for seg in text:
42
+ seg_meta = copy.deepcopy(doc_meta)
43
+ seg_meta['seg'] = copy.deepcopy(seg.attrib)
44
+ tokens = [child for child in seg if child.tag == 'TOKEN']
45
+ tokens.sort(key=lambda t: t.attrib['start_char'])
46
+ words = list(map(attrgetter('text'), tokens))
47
+ one_pred = predictor.predict_sentence(words)
48
+ one_pred['meta'] = seg_meta
49
+
50
+ new_frames = list()
51
+ for fr in one_pred['prediction']:
52
+ if fr['label'] in k_map:
53
+ fr['label'] = k_map[fr['label']]
54
+ new_frames.append(fr)
55
+ one_pred['prediction'] = new_frames
56
+
57
+ predictions.append(one_pred)
58
+
59
+ logging.info('Finished Prediction.')
60
+
61
+ return predictions
62
+
63
+
64
+ def do_task(input_dir, model_archive, onto_map):
65
+ """
66
+ This function is called by the KAIROS infrastructure code for each
67
+ TASK1 input.
68
+ """
69
+
70
+ return predict_kairos(model_archive=model_archive,
71
+ source_folder=input_dir,
72
+ onto_map=onto_map)
73
+
74
+
75
+ def run():
76
+ parser = argparse.ArgumentParser(description='Span Finder for KAIROS Quizlet4\n')
77
+ parser.add_argument('model_archive', metavar='MODEL_ARCHIVE', type=str, help='Path to model archive file.')
78
+ parser.add_argument('source_folder', metavar='SOURCE_FOLDER', type=str, help='Path to the folder that contains the XMLs.')
79
+ parser.add_argument('onto_map', metavar='ONTO_MAP', type=str, help='Path to the ontology JSON.')
80
+ parser.add_argument('destination', metavar='DESTINATION', type=str, help='Output path. (jsonl file path)')
81
+ args = parser.parse_args()
82
+
83
+ logging.basicConfig(level='INFO', format="%(asctime)s %(name)-12s %(levelname)-8s %(message)s")
84
+
85
+ predictions = predict_kairos(model_archive=args.model_archive,
86
+ source_folder=args.source_folder,
87
+ onto_map=args.onto_map)
88
+
89
+ logging.info('Saving to ' + args.destination + ' ...')
90
+ os.makedirs(os.path.dirname(args.destination), exist_ok=True)
91
+ with open(args.destination, 'w') as fp:
92
+ fp.write('\n'.join(map(json.dumps, predictions)))
93
+
94
+ logging.info('Done.')
95
+
96
+
97
+ if __name__ == '__main__':
98
+ run()