kormilitzin commited on
Commit
aa0e64c
1 Parent(s): c097d43

Update spaCy pipeline

Browse files
.gitattributes CHANGED
@@ -30,3 +30,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
30
  *strings.json filter=lfs diff=lfs merge=lfs -text
31
  vectors filter=lfs diff=lfs merge=lfs -text
32
  model filter=lfs diff=lfs merge=lfs -text
 
 
30
  *strings.json filter=lfs diff=lfs merge=lfs -text
31
  vectors filter=lfs diff=lfs merge=lfs -text
32
  model filter=lfs diff=lfs merge=lfs -text
33
+ vocab/key2row filter=lfs diff=lfs merge=lfs -text
README.md CHANGED
@@ -14,25 +14,25 @@ model-index:
14
  metrics:
15
  - name: NER Precision
16
  type: precision
17
- value: 0.8910716705
18
  - name: NER Recall
19
  type: recall
20
- value: 0.9043035886
21
  - name: NER F Score
22
  type: f_score
23
- value: 0.8976388699
24
  ---
25
  | Feature | Description |
26
  | --- | --- |
27
  | **Name** | `en_core_med7_trf` |
28
- | **Version** | `3.1.3.1` |
29
- | **spaCy** | `>=3.1.4,<3.2.0` |
30
  | **Default Pipeline** | `transformer`, `ner` |
31
  | **Components** | `transformer`, `ner` |
32
- | **Vectors** | 0 keys, 0 unique vectors (0 dimensions) |
33
  | **Sources** | n/a |
34
  | **License** | `MIT` |
35
- | **Author** | [Andrey Kormilitzin](kormilitzin.com) |
36
 
37
  ### Label Scheme
38
 
@@ -50,8 +50,8 @@ model-index:
50
 
51
  | Type | Score |
52
  | --- | --- |
53
- | `ENTS_F` | 89.76 |
54
- | `ENTS_P` | 89.11 |
55
- | `ENTS_R` | 90.43 |
56
- | `TRANSFORMER_LOSS` | 209606.10 |
57
- | `NER_LOSS` | 874893.84 |
 
14
  metrics:
15
  - name: NER Precision
16
  type: precision
17
+ value: 0.8822157434
18
  - name: NER Recall
19
  type: recall
20
+ value: 0.925382263
21
  - name: NER F Score
22
  type: f_score
23
+ value: 0.9032835821
24
  ---
25
  | Feature | Description |
26
  | --- | --- |
27
  | **Name** | `en_core_med7_trf` |
28
+ | **Version** | `3.4.2.1` |
29
+ | **spaCy** | `>=3.4.2,<3.5.0` |
30
  | **Default Pipeline** | `transformer`, `ner` |
31
  | **Components** | `transformer`, `ner` |
32
+ | **Vectors** | 514157 keys, 514157 unique vectors (300 dimensions) |
33
  | **Sources** | n/a |
34
  | **License** | `MIT` |
35
+ | **Author** | [Andrey Kormilitzin](https://www.kormilitzin.com/) |
36
 
37
  ### Label Scheme
38
 
 
50
 
51
  | Type | Score |
52
  | --- | --- |
53
+ | `ENTS_F` | 90.33 |
54
+ | `ENTS_P` | 88.22 |
55
+ | `ENTS_R` | 92.54 |
56
+ | `TRANSFORMER_LOSS` | 2502627.06 |
57
+ | `NER_LOSS` | 114576.77 |
config.cfg CHANGED
@@ -1,8 +1,8 @@
1
  [paths]
2
- train = "./data/spacy_format/train_443.spacy"
3
- dev = "./data/spacy_format/dev_443.spacy"
4
- vectors = null
5
- init_tok2vec = null
6
 
7
  [system]
8
  gpu_allocator = "pytorch"
@@ -11,7 +11,7 @@ seed = 0
11
  [nlp]
12
  lang = "en"
13
  pipeline = ["transformer","ner"]
14
- batch_size = 2
15
  disabled = []
16
  before_creation = null
17
  after_creation = null
@@ -24,13 +24,14 @@ tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
24
  factory = "ner"
25
  incorrect_spans_key = null
26
  moves = null
 
27
  update_with_oracle_cut_size = 100
28
 
29
  [components.ner.model]
30
  @architectures = "spacy.TransitionBasedParser.v2"
31
  state_type = "ner"
32
  extra_state_tokens = false
33
- hidden_width = 64
34
  maxout_pieces = 2
35
  use_upper = false
36
  nO = null
@@ -47,17 +48,22 @@ max_batch_items = 4096
47
  set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
48
 
49
  [components.transformer.model]
50
- @architectures = "spacy-transformers.TransformerModel.v1"
51
- name = "roberta-base"
 
52
 
53
  [components.transformer.model.get_spans]
54
  @span_getters = "spacy-transformers.strided_spans.v1"
55
- window = 128
56
  stride = 96
57
 
 
 
58
  [components.transformer.model.tokenizer_config]
59
  use_fast = true
60
 
 
 
61
  [corpora]
62
 
63
  [corpora.dev]
@@ -83,9 +89,9 @@ train_corpus = "corpora.train"
83
  seed = ${system.seed}
84
  gpu_allocator = ${system.gpu_allocator}
85
  dropout = 0.1
86
- patience = 2000
87
  max_epochs = 0
88
- max_steps = 60000
89
  eval_frequency = 200
90
  frozen_components = []
91
  annotating_components = []
@@ -94,13 +100,13 @@ before_to_disk = null
94
  [training.batcher]
95
  @batchers = "spacy.batch_by_padded.v1"
96
  discard_oversize = true
97
- size = 2000
98
  buffer = 256
99
  get_length = null
100
 
101
  [training.logger]
102
  @loggers = "spacy.ConsoleLogger.v1"
103
- progress_bar = false
104
 
105
  [training.optimizer]
106
  @optimizers = "Adam.v1"
@@ -130,10 +136,14 @@ ents_per_type = null
130
  vectors = ${paths.vectors}
131
  init_tok2vec = ${paths.init_tok2vec}
132
  vocab_data = null
133
- lookups = null
134
  before_init = null
135
  after_init = null
136
 
137
  [initialize.components]
138
 
 
 
 
 
 
139
  [initialize.tokenizer]
 
1
  [paths]
2
+ train = "./data/spacy_format/train_med7_v34.spacy"
3
+ dev = "./data/spacy_format/dev_med7_v34.spacy"
4
+ vectors = "en_core_web_lg"
5
+ init_tok2vec = "/mnt/sdf/andrey/projects/med7_v3/output_pretrain_lg/model169.bin"
6
 
7
  [system]
8
  gpu_allocator = "pytorch"
 
11
  [nlp]
12
  lang = "en"
13
  pipeline = ["transformer","ner"]
14
+ batch_size = 64
15
  disabled = []
16
  before_creation = null
17
  after_creation = null
 
24
  factory = "ner"
25
  incorrect_spans_key = null
26
  moves = null
27
+ scorer = {"@scorers":"spacy.ner_scorer.v1"}
28
  update_with_oracle_cut_size = 100
29
 
30
  [components.ner.model]
31
  @architectures = "spacy.TransitionBasedParser.v2"
32
  state_type = "ner"
33
  extra_state_tokens = false
34
+ hidden_width = 128
35
  maxout_pieces = 2
36
  use_upper = false
37
  nO = null
 
48
  set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
49
 
50
  [components.transformer.model]
51
+ @architectures = "spacy-transformers.TransformerModel.v3"
52
+ name = "/mnt/sdf/andrey/projects/med7_v3/RoBERTa-base-PM-M3-hf"
53
+ mixed_precision = false
54
 
55
  [components.transformer.model.get_spans]
56
  @span_getters = "spacy-transformers.strided_spans.v1"
57
+ window = 256
58
  stride = 96
59
 
60
+ [components.transformer.model.grad_scaler_config]
61
+
62
  [components.transformer.model.tokenizer_config]
63
  use_fast = true
64
 
65
+ [components.transformer.model.transformer_config]
66
+
67
  [corpora]
68
 
69
  [corpora.dev]
 
89
  seed = ${system.seed}
90
  gpu_allocator = ${system.gpu_allocator}
91
  dropout = 0.1
92
+ patience = 3600
93
  max_epochs = 0
94
+ max_steps = 20000
95
  eval_frequency = 200
96
  frozen_components = []
97
  annotating_components = []
 
100
  [training.batcher]
101
  @batchers = "spacy.batch_by_padded.v1"
102
  discard_oversize = true
103
+ size = 1000
104
  buffer = 256
105
  get_length = null
106
 
107
  [training.logger]
108
  @loggers = "spacy.ConsoleLogger.v1"
109
+ progress_bar = true
110
 
111
  [training.optimizer]
112
  @optimizers = "Adam.v1"
 
136
  vectors = ${paths.vectors}
137
  init_tok2vec = ${paths.init_tok2vec}
138
  vocab_data = null
 
139
  before_init = null
140
  after_init = null
141
 
142
  [initialize.components]
143
 
144
+ [initialize.lookups]
145
+ @misc = "spacy.LookupsDataLoader.v1"
146
+ lang = ${nlp.lang}
147
+ tables = ["lexeme_norm"]
148
+
149
  [initialize.tokenizer]
en_core_med7_trf-any-py3-none-any.whl CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:796d6aa625f06a7a532c1716943a7f3881299c87749328c8b407963686da19d3
3
- size 445043002
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:a3e0f27b7379e0fa26d9b23b63f7d24154208ff3fdd04796da4e4bc9c01a6e01
3
+ size 1018868436
meta.json CHANGED
@@ -1,19 +1,19 @@
1
  {
2
  "lang":"en",
3
  "name":"core_med7_trf",
4
- "version":"3.1.3.1",
5
  "description":"",
6
  "author":"Andrey Kormilitzin",
7
  "email":"kormilitzin@gmail.com",
8
- "url":"kormilitzin.com",
9
  "license":"MIT",
10
- "spacy_version":">=3.1.4,<3.2.0",
11
- "spacy_git_version":"8bda39f08",
12
  "vectors":{
13
- "width":0,
14
- "vectors":0,
15
- "keys":0,
16
- "name":null
17
  },
18
  "labels":{
19
  "transformer":[
@@ -41,50 +41,50 @@
41
 
42
  ],
43
  "performance":{
44
- "ents_f":0.8976388699,
45
- "ents_p":0.8910716705,
46
- "ents_r":0.9043035886,
47
  "ents_per_type":{
48
  "DRUG":{
49
- "p":0.8898463844,
50
- "r":0.9310074481,
51
- "f":0.9099616858
52
  },
53
- "DOSAGE":{
54
- "p":0.8890554723,
55
- "r":0.8930722892,
56
- "f":0.8910593539
57
  },
58
  "STRENGTH":{
59
- "p":0.9363057325,
60
- "r":0.9625818522,
61
- "f":0.9492619926
62
  },
63
  "FREQUENCY":{
64
- "p":0.7804597701,
65
- "r":0.7786697248,
66
- "f":0.7795637199
67
  },
68
  "FORM":{
69
- "p":0.9034676664,
70
- "r":0.8950789229,
71
- "f":0.8992537313
72
  },
73
- "ROUTE":{
74
- "p":0.949945593,
75
- "r":0.9247881356,
76
- "f":0.9371980676
77
  },
78
  "DURATION":{
79
- "p":0.7111111111,
80
- "r":0.6666666667,
81
- "f":0.688172043
82
  }
83
  },
84
- "transformer_loss":2096.0609702383,
85
- "ner_loss":8748.9384091962
86
  },
87
  "requirements":[
88
- "spacy-transformers>=1.1.2,<1.2.0"
89
  ]
90
  }
 
1
  {
2
  "lang":"en",
3
  "name":"core_med7_trf",
4
+ "version":"3.4.2.1",
5
  "description":"",
6
  "author":"Andrey Kormilitzin",
7
  "email":"kormilitzin@gmail.com",
8
+ "url":"https://www.kormilitzin.com/",
9
  "license":"MIT",
10
+ "spacy_version":">=3.4.2,<3.5.0",
11
+ "spacy_git_version":"Unknown",
12
  "vectors":{
13
+ "width":300,
14
+ "vectors":514157,
15
+ "keys":514157,
16
+ "name":"en_vectors"
17
  },
18
  "labels":{
19
  "transformer":[
 
41
 
42
  ],
43
  "performance":{
44
+ "ents_f":0.9032835821,
45
+ "ents_p":0.8822157434,
46
+ "ents_r":0.925382263,
47
  "ents_per_type":{
48
  "DRUG":{
49
+ "p":0.8804185351,
50
+ "r":0.9349206349,
51
+ "f":0.9068514242
52
  },
53
+ "ROUTE":{
54
+ "p":0.9444444444,
55
+ "r":0.9739583333,
56
+ "f":0.958974359
57
  },
58
  "STRENGTH":{
59
+ "p":0.9190283401,
60
+ "r":0.9380165289,
61
+ "f":0.9284253579
62
  },
63
  "FREQUENCY":{
64
+ "p":0.7512437811,
65
+ "r":0.7947368421,
66
+ "f":0.7723785166
67
  },
68
  "FORM":{
69
+ "p":0.9449541284,
70
+ "r":0.9493087558,
71
+ "f":0.9471264368
72
  },
73
+ "DOSAGE":{
74
+ "p":0.872611465,
75
+ "r":0.958041958,
76
+ "f":0.9133333333
77
  },
78
  "DURATION":{
79
+ "p":0.64,
80
+ "r":0.7619047619,
81
+ "f":0.6956521739
82
  }
83
  },
84
+ "transformer_loss":25026.2706476041,
85
+ "ner_loss":1145.767683525
86
  },
87
  "requirements":[
88
+ "spacy-transformers>=1.1.6,<1.2.0"
89
  ]
90
  }
ner/model CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:efdefbc26ff3a7778078533cf5a62d1b09cf3f8f58830170295ddfad4e3547a1
3
- size 244778
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3cce46a8bbd4b30427fcc65d07f6d1391d90c46b43ff7d555518c660158e0df9
3
+ size 487730
ner/moves CHANGED
@@ -1 +1 @@
1
- ��moves��{"0":{},"1":{"DRUG":25068,"STRENGTH":19845,"DOSAGE":16674,"FREQUENCY":15332,"FORM":12981,"ROUTE":8181,"DURATION":1940},"2":{"DRUG":25068,"STRENGTH":19845,"DOSAGE":16674,"FREQUENCY":15332,"FORM":12981,"ROUTE":8181,"DURATION":1940},"3":{"DRUG":25068,"STRENGTH":19845,"DOSAGE":16674,"FREQUENCY":15332,"FORM":12981,"ROUTE":8181,"DURATION":1940},"4":{"DRUG":25068,"STRENGTH":19845,"DOSAGE":16674,"FREQUENCY":15332,"FORM":12981,"ROUTE":8181,"DURATION":1940,"":1},"5":{"":1}}�cfg��neg_key�
 
1
+ ��moves��{"0":{},"1":{"DRUG":27417,"STRENGTH":21625,"DOSAGE":18350,"FREQUENCY":16642,"FORM":14267,"ROUTE":8996,"DURATION":2140},"2":{"DRUG":27417,"STRENGTH":21625,"DOSAGE":18350,"FREQUENCY":16642,"FORM":14267,"ROUTE":8996,"DURATION":2140},"3":{"DRUG":27417,"STRENGTH":21625,"DOSAGE":18350,"FREQUENCY":16642,"FORM":14267,"ROUTE":8996,"DURATION":2140},"4":{"DRUG":27417,"STRENGTH":21625,"DOSAGE":18350,"FREQUENCY":16642,"FORM":14267,"ROUTE":8996,"DURATION":2140,"":1},"5":{"":1}}�cfg��neg_key�
tokenizer CHANGED
The diff for this file is too large to render. See raw diff
 
vocab/key2row CHANGED
@@ -1 +1,3 @@
1
-
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:31566ae010da3d399eb1d930ae142757afd2601034a4be3bdb00d18881c8c06a
3
+ size 7066303
vocab/lookups.bin CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:76be8b528d0075f7aae98d6fa57a6d3c83ae480a8469e668d7b0af968995ac71
3
- size 1
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:1ddd140ecac6a8c4592e9146d6e30074569ffaed97ee51edc9587dc510f8934c
3
+ size 69982
vocab/strings.json CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:bbbc0cbaea5d8f3124a60adbf573d2826f8294f5d0d10719045d57e792253753
3
- size 833590
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0b1693baa4ec8d99e20fa93b38ecafa6b4f49f244472d455f35e7605f20345e0
3
+ size 10856674
vocab/vectors CHANGED
@@ -1,3 +1,3 @@
1
  version https://git-lfs.github.com/spec/v1
2
- oid sha256:14772b683e726436d5948ad3fff2b43d036ef2ebbe3458aafed6004e05a40706
3
- size 128
 
1
  version https://git-lfs.github.com/spec/v1
2
+ oid sha256:234dcf234bfdf01775ae6182715d55eaacfcde8555b189f25440b56d3c39fd5d
3
+ size 616988528
vocab/vectors.cfg ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ {
2
+ "mode":"default"
3
+ }