2023-10-24 22:47:35,121 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:47:35,122 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=13, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-24 22:47:35,122 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:47:35,122 MultiCorpus: 5777 train + 722 dev + 723 test sentences - NER_ICDAR_EUROPEANA Corpus: 5777 train + 722 dev + 723 test sentences - /home/ubuntu/.flair/datasets/ner_icdar_europeana/nl 2023-10-24 22:47:35,122 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:47:35,122 Train: 5777 sentences 2023-10-24 22:47:35,122 (train_with_dev=False, train_with_test=False) 2023-10-24 22:47:35,122 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:47:35,122 Training Params: 2023-10-24 22:47:35,122 - learning_rate: "5e-05" 2023-10-24 22:47:35,123 - mini_batch_size: "8" 2023-10-24 22:47:35,123 - max_epochs: "10" 2023-10-24 22:47:35,123 - shuffle: "True" 2023-10-24 22:47:35,123 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:47:35,123 Plugins: 2023-10-24 22:47:35,123 - TensorboardLogger 2023-10-24 22:47:35,123 - LinearScheduler | warmup_fraction: '0.1' 2023-10-24 22:47:35,123 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:47:35,123 Final evaluation on model from best epoch (best-model.pt) 2023-10-24 22:47:35,123 - metric: "('micro avg', 'f1-score')" 2023-10-24 22:47:35,123 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:47:35,123 Computation: 2023-10-24 22:47:35,123 - compute on device: cuda:0 2023-10-24 22:47:35,123 - embedding storage: none 2023-10-24 22:47:35,123 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:47:35,123 Model training base path: "hmbench-icdar/nl-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs8-wsFalse-e10-lr5e-05-poolingfirst-layers-1-crfFalse-1" 2023-10-24 22:47:35,123 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:47:35,123 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:47:35,123 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-24 22:47:43,574 epoch 1 - iter 72/723 - loss 1.90099622 - time (sec): 8.45 - samples/sec: 2093.32 - lr: 0.000005 - momentum: 0.000000 2023-10-24 22:47:52,328 epoch 1 - iter 144/723 - loss 1.09087996 - time (sec): 17.20 - samples/sec: 2047.05 - lr: 0.000010 - momentum: 0.000000 2023-10-24 22:48:01,244 epoch 1 - iter 216/723 - loss 0.77964600 - time (sec): 26.12 - samples/sec: 2072.80 - lr: 0.000015 - momentum: 0.000000 2023-10-24 22:48:09,443 epoch 1 - iter 288/723 - loss 0.63708806 - time (sec): 34.32 - samples/sec: 2055.00 - lr: 0.000020 - momentum: 0.000000 2023-10-24 22:48:17,547 epoch 1 - iter 360/723 - loss 0.54313247 - time (sec): 42.42 - samples/sec: 2054.17 - lr: 0.000025 - momentum: 0.000000 2023-10-24 22:48:25,862 epoch 1 - iter 432/723 - loss 0.48284963 - time (sec): 50.74 - samples/sec: 2053.75 - lr: 0.000030 - momentum: 0.000000 2023-10-24 22:48:34,173 epoch 1 - iter 504/723 - loss 0.43417529 - time (sec): 59.05 - samples/sec: 2046.07 - lr: 0.000035 - momentum: 0.000000 2023-10-24 22:48:43,285 epoch 1 - iter 576/723 - loss 0.39662721 - time (sec): 68.16 - samples/sec: 2037.13 - lr: 0.000040 - momentum: 0.000000 2023-10-24 22:48:51,944 epoch 1 - iter 648/723 - loss 0.36490823 - time (sec): 76.82 - samples/sec: 2044.94 - lr: 0.000045 - momentum: 0.000000 2023-10-24 22:49:01,070 epoch 1 - iter 720/723 - loss 0.33855338 - time (sec): 85.95 - samples/sec: 2044.95 - lr: 0.000050 - momentum: 0.000000 2023-10-24 22:49:01,321 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:49:01,321 EPOCH 1 done: loss 0.3381 - lr: 0.000050 2023-10-24 22:49:04,603 DEV : loss 0.13358977437019348 - f1-score (micro avg) 0.5559 2023-10-24 22:49:04,615 saving best model 2023-10-24 22:49:05,173 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:49:13,532 epoch 2 - iter 72/723 - loss 0.11563641 - time (sec): 8.36 - samples/sec: 2039.66 - lr: 0.000049 - momentum: 0.000000 2023-10-24 22:49:21,466 epoch 2 - iter 144/723 - loss 0.11274613 - time (sec): 16.29 - samples/sec: 2051.20 - lr: 0.000049 - momentum: 0.000000 2023-10-24 22:49:29,822 epoch 2 - iter 216/723 - loss 0.10902633 - time (sec): 24.65 - samples/sec: 2053.51 - lr: 0.000048 - momentum: 0.000000 2023-10-24 22:49:38,964 epoch 2 - iter 288/723 - loss 0.10367207 - time (sec): 33.79 - samples/sec: 2049.90 - lr: 0.000048 - momentum: 0.000000 2023-10-24 22:49:48,274 epoch 2 - iter 360/723 - loss 0.09939488 - time (sec): 43.10 - samples/sec: 2053.73 - lr: 0.000047 - momentum: 0.000000 2023-10-24 22:49:57,605 epoch 2 - iter 432/723 - loss 0.09748051 - time (sec): 52.43 - samples/sec: 2046.48 - lr: 0.000047 - momentum: 0.000000 2023-10-24 22:50:06,005 epoch 2 - iter 504/723 - loss 0.09514922 - time (sec): 60.83 - samples/sec: 2045.92 - lr: 0.000046 - momentum: 0.000000 2023-10-24 22:50:13,666 epoch 2 - iter 576/723 - loss 0.09875432 - time (sec): 68.49 - samples/sec: 2047.89 - lr: 0.000046 - momentum: 0.000000 2023-10-24 22:50:22,101 epoch 2 - iter 648/723 - loss 0.09818627 - time (sec): 76.93 - samples/sec: 2048.27 - lr: 0.000045 - momentum: 0.000000 2023-10-24 22:50:30,670 epoch 2 - iter 720/723 - loss 0.09775475 - time (sec): 85.50 - samples/sec: 2053.69 - lr: 0.000044 - momentum: 0.000000 2023-10-24 22:50:30,916 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:50:30,916 EPOCH 2 done: loss 0.0977 - lr: 0.000044 2023-10-24 22:50:34,642 DEV : loss 0.09923986345529556 - f1-score (micro avg) 0.7434 2023-10-24 22:50:34,654 saving best model 2023-10-24 22:50:35,378 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:50:44,044 epoch 3 - iter 72/723 - loss 0.07128608 - time (sec): 8.66 - samples/sec: 2017.83 - lr: 0.000044 - momentum: 0.000000 2023-10-24 22:50:52,525 epoch 3 - iter 144/723 - loss 0.05959143 - time (sec): 17.15 - samples/sec: 2039.09 - lr: 0.000043 - momentum: 0.000000 2023-10-24 22:51:00,812 epoch 3 - iter 216/723 - loss 0.06657607 - time (sec): 25.43 - samples/sec: 2055.01 - lr: 0.000043 - momentum: 0.000000 2023-10-24 22:51:09,580 epoch 3 - iter 288/723 - loss 0.06577637 - time (sec): 34.20 - samples/sec: 2060.35 - lr: 0.000042 - momentum: 0.000000 2023-10-24 22:51:18,393 epoch 3 - iter 360/723 - loss 0.06437789 - time (sec): 43.01 - samples/sec: 2050.83 - lr: 0.000042 - momentum: 0.000000 2023-10-24 22:51:27,526 epoch 3 - iter 432/723 - loss 0.06488597 - time (sec): 52.15 - samples/sec: 2052.47 - lr: 0.000041 - momentum: 0.000000 2023-10-24 22:51:35,847 epoch 3 - iter 504/723 - loss 0.06625285 - time (sec): 60.47 - samples/sec: 2041.55 - lr: 0.000041 - momentum: 0.000000 2023-10-24 22:51:44,188 epoch 3 - iter 576/723 - loss 0.06508266 - time (sec): 68.81 - samples/sec: 2036.26 - lr: 0.000040 - momentum: 0.000000 2023-10-24 22:51:52,876 epoch 3 - iter 648/723 - loss 0.06525563 - time (sec): 77.50 - samples/sec: 2036.55 - lr: 0.000039 - momentum: 0.000000 2023-10-24 22:52:01,622 epoch 3 - iter 720/723 - loss 0.06417277 - time (sec): 86.24 - samples/sec: 2039.48 - lr: 0.000039 - momentum: 0.000000 2023-10-24 22:52:01,826 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:52:01,827 EPOCH 3 done: loss 0.0642 - lr: 0.000039 2023-10-24 22:52:05,562 DEV : loss 0.08431313186883926 - f1-score (micro avg) 0.8162 2023-10-24 22:52:05,574 saving best model 2023-10-24 22:52:06,287 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:52:14,602 epoch 4 - iter 72/723 - loss 0.04275318 - time (sec): 8.31 - samples/sec: 2104.71 - lr: 0.000038 - momentum: 0.000000 2023-10-24 22:52:23,182 epoch 4 - iter 144/723 - loss 0.04122534 - time (sec): 16.89 - samples/sec: 2056.08 - lr: 0.000038 - momentum: 0.000000 2023-10-24 22:52:30,986 epoch 4 - iter 216/723 - loss 0.04387147 - time (sec): 24.70 - samples/sec: 2053.76 - lr: 0.000037 - momentum: 0.000000 2023-10-24 22:52:39,468 epoch 4 - iter 288/723 - loss 0.04477684 - time (sec): 33.18 - samples/sec: 2026.76 - lr: 0.000037 - momentum: 0.000000 2023-10-24 22:52:48,425 epoch 4 - iter 360/723 - loss 0.04492124 - time (sec): 42.14 - samples/sec: 2037.30 - lr: 0.000036 - momentum: 0.000000 2023-10-24 22:52:57,346 epoch 4 - iter 432/723 - loss 0.04583451 - time (sec): 51.06 - samples/sec: 2038.10 - lr: 0.000036 - momentum: 0.000000 2023-10-24 22:53:06,429 epoch 4 - iter 504/723 - loss 0.04611188 - time (sec): 60.14 - samples/sec: 2037.05 - lr: 0.000035 - momentum: 0.000000 2023-10-24 22:53:15,120 epoch 4 - iter 576/723 - loss 0.04469752 - time (sec): 68.83 - samples/sec: 2040.59 - lr: 0.000034 - momentum: 0.000000 2023-10-24 22:53:23,881 epoch 4 - iter 648/723 - loss 0.04421455 - time (sec): 77.59 - samples/sec: 2036.88 - lr: 0.000034 - momentum: 0.000000 2023-10-24 22:53:32,384 epoch 4 - iter 720/723 - loss 0.04381095 - time (sec): 86.10 - samples/sec: 2041.83 - lr: 0.000033 - momentum: 0.000000 2023-10-24 22:53:32,612 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:53:32,612 EPOCH 4 done: loss 0.0440 - lr: 0.000033 2023-10-24 22:53:36,048 DEV : loss 0.0921085774898529 - f1-score (micro avg) 0.8061 2023-10-24 22:53:36,059 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:53:45,135 epoch 5 - iter 72/723 - loss 0.02981632 - time (sec): 9.07 - samples/sec: 2016.43 - lr: 0.000033 - momentum: 0.000000 2023-10-24 22:53:54,262 epoch 5 - iter 144/723 - loss 0.03293464 - time (sec): 18.20 - samples/sec: 1965.96 - lr: 0.000032 - momentum: 0.000000 2023-10-24 22:54:02,987 epoch 5 - iter 216/723 - loss 0.02903229 - time (sec): 26.93 - samples/sec: 1981.65 - lr: 0.000032 - momentum: 0.000000 2023-10-24 22:54:12,480 epoch 5 - iter 288/723 - loss 0.02921469 - time (sec): 36.42 - samples/sec: 1984.35 - lr: 0.000031 - momentum: 0.000000 2023-10-24 22:54:20,920 epoch 5 - iter 360/723 - loss 0.03035987 - time (sec): 44.86 - samples/sec: 1994.23 - lr: 0.000031 - momentum: 0.000000 2023-10-24 22:54:29,626 epoch 5 - iter 432/723 - loss 0.03109785 - time (sec): 53.57 - samples/sec: 2009.90 - lr: 0.000030 - momentum: 0.000000 2023-10-24 22:54:37,390 epoch 5 - iter 504/723 - loss 0.03348098 - time (sec): 61.33 - samples/sec: 2014.44 - lr: 0.000029 - momentum: 0.000000 2023-10-24 22:54:46,312 epoch 5 - iter 576/723 - loss 0.03272635 - time (sec): 70.25 - samples/sec: 2013.74 - lr: 0.000029 - momentum: 0.000000 2023-10-24 22:54:54,679 epoch 5 - iter 648/723 - loss 0.03250020 - time (sec): 78.62 - samples/sec: 2009.22 - lr: 0.000028 - momentum: 0.000000 2023-10-24 22:55:03,105 epoch 5 - iter 720/723 - loss 0.03217828 - time (sec): 87.04 - samples/sec: 2015.60 - lr: 0.000028 - momentum: 0.000000 2023-10-24 22:55:03,508 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:55:03,508 EPOCH 5 done: loss 0.0323 - lr: 0.000028 2023-10-24 22:55:06,952 DEV : loss 0.12793748080730438 - f1-score (micro avg) 0.8201 2023-10-24 22:55:06,964 saving best model 2023-10-24 22:55:07,671 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:55:16,444 epoch 6 - iter 72/723 - loss 0.01771827 - time (sec): 8.77 - samples/sec: 1953.71 - lr: 0.000027 - momentum: 0.000000 2023-10-24 22:55:24,853 epoch 6 - iter 144/723 - loss 0.02179612 - time (sec): 17.18 - samples/sec: 1999.81 - lr: 0.000027 - momentum: 0.000000 2023-10-24 22:55:34,164 epoch 6 - iter 216/723 - loss 0.02091712 - time (sec): 26.49 - samples/sec: 2012.54 - lr: 0.000026 - momentum: 0.000000 2023-10-24 22:55:42,847 epoch 6 - iter 288/723 - loss 0.02115975 - time (sec): 35.18 - samples/sec: 1995.33 - lr: 0.000026 - momentum: 0.000000 2023-10-24 22:55:51,268 epoch 6 - iter 360/723 - loss 0.02279607 - time (sec): 43.60 - samples/sec: 2003.54 - lr: 0.000025 - momentum: 0.000000 2023-10-24 22:55:59,917 epoch 6 - iter 432/723 - loss 0.02299399 - time (sec): 52.25 - samples/sec: 2015.37 - lr: 0.000024 - momentum: 0.000000 2023-10-24 22:56:08,369 epoch 6 - iter 504/723 - loss 0.02253209 - time (sec): 60.70 - samples/sec: 2032.10 - lr: 0.000024 - momentum: 0.000000 2023-10-24 22:56:16,964 epoch 6 - iter 576/723 - loss 0.02318425 - time (sec): 69.29 - samples/sec: 2032.54 - lr: 0.000023 - momentum: 0.000000 2023-10-24 22:56:25,279 epoch 6 - iter 648/723 - loss 0.02365793 - time (sec): 77.61 - samples/sec: 2042.94 - lr: 0.000023 - momentum: 0.000000 2023-10-24 22:56:33,600 epoch 6 - iter 720/723 - loss 0.02442479 - time (sec): 85.93 - samples/sec: 2044.40 - lr: 0.000022 - momentum: 0.000000 2023-10-24 22:56:33,868 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:56:33,869 EPOCH 6 done: loss 0.0244 - lr: 0.000022 2023-10-24 22:56:37,590 DEV : loss 0.13028167188167572 - f1-score (micro avg) 0.8206 2023-10-24 22:56:37,602 saving best model 2023-10-24 22:56:38,305 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:56:46,715 epoch 7 - iter 72/723 - loss 0.01204270 - time (sec): 8.41 - samples/sec: 2127.69 - lr: 0.000022 - momentum: 0.000000 2023-10-24 22:56:55,800 epoch 7 - iter 144/723 - loss 0.01568493 - time (sec): 17.49 - samples/sec: 2021.54 - lr: 0.000021 - momentum: 0.000000 2023-10-24 22:57:04,172 epoch 7 - iter 216/723 - loss 0.01685436 - time (sec): 25.87 - samples/sec: 2034.94 - lr: 0.000021 - momentum: 0.000000 2023-10-24 22:57:12,899 epoch 7 - iter 288/723 - loss 0.01586004 - time (sec): 34.59 - samples/sec: 2049.11 - lr: 0.000020 - momentum: 0.000000 2023-10-24 22:57:22,002 epoch 7 - iter 360/723 - loss 0.01669915 - time (sec): 43.70 - samples/sec: 2039.42 - lr: 0.000019 - momentum: 0.000000 2023-10-24 22:57:30,281 epoch 7 - iter 432/723 - loss 0.01664053 - time (sec): 51.98 - samples/sec: 2027.06 - lr: 0.000019 - momentum: 0.000000 2023-10-24 22:57:38,651 epoch 7 - iter 504/723 - loss 0.01665777 - time (sec): 60.35 - samples/sec: 2027.21 - lr: 0.000018 - momentum: 0.000000 2023-10-24 22:57:47,215 epoch 7 - iter 576/723 - loss 0.01672193 - time (sec): 68.91 - samples/sec: 2029.42 - lr: 0.000018 - momentum: 0.000000 2023-10-24 22:57:56,065 epoch 7 - iter 648/723 - loss 0.01634593 - time (sec): 77.76 - samples/sec: 2032.35 - lr: 0.000017 - momentum: 0.000000 2023-10-24 22:58:04,677 epoch 7 - iter 720/723 - loss 0.01622942 - time (sec): 86.37 - samples/sec: 2032.46 - lr: 0.000017 - momentum: 0.000000 2023-10-24 22:58:05,043 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:58:05,043 EPOCH 7 done: loss 0.0162 - lr: 0.000017 2023-10-24 22:58:08,477 DEV : loss 0.16047385334968567 - f1-score (micro avg) 0.8284 2023-10-24 22:58:08,489 saving best model 2023-10-24 22:58:09,187 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:58:18,118 epoch 8 - iter 72/723 - loss 0.00701010 - time (sec): 8.93 - samples/sec: 1976.12 - lr: 0.000016 - momentum: 0.000000 2023-10-24 22:58:27,226 epoch 8 - iter 144/723 - loss 0.00950195 - time (sec): 18.04 - samples/sec: 1966.17 - lr: 0.000016 - momentum: 0.000000 2023-10-24 22:58:35,453 epoch 8 - iter 216/723 - loss 0.01036607 - time (sec): 26.26 - samples/sec: 2020.43 - lr: 0.000015 - momentum: 0.000000 2023-10-24 22:58:44,819 epoch 8 - iter 288/723 - loss 0.01038046 - time (sec): 35.63 - samples/sec: 2054.12 - lr: 0.000014 - momentum: 0.000000 2023-10-24 22:58:53,155 epoch 8 - iter 360/723 - loss 0.01050105 - time (sec): 43.97 - samples/sec: 2051.11 - lr: 0.000014 - momentum: 0.000000 2023-10-24 22:59:01,634 epoch 8 - iter 432/723 - loss 0.01077764 - time (sec): 52.45 - samples/sec: 2053.71 - lr: 0.000013 - momentum: 0.000000 2023-10-24 22:59:10,359 epoch 8 - iter 504/723 - loss 0.01155176 - time (sec): 61.17 - samples/sec: 2044.08 - lr: 0.000013 - momentum: 0.000000 2023-10-24 22:59:18,085 epoch 8 - iter 576/723 - loss 0.01170645 - time (sec): 68.90 - samples/sec: 2035.43 - lr: 0.000012 - momentum: 0.000000 2023-10-24 22:59:26,361 epoch 8 - iter 648/723 - loss 0.01132420 - time (sec): 77.17 - samples/sec: 2036.57 - lr: 0.000012 - momentum: 0.000000 2023-10-24 22:59:35,154 epoch 8 - iter 720/723 - loss 0.01110679 - time (sec): 85.97 - samples/sec: 2041.54 - lr: 0.000011 - momentum: 0.000000 2023-10-24 22:59:35,626 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:59:35,626 EPOCH 8 done: loss 0.0111 - lr: 0.000011 2023-10-24 22:59:39,060 DEV : loss 0.17271144688129425 - f1-score (micro avg) 0.8152 2023-10-24 22:59:39,072 ---------------------------------------------------------------------------------------------------- 2023-10-24 22:59:48,021 epoch 9 - iter 72/723 - loss 0.00436475 - time (sec): 8.95 - samples/sec: 2093.86 - lr: 0.000011 - momentum: 0.000000 2023-10-24 22:59:55,966 epoch 9 - iter 144/723 - loss 0.00673411 - time (sec): 16.89 - samples/sec: 2075.39 - lr: 0.000010 - momentum: 0.000000 2023-10-24 23:00:05,069 epoch 9 - iter 216/723 - loss 0.00655734 - time (sec): 26.00 - samples/sec: 2059.97 - lr: 0.000009 - momentum: 0.000000 2023-10-24 23:00:13,711 epoch 9 - iter 288/723 - loss 0.00707595 - time (sec): 34.64 - samples/sec: 2050.59 - lr: 0.000009 - momentum: 0.000000 2023-10-24 23:00:22,414 epoch 9 - iter 360/723 - loss 0.00702570 - time (sec): 43.34 - samples/sec: 2037.79 - lr: 0.000008 - momentum: 0.000000 2023-10-24 23:00:30,933 epoch 9 - iter 432/723 - loss 0.00653348 - time (sec): 51.86 - samples/sec: 2047.10 - lr: 0.000008 - momentum: 0.000000 2023-10-24 23:00:39,615 epoch 9 - iter 504/723 - loss 0.00730415 - time (sec): 60.54 - samples/sec: 2047.35 - lr: 0.000007 - momentum: 0.000000 2023-10-24 23:00:47,843 epoch 9 - iter 576/723 - loss 0.00716059 - time (sec): 68.77 - samples/sec: 2051.75 - lr: 0.000007 - momentum: 0.000000 2023-10-24 23:00:56,415 epoch 9 - iter 648/723 - loss 0.00716027 - time (sec): 77.34 - samples/sec: 2049.01 - lr: 0.000006 - momentum: 0.000000 2023-10-24 23:01:05,125 epoch 9 - iter 720/723 - loss 0.00769558 - time (sec): 86.05 - samples/sec: 2043.30 - lr: 0.000006 - momentum: 0.000000 2023-10-24 23:01:05,342 ---------------------------------------------------------------------------------------------------- 2023-10-24 23:01:05,342 EPOCH 9 done: loss 0.0077 - lr: 0.000006 2023-10-24 23:01:09,068 DEV : loss 0.18762636184692383 - f1-score (micro avg) 0.8138 2023-10-24 23:01:09,080 ---------------------------------------------------------------------------------------------------- 2023-10-24 23:01:17,501 epoch 10 - iter 72/723 - loss 0.00700037 - time (sec): 8.42 - samples/sec: 2072.79 - lr: 0.000005 - momentum: 0.000000 2023-10-24 23:01:25,989 epoch 10 - iter 144/723 - loss 0.00538041 - time (sec): 16.91 - samples/sec: 2100.23 - lr: 0.000004 - momentum: 0.000000 2023-10-24 23:01:34,941 epoch 10 - iter 216/723 - loss 0.00524047 - time (sec): 25.86 - samples/sec: 2105.37 - lr: 0.000004 - momentum: 0.000000 2023-10-24 23:01:44,291 epoch 10 - iter 288/723 - loss 0.00587406 - time (sec): 35.21 - samples/sec: 2067.64 - lr: 0.000003 - momentum: 0.000000 2023-10-24 23:01:52,710 epoch 10 - iter 360/723 - loss 0.00546117 - time (sec): 43.63 - samples/sec: 2052.35 - lr: 0.000003 - momentum: 0.000000 2023-10-24 23:02:01,634 epoch 10 - iter 432/723 - loss 0.00525314 - time (sec): 52.55 - samples/sec: 2031.85 - lr: 0.000002 - momentum: 0.000000 2023-10-24 23:02:10,260 epoch 10 - iter 504/723 - loss 0.00559956 - time (sec): 61.18 - samples/sec: 2028.96 - lr: 0.000002 - momentum: 0.000000 2023-10-24 23:02:18,569 epoch 10 - iter 576/723 - loss 0.00567395 - time (sec): 69.49 - samples/sec: 2036.85 - lr: 0.000001 - momentum: 0.000000 2023-10-24 23:02:27,420 epoch 10 - iter 648/723 - loss 0.00543119 - time (sec): 78.34 - samples/sec: 2025.44 - lr: 0.000001 - momentum: 0.000000 2023-10-24 23:02:35,701 epoch 10 - iter 720/723 - loss 0.00550425 - time (sec): 86.62 - samples/sec: 2029.78 - lr: 0.000000 - momentum: 0.000000 2023-10-24 23:02:35,912 ---------------------------------------------------------------------------------------------------- 2023-10-24 23:02:35,913 EPOCH 10 done: loss 0.0055 - lr: 0.000000 2023-10-24 23:02:39,645 DEV : loss 0.19829346239566803 - f1-score (micro avg) 0.8156 2023-10-24 23:02:40,213 ---------------------------------------------------------------------------------------------------- 2023-10-24 23:02:40,214 Loading model from best epoch ... 2023-10-24 23:02:42,032 SequenceTagger predicts: Dictionary with 13 tags: O, S-LOC, B-LOC, E-LOC, I-LOC, S-PER, B-PER, E-PER, I-PER, S-ORG, B-ORG, E-ORG, I-ORG 2023-10-24 23:02:45,283 Results: - F-score (micro) 0.8006 - F-score (macro) 0.6747 - Accuracy 0.6799 By class: precision recall f1-score support PER 0.8527 0.7925 0.8215 482 LOC 0.8801 0.8013 0.8389 458 ORG 0.4231 0.3188 0.3636 69 micro avg 0.8408 0.7641 0.8006 1009 macro avg 0.7186 0.6376 0.6747 1009 weighted avg 0.8357 0.7641 0.7981 1009 2023-10-24 23:02:45,283 ----------------------------------------------------------------------------------------------------