2023-10-25 17:53:07,735 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:53:07,736 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=13, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-25 17:53:07,736 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:53:07,736 MultiCorpus: 14465 train + 1392 dev + 2432 test sentences - NER_HIPE_2022 Corpus: 14465 train + 1392 dev + 2432 test sentences - /home/ubuntu/.flair/datasets/ner_hipe_2022/v2.1/letemps/fr/with_doc_seperator 2023-10-25 17:53:07,736 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:53:07,736 Train: 14465 sentences 2023-10-25 17:53:07,736 (train_with_dev=False, train_with_test=False) 2023-10-25 17:53:07,736 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:53:07,736 Training Params: 2023-10-25 17:53:07,736 - learning_rate: "3e-05" 2023-10-25 17:53:07,736 - mini_batch_size: "4" 2023-10-25 17:53:07,736 - max_epochs: "10" 2023-10-25 17:53:07,736 - shuffle: "True" 2023-10-25 17:53:07,736 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:53:07,737 Plugins: 2023-10-25 17:53:07,737 - TensorboardLogger 2023-10-25 17:53:07,737 - LinearScheduler | warmup_fraction: '0.1' 2023-10-25 17:53:07,737 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:53:07,737 Final evaluation on model from best epoch (best-model.pt) 2023-10-25 17:53:07,737 - metric: "('micro avg', 'f1-score')" 2023-10-25 17:53:07,737 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:53:07,737 Computation: 2023-10-25 17:53:07,737 - compute on device: cuda:0 2023-10-25 17:53:07,737 - embedding storage: none 2023-10-25 17:53:07,737 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:53:07,737 Model training base path: "hmbench-letemps/fr-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs4-wsFalse-e10-lr3e-05-poolingfirst-layers-1-crfFalse-5" 2023-10-25 17:53:07,737 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:53:07,737 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:53:07,737 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-25 17:53:30,177 epoch 1 - iter 361/3617 - loss 1.04448431 - time (sec): 22.44 - samples/sec: 1725.27 - lr: 0.000003 - momentum: 0.000000 2023-10-25 17:53:52,736 epoch 1 - iter 722/3617 - loss 0.62986808 - time (sec): 45.00 - samples/sec: 1706.63 - lr: 0.000006 - momentum: 0.000000 2023-10-25 17:54:15,328 epoch 1 - iter 1083/3617 - loss 0.47556906 - time (sec): 67.59 - samples/sec: 1695.64 - lr: 0.000009 - momentum: 0.000000 2023-10-25 17:54:37,780 epoch 1 - iter 1444/3617 - loss 0.39046577 - time (sec): 90.04 - samples/sec: 1697.01 - lr: 0.000012 - momentum: 0.000000 2023-10-25 17:55:00,290 epoch 1 - iter 1805/3617 - loss 0.33554242 - time (sec): 112.55 - samples/sec: 1688.24 - lr: 0.000015 - momentum: 0.000000 2023-10-25 17:55:22,907 epoch 1 - iter 2166/3617 - loss 0.30184490 - time (sec): 135.17 - samples/sec: 1681.32 - lr: 0.000018 - momentum: 0.000000 2023-10-25 17:55:45,800 epoch 1 - iter 2527/3617 - loss 0.27353951 - time (sec): 158.06 - samples/sec: 1683.62 - lr: 0.000021 - momentum: 0.000000 2023-10-25 17:56:08,540 epoch 1 - iter 2888/3617 - loss 0.25459014 - time (sec): 180.80 - samples/sec: 1674.95 - lr: 0.000024 - momentum: 0.000000 2023-10-25 17:56:31,402 epoch 1 - iter 3249/3617 - loss 0.23801927 - time (sec): 203.66 - samples/sec: 1673.95 - lr: 0.000027 - momentum: 0.000000 2023-10-25 17:56:54,160 epoch 1 - iter 3610/3617 - loss 0.22532715 - time (sec): 226.42 - samples/sec: 1675.31 - lr: 0.000030 - momentum: 0.000000 2023-10-25 17:56:54,573 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:56:54,573 EPOCH 1 done: loss 0.2251 - lr: 0.000030 2023-10-25 17:56:59,104 DEV : loss 0.14202608168125153 - f1-score (micro avg) 0.6083 2023-10-25 17:56:59,127 saving best model 2023-10-25 17:56:59,676 ---------------------------------------------------------------------------------------------------- 2023-10-25 17:57:22,365 epoch 2 - iter 361/3617 - loss 0.09325387 - time (sec): 22.69 - samples/sec: 1653.93 - lr: 0.000030 - momentum: 0.000000 2023-10-25 17:57:44,957 epoch 2 - iter 722/3617 - loss 0.09875007 - time (sec): 45.28 - samples/sec: 1653.86 - lr: 0.000029 - momentum: 0.000000 2023-10-25 17:58:07,620 epoch 2 - iter 1083/3617 - loss 0.09947355 - time (sec): 67.94 - samples/sec: 1662.40 - lr: 0.000029 - momentum: 0.000000 2023-10-25 17:58:30,191 epoch 2 - iter 1444/3617 - loss 0.10125065 - time (sec): 90.51 - samples/sec: 1665.36 - lr: 0.000029 - momentum: 0.000000 2023-10-25 17:58:53,134 epoch 2 - iter 1805/3617 - loss 0.10017178 - time (sec): 113.46 - samples/sec: 1680.24 - lr: 0.000028 - momentum: 0.000000 2023-10-25 17:59:15,812 epoch 2 - iter 2166/3617 - loss 0.10001999 - time (sec): 136.13 - samples/sec: 1678.91 - lr: 0.000028 - momentum: 0.000000 2023-10-25 17:59:38,412 epoch 2 - iter 2527/3617 - loss 0.09792164 - time (sec): 158.73 - samples/sec: 1676.16 - lr: 0.000028 - momentum: 0.000000 2023-10-25 18:00:01,213 epoch 2 - iter 2888/3617 - loss 0.09669901 - time (sec): 181.54 - samples/sec: 1675.51 - lr: 0.000027 - momentum: 0.000000 2023-10-25 18:00:24,017 epoch 2 - iter 3249/3617 - loss 0.09713692 - time (sec): 204.34 - samples/sec: 1674.30 - lr: 0.000027 - momentum: 0.000000 2023-10-25 18:00:46,631 epoch 2 - iter 3610/3617 - loss 0.09773196 - time (sec): 226.95 - samples/sec: 1671.96 - lr: 0.000027 - momentum: 0.000000 2023-10-25 18:00:47,034 ---------------------------------------------------------------------------------------------------- 2023-10-25 18:00:47,034 EPOCH 2 done: loss 0.0977 - lr: 0.000027 2023-10-25 18:00:51,792 DEV : loss 0.14715531468391418 - f1-score (micro avg) 0.6016 2023-10-25 18:00:51,815 ---------------------------------------------------------------------------------------------------- 2023-10-25 18:01:15,000 epoch 3 - iter 361/3617 - loss 0.06760835 - time (sec): 23.18 - samples/sec: 1667.13 - lr: 0.000026 - momentum: 0.000000 2023-10-25 18:01:38,218 epoch 3 - iter 722/3617 - loss 0.06991201 - time (sec): 46.40 - samples/sec: 1723.47 - lr: 0.000026 - momentum: 0.000000 2023-10-25 18:02:00,928 epoch 3 - iter 1083/3617 - loss 0.06632920 - time (sec): 69.11 - samples/sec: 1713.65 - lr: 0.000026 - momentum: 0.000000 2023-10-25 18:02:23,506 epoch 3 - iter 1444/3617 - loss 0.06914393 - time (sec): 91.69 - samples/sec: 1705.05 - lr: 0.000025 - momentum: 0.000000 2023-10-25 18:02:45,976 epoch 3 - iter 1805/3617 - loss 0.07123450 - time (sec): 114.16 - samples/sec: 1692.95 - lr: 0.000025 - momentum: 0.000000 2023-10-25 18:03:08,412 epoch 3 - iter 2166/3617 - loss 0.07224670 - time (sec): 136.60 - samples/sec: 1684.65 - lr: 0.000025 - momentum: 0.000000 2023-10-25 18:03:30,979 epoch 3 - iter 2527/3617 - loss 0.07309131 - time (sec): 159.16 - samples/sec: 1683.48 - lr: 0.000024 - momentum: 0.000000 2023-10-25 18:03:53,575 epoch 3 - iter 2888/3617 - loss 0.07511697 - time (sec): 181.76 - samples/sec: 1677.20 - lr: 0.000024 - momentum: 0.000000 2023-10-25 18:04:16,265 epoch 3 - iter 3249/3617 - loss 0.07486815 - time (sec): 204.45 - samples/sec: 1674.48 - lr: 0.000024 - momentum: 0.000000 2023-10-25 18:04:38,616 epoch 3 - iter 3610/3617 - loss 0.07515962 - time (sec): 226.80 - samples/sec: 1672.69 - lr: 0.000023 - momentum: 0.000000 2023-10-25 18:04:39,041 ---------------------------------------------------------------------------------------------------- 2023-10-25 18:04:39,041 EPOCH 3 done: loss 0.0751 - lr: 0.000023 2023-10-25 18:04:43,797 DEV : loss 0.2039371132850647 - f1-score (micro avg) 0.6501 2023-10-25 18:04:43,819 saving best model 2023-10-25 18:04:44,590 ---------------------------------------------------------------------------------------------------- 2023-10-25 18:05:07,373 epoch 4 - iter 361/3617 - loss 0.05450055 - time (sec): 22.78 - samples/sec: 1685.26 - lr: 0.000023 - momentum: 0.000000 2023-10-25 18:05:29,930 epoch 4 - iter 722/3617 - loss 0.04868394 - time (sec): 45.34 - samples/sec: 1680.30 - lr: 0.000023 - momentum: 0.000000 2023-10-25 18:05:52,692 epoch 4 - iter 1083/3617 - loss 0.04724397 - time (sec): 68.10 - samples/sec: 1692.84 - lr: 0.000022 - momentum: 0.000000 2023-10-25 18:06:15,338 epoch 4 - iter 1444/3617 - loss 0.04713960 - time (sec): 90.75 - samples/sec: 1698.77 - lr: 0.000022 - momentum: 0.000000 2023-10-25 18:06:37,961 epoch 4 - iter 1805/3617 - loss 0.04948343 - time (sec): 113.37 - samples/sec: 1696.17 - lr: 0.000022 - momentum: 0.000000 2023-10-25 18:07:00,499 epoch 4 - iter 2166/3617 - loss 0.04977775 - time (sec): 135.91 - samples/sec: 1685.63 - lr: 0.000021 - momentum: 0.000000 2023-10-25 18:07:22,960 epoch 4 - iter 2527/3617 - loss 0.04973117 - time (sec): 158.37 - samples/sec: 1680.12 - lr: 0.000021 - momentum: 0.000000 2023-10-25 18:07:45,582 epoch 4 - iter 2888/3617 - loss 0.05015542 - time (sec): 180.99 - samples/sec: 1677.24 - lr: 0.000021 - momentum: 0.000000 2023-10-25 18:08:08,593 epoch 4 - iter 3249/3617 - loss 0.05042575 - time (sec): 204.00 - samples/sec: 1670.53 - lr: 0.000020 - momentum: 0.000000 2023-10-25 18:08:31,332 epoch 4 - iter 3610/3617 - loss 0.05053351 - time (sec): 226.74 - samples/sec: 1673.42 - lr: 0.000020 - momentum: 0.000000 2023-10-25 18:08:31,747 ---------------------------------------------------------------------------------------------------- 2023-10-25 18:08:31,748 EPOCH 4 done: loss 0.0507 - lr: 0.000020 2023-10-25 18:08:36,510 DEV : loss 0.24384552240371704 - f1-score (micro avg) 0.6139 2023-10-25 18:08:36,532 ---------------------------------------------------------------------------------------------------- 2023-10-25 18:08:59,252 epoch 5 - iter 361/3617 - loss 0.03078975 - time (sec): 22.72 - samples/sec: 1723.25 - lr: 0.000020 - momentum: 0.000000 2023-10-25 18:09:22,026 epoch 5 - iter 722/3617 - loss 0.03379188 - time (sec): 45.49 - samples/sec: 1704.90 - lr: 0.000019 - momentum: 0.000000 2023-10-25 18:09:44,670 epoch 5 - iter 1083/3617 - loss 0.03494238 - time (sec): 68.14 - samples/sec: 1697.74 - lr: 0.000019 - momentum: 0.000000 2023-10-25 18:10:06,931 epoch 5 - iter 1444/3617 - loss 0.03589940 - time (sec): 90.40 - samples/sec: 1670.73 - lr: 0.000019 - momentum: 0.000000 2023-10-25 18:10:29,820 epoch 5 - iter 1805/3617 - loss 0.03741139 - time (sec): 113.29 - samples/sec: 1679.50 - lr: 0.000018 - momentum: 0.000000 2023-10-25 18:10:52,691 epoch 5 - iter 2166/3617 - loss 0.03745558 - time (sec): 136.16 - samples/sec: 1683.12 - lr: 0.000018 - momentum: 0.000000 2023-10-25 18:11:15,175 epoch 5 - iter 2527/3617 - loss 0.03750218 - time (sec): 158.64 - samples/sec: 1674.48 - lr: 0.000018 - momentum: 0.000000 2023-10-25 18:11:37,722 epoch 5 - iter 2888/3617 - loss 0.03678568 - time (sec): 181.19 - samples/sec: 1672.19 - lr: 0.000017 - momentum: 0.000000 2023-10-25 18:12:00,225 epoch 5 - iter 3249/3617 - loss 0.03673079 - time (sec): 203.69 - samples/sec: 1666.92 - lr: 0.000017 - momentum: 0.000000 2023-10-25 18:12:23,199 epoch 5 - iter 3610/3617 - loss 0.03582200 - time (sec): 226.67 - samples/sec: 1674.22 - lr: 0.000017 - momentum: 0.000000 2023-10-25 18:12:23,601 ---------------------------------------------------------------------------------------------------- 2023-10-25 18:12:23,601 EPOCH 5 done: loss 0.0358 - lr: 0.000017 2023-10-25 18:12:28,874 DEV : loss 0.29047203063964844 - f1-score (micro avg) 0.6375 2023-10-25 18:12:28,897 ---------------------------------------------------------------------------------------------------- 2023-10-25 18:12:51,381 epoch 6 - iter 361/3617 - loss 0.02612742 - time (sec): 22.48 - samples/sec: 1642.43 - lr: 0.000016 - momentum: 0.000000 2023-10-25 18:13:13,926 epoch 6 - iter 722/3617 - loss 0.02633927 - time (sec): 45.03 - samples/sec: 1658.37 - lr: 0.000016 - momentum: 0.000000 2023-10-25 18:13:36,722 epoch 6 - iter 1083/3617 - loss 0.02499591 - time (sec): 67.82 - samples/sec: 1674.19 - lr: 0.000016 - momentum: 0.000000 2023-10-25 18:13:59,270 epoch 6 - iter 1444/3617 - loss 0.02561653 - time (sec): 90.37 - samples/sec: 1675.88 - lr: 0.000015 - momentum: 0.000000 2023-10-25 18:14:22,086 epoch 6 - iter 1805/3617 - loss 0.02554121 - time (sec): 113.19 - samples/sec: 1672.16 - lr: 0.000015 - momentum: 0.000000 2023-10-25 18:14:44,503 epoch 6 - iter 2166/3617 - loss 0.02594605 - time (sec): 135.61 - samples/sec: 1667.09 - lr: 0.000015 - momentum: 0.000000 2023-10-25 18:15:06,952 epoch 6 - iter 2527/3617 - loss 0.02566906 - time (sec): 158.05 - samples/sec: 1662.64 - lr: 0.000014 - momentum: 0.000000 2023-10-25 18:15:29,691 epoch 6 - iter 2888/3617 - loss 0.02638375 - time (sec): 180.79 - samples/sec: 1669.32 - lr: 0.000014 - momentum: 0.000000 2023-10-25 18:15:52,576 epoch 6 - iter 3249/3617 - loss 0.02624045 - time (sec): 203.68 - samples/sec: 1673.50 - lr: 0.000014 - momentum: 0.000000 2023-10-25 18:16:15,346 epoch 6 - iter 3610/3617 - loss 0.02618026 - time (sec): 226.45 - samples/sec: 1674.81 - lr: 0.000013 - momentum: 0.000000 2023-10-25 18:16:15,770 ---------------------------------------------------------------------------------------------------- 2023-10-25 18:16:15,771 EPOCH 6 done: loss 0.0261 - lr: 0.000013 2023-10-25 18:16:21,044 DEV : loss 0.35754987597465515 - f1-score (micro avg) 0.6486 2023-10-25 18:16:21,067 ---------------------------------------------------------------------------------------------------- 2023-10-25 18:16:43,741 epoch 7 - iter 361/3617 - loss 0.02669619 - time (sec): 22.67 - samples/sec: 1694.80 - lr: 0.000013 - momentum: 0.000000 2023-10-25 18:17:06,457 epoch 7 - iter 722/3617 - loss 0.02133193 - time (sec): 45.39 - samples/sec: 1690.52 - lr: 0.000013 - momentum: 0.000000 2023-10-25 18:17:29,252 epoch 7 - iter 1083/3617 - loss 0.01919200 - time (sec): 68.18 - samples/sec: 1696.94 - lr: 0.000012 - momentum: 0.000000 2023-10-25 18:17:51,921 epoch 7 - iter 1444/3617 - loss 0.01837169 - time (sec): 90.85 - samples/sec: 1700.41 - lr: 0.000012 - momentum: 0.000000 2023-10-25 18:18:14,289 epoch 7 - iter 1805/3617 - loss 0.01828680 - time (sec): 113.22 - samples/sec: 1683.38 - lr: 0.000012 - momentum: 0.000000 2023-10-25 18:18:36,956 epoch 7 - iter 2166/3617 - loss 0.01791469 - time (sec): 135.89 - samples/sec: 1672.40 - lr: 0.000011 - momentum: 0.000000 2023-10-25 18:18:59,536 epoch 7 - iter 2527/3617 - loss 0.01755958 - time (sec): 158.47 - samples/sec: 1666.66 - lr: 0.000011 - momentum: 0.000000 2023-10-25 18:19:22,334 epoch 7 - iter 2888/3617 - loss 0.01737112 - time (sec): 181.27 - samples/sec: 1667.36 - lr: 0.000011 - momentum: 0.000000 2023-10-25 18:19:44,988 epoch 7 - iter 3249/3617 - loss 0.01742728 - time (sec): 203.92 - samples/sec: 1671.26 - lr: 0.000010 - momentum: 0.000000 2023-10-25 18:20:07,652 epoch 7 - iter 3610/3617 - loss 0.01735064 - time (sec): 226.58 - samples/sec: 1673.83 - lr: 0.000010 - momentum: 0.000000 2023-10-25 18:20:08,070 ---------------------------------------------------------------------------------------------------- 2023-10-25 18:20:08,070 EPOCH 7 done: loss 0.0174 - lr: 0.000010 2023-10-25 18:20:13,372 DEV : loss 0.3536568582057953 - f1-score (micro avg) 0.6385 2023-10-25 18:20:13,396 ---------------------------------------------------------------------------------------------------- 2023-10-25 18:20:35,975 epoch 8 - iter 361/3617 - loss 0.01206039 - time (sec): 22.58 - samples/sec: 1635.31 - lr: 0.000010 - momentum: 0.000000 2023-10-25 18:20:58,625 epoch 8 - iter 722/3617 - loss 0.01278203 - time (sec): 45.23 - samples/sec: 1648.62 - lr: 0.000009 - momentum: 0.000000 2023-10-25 18:21:21,240 epoch 8 - iter 1083/3617 - loss 0.01364976 - time (sec): 67.84 - samples/sec: 1649.06 - lr: 0.000009 - momentum: 0.000000 2023-10-25 18:21:43,895 epoch 8 - iter 1444/3617 - loss 0.01307669 - time (sec): 90.50 - samples/sec: 1654.06 - lr: 0.000009 - momentum: 0.000000 2023-10-25 18:22:06,544 epoch 8 - iter 1805/3617 - loss 0.01174791 - time (sec): 113.15 - samples/sec: 1658.98 - lr: 0.000008 - momentum: 0.000000 2023-10-25 18:22:29,142 epoch 8 - iter 2166/3617 - loss 0.01152362 - time (sec): 135.75 - samples/sec: 1658.48 - lr: 0.000008 - momentum: 0.000000 2023-10-25 18:22:51,686 epoch 8 - iter 2527/3617 - loss 0.01119897 - time (sec): 158.29 - samples/sec: 1659.80 - lr: 0.000008 - momentum: 0.000000 2023-10-25 18:23:14,109 epoch 8 - iter 2888/3617 - loss 0.01063586 - time (sec): 180.71 - samples/sec: 1657.02 - lr: 0.000007 - momentum: 0.000000 2023-10-25 18:23:37,077 epoch 8 - iter 3249/3617 - loss 0.01119434 - time (sec): 203.68 - samples/sec: 1663.50 - lr: 0.000007 - momentum: 0.000000 2023-10-25 18:24:00,281 epoch 8 - iter 3610/3617 - loss 0.01118139 - time (sec): 226.88 - samples/sec: 1671.90 - lr: 0.000007 - momentum: 0.000000 2023-10-25 18:24:00,698 ---------------------------------------------------------------------------------------------------- 2023-10-25 18:24:00,698 EPOCH 8 done: loss 0.0112 - lr: 0.000007 2023-10-25 18:24:06,004 DEV : loss 0.37936946749687195 - f1-score (micro avg) 0.6465 2023-10-25 18:24:06,027 ---------------------------------------------------------------------------------------------------- 2023-10-25 18:24:28,653 epoch 9 - iter 361/3617 - loss 0.00453700 - time (sec): 22.62 - samples/sec: 1683.71 - lr: 0.000006 - momentum: 0.000000 2023-10-25 18:24:51,313 epoch 9 - iter 722/3617 - loss 0.00589753 - time (sec): 45.29 - samples/sec: 1687.02 - lr: 0.000006 - momentum: 0.000000 2023-10-25 18:25:14,005 epoch 9 - iter 1083/3617 - loss 0.00678198 - time (sec): 67.98 - samples/sec: 1685.07 - lr: 0.000006 - momentum: 0.000000 2023-10-25 18:25:36,711 epoch 9 - iter 1444/3617 - loss 0.00698786 - time (sec): 90.68 - samples/sec: 1676.57 - lr: 0.000005 - momentum: 0.000000 2023-10-25 18:25:59,553 epoch 9 - iter 1805/3617 - loss 0.00683517 - time (sec): 113.52 - samples/sec: 1676.20 - lr: 0.000005 - momentum: 0.000000 2023-10-25 18:26:22,155 epoch 9 - iter 2166/3617 - loss 0.00688052 - time (sec): 136.13 - samples/sec: 1673.27 - lr: 0.000005 - momentum: 0.000000 2023-10-25 18:26:44,919 epoch 9 - iter 2527/3617 - loss 0.00701541 - time (sec): 158.89 - samples/sec: 1669.59 - lr: 0.000004 - momentum: 0.000000 2023-10-25 18:27:07,409 epoch 9 - iter 2888/3617 - loss 0.00693063 - time (sec): 181.38 - samples/sec: 1666.37 - lr: 0.000004 - momentum: 0.000000 2023-10-25 18:27:29,896 epoch 9 - iter 3249/3617 - loss 0.00674220 - time (sec): 203.87 - samples/sec: 1666.51 - lr: 0.000004 - momentum: 0.000000 2023-10-25 18:27:52,714 epoch 9 - iter 3610/3617 - loss 0.00681525 - time (sec): 226.69 - samples/sec: 1672.04 - lr: 0.000003 - momentum: 0.000000 2023-10-25 18:27:53,186 ---------------------------------------------------------------------------------------------------- 2023-10-25 18:27:53,186 EPOCH 9 done: loss 0.0068 - lr: 0.000003 2023-10-25 18:27:57,959 DEV : loss 0.41782665252685547 - f1-score (micro avg) 0.6447 2023-10-25 18:27:57,982 ---------------------------------------------------------------------------------------------------- 2023-10-25 18:28:21,037 epoch 10 - iter 361/3617 - loss 0.00866093 - time (sec): 23.05 - samples/sec: 1651.05 - lr: 0.000003 - momentum: 0.000000 2023-10-25 18:28:43,699 epoch 10 - iter 722/3617 - loss 0.00532429 - time (sec): 45.72 - samples/sec: 1676.97 - lr: 0.000003 - momentum: 0.000000 2023-10-25 18:29:06,533 epoch 10 - iter 1083/3617 - loss 0.00622948 - time (sec): 68.55 - samples/sec: 1677.53 - lr: 0.000002 - momentum: 0.000000 2023-10-25 18:29:29,165 epoch 10 - iter 1444/3617 - loss 0.00568970 - time (sec): 91.18 - samples/sec: 1669.35 - lr: 0.000002 - momentum: 0.000000 2023-10-25 18:29:51,935 epoch 10 - iter 1805/3617 - loss 0.00558160 - time (sec): 113.95 - samples/sec: 1675.59 - lr: 0.000002 - momentum: 0.000000 2023-10-25 18:30:14,674 epoch 10 - iter 2166/3617 - loss 0.00514416 - time (sec): 136.69 - samples/sec: 1674.83 - lr: 0.000001 - momentum: 0.000000 2023-10-25 18:30:37,246 epoch 10 - iter 2527/3617 - loss 0.00503834 - time (sec): 159.26 - samples/sec: 1670.17 - lr: 0.000001 - momentum: 0.000000 2023-10-25 18:30:59,921 epoch 10 - iter 2888/3617 - loss 0.00495744 - time (sec): 181.94 - samples/sec: 1672.40 - lr: 0.000001 - momentum: 0.000000 2023-10-25 18:31:22,734 epoch 10 - iter 3249/3617 - loss 0.00479787 - time (sec): 204.75 - samples/sec: 1675.27 - lr: 0.000000 - momentum: 0.000000 2023-10-25 18:31:45,145 epoch 10 - iter 3610/3617 - loss 0.00477649 - time (sec): 227.16 - samples/sec: 1669.56 - lr: 0.000000 - momentum: 0.000000 2023-10-25 18:31:45,583 ---------------------------------------------------------------------------------------------------- 2023-10-25 18:31:45,583 EPOCH 10 done: loss 0.0048 - lr: 0.000000 2023-10-25 18:31:50,356 DEV : loss 0.416111558675766 - f1-score (micro avg) 0.6427 2023-10-25 18:31:50,932 ---------------------------------------------------------------------------------------------------- 2023-10-25 18:31:50,933 Loading model from best epoch ... 2023-10-25 18:31:52,701 SequenceTagger predicts: Dictionary with 13 tags: O, S-loc, B-loc, E-loc, I-loc, S-pers, B-pers, E-pers, I-pers, S-org, B-org, E-org, I-org 2023-10-25 18:31:58,354 Results: - F-score (micro) 0.6515 - F-score (macro) 0.448 - Accuracy 0.4966 By class: precision recall f1-score support loc 0.6294 0.7817 0.6974 591 pers 0.5663 0.7535 0.6466 357 org 0.0000 0.0000 0.0000 79 micro avg 0.6007 0.7118 0.6515 1027 macro avg 0.3986 0.5117 0.4480 1027 weighted avg 0.5591 0.7118 0.6261 1027 2023-10-25 18:31:58,354 ----------------------------------------------------------------------------------------------------