2023-10-25 12:28:37,998 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:28:37,999 Model: "SequenceTagger( (embeddings): TransformerWordEmbeddings( (model): BertModel( (embeddings): BertEmbeddings( (word_embeddings): Embedding(64001, 768) (position_embeddings): Embedding(512, 768) (token_type_embeddings): Embedding(2, 768) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) (encoder): BertEncoder( (layer): ModuleList( (0): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (1): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (2): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (3): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (4): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (5): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (6): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (7): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (8): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (9): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (10): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (11): BertLayer( (attention): BertAttention( (self): BertSelfAttention( (query): Linear(in_features=768, out_features=768, bias=True) (key): Linear(in_features=768, out_features=768, bias=True) (value): Linear(in_features=768, out_features=768, bias=True) (dropout): Dropout(p=0.1, inplace=False) ) (output): BertSelfOutput( (dense): Linear(in_features=768, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) (intermediate): BertIntermediate( (dense): Linear(in_features=768, out_features=3072, bias=True) (intermediate_act_fn): GELUActivation() ) (output): BertOutput( (dense): Linear(in_features=3072, out_features=768, bias=True) (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True) (dropout): Dropout(p=0.1, inplace=False) ) ) ) ) (pooler): BertPooler( (dense): Linear(in_features=768, out_features=768, bias=True) (activation): Tanh() ) ) ) (locked_dropout): LockedDropout(p=0.5) (linear): Linear(in_features=768, out_features=13, bias=True) (loss_function): CrossEntropyLoss() )" 2023-10-25 12:28:37,999 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:28:37,999 MultiCorpus: 14465 train + 1392 dev + 2432 test sentences - NER_HIPE_2022 Corpus: 14465 train + 1392 dev + 2432 test sentences - /home/ubuntu/.flair/datasets/ner_hipe_2022/v2.1/letemps/fr/with_doc_seperator 2023-10-25 12:28:37,999 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:28:37,999 Train: 14465 sentences 2023-10-25 12:28:37,999 (train_with_dev=False, train_with_test=False) 2023-10-25 12:28:37,999 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:28:37,999 Training Params: 2023-10-25 12:28:37,999 - learning_rate: "3e-05" 2023-10-25 12:28:37,999 - mini_batch_size: "8" 2023-10-25 12:28:37,999 - max_epochs: "10" 2023-10-25 12:28:37,999 - shuffle: "True" 2023-10-25 12:28:37,999 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:28:37,999 Plugins: 2023-10-25 12:28:37,999 - TensorboardLogger 2023-10-25 12:28:37,999 - LinearScheduler | warmup_fraction: '0.1' 2023-10-25 12:28:37,999 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:28:37,999 Final evaluation on model from best epoch (best-model.pt) 2023-10-25 12:28:37,999 - metric: "('micro avg', 'f1-score')" 2023-10-25 12:28:37,999 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:28:37,999 Computation: 2023-10-25 12:28:37,999 - compute on device: cuda:0 2023-10-25 12:28:37,999 - embedding storage: none 2023-10-25 12:28:38,000 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:28:38,000 Model training base path: "hmbench-letemps/fr-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs8-wsFalse-e10-lr3e-05-poolingfirst-layers-1-crfFalse-3" 2023-10-25 12:28:38,000 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:28:38,000 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:28:38,000 Logging anything other than scalars to TensorBoard is currently not supported. 2023-10-25 12:28:53,436 epoch 1 - iter 180/1809 - loss 1.32066192 - time (sec): 15.44 - samples/sec: 2398.78 - lr: 0.000003 - momentum: 0.000000 2023-10-25 12:29:09,601 epoch 1 - iter 360/1809 - loss 0.75278801 - time (sec): 31.60 - samples/sec: 2431.30 - lr: 0.000006 - momentum: 0.000000 2023-10-25 12:29:25,308 epoch 1 - iter 540/1809 - loss 0.55980290 - time (sec): 47.31 - samples/sec: 2422.47 - lr: 0.000009 - momentum: 0.000000 2023-10-25 12:29:41,168 epoch 1 - iter 720/1809 - loss 0.45294865 - time (sec): 63.17 - samples/sec: 2423.46 - lr: 0.000012 - momentum: 0.000000 2023-10-25 12:29:57,000 epoch 1 - iter 900/1809 - loss 0.38916200 - time (sec): 79.00 - samples/sec: 2409.00 - lr: 0.000015 - momentum: 0.000000 2023-10-25 12:30:12,872 epoch 1 - iter 1080/1809 - loss 0.34304204 - time (sec): 94.87 - samples/sec: 2402.45 - lr: 0.000018 - momentum: 0.000000 2023-10-25 12:30:28,488 epoch 1 - iter 1260/1809 - loss 0.30889726 - time (sec): 110.49 - samples/sec: 2404.56 - lr: 0.000021 - momentum: 0.000000 2023-10-25 12:30:44,387 epoch 1 - iter 1440/1809 - loss 0.28449702 - time (sec): 126.39 - samples/sec: 2398.06 - lr: 0.000024 - momentum: 0.000000 2023-10-25 12:31:00,218 epoch 1 - iter 1620/1809 - loss 0.26437763 - time (sec): 142.22 - samples/sec: 2391.42 - lr: 0.000027 - momentum: 0.000000 2023-10-25 12:31:16,209 epoch 1 - iter 1800/1809 - loss 0.24798561 - time (sec): 158.21 - samples/sec: 2390.63 - lr: 0.000030 - momentum: 0.000000 2023-10-25 12:31:16,974 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:31:16,974 EPOCH 1 done: loss 0.2473 - lr: 0.000030 2023-10-25 12:31:21,487 DEV : loss 0.10075430572032928 - f1-score (micro avg) 0.6039 2023-10-25 12:31:21,509 saving best model 2023-10-25 12:31:22,067 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:31:37,430 epoch 2 - iter 180/1809 - loss 0.07768380 - time (sec): 15.36 - samples/sec: 2376.17 - lr: 0.000030 - momentum: 0.000000 2023-10-25 12:31:53,589 epoch 2 - iter 360/1809 - loss 0.07934728 - time (sec): 31.52 - samples/sec: 2340.48 - lr: 0.000029 - momentum: 0.000000 2023-10-25 12:32:09,858 epoch 2 - iter 540/1809 - loss 0.08305144 - time (sec): 47.79 - samples/sec: 2365.38 - lr: 0.000029 - momentum: 0.000000 2023-10-25 12:32:25,618 epoch 2 - iter 720/1809 - loss 0.08318685 - time (sec): 63.55 - samples/sec: 2375.08 - lr: 0.000029 - momentum: 0.000000 2023-10-25 12:32:41,497 epoch 2 - iter 900/1809 - loss 0.08289216 - time (sec): 79.43 - samples/sec: 2379.62 - lr: 0.000028 - momentum: 0.000000 2023-10-25 12:32:57,235 epoch 2 - iter 1080/1809 - loss 0.08295050 - time (sec): 95.17 - samples/sec: 2378.34 - lr: 0.000028 - momentum: 0.000000 2023-10-25 12:33:12,805 epoch 2 - iter 1260/1809 - loss 0.08218539 - time (sec): 110.74 - samples/sec: 2385.51 - lr: 0.000028 - momentum: 0.000000 2023-10-25 12:33:28,522 epoch 2 - iter 1440/1809 - loss 0.08166311 - time (sec): 126.45 - samples/sec: 2390.88 - lr: 0.000027 - momentum: 0.000000 2023-10-25 12:33:44,142 epoch 2 - iter 1620/1809 - loss 0.08087213 - time (sec): 142.07 - samples/sec: 2394.69 - lr: 0.000027 - momentum: 0.000000 2023-10-25 12:34:00,351 epoch 2 - iter 1800/1809 - loss 0.08180993 - time (sec): 158.28 - samples/sec: 2389.13 - lr: 0.000027 - momentum: 0.000000 2023-10-25 12:34:01,173 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:34:01,173 EPOCH 2 done: loss 0.0817 - lr: 0.000027 2023-10-25 12:34:06,428 DEV : loss 0.11919340491294861 - f1-score (micro avg) 0.6265 2023-10-25 12:34:06,451 saving best model 2023-10-25 12:34:07,159 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:34:22,732 epoch 3 - iter 180/1809 - loss 0.05754362 - time (sec): 15.57 - samples/sec: 2359.02 - lr: 0.000026 - momentum: 0.000000 2023-10-25 12:34:39,144 epoch 3 - iter 360/1809 - loss 0.05445932 - time (sec): 31.98 - samples/sec: 2359.41 - lr: 0.000026 - momentum: 0.000000 2023-10-25 12:34:55,128 epoch 3 - iter 540/1809 - loss 0.10839689 - time (sec): 47.97 - samples/sec: 2375.14 - lr: 0.000026 - momentum: 0.000000 2023-10-25 12:35:10,938 epoch 3 - iter 720/1809 - loss 0.11176576 - time (sec): 63.78 - samples/sec: 2398.12 - lr: 0.000025 - momentum: 0.000000 2023-10-25 12:35:26,644 epoch 3 - iter 900/1809 - loss 0.10133908 - time (sec): 79.48 - samples/sec: 2390.92 - lr: 0.000025 - momentum: 0.000000 2023-10-25 12:35:42,693 epoch 3 - iter 1080/1809 - loss 0.09494598 - time (sec): 95.53 - samples/sec: 2394.48 - lr: 0.000025 - momentum: 0.000000 2023-10-25 12:35:58,527 epoch 3 - iter 1260/1809 - loss 0.08876149 - time (sec): 111.37 - samples/sec: 2396.98 - lr: 0.000024 - momentum: 0.000000 2023-10-25 12:36:14,350 epoch 3 - iter 1440/1809 - loss 0.08488484 - time (sec): 127.19 - samples/sec: 2392.20 - lr: 0.000024 - momentum: 0.000000 2023-10-25 12:36:29,593 epoch 3 - iter 1620/1809 - loss 0.08167938 - time (sec): 142.43 - samples/sec: 2379.70 - lr: 0.000024 - momentum: 0.000000 2023-10-25 12:36:45,814 epoch 3 - iter 1800/1809 - loss 0.08095225 - time (sec): 158.65 - samples/sec: 2381.77 - lr: 0.000023 - momentum: 0.000000 2023-10-25 12:36:46,679 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:36:46,680 EPOCH 3 done: loss 0.0808 - lr: 0.000023 2023-10-25 12:36:51,961 DEV : loss 0.17484049499034882 - f1-score (micro avg) 0.6384 2023-10-25 12:36:51,984 saving best model 2023-10-25 12:36:52,764 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:37:08,603 epoch 4 - iter 180/1809 - loss 0.03876512 - time (sec): 15.84 - samples/sec: 2387.38 - lr: 0.000023 - momentum: 0.000000 2023-10-25 12:37:24,227 epoch 4 - iter 360/1809 - loss 0.03787875 - time (sec): 31.46 - samples/sec: 2389.93 - lr: 0.000023 - momentum: 0.000000 2023-10-25 12:37:40,405 epoch 4 - iter 540/1809 - loss 0.03825259 - time (sec): 47.64 - samples/sec: 2379.49 - lr: 0.000022 - momentum: 0.000000 2023-10-25 12:37:56,195 epoch 4 - iter 720/1809 - loss 0.03869128 - time (sec): 63.43 - samples/sec: 2376.11 - lr: 0.000022 - momentum: 0.000000 2023-10-25 12:38:12,183 epoch 4 - iter 900/1809 - loss 0.03899276 - time (sec): 79.42 - samples/sec: 2382.59 - lr: 0.000022 - momentum: 0.000000 2023-10-25 12:38:27,874 epoch 4 - iter 1080/1809 - loss 0.04024557 - time (sec): 95.11 - samples/sec: 2388.88 - lr: 0.000021 - momentum: 0.000000 2023-10-25 12:38:43,516 epoch 4 - iter 1260/1809 - loss 0.04175868 - time (sec): 110.75 - samples/sec: 2385.51 - lr: 0.000021 - momentum: 0.000000 2023-10-25 12:38:59,438 epoch 4 - iter 1440/1809 - loss 0.04172356 - time (sec): 126.67 - samples/sec: 2378.74 - lr: 0.000021 - momentum: 0.000000 2023-10-25 12:39:15,161 epoch 4 - iter 1620/1809 - loss 0.04187760 - time (sec): 142.40 - samples/sec: 2379.62 - lr: 0.000020 - momentum: 0.000000 2023-10-25 12:39:31,322 epoch 4 - iter 1800/1809 - loss 0.04269496 - time (sec): 158.56 - samples/sec: 2382.15 - lr: 0.000020 - momentum: 0.000000 2023-10-25 12:39:32,197 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:39:32,197 EPOCH 4 done: loss 0.0426 - lr: 0.000020 2023-10-25 12:39:36,946 DEV : loss 0.1871713101863861 - f1-score (micro avg) 0.617 2023-10-25 12:39:36,968 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:39:53,069 epoch 5 - iter 180/1809 - loss 0.02777427 - time (sec): 16.10 - samples/sec: 2334.53 - lr: 0.000020 - momentum: 0.000000 2023-10-25 12:40:09,463 epoch 5 - iter 360/1809 - loss 0.02582892 - time (sec): 32.49 - samples/sec: 2352.02 - lr: 0.000019 - momentum: 0.000000 2023-10-25 12:40:25,230 epoch 5 - iter 540/1809 - loss 0.02797722 - time (sec): 48.26 - samples/sec: 2376.31 - lr: 0.000019 - momentum: 0.000000 2023-10-25 12:40:40,871 epoch 5 - iter 720/1809 - loss 0.02666957 - time (sec): 63.90 - samples/sec: 2396.91 - lr: 0.000019 - momentum: 0.000000 2023-10-25 12:40:57,048 epoch 5 - iter 900/1809 - loss 0.02759579 - time (sec): 80.08 - samples/sec: 2390.81 - lr: 0.000018 - momentum: 0.000000 2023-10-25 12:41:12,856 epoch 5 - iter 1080/1809 - loss 0.02814669 - time (sec): 95.89 - samples/sec: 2387.46 - lr: 0.000018 - momentum: 0.000000 2023-10-25 12:41:28,645 epoch 5 - iter 1260/1809 - loss 0.02850952 - time (sec): 111.68 - samples/sec: 2385.17 - lr: 0.000018 - momentum: 0.000000 2023-10-25 12:41:44,229 epoch 5 - iter 1440/1809 - loss 0.02843264 - time (sec): 127.26 - samples/sec: 2391.30 - lr: 0.000017 - momentum: 0.000000 2023-10-25 12:41:59,925 epoch 5 - iter 1620/1809 - loss 0.02870411 - time (sec): 142.96 - samples/sec: 2387.86 - lr: 0.000017 - momentum: 0.000000 2023-10-25 12:42:15,858 epoch 5 - iter 1800/1809 - loss 0.02880197 - time (sec): 158.89 - samples/sec: 2381.32 - lr: 0.000017 - momentum: 0.000000 2023-10-25 12:42:16,584 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:42:16,584 EPOCH 5 done: loss 0.0287 - lr: 0.000017 2023-10-25 12:42:21,352 DEV : loss 0.2866155803203583 - f1-score (micro avg) 0.645 2023-10-25 12:42:21,375 saving best model 2023-10-25 12:42:22,152 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:42:38,353 epoch 6 - iter 180/1809 - loss 0.01495851 - time (sec): 16.20 - samples/sec: 2401.90 - lr: 0.000016 - momentum: 0.000000 2023-10-25 12:42:54,262 epoch 6 - iter 360/1809 - loss 0.01766455 - time (sec): 32.11 - samples/sec: 2369.80 - lr: 0.000016 - momentum: 0.000000 2023-10-25 12:43:09,989 epoch 6 - iter 540/1809 - loss 0.01859450 - time (sec): 47.84 - samples/sec: 2364.59 - lr: 0.000016 - momentum: 0.000000 2023-10-25 12:43:25,708 epoch 6 - iter 720/1809 - loss 0.01990834 - time (sec): 63.56 - samples/sec: 2393.20 - lr: 0.000015 - momentum: 0.000000 2023-10-25 12:43:41,465 epoch 6 - iter 900/1809 - loss 0.02012503 - time (sec): 79.31 - samples/sec: 2387.22 - lr: 0.000015 - momentum: 0.000000 2023-10-25 12:43:57,375 epoch 6 - iter 1080/1809 - loss 0.02035323 - time (sec): 95.22 - samples/sec: 2388.70 - lr: 0.000015 - momentum: 0.000000 2023-10-25 12:44:13,594 epoch 6 - iter 1260/1809 - loss 0.02009955 - time (sec): 111.44 - samples/sec: 2381.17 - lr: 0.000014 - momentum: 0.000000 2023-10-25 12:44:29,665 epoch 6 - iter 1440/1809 - loss 0.01997538 - time (sec): 127.51 - samples/sec: 2383.47 - lr: 0.000014 - momentum: 0.000000 2023-10-25 12:44:45,501 epoch 6 - iter 1620/1809 - loss 0.01992365 - time (sec): 143.35 - samples/sec: 2380.95 - lr: 0.000014 - momentum: 0.000000 2023-10-25 12:45:01,008 epoch 6 - iter 1800/1809 - loss 0.02013927 - time (sec): 158.86 - samples/sec: 2380.41 - lr: 0.000013 - momentum: 0.000000 2023-10-25 12:45:01,759 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:45:01,759 EPOCH 6 done: loss 0.0201 - lr: 0.000013 2023-10-25 12:45:06,544 DEV : loss 0.37807849049568176 - f1-score (micro avg) 0.6363 2023-10-25 12:45:06,567 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:45:22,138 epoch 7 - iter 180/1809 - loss 0.01284640 - time (sec): 15.57 - samples/sec: 2403.58 - lr: 0.000013 - momentum: 0.000000 2023-10-25 12:45:37,980 epoch 7 - iter 360/1809 - loss 0.01313214 - time (sec): 31.41 - samples/sec: 2372.35 - lr: 0.000013 - momentum: 0.000000 2023-10-25 12:45:53,544 epoch 7 - iter 540/1809 - loss 0.01317521 - time (sec): 46.98 - samples/sec: 2376.23 - lr: 0.000012 - momentum: 0.000000 2023-10-25 12:46:09,419 epoch 7 - iter 720/1809 - loss 0.01465604 - time (sec): 62.85 - samples/sec: 2375.28 - lr: 0.000012 - momentum: 0.000000 2023-10-25 12:46:25,283 epoch 7 - iter 900/1809 - loss 0.01445898 - time (sec): 78.72 - samples/sec: 2374.15 - lr: 0.000012 - momentum: 0.000000 2023-10-25 12:46:41,406 epoch 7 - iter 1080/1809 - loss 0.01378515 - time (sec): 94.84 - samples/sec: 2381.54 - lr: 0.000011 - momentum: 0.000000 2023-10-25 12:46:57,319 epoch 7 - iter 1260/1809 - loss 0.01405092 - time (sec): 110.75 - samples/sec: 2391.78 - lr: 0.000011 - momentum: 0.000000 2023-10-25 12:47:13,451 epoch 7 - iter 1440/1809 - loss 0.01485727 - time (sec): 126.88 - samples/sec: 2389.04 - lr: 0.000011 - momentum: 0.000000 2023-10-25 12:47:29,366 epoch 7 - iter 1620/1809 - loss 0.01448345 - time (sec): 142.80 - samples/sec: 2383.94 - lr: 0.000010 - momentum: 0.000000 2023-10-25 12:47:45,272 epoch 7 - iter 1800/1809 - loss 0.01468468 - time (sec): 158.70 - samples/sec: 2382.12 - lr: 0.000010 - momentum: 0.000000 2023-10-25 12:47:45,994 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:47:45,994 EPOCH 7 done: loss 0.0147 - lr: 0.000010 2023-10-25 12:47:51,314 DEV : loss 0.38382917642593384 - f1-score (micro avg) 0.6421 2023-10-25 12:47:51,337 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:48:07,499 epoch 8 - iter 180/1809 - loss 0.00759326 - time (sec): 16.16 - samples/sec: 2409.98 - lr: 0.000010 - momentum: 0.000000 2023-10-25 12:48:23,665 epoch 8 - iter 360/1809 - loss 0.00817682 - time (sec): 32.33 - samples/sec: 2351.45 - lr: 0.000009 - momentum: 0.000000 2023-10-25 12:48:39,685 epoch 8 - iter 540/1809 - loss 0.00774880 - time (sec): 48.35 - samples/sec: 2361.41 - lr: 0.000009 - momentum: 0.000000 2023-10-25 12:48:55,677 epoch 8 - iter 720/1809 - loss 0.00830876 - time (sec): 64.34 - samples/sec: 2375.37 - lr: 0.000009 - momentum: 0.000000 2023-10-25 12:49:11,519 epoch 8 - iter 900/1809 - loss 0.00898372 - time (sec): 80.18 - samples/sec: 2376.67 - lr: 0.000008 - momentum: 0.000000 2023-10-25 12:49:27,154 epoch 8 - iter 1080/1809 - loss 0.00905103 - time (sec): 95.82 - samples/sec: 2364.51 - lr: 0.000008 - momentum: 0.000000 2023-10-25 12:49:42,765 epoch 8 - iter 1260/1809 - loss 0.00901134 - time (sec): 111.43 - samples/sec: 2370.76 - lr: 0.000008 - momentum: 0.000000 2023-10-25 12:49:58,768 epoch 8 - iter 1440/1809 - loss 0.00911879 - time (sec): 127.43 - samples/sec: 2379.44 - lr: 0.000007 - momentum: 0.000000 2023-10-25 12:50:14,163 epoch 8 - iter 1620/1809 - loss 0.00980469 - time (sec): 142.83 - samples/sec: 2378.78 - lr: 0.000007 - momentum: 0.000000 2023-10-25 12:50:30,039 epoch 8 - iter 1800/1809 - loss 0.00951385 - time (sec): 158.70 - samples/sec: 2382.46 - lr: 0.000007 - momentum: 0.000000 2023-10-25 12:50:30,814 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:50:30,814 EPOCH 8 done: loss 0.0095 - lr: 0.000007 2023-10-25 12:50:36,114 DEV : loss 0.3850279152393341 - f1-score (micro avg) 0.6606 2023-10-25 12:50:36,137 saving best model 2023-10-25 12:50:36,798 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:50:52,245 epoch 9 - iter 180/1809 - loss 0.00647474 - time (sec): 15.45 - samples/sec: 2393.45 - lr: 0.000006 - momentum: 0.000000 2023-10-25 12:51:08,375 epoch 9 - iter 360/1809 - loss 0.00509171 - time (sec): 31.58 - samples/sec: 2400.18 - lr: 0.000006 - momentum: 0.000000 2023-10-25 12:51:24,236 epoch 9 - iter 540/1809 - loss 0.00664059 - time (sec): 47.44 - samples/sec: 2402.03 - lr: 0.000006 - momentum: 0.000000 2023-10-25 12:51:39,708 epoch 9 - iter 720/1809 - loss 0.00626044 - time (sec): 62.91 - samples/sec: 2395.19 - lr: 0.000005 - momentum: 0.000000 2023-10-25 12:51:55,581 epoch 9 - iter 900/1809 - loss 0.00660259 - time (sec): 78.78 - samples/sec: 2392.98 - lr: 0.000005 - momentum: 0.000000 2023-10-25 12:52:12,089 epoch 9 - iter 1080/1809 - loss 0.00656851 - time (sec): 95.29 - samples/sec: 2386.15 - lr: 0.000005 - momentum: 0.000000 2023-10-25 12:52:28,445 epoch 9 - iter 1260/1809 - loss 0.00630130 - time (sec): 111.65 - samples/sec: 2379.83 - lr: 0.000004 - momentum: 0.000000 2023-10-25 12:52:44,518 epoch 9 - iter 1440/1809 - loss 0.00621822 - time (sec): 127.72 - samples/sec: 2384.06 - lr: 0.000004 - momentum: 0.000000 2023-10-25 12:52:59,613 epoch 9 - iter 1620/1809 - loss 0.00603142 - time (sec): 142.81 - samples/sec: 2378.33 - lr: 0.000004 - momentum: 0.000000 2023-10-25 12:53:15,534 epoch 9 - iter 1800/1809 - loss 0.00568062 - time (sec): 158.73 - samples/sec: 2383.07 - lr: 0.000003 - momentum: 0.000000 2023-10-25 12:53:16,303 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:53:16,304 EPOCH 9 done: loss 0.0057 - lr: 0.000003 2023-10-25 12:53:21,631 DEV : loss 0.4284880757331848 - f1-score (micro avg) 0.6479 2023-10-25 12:53:21,654 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:53:37,461 epoch 10 - iter 180/1809 - loss 0.00257309 - time (sec): 15.81 - samples/sec: 2408.54 - lr: 0.000003 - momentum: 0.000000 2023-10-25 12:53:53,223 epoch 10 - iter 360/1809 - loss 0.00263089 - time (sec): 31.57 - samples/sec: 2405.48 - lr: 0.000003 - momentum: 0.000000 2023-10-25 12:54:09,057 epoch 10 - iter 540/1809 - loss 0.00294215 - time (sec): 47.40 - samples/sec: 2395.49 - lr: 0.000002 - momentum: 0.000000 2023-10-25 12:54:24,791 epoch 10 - iter 720/1809 - loss 0.00342558 - time (sec): 63.14 - samples/sec: 2387.73 - lr: 0.000002 - momentum: 0.000000 2023-10-25 12:54:40,381 epoch 10 - iter 900/1809 - loss 0.00403372 - time (sec): 78.73 - samples/sec: 2379.45 - lr: 0.000002 - momentum: 0.000000 2023-10-25 12:54:56,034 epoch 10 - iter 1080/1809 - loss 0.00393697 - time (sec): 94.38 - samples/sec: 2380.89 - lr: 0.000001 - momentum: 0.000000 2023-10-25 12:55:12,157 epoch 10 - iter 1260/1809 - loss 0.00409778 - time (sec): 110.50 - samples/sec: 2383.26 - lr: 0.000001 - momentum: 0.000000 2023-10-25 12:55:28,481 epoch 10 - iter 1440/1809 - loss 0.00378507 - time (sec): 126.83 - samples/sec: 2385.83 - lr: 0.000001 - momentum: 0.000000 2023-10-25 12:55:44,632 epoch 10 - iter 1620/1809 - loss 0.00378607 - time (sec): 142.98 - samples/sec: 2384.08 - lr: 0.000000 - momentum: 0.000000 2023-10-25 12:56:00,190 epoch 10 - iter 1800/1809 - loss 0.00380232 - time (sec): 158.54 - samples/sec: 2385.62 - lr: 0.000000 - momentum: 0.000000 2023-10-25 12:56:00,899 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:56:00,899 EPOCH 10 done: loss 0.0038 - lr: 0.000000 2023-10-25 12:56:06,221 DEV : loss 0.4174613654613495 - f1-score (micro avg) 0.6436 2023-10-25 12:56:06,807 ---------------------------------------------------------------------------------------------------- 2023-10-25 12:56:06,808 Loading model from best epoch ... 2023-10-25 12:56:08,576 SequenceTagger predicts: Dictionary with 13 tags: O, S-loc, B-loc, E-loc, I-loc, S-pers, B-pers, E-pers, I-pers, S-org, B-org, E-org, I-org 2023-10-25 12:56:14,318 Results: - F-score (micro) 0.673 - F-score (macro) 0.5346 - Accuracy 0.5199 By class: precision recall f1-score support loc 0.6676 0.7716 0.7159 591 pers 0.6022 0.7675 0.6749 357 org 0.3023 0.1646 0.2131 79 micro avg 0.6291 0.7235 0.6730 1027 macro avg 0.5241 0.5679 0.5346 1027 weighted avg 0.6168 0.7235 0.6629 1027 2023-10-25 12:56:14,318 ----------------------------------------------------------------------------------------------------