stefan-it's picture
Upload ./training.log with huggingface_hub
c47c491
raw
history blame
36.9 kB
2023-10-25 08:56:05,291 ----------------------------------------------------------------------------------------------------
2023-10-25 08:56:05,292 Model: "SequenceTagger(
(embeddings): TransformerWordEmbeddings(
(model): BertModel(
(embeddings): BertEmbeddings(
(word_embeddings): Embedding(64001, 768)
(position_embeddings): Embedding(512, 768)
(token_type_embeddings): Embedding(2, 768)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(encoder): BertEncoder(
(layer): ModuleList(
(0): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(1): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(2): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(3): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(4): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(5): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(6): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(7): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(8): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(9): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(10): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(11): BertLayer(
(attention): BertAttention(
(self): BertSelfAttention(
(query): Linear(in_features=768, out_features=768, bias=True)
(key): Linear(in_features=768, out_features=768, bias=True)
(value): Linear(in_features=768, out_features=768, bias=True)
(dropout): Dropout(p=0.1, inplace=False)
)
(output): BertSelfOutput(
(dense): Linear(in_features=768, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
(intermediate): BertIntermediate(
(dense): Linear(in_features=768, out_features=3072, bias=True)
(intermediate_act_fn): GELUActivation()
)
(output): BertOutput(
(dense): Linear(in_features=3072, out_features=768, bias=True)
(LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
(dropout): Dropout(p=0.1, inplace=False)
)
)
)
)
(pooler): BertPooler(
(dense): Linear(in_features=768, out_features=768, bias=True)
(activation): Tanh()
)
)
)
(locked_dropout): LockedDropout(p=0.5)
(linear): Linear(in_features=768, out_features=13, bias=True)
(loss_function): CrossEntropyLoss()
)"
2023-10-25 08:56:05,292 ----------------------------------------------------------------------------------------------------
2023-10-25 08:56:05,292 MultiCorpus: 14465 train + 1392 dev + 2432 test sentences
- NER_HIPE_2022 Corpus: 14465 train + 1392 dev + 2432 test sentences - /home/ubuntu/.flair/datasets/ner_hipe_2022/v2.1/letemps/fr/with_doc_seperator
2023-10-25 08:56:05,292 ----------------------------------------------------------------------------------------------------
2023-10-25 08:56:05,292 Train: 14465 sentences
2023-10-25 08:56:05,292 (train_with_dev=False, train_with_test=False)
2023-10-25 08:56:05,292 ----------------------------------------------------------------------------------------------------
2023-10-25 08:56:05,292 Training Params:
2023-10-25 08:56:05,292 - learning_rate: "3e-05"
2023-10-25 08:56:05,292 - mini_batch_size: "4"
2023-10-25 08:56:05,292 - max_epochs: "10"
2023-10-25 08:56:05,292 - shuffle: "True"
2023-10-25 08:56:05,292 ----------------------------------------------------------------------------------------------------
2023-10-25 08:56:05,292 Plugins:
2023-10-25 08:56:05,292 - TensorboardLogger
2023-10-25 08:56:05,292 - LinearScheduler | warmup_fraction: '0.1'
2023-10-25 08:56:05,292 ----------------------------------------------------------------------------------------------------
2023-10-25 08:56:05,292 Final evaluation on model from best epoch (best-model.pt)
2023-10-25 08:56:05,292 - metric: "('micro avg', 'f1-score')"
2023-10-25 08:56:05,292 ----------------------------------------------------------------------------------------------------
2023-10-25 08:56:05,292 Computation:
2023-10-25 08:56:05,292 - compute on device: cuda:0
2023-10-25 08:56:05,292 - embedding storage: none
2023-10-25 08:56:05,292 ----------------------------------------------------------------------------------------------------
2023-10-25 08:56:05,292 Model training base path: "hmbench-letemps/fr-dbmdz/bert-base-historic-multilingual-64k-td-cased-bs4-wsFalse-e10-lr3e-05-poolingfirst-layers-1-crfFalse-1"
2023-10-25 08:56:05,293 ----------------------------------------------------------------------------------------------------
2023-10-25 08:56:05,293 ----------------------------------------------------------------------------------------------------
2023-10-25 08:56:05,293 Logging anything other than scalars to TensorBoard is currently not supported.
2023-10-25 08:56:27,757 epoch 1 - iter 361/3617 - loss 1.29876432 - time (sec): 22.46 - samples/sec: 1685.27 - lr: 0.000003 - momentum: 0.000000
2023-10-25 08:56:50,077 epoch 1 - iter 722/3617 - loss 0.75385707 - time (sec): 44.78 - samples/sec: 1679.34 - lr: 0.000006 - momentum: 0.000000
2023-10-25 08:57:12,715 epoch 1 - iter 1083/3617 - loss 0.54794241 - time (sec): 67.42 - samples/sec: 1685.85 - lr: 0.000009 - momentum: 0.000000
2023-10-25 08:57:35,339 epoch 1 - iter 1444/3617 - loss 0.44521586 - time (sec): 90.05 - samples/sec: 1685.05 - lr: 0.000012 - momentum: 0.000000
2023-10-25 08:57:57,806 epoch 1 - iter 1805/3617 - loss 0.38171890 - time (sec): 112.51 - samples/sec: 1680.63 - lr: 0.000015 - momentum: 0.000000
2023-10-25 08:58:20,893 epoch 1 - iter 2166/3617 - loss 0.33646336 - time (sec): 135.60 - samples/sec: 1675.78 - lr: 0.000018 - momentum: 0.000000
2023-10-25 08:58:43,512 epoch 1 - iter 2527/3617 - loss 0.30293723 - time (sec): 158.22 - samples/sec: 1678.77 - lr: 0.000021 - momentum: 0.000000
2023-10-25 08:59:06,150 epoch 1 - iter 2888/3617 - loss 0.27974922 - time (sec): 180.86 - samples/sec: 1676.54 - lr: 0.000024 - momentum: 0.000000
2023-10-25 08:59:28,993 epoch 1 - iter 3249/3617 - loss 0.26111850 - time (sec): 203.70 - samples/sec: 1675.74 - lr: 0.000027 - momentum: 0.000000
2023-10-25 08:59:51,479 epoch 1 - iter 3610/3617 - loss 0.24710193 - time (sec): 226.19 - samples/sec: 1675.77 - lr: 0.000030 - momentum: 0.000000
2023-10-25 08:59:51,943 ----------------------------------------------------------------------------------------------------
2023-10-25 08:59:51,944 EPOCH 1 done: loss 0.2467 - lr: 0.000030
2023-10-25 08:59:56,474 DEV : loss 0.14493782818317413 - f1-score (micro avg) 0.5921
2023-10-25 08:59:56,496 saving best model
2023-10-25 08:59:56,967 ----------------------------------------------------------------------------------------------------
2023-10-25 09:00:19,557 epoch 2 - iter 361/3617 - loss 0.09481662 - time (sec): 22.59 - samples/sec: 1695.11 - lr: 0.000030 - momentum: 0.000000
2023-10-25 09:00:42,519 epoch 2 - iter 722/3617 - loss 0.10727292 - time (sec): 45.55 - samples/sec: 1694.09 - lr: 0.000029 - momentum: 0.000000
2023-10-25 09:01:05,260 epoch 2 - iter 1083/3617 - loss 0.10816822 - time (sec): 68.29 - samples/sec: 1694.24 - lr: 0.000029 - momentum: 0.000000
2023-10-25 09:01:27,920 epoch 2 - iter 1444/3617 - loss 0.10358701 - time (sec): 90.95 - samples/sec: 1688.21 - lr: 0.000029 - momentum: 0.000000
2023-10-25 09:01:50,585 epoch 2 - iter 1805/3617 - loss 0.10315773 - time (sec): 113.62 - samples/sec: 1685.42 - lr: 0.000028 - momentum: 0.000000
2023-10-25 09:02:13,160 epoch 2 - iter 2166/3617 - loss 0.10157426 - time (sec): 136.19 - samples/sec: 1679.24 - lr: 0.000028 - momentum: 0.000000
2023-10-25 09:02:35,597 epoch 2 - iter 2527/3617 - loss 0.10001100 - time (sec): 158.63 - samples/sec: 1675.89 - lr: 0.000028 - momentum: 0.000000
2023-10-25 09:02:58,307 epoch 2 - iter 2888/3617 - loss 0.09732052 - time (sec): 181.34 - samples/sec: 1677.76 - lr: 0.000027 - momentum: 0.000000
2023-10-25 09:03:21,029 epoch 2 - iter 3249/3617 - loss 0.09809576 - time (sec): 204.06 - samples/sec: 1675.59 - lr: 0.000027 - momentum: 0.000000
2023-10-25 09:03:43,417 epoch 2 - iter 3610/3617 - loss 0.09810723 - time (sec): 226.45 - samples/sec: 1674.16 - lr: 0.000027 - momentum: 0.000000
2023-10-25 09:03:43,852 ----------------------------------------------------------------------------------------------------
2023-10-25 09:03:43,852 EPOCH 2 done: loss 0.0980 - lr: 0.000027
2023-10-25 09:03:49,086 DEV : loss 0.1498355269432068 - f1-score (micro avg) 0.6537
2023-10-25 09:03:49,108 saving best model
2023-10-25 09:03:49,728 ----------------------------------------------------------------------------------------------------
2023-10-25 09:04:12,340 epoch 3 - iter 361/3617 - loss 0.08371286 - time (sec): 22.61 - samples/sec: 1661.67 - lr: 0.000026 - momentum: 0.000000
2023-10-25 09:04:35,198 epoch 3 - iter 722/3617 - loss 0.08266269 - time (sec): 45.47 - samples/sec: 1671.47 - lr: 0.000026 - momentum: 0.000000
2023-10-25 09:04:57,535 epoch 3 - iter 1083/3617 - loss 0.07533014 - time (sec): 67.81 - samples/sec: 1676.54 - lr: 0.000026 - momentum: 0.000000
2023-10-25 09:05:20,055 epoch 3 - iter 1444/3617 - loss 0.07921444 - time (sec): 90.33 - samples/sec: 1672.99 - lr: 0.000025 - momentum: 0.000000
2023-10-25 09:05:42,693 epoch 3 - iter 1805/3617 - loss 0.07689623 - time (sec): 112.96 - samples/sec: 1679.89 - lr: 0.000025 - momentum: 0.000000
2023-10-25 09:06:05,757 epoch 3 - iter 2166/3617 - loss 0.07594405 - time (sec): 136.03 - samples/sec: 1684.69 - lr: 0.000025 - momentum: 0.000000
2023-10-25 09:06:28,208 epoch 3 - iter 2527/3617 - loss 0.07505941 - time (sec): 158.48 - samples/sec: 1678.13 - lr: 0.000024 - momentum: 0.000000
2023-10-25 09:06:51,076 epoch 3 - iter 2888/3617 - loss 0.07488029 - time (sec): 181.35 - samples/sec: 1685.62 - lr: 0.000024 - momentum: 0.000000
2023-10-25 09:07:13,857 epoch 3 - iter 3249/3617 - loss 0.07618760 - time (sec): 204.13 - samples/sec: 1680.18 - lr: 0.000024 - momentum: 0.000000
2023-10-25 09:07:36,286 epoch 3 - iter 3610/3617 - loss 0.07650149 - time (sec): 226.56 - samples/sec: 1674.18 - lr: 0.000023 - momentum: 0.000000
2023-10-25 09:07:36,709 ----------------------------------------------------------------------------------------------------
2023-10-25 09:07:36,709 EPOCH 3 done: loss 0.0764 - lr: 0.000023
2023-10-25 09:07:41,464 DEV : loss 0.19308863580226898 - f1-score (micro avg) 0.6209
2023-10-25 09:07:41,486 ----------------------------------------------------------------------------------------------------
2023-10-25 09:08:04,147 epoch 4 - iter 361/3617 - loss 0.04740247 - time (sec): 22.66 - samples/sec: 1676.03 - lr: 0.000023 - momentum: 0.000000
2023-10-25 09:08:27,052 epoch 4 - iter 722/3617 - loss 0.04393513 - time (sec): 45.57 - samples/sec: 1694.68 - lr: 0.000023 - momentum: 0.000000
2023-10-25 09:08:49,468 epoch 4 - iter 1083/3617 - loss 0.04673719 - time (sec): 67.98 - samples/sec: 1670.64 - lr: 0.000022 - momentum: 0.000000
2023-10-25 09:09:12,096 epoch 4 - iter 1444/3617 - loss 0.04771808 - time (sec): 90.61 - samples/sec: 1670.66 - lr: 0.000022 - momentum: 0.000000
2023-10-25 09:09:34,752 epoch 4 - iter 1805/3617 - loss 0.04805421 - time (sec): 113.27 - samples/sec: 1672.54 - lr: 0.000022 - momentum: 0.000000
2023-10-25 09:09:57,527 epoch 4 - iter 2166/3617 - loss 0.04811747 - time (sec): 136.04 - samples/sec: 1675.32 - lr: 0.000021 - momentum: 0.000000
2023-10-25 09:10:20,097 epoch 4 - iter 2527/3617 - loss 0.04985558 - time (sec): 158.61 - samples/sec: 1672.88 - lr: 0.000021 - momentum: 0.000000
2023-10-25 09:10:43,113 epoch 4 - iter 2888/3617 - loss 0.04967931 - time (sec): 181.63 - samples/sec: 1666.35 - lr: 0.000021 - momentum: 0.000000
2023-10-25 09:11:05,900 epoch 4 - iter 3249/3617 - loss 0.04983072 - time (sec): 204.41 - samples/sec: 1665.34 - lr: 0.000020 - momentum: 0.000000
2023-10-25 09:11:28,853 epoch 4 - iter 3610/3617 - loss 0.05188277 - time (sec): 227.37 - samples/sec: 1667.33 - lr: 0.000020 - momentum: 0.000000
2023-10-25 09:11:29,293 ----------------------------------------------------------------------------------------------------
2023-10-25 09:11:29,293 EPOCH 4 done: loss 0.0518 - lr: 0.000020
2023-10-25 09:11:34,065 DEV : loss 0.25538942217826843 - f1-score (micro avg) 0.6376
2023-10-25 09:11:34,087 ----------------------------------------------------------------------------------------------------
2023-10-25 09:11:56,512 epoch 5 - iter 361/3617 - loss 0.03131284 - time (sec): 22.42 - samples/sec: 1629.91 - lr: 0.000020 - momentum: 0.000000
2023-10-25 09:12:19,313 epoch 5 - iter 722/3617 - loss 0.03223206 - time (sec): 45.22 - samples/sec: 1639.25 - lr: 0.000019 - momentum: 0.000000
2023-10-25 09:12:42,036 epoch 5 - iter 1083/3617 - loss 0.03088082 - time (sec): 67.95 - samples/sec: 1652.27 - lr: 0.000019 - momentum: 0.000000
2023-10-25 09:13:04,677 epoch 5 - iter 1444/3617 - loss 0.03409690 - time (sec): 90.59 - samples/sec: 1655.84 - lr: 0.000019 - momentum: 0.000000
2023-10-25 09:13:27,356 epoch 5 - iter 1805/3617 - loss 0.03218071 - time (sec): 113.27 - samples/sec: 1668.62 - lr: 0.000018 - momentum: 0.000000
2023-10-25 09:13:49,986 epoch 5 - iter 2166/3617 - loss 0.03391101 - time (sec): 135.90 - samples/sec: 1665.03 - lr: 0.000018 - momentum: 0.000000
2023-10-25 09:14:12,624 epoch 5 - iter 2527/3617 - loss 0.03493067 - time (sec): 158.54 - samples/sec: 1662.52 - lr: 0.000018 - momentum: 0.000000
2023-10-25 09:14:35,385 epoch 5 - iter 2888/3617 - loss 0.03495628 - time (sec): 181.30 - samples/sec: 1670.98 - lr: 0.000017 - momentum: 0.000000
2023-10-25 09:14:58,001 epoch 5 - iter 3249/3617 - loss 0.03497871 - time (sec): 203.91 - samples/sec: 1670.29 - lr: 0.000017 - momentum: 0.000000
2023-10-25 09:15:20,904 epoch 5 - iter 3610/3617 - loss 0.03564780 - time (sec): 226.82 - samples/sec: 1672.37 - lr: 0.000017 - momentum: 0.000000
2023-10-25 09:15:21,319 ----------------------------------------------------------------------------------------------------
2023-10-25 09:15:21,319 EPOCH 5 done: loss 0.0357 - lr: 0.000017
2023-10-25 09:15:26,608 DEV : loss 0.3036385476589203 - f1-score (micro avg) 0.6379
2023-10-25 09:15:26,630 ----------------------------------------------------------------------------------------------------
2023-10-25 09:15:49,295 epoch 6 - iter 361/3617 - loss 0.01822029 - time (sec): 22.66 - samples/sec: 1605.12 - lr: 0.000016 - momentum: 0.000000
2023-10-25 09:16:12,087 epoch 6 - iter 722/3617 - loss 0.02217639 - time (sec): 45.46 - samples/sec: 1662.06 - lr: 0.000016 - momentum: 0.000000
2023-10-25 09:16:34,918 epoch 6 - iter 1083/3617 - loss 0.02506345 - time (sec): 68.29 - samples/sec: 1664.01 - lr: 0.000016 - momentum: 0.000000
2023-10-25 09:16:57,390 epoch 6 - iter 1444/3617 - loss 0.02414606 - time (sec): 90.76 - samples/sec: 1655.24 - lr: 0.000015 - momentum: 0.000000
2023-10-25 09:17:20,050 epoch 6 - iter 1805/3617 - loss 0.02424517 - time (sec): 113.42 - samples/sec: 1662.94 - lr: 0.000015 - momentum: 0.000000
2023-10-25 09:17:42,507 epoch 6 - iter 2166/3617 - loss 0.02407469 - time (sec): 135.88 - samples/sec: 1663.03 - lr: 0.000015 - momentum: 0.000000
2023-10-25 09:18:05,243 epoch 6 - iter 2527/3617 - loss 0.02329897 - time (sec): 158.61 - samples/sec: 1665.13 - lr: 0.000014 - momentum: 0.000000
2023-10-25 09:18:28,017 epoch 6 - iter 2888/3617 - loss 0.02317000 - time (sec): 181.39 - samples/sec: 1670.08 - lr: 0.000014 - momentum: 0.000000
2023-10-25 09:18:50,676 epoch 6 - iter 3249/3617 - loss 0.02253595 - time (sec): 204.04 - samples/sec: 1670.18 - lr: 0.000014 - momentum: 0.000000
2023-10-25 09:19:13,356 epoch 6 - iter 3610/3617 - loss 0.02298512 - time (sec): 226.73 - samples/sec: 1671.45 - lr: 0.000013 - momentum: 0.000000
2023-10-25 09:19:13,810 ----------------------------------------------------------------------------------------------------
2023-10-25 09:19:13,810 EPOCH 6 done: loss 0.0230 - lr: 0.000013
2023-10-25 09:19:19,090 DEV : loss 0.3258330523967743 - f1-score (micro avg) 0.6394
2023-10-25 09:19:19,113 ----------------------------------------------------------------------------------------------------
2023-10-25 09:19:41,742 epoch 7 - iter 361/3617 - loss 0.01238420 - time (sec): 22.63 - samples/sec: 1691.85 - lr: 0.000013 - momentum: 0.000000
2023-10-25 09:20:04,086 epoch 7 - iter 722/3617 - loss 0.01141601 - time (sec): 44.97 - samples/sec: 1678.52 - lr: 0.000013 - momentum: 0.000000
2023-10-25 09:20:26,635 epoch 7 - iter 1083/3617 - loss 0.01410956 - time (sec): 67.52 - samples/sec: 1670.11 - lr: 0.000012 - momentum: 0.000000
2023-10-25 09:20:49,285 epoch 7 - iter 1444/3617 - loss 0.01436451 - time (sec): 90.17 - samples/sec: 1676.14 - lr: 0.000012 - momentum: 0.000000
2023-10-25 09:21:12,335 epoch 7 - iter 1805/3617 - loss 0.01504339 - time (sec): 113.22 - samples/sec: 1693.17 - lr: 0.000012 - momentum: 0.000000
2023-10-25 09:21:34,746 epoch 7 - iter 2166/3617 - loss 0.01505583 - time (sec): 135.63 - samples/sec: 1683.15 - lr: 0.000011 - momentum: 0.000000
2023-10-25 09:21:57,667 epoch 7 - iter 2527/3617 - loss 0.01548792 - time (sec): 158.55 - samples/sec: 1678.86 - lr: 0.000011 - momentum: 0.000000
2023-10-25 09:22:20,310 epoch 7 - iter 2888/3617 - loss 0.01540908 - time (sec): 181.20 - samples/sec: 1677.51 - lr: 0.000011 - momentum: 0.000000
2023-10-25 09:22:43,073 epoch 7 - iter 3249/3617 - loss 0.01583643 - time (sec): 203.96 - samples/sec: 1679.41 - lr: 0.000010 - momentum: 0.000000
2023-10-25 09:23:05,722 epoch 7 - iter 3610/3617 - loss 0.01543481 - time (sec): 226.61 - samples/sec: 1673.87 - lr: 0.000010 - momentum: 0.000000
2023-10-25 09:23:06,127 ----------------------------------------------------------------------------------------------------
2023-10-25 09:23:06,128 EPOCH 7 done: loss 0.0155 - lr: 0.000010
2023-10-25 09:23:10,894 DEV : loss 0.3687475621700287 - f1-score (micro avg) 0.6512
2023-10-25 09:23:10,917 ----------------------------------------------------------------------------------------------------
2023-10-25 09:23:34,320 epoch 8 - iter 361/3617 - loss 0.01011873 - time (sec): 23.40 - samples/sec: 1642.59 - lr: 0.000010 - momentum: 0.000000
2023-10-25 09:23:57,093 epoch 8 - iter 722/3617 - loss 0.01183084 - time (sec): 46.18 - samples/sec: 1647.32 - lr: 0.000009 - momentum: 0.000000
2023-10-25 09:24:19,987 epoch 8 - iter 1083/3617 - loss 0.01114849 - time (sec): 69.07 - samples/sec: 1675.33 - lr: 0.000009 - momentum: 0.000000
2023-10-25 09:24:42,267 epoch 8 - iter 1444/3617 - loss 0.01144658 - time (sec): 91.35 - samples/sec: 1671.61 - lr: 0.000009 - momentum: 0.000000
2023-10-25 09:25:04,971 epoch 8 - iter 1805/3617 - loss 0.01085694 - time (sec): 114.05 - samples/sec: 1671.04 - lr: 0.000008 - momentum: 0.000000
2023-10-25 09:25:27,776 epoch 8 - iter 2166/3617 - loss 0.01113943 - time (sec): 136.86 - samples/sec: 1670.33 - lr: 0.000008 - momentum: 0.000000
2023-10-25 09:25:50,272 epoch 8 - iter 2527/3617 - loss 0.01110272 - time (sec): 159.35 - samples/sec: 1665.95 - lr: 0.000008 - momentum: 0.000000
2023-10-25 09:26:13,117 epoch 8 - iter 2888/3617 - loss 0.01112695 - time (sec): 182.20 - samples/sec: 1667.86 - lr: 0.000007 - momentum: 0.000000
2023-10-25 09:26:35,738 epoch 8 - iter 3249/3617 - loss 0.01071467 - time (sec): 204.82 - samples/sec: 1667.74 - lr: 0.000007 - momentum: 0.000000
2023-10-25 09:26:58,274 epoch 8 - iter 3610/3617 - loss 0.01074639 - time (sec): 227.36 - samples/sec: 1668.14 - lr: 0.000007 - momentum: 0.000000
2023-10-25 09:26:58,691 ----------------------------------------------------------------------------------------------------
2023-10-25 09:26:58,691 EPOCH 8 done: loss 0.0107 - lr: 0.000007
2023-10-25 09:27:03,463 DEV : loss 0.38349881768226624 - f1-score (micro avg) 0.6433
2023-10-25 09:27:03,486 ----------------------------------------------------------------------------------------------------
2023-10-25 09:27:26,470 epoch 9 - iter 361/3617 - loss 0.00556864 - time (sec): 22.98 - samples/sec: 1698.80 - lr: 0.000006 - momentum: 0.000000
2023-10-25 09:27:49,214 epoch 9 - iter 722/3617 - loss 0.00783730 - time (sec): 45.73 - samples/sec: 1713.36 - lr: 0.000006 - momentum: 0.000000
2023-10-25 09:28:11,732 epoch 9 - iter 1083/3617 - loss 0.00688603 - time (sec): 68.25 - samples/sec: 1699.80 - lr: 0.000006 - momentum: 0.000000
2023-10-25 09:28:34,228 epoch 9 - iter 1444/3617 - loss 0.00661452 - time (sec): 90.74 - samples/sec: 1681.63 - lr: 0.000005 - momentum: 0.000000
2023-10-25 09:28:57,185 epoch 9 - iter 1805/3617 - loss 0.00671017 - time (sec): 113.70 - samples/sec: 1690.74 - lr: 0.000005 - momentum: 0.000000
2023-10-25 09:29:19,774 epoch 9 - iter 2166/3617 - loss 0.00667753 - time (sec): 136.29 - samples/sec: 1681.32 - lr: 0.000005 - momentum: 0.000000
2023-10-25 09:29:42,402 epoch 9 - iter 2527/3617 - loss 0.00799751 - time (sec): 158.92 - samples/sec: 1675.07 - lr: 0.000004 - momentum: 0.000000
2023-10-25 09:30:05,056 epoch 9 - iter 2888/3617 - loss 0.00813035 - time (sec): 181.57 - samples/sec: 1675.38 - lr: 0.000004 - momentum: 0.000000
2023-10-25 09:30:28,208 epoch 9 - iter 3249/3617 - loss 0.00804585 - time (sec): 204.72 - samples/sec: 1670.75 - lr: 0.000004 - momentum: 0.000000
2023-10-25 09:30:50,683 epoch 9 - iter 3610/3617 - loss 0.00784812 - time (sec): 227.20 - samples/sec: 1668.02 - lr: 0.000003 - momentum: 0.000000
2023-10-25 09:30:51,156 ----------------------------------------------------------------------------------------------------
2023-10-25 09:30:51,156 EPOCH 9 done: loss 0.0079 - lr: 0.000003
2023-10-25 09:30:55,937 DEV : loss 0.3988388478755951 - f1-score (micro avg) 0.6402
2023-10-25 09:30:55,959 ----------------------------------------------------------------------------------------------------
2023-10-25 09:31:18,550 epoch 10 - iter 361/3617 - loss 0.00169395 - time (sec): 22.59 - samples/sec: 1691.67 - lr: 0.000003 - momentum: 0.000000
2023-10-25 09:31:41,128 epoch 10 - iter 722/3617 - loss 0.00257176 - time (sec): 45.17 - samples/sec: 1691.41 - lr: 0.000003 - momentum: 0.000000
2023-10-25 09:32:03,905 epoch 10 - iter 1083/3617 - loss 0.00388498 - time (sec): 67.95 - samples/sec: 1670.61 - lr: 0.000002 - momentum: 0.000000
2023-10-25 09:32:26,672 epoch 10 - iter 1444/3617 - loss 0.00415693 - time (sec): 90.71 - samples/sec: 1674.51 - lr: 0.000002 - momentum: 0.000000
2023-10-25 09:32:49,198 epoch 10 - iter 1805/3617 - loss 0.00422595 - time (sec): 113.24 - samples/sec: 1665.99 - lr: 0.000002 - momentum: 0.000000
2023-10-25 09:33:11,815 epoch 10 - iter 2166/3617 - loss 0.00444188 - time (sec): 135.86 - samples/sec: 1665.22 - lr: 0.000001 - momentum: 0.000000
2023-10-25 09:33:34,466 epoch 10 - iter 2527/3617 - loss 0.00456308 - time (sec): 158.51 - samples/sec: 1659.07 - lr: 0.000001 - momentum: 0.000000
2023-10-25 09:33:57,358 epoch 10 - iter 2888/3617 - loss 0.00457433 - time (sec): 181.40 - samples/sec: 1663.72 - lr: 0.000001 - momentum: 0.000000
2023-10-25 09:34:20,142 epoch 10 - iter 3249/3617 - loss 0.00465404 - time (sec): 204.18 - samples/sec: 1668.71 - lr: 0.000000 - momentum: 0.000000
2023-10-25 09:34:42,847 epoch 10 - iter 3610/3617 - loss 0.00478068 - time (sec): 226.89 - samples/sec: 1672.23 - lr: 0.000000 - momentum: 0.000000
2023-10-25 09:34:43,247 ----------------------------------------------------------------------------------------------------
2023-10-25 09:34:43,247 EPOCH 10 done: loss 0.0048 - lr: 0.000000
2023-10-25 09:34:48,560 DEV : loss 0.42030808329582214 - f1-score (micro avg) 0.6507
2023-10-25 09:34:49,057 ----------------------------------------------------------------------------------------------------
2023-10-25 09:34:49,058 Loading model from best epoch ...
2023-10-25 09:34:50,737 SequenceTagger predicts: Dictionary with 13 tags: O, S-loc, B-loc, E-loc, I-loc, S-pers, B-pers, E-pers, I-pers, S-org, B-org, E-org, I-org
2023-10-25 09:34:56,439
Results:
- F-score (micro) 0.6562
- F-score (macro) 0.4469
- Accuracy 0.499
By class:
precision recall f1-score support
loc 0.6340 0.8088 0.7108 591
pers 0.5688 0.7059 0.6300 357
org 0.0000 0.0000 0.0000 79
micro avg 0.6093 0.7108 0.6562 1027
macro avg 0.4009 0.5049 0.4469 1027
weighted avg 0.5626 0.7108 0.6280 1027
2023-10-25 09:34:56,439 ----------------------------------------------------------------------------------------------------