{ "_name_or_path": "/localdata/jamesbr/dev/pretrained_checkpoints/pytorch_bert_large_phase1/", "architectures": [ "PoptorchPipelinedBertForPretraining" ], "async_dataloader": true, "attention_probs_dropout_prob": 0.0, "auto_loss_scaling": false, "batch_size": 2, "batches_per_step": 1, "checkpoint_output_dir": "/localdata/jamesbr/dev/pretrained_checkpoints/pytorch_bert_large_phase2", "checkpoint_steps": null, "compile_only": false, "config": null, "custom_ops": true, "dataloader_workers": 64, "dataset": "pretraining", "disable_progress_bar": true, "embedding_serialization_factor": 2, "enable_half_first_order_momentum": false, "enable_half_partials": true, "executable_cache_dir": "", "file_buffer_size": 10, "global_batch_size": 16384, "gradient_accumulation": 2048, "gradient_checkpointing": false, "hidden_act": "gelu", "hidden_dropout_prob": 0.1, "hidden_size": 1024, "initializer_range": 0.02, "input_files": [ "data/wikipedia/384/*.tfrecord" ], "intermediate_size": 4096, "ipus_per_replica": 4, "layer_norm_eps": 0.001, "layers_per_ipu": [ 3, 7, 7, 7 ], "learning_rate": 0.002828, "loss_scaling": 8192.0, "lr_schedule": "linear", "lr_warmup": 0.128, "mask_tokens": 56, "matmul_proportion": [ 0.15, 0.25, 0.25, 0.25 ], "max_position_embeddings": 512, "model_type": "bert", "num_attention_heads": 16, "num_epochs": null, "num_hidden_layers": 24, "optimizer": "LAMB", "optimizer_state_offchip": true, "pad_token_id": 0, "position_embedding_type": "absolute", "pred_head_transform": true, "pretrained_checkpoint": "/localdata/jamesbr/dev/pretrained_checkpoints/pytorch_bert_large_phase1/", "profile": false, "profile_dir": "profile", "random_seed": 42, "recompute_checkpoint_every_layer": true, "replicated_tensor_sharding": true, "replication_factor": 4, "restore_steps_and_optimizer": false, "samples_per_step": 16384, "sdk_version": "poplar_sdk-ubuntu_18_04-2.3.0-EA.1+716-757737e247", "sequence_length": 384, "squad_do_training": true, "squad_do_validation": true, "synthetic_data": false, "training_steps": 2137, "transformers_version": "4.7.0", "type_vocab_size": 2, "use_cache": true, "use_popdist": false, "vocab_size": 30522, "wandb": true, "wandb_param_steps": null, "weight_decay": 0.01 }