diff --git "a/detailed_log.txt" "b/detailed_log.txt" new file mode 100644--- /dev/null +++ "b/detailed_log.txt" @@ -0,0 +1,9883 @@ +--------------------------------------------- +✓ cuDNN found, will run with flash-attention +✓ OpenMP found +✓ NCCL found, OK to train with multiple GPUs +✓ MPI enabled +✓ nvcc found, including GPU/CUDA support +--------------------------------------------- +/usr/local/cuda/bin//nvcc -O3 -t=0 --use_fast_math -std=c++17 --generate-code arch=compute_89,code=[compute_89,sm_89] -DENABLE_CUDNN -DMULTI_GPU -DUSE_MPI -DENABLE_BF16 train_gpt2.cu build/cudnn_att. +o -lcublas -lcublasLt -lcudnn -L/usr/lib/x86_64-linux-gnu/openmpi/lib/ -I/home/jrahn/cudnn-frontend/include -I/usr/lib/x86_64-linux-gnu/openmpi/include/ -lnccl -lmpi -o train_gpt2cu ++-----------------------+----------------------------------------------------+ +| Parameter | Value | ++-----------------------+----------------------------------------------------+ +| train data pattern | dev/data/edu_fineweb10B/edu_fineweb_train_*.bin | +| val data pattern | dev/data/edu_fineweb10B/edu_fineweb_val_*.bin | +| output log dir | log_gpt3_125M_edu_v4 | +| checkpoint_every | 500 | +| resume | 1 | +| micro batch size B | 16 | +| sequence length T | 2048 | +| total batch size | 524288 | +| LR scheduler | cosine | +| learning rate (LR) | 6.000000e-03 | +| warmup iterations | 1000 | +| final LR fraction | 1.000000e-01 | +| weight decay | 1.000000e-01 | +| skip update lossz | 5.000000 | +| skip update gradz | 5.000000 | +| max_steps | 18794 | +| val_loss_every | 250 | +| val_max_steps | 20 | +| sample_every | 1000 | +| genT | 144 | +| overfit_single_batch | 0 | +| use_master_weights | enabled | +| gelu_fusion | 1 | +| recompute | 0 | ++-----------------------+----------------------------------------------------+ +| device | NVIDIA GeForce RTX 4090 | +| peak TFlops | 165.2 | +| precision | BF16 | ++-----------------------+----------------------------------------------------+ +| weight init method | gpt3:c768 | +| max_sequence_length T | 2048 | +| vocab_size V | 50257 | +| padded_vocab_size Vp | 50304 | +| num_layers L | 12 | +| num_heads NH | 12 | +| channels C | 768 | +| num_parameters | 125262336 | ++-----------------------+----------------------------------------------------+ +| train_num_batches | 18794 | +| val_num_batches | 20 | ++-----------------------+----------------------------------------------------+ +| run hellaswag | yes | ++-----------------------+----------------------------------------------------+ +| num_processes | 2 | +| zero_stage | 1 | ++-----------------------+----------------------------------------------------+ +num_parameters: 125262336 => bytes: 250524672 +allocated 238 MiB for model parameters +batch_size B=16 * seq_len T=2048 * num_processes=2 and total_batch_size=524288 +=> setting grad_accum_steps=8 +created directory: log_gpt3_125M_edu_v4 +allocating 238 MiB for parameter gradients +allocating 12720 MiB for activations +allocating 238 MiB for AdamW optimizer state m +allocating 238 MiB for AdamW optimizer state v +allocating 238 MiB for master copy of params +device memory usage: 14465 MiB / 24210 MiB +memory per sequence: 795 MiB + -> estimated maximum batch size: 28 +val loss 10.958850 +step 1/18794 | loss 10.957932 (+nanz)| norm 14.8325 (+nanz)| lr 6.00e-06 | 2106.97 ms | 65.1% bf16 MFU | 248835 tok/s +step 2/18794 | loss 10.647232 (+nanz)| norm 11.6522 (+nanz)| lr 1.20e-05 | 1963.95 ms | 69.9% bf16 MFU | 266955 tok/s +step 3/18794 | loss 10.284852 (+nanz)| norm 8.0668 (+nanz)| lr 1.80e-05 | 1966.70 ms | 69.8% bf16 MFU | 266764 tok/s +step 4/18794 | loss 10.015448 (+nanz)| norm 5.3442 (+nanz)| lr 2.40e-05 | 1965.97 ms | 69.8% bf16 MFU | 266735 tok/s +step 5/18794 | loss 9.821831 (+nanz)| norm 3.6961 (+nanz)| lr 3.00e-05 | 1967.33 ms | 69.8% bf16 MFU | 266671 tok/s +step 6/18794 | loss 9.690645 (+nanz)| norm 2.8645 (+nanz)| lr 3.60e-05 | 1958.37 ms | 70.1% bf16 MFU | 266902 tok/s +step 7/18794 | loss 9.613273 (+nanz)| norm 2.4678 (+nanz)| lr 4.20e-05 | 1950.19 ms | 70.4% bf16 MFU | 267268 tok/s +step 8/18794 | loss 9.548866 (+nanz)| norm 2.3496 (+nanz)| lr 4.80e-05 | 1961.44 ms | 70.0% bf16 MFU | 267273 tok/s +step 9/18794 | loss 9.556955 (+nanz)| norm 2.1852 (+nanz)| lr 5.40e-05 | 1966.49 ms | 69.8% bf16 MFU | 267174 tok/s +step 10/18794 | loss 9.476960 (+nanz)| norm 2.1752 (+nanz)| lr 6.00e-05 | 1968.06 ms | 69.7% bf16 MFU | 267069 tok/s +step 11/18794 | loss 9.410295 (+nanz)| norm 2.1467 (+nanz)| lr 6.60e-05 | 1957.13 ms | 70.1% bf16 MFU | 267171 tok/s +step 12/18794 | loss 9.357857 (+nanz)| norm 2.0524 (+nanz)| lr 7.20e-05 | 1937.30 ms | 70.8% bf16 MFU | 267572 tok/s +step 13/18794 | loss 9.287427 (+nanz)| norm 2.0341 (+nanz)| lr 7.80e-05 | 1953.06 ms | 70.3% bf16 MFU | 267667 tok/s +step 14/18794 | loss 9.187832 (+nanz)| norm 1.9434 (+nanz)| lr 8.40e-05 | 1967.28 ms | 69.8% bf16 MFU | 267547 tok/s +step 15/18794 | loss 9.090523 (+nanz)| norm 1.9971 (+nanz)| lr 9.00e-05 | 1952.98 ms | 70.3% bf16 MFU | 267636 tok/s +step 16/18794 | loss 9.023990 (+nanz)| norm 2.6988 (+nanz)| lr 9.60e-05 | 1953.85 ms | 70.2% bf16 MFU | 267701 tok/s +step 17/18794 | loss 8.978894 (+nanz)| norm 6.2132 (+nanz)| lr 1.02e-04 | 1953.14 ms | 70.3% bf16 MFU | 267766 tok/s +step 18/18794 | loss 8.867225 (+nanz)| norm 1.8514 (+nanz)| lr 1.08e-04 | 1960.11 ms | 70.0% bf16 MFU | 267742 tok/s +step 19/18794 | loss 8.767125 (+nanz)| norm 2.1801 (+nanz)| lr 1.14e-04 | 1963.53 ms | 69.9% bf16 MFU | 267681 tok/s +step 20/18794 | loss 8.670886 (+nanz)| norm 1.7281 (+nanz)| lr 1.20e-04 | 1961.18 ms | 70.0% bf16 MFU | 267653 tok/s +step 21/18794 | loss 8.580118 (+nanz)| norm 1.6662 (+nanz)| lr 1.26e-04 | 1958.28 ms | 70.1% bf16 MFU | 267659 tok/s +step 22/18794 | loss 8.484600 (+nanz)| norm 3.6865 (+nanz)| lr 1.32e-04 | 1968.52 ms | 69.7% bf16 MFU | 267559 tok/s +step 23/18794 | loss 8.429165 (+nanz)| norm 1.4634 (+nanz)| lr 1.38e-04 | 1961.05 ms | 70.0% bf16 MFU | 267543 tok/s +step 24/18794 | loss 8.309236 (+nanz)| norm 1.4821 (+nanz)| lr 1.44e-04 | 1965.63 ms | 69.8% bf16 MFU | 267485 tok/s +step 25/18794 | loss 8.250509 (+nanz)| norm 1.4547 (+nanz)| lr 1.50e-04 | 1967.20 ms | 69.8% bf16 MFU | 267416 tok/s +step 26/18794 | loss 8.184395 (+nanz)| norm 1.2386 (+nanz)| lr 1.56e-04 | 1975.82 ms | 69.5% bf16 MFU | 267273 tok/s +step 27/18794 | loss 8.078316 (+nanz)| norm 1.8914 (+nanz)| lr 1.62e-04 | 1968.07 ms | 69.7% bf16 MFU | 267214 tok/s +step 28/18794 | loss 8.013226 (+nanz)| norm 1.4886 (+nanz)| lr 1.68e-04 | 1972.47 ms | 69.6% bf16 MFU | 267120 tok/s +step 29/18794 | loss 7.917800 (+nanz)| norm 1.1583 (+nanz)| lr 1.74e-04 | 1969.44 ms | 69.7% bf16 MFU | 267060 tok/s +step 30/18794 | loss 7.846300 (+nanz)| norm 1.0771 (+nanz)| lr 1.80e-04 | 1970.14 ms | 69.7% bf16 MFU | 266999 tok/s +step 31/18794 | loss 7.825881 (+nanz)| norm 0.9824 (+nanz)| lr 1.86e-04 | 1966.91 ms | 69.8% bf16 MFU | 266971 tok/s +step 32/18794 | loss 7.674847 (+nanz)| norm 1.3876 (+nanz)| lr 1.92e-04 | 1976.05 ms | 69.4% bf16 MFU | 266867 tok/s +step 33/18794 | loss 7.631570 (+nanz)| norm 0.8591 (+nanz)| lr 1.98e-04 | 1959.71 ms | 70.0% bf16 MFU | 266909 tok/s +step 34/18794 | loss 7.591294 (+nanz)| norm 0.7261 (+nanz)| lr 2.04e-04 | 1965.28 ms | 69.8% bf16 MFU | 266900 tok/s +step 35/18794 | loss 7.500128 (+nanz)| norm 0.9628 (+nanz)| lr 2.10e-04 | 1986.03 ms | 69.1% bf16 MFU | 266724 tok/s +step 36/18794 | loss 7.459189 (+nanz)| norm 0.7117 (+nanz)| lr 2.16e-04 | 1968.22 ms | 69.7% bf16 MFU | 266703 tok/s +step 37/18794 | loss 7.436276 (+nanz)| norm 0.6327 (+nanz)| lr 2.22e-04 | 1973.53 ms | 69.5% bf16 MFU | 266641 tok/s +step 38/18794 | loss 7.358282 (+nanz)| norm 0.8081 (+nanz)| lr 2.28e-04 | 1965.92 ms | 69.8% bf16 MFU | 266644 tok/s +step 39/18794 | loss 7.358347 (+nanz)| norm 0.9673 (+nanz)| lr 2.34e-04 | 1969.65 ms | 69.7% bf16 MFU | 266617 tok/s +step 40/18794 | loss 7.339409 (+nanz)| norm 1.1227 (+nanz)| lr 2.40e-04 | 1973.52 ms | 69.5% bf16 MFU | 266562 tok/s +step 41/18794 | loss 7.274670 (+nanz)| norm 0.6873 (+nanz)| lr 2.46e-04 | 1969.86 ms | 69.7% bf16 MFU | 266538 tok/s +step 42/18794 | loss 7.291101 (+nanz)| norm 0.9747 (+nanz)| lr 2.52e-04 | 1978.30 ms | 69.4% bf16 MFU | 266452 tok/s +step 43/18794 | loss 7.171454 (+nanz)| norm 0.5175 (+nanz)| lr 2.58e-04 | 1974.11 ms | 69.5% bf16 MFU | 266403 tok/s +step 44/18794 | loss 7.230802 (+nanz)| norm 1.0031 (+nanz)| lr 2.64e-04 | 1960.01 ms | 70.0% bf16 MFU | 266464 tok/s +step 45/18794 | loss 7.206894 (+nanz)| norm 1.1178 (+nanz)| lr 2.70e-04 | 1962.84 ms | 69.9% bf16 MFU | 266500 tok/s +step 46/18794 | loss 7.213326 (+nanz)| norm 0.8976 (+nanz)| lr 2.76e-04 | 1974.45 ms | 69.5% bf16 MFU | 266446 tok/s +step 47/18794 | loss 7.136472 (+nanz)| norm 1.4114 (+nanz)| lr 2.82e-04 | 1977.38 ms | 69.4% bf16 MFU | 266374 tok/s +step 48/18794 | loss 7.145808 (+nanz)| norm 1.3279 (+nanz)| lr 2.88e-04 | 1970.00 ms | 69.7% bf16 MFU | 266361 tok/s +step 49/18794 | loss 7.110090 (+nanz)| norm 1.5498 (+nanz)| lr 2.94e-04 | 1992.14 ms | 68.9% bf16 MFU | 266187 tok/s +step 50/18794 | loss 7.068109 (+nanz)| norm 0.8171 (+nanz)| lr 3.00e-04 | 1979.46 ms | 69.3% bf16 MFU | 266115 tok/s +step 51/18794 | loss 7.060719 (+nanz)| norm 0.7178 (+nanz)| lr 3.06e-04 | 1971.71 ms | 69.6% bf16 MFU | 266104 tok/s +step 52/18794 | loss 6.977152 (+nanz)| norm 0.7924 (+nanz)| lr 3.12e-04 | 1960.82 ms | 70.0% bf16 MFU | 266173 tok/s +step 53/18794 | loss 7.044573 (+nanz)| norm 1.1689 (+nanz)| lr 3.18e-04 | 1966.29 ms | 69.8% bf16 MFU | 266198 tok/s +step 54/18794 | loss 6.935850 (+nanz)| norm 0.7291 (+nanz)| lr 3.24e-04 | 1980.39 ms | 69.3% bf16 MFU | 266120 tok/s +step 55/18794 | loss 6.922439 (+nanz)| norm 0.8902 (+nanz)| lr 3.30e-04 | 1973.55 ms | 69.5% bf16 MFU | 266095 tok/s +step 56/18794 | loss 6.942529 (+nanz)| norm 0.8514 (+nanz)| lr 3.36e-04 | 1968.19 ms | 69.7% bf16 MFU | 266110 tok/s +step 57/18794 | loss 6.928077 (+nanz)| norm 0.8583 (+nanz)| lr 3.42e-04 | 1972.05 ms | 69.6% bf16 MFU | 266097 tok/s +step 58/18794 | loss 6.882896 (+nanz)| norm 0.7840 (+nanz)| lr 3.48e-04 | 1980.29 ms | 69.3% bf16 MFU | 266026 tok/s +step 59/18794 | loss 6.904822 (+nanz)| norm 0.9601 (+nanz)| lr 3.54e-04 | 1979.34 ms | 69.3% bf16 MFU | 265966 tok/s +step 60/18794 | loss 6.844017 (+nanz)| norm 1.5510 (+nanz)| lr 3.60e-04 | 1978.65 ms | 69.4% bf16 MFU | 265913 tok/s +step 61/18794 | loss 6.847663 (+nanz)| norm 1.4161 (+nanz)| lr 3.66e-04 | 1986.01 ms | 69.1% bf16 MFU | 265813 tok/s +step 62/18794 | loss 6.798135 (+nanz)| norm 0.8004 (+nanz)| lr 3.72e-04 | 1979.22 ms | 69.3% bf16 MFU | 265765 tok/s +step 63/18794 | loss 6.809881 (+nanz)| norm 0.9810 (+nanz)| lr 3.78e-04 | 1983.75 ms | 69.2% bf16 MFU | 265688 tok/s +step 64/18794 | loss 6.785971 (+nanz)| norm 0.9197 (+nanz)| lr 3.84e-04 | 1981.66 ms | 69.3% bf16 MFU | 265630 tok/s +step 65/18794 | loss 6.765566 (+nanz)| norm 0.9862 (+nanz)| lr 3.90e-04 | 1981.60 ms | 69.3% bf16 MFU | 265575 tok/s +step 66/18794 | loss 6.753748 (+nanz)| norm 0.7280 (+nanz)| lr 3.96e-04 | 1985.34 ms | 69.1% bf16 MFU | 265497 tok/s +step 67/18794 | loss 6.669776 (+nanz)| norm 0.5994 (+nanz)| lr 4.02e-04 | 1979.93 ms | 69.3% bf16 MFU | 265461 tok/s +step 68/18794 | loss 6.708775 (+nanz)| norm 0.4639 (+nanz)| lr 4.08e-04 | 1976.87 ms | 69.4% bf16 MFU | 265448 tok/s +step 69/18794 | loss 6.680964 (+nanz)| norm 0.5768 (+nanz)| lr 4.14e-04 | 1988.66 ms | 69.0% bf16 MFU | 265355 tok/s +step 70/18794 | loss 6.668404 (+nanz)| norm 0.6937 (+nanz)| lr 4.20e-04 | 1981.75 ms | 69.2% bf16 MFU | 265314 tok/s +step 71/18794 | loss 6.654617 (+nanz)| norm 0.9221 (+nanz)| lr 4.26e-04 | 1985.24 ms | 69.1% bf16 MFU | 265251 tok/s +step 72/18794 | loss 6.686837 (+nanz)| norm 0.9637 (+nanz)| lr 4.32e-04 | 1974.74 ms | 69.5% bf16 MFU | 265264 tok/s +step 73/18794 | loss 6.670592 (+nanz)| norm 0.9296 (+nanz)| lr 4.38e-04 | 1978.19 ms | 69.4% bf16 MFU | 265252 tok/s +step 74/18794 | loss 6.589166 (+nanz)| norm 0.8237 (+nanz)| lr 4.44e-04 | 1976.39 ms | 69.4% bf16 MFU | 265253 tok/s +step 75/18794 | loss 6.604837 (+nanz)| norm 0.9459 (+nanz)| lr 4.50e-04 | 1987.33 ms | 69.1% bf16 MFU | 265180 tok/s +step 76/18794 | loss 6.640738 (+nanz)| norm 0.6654 (+nanz)| lr 4.56e-04 | 1977.30 ms | 69.4% bf16 MFU | 265178 tok/s +step 77/18794 | loss 6.631350 (+nanz)| norm 0.8150 (+nanz)| lr 4.62e-04 | 1974.51 ms | 69.5% bf16 MFU | 265196 tok/s +step 78/18794 | loss 6.629026 (+nanz)| norm 0.7011 (+nanz)| lr 4.68e-04 | 1982.45 ms | 69.2% bf16 MFU | 265159 tok/s +step 79/18794 | loss 6.534187 (+nanz)| norm 0.6185 (+nanz)| lr 4.74e-04 | 1983.07 ms | 69.2% bf16 MFU | 265119 tok/s +step 80/18794 | loss 6.658986 (+nanz)| norm 0.6389 (+nanz)| lr 4.80e-04 | 1978.75 ms | 69.4% bf16 MFU | 265111 tok/s +step 81/18794 | loss 6.527759 (+nanz)| norm 0.7542 (+nanz)| lr 4.86e-04 | 1979.48 ms | 69.3% bf16 MFU | 265099 tok/s +step 82/18794 | loss 6.540927 (+nanz)| norm 0.6284 (+nanz)| lr 4.92e-04 | 1975.00 ms | 69.5% bf16 MFU | 265117 tok/s +step 83/18794 | loss 6.551311 (+nanz)| norm 0.5422 (+nanz)| lr 4.98e-04 | 1981.27 ms | 69.3% bf16 MFU | 265092 tok/s +step 84/18794 | loss 6.481877 (+nanz)| norm 0.5236 (+nanz)| lr 5.04e-04 | 1980.87 ms | 69.3% bf16 MFU | 265071 tok/s +step 85/18794 | loss 6.483929 (+nanz)| norm 0.5102 (+nanz)| lr 5.10e-04 | 1989.62 ms | 69.0% bf16 MFU | 264992 tok/s +step 86/18794 | loss 6.485063 (+nanz)| norm 0.8189 (+nanz)| lr 5.16e-04 | 1976.46 ms | 69.4% bf16 MFU | 265006 tok/s +step 87/18794 | loss 6.407083 (+nanz)| norm 0.6851 (+nanz)| lr 5.22e-04 | 1988.04 ms | 69.0% bf16 MFU | 264941 tok/s +step 88/18794 | loss 6.493220 (+nanz)| norm 0.6791 (+nanz)| lr 5.28e-04 | 1981.51 ms | 69.3% bf16 MFU | 264923 tok/s +step 89/18794 | loss 6.501971 (+nanz)| norm 0.6648 (+nanz)| lr 5.34e-04 | 1974.33 ms | 69.5% bf16 MFU | 264955 tok/s +step 90/18794 | loss 6.470716 (+nanz)| norm 0.4423 (+nanz)| lr 5.40e-04 | 1991.05 ms | 68.9% bf16 MFU | 264872 tok/s +step 91/18794 | loss 6.436247 (+nanz)| norm 0.6337 (+nanz)| lr 5.46e-04 | 1974.49 ms | 69.5% bf16 MFU | 264906 tok/s +step 92/18794 | loss 6.457773 (+nanz)| norm 1.1615 (+nanz)| lr 5.52e-04 | 1977.37 ms | 69.4% bf16 MFU | 264918 tok/s +step 93/18794 | loss 6.436251 (+nanz)| norm 1.3972 (+nanz)| lr 5.58e-04 | 1970.38 ms | 69.6% bf16 MFU | 264976 tok/s +step 94/18794 | loss 6.432745 (+nanz)| norm 0.9421 (+nanz)| lr 5.64e-04 | 1977.37 ms | 69.4% bf16 MFU | 264985 tok/s +step 95/18794 | loss 6.414155 (+nanz)| norm 0.7087 (+nanz)| lr 5.70e-04 | 1983.88 ms | 69.2% bf16 MFU | 264949 tok/s +step 96/18794 | loss 6.437037 (+nanz)| norm 0.5508 (+nanz)| lr 5.76e-04 | 1993.15 ms | 68.9% bf16 MFU | 264853 tok/s +step 97/18794 | loss 6.355496 (+nanz)| norm 0.6069 (+nanz)| lr 5.82e-04 | 1987.74 ms | 69.0% bf16 MFU | 264798 tok/s +step 98/18794 | loss 6.348640 (+nanz)| norm 0.5535 (+nanz)| lr 5.88e-04 | 1978.85 ms | 69.3% bf16 MFU | 264805 tok/s +step 99/18794 | loss 6.366690 (+nanz)| norm 0.7484 (+nanz)| lr 5.94e-04 | 1988.20 ms | 69.0% bf16 MFU | 264750 tok/s +step 100/18794 | loss 6.386213 (+nanz)| norm 0.9887 (+nanz)| lr 6.00e-04 | 1986.17 ms | 69.1% bf16 MFU | 264711 tok/s +step 101/18794 | loss 6.336051 (-1.02z)| norm 0.9227 (-0.33z)| lr 6.06e-04 | 1979.59 ms | 69.3% bf16 MFU | 264717 tok/s +step 102/18794 | loss 6.360234 (-1.00z)| norm 0.9812 (-0.30z)| lr 6.12e-04 | 1971.39 ms | 69.6% bf16 MFU | 264779 tok/s +step 103/18794 | loss 6.452271 (-0.91z)| norm 1.1023 (-0.16z)| lr 6.18e-04 | 1972.76 ms | 69.6% bf16 MFU | 264829 tok/s +step 104/18794 | loss 6.359668 (-0.99z)| norm 0.8052 (-0.49z)| lr 6.24e-04 | 1979.82 ms | 69.3% bf16 MFU | 264828 tok/s +step 105/18794 | loss 6.330804 (-1.01z)| norm 0.7845 (-0.50z)| lr 6.30e-04 | 1988.98 ms | 69.0% bf16 MFU | 264766 tok/s +step 106/18794 | loss 6.371106 (-0.96z)| norm 0.7565 (-0.52z)| lr 6.36e-04 | 1993.86 ms | 68.8% bf16 MFU | 264675 tok/s +step 107/18794 | loss 6.333128 (-0.99z)| norm 0.5974 (-0.71z)| lr 6.42e-04 | 1981.87 ms | 69.2% bf16 MFU | 264668 tok/s +step 108/18794 | loss 6.343441 (-0.97z)| norm 1.2005 (+0.10z)| lr 6.48e-04 | 2002.28 ms | 68.5% bf16 MFU | 264527 tok/s +step 109/18794 | loss 6.311188 (-1.00z)| norm 1.0886 (-0.03z)| lr 6.54e-04 | 1986.92 ms | 69.1% bf16 MFU | 264494 tok/s +step 110/18794 | loss 6.351082 (-0.95z)| norm 0.7752 (-0.44z)| lr 6.60e-04 | 1968.40 ms | 69.7% bf16 MFU | 264587 tok/s +step 111/18794 | loss 6.372776 (-0.92z)| norm 0.9745 (-0.16z)| lr 6.66e-04 | 1993.57 ms | 68.8% bf16 MFU | 264507 tok/s +step 112/18794 | loss 6.363757 (-0.92z)| norm 1.1222 (+0.06z)| lr 6.72e-04 | 1972.83 ms | 69.6% bf16 MFU | 264569 tok/s +step 113/18794 | loss 6.296664 (-1.01z)| norm 0.6589 (-0.57z)| lr 6.78e-04 | 1982.02 ms | 69.2% bf16 MFU | 264567 tok/s +step 114/18794 | loss 6.356906 (-0.92z)| norm 0.8109 (-0.34z)| lr 6.84e-04 | 1988.21 ms | 69.0% bf16 MFU | 264524 tok/s +step 115/18794 | loss 6.267276 (-1.04z)| norm 0.6813 (-0.51z)| lr 6.90e-04 | 1993.05 ms | 68.9% bf16 MFU | 264450 tok/s +step 116/18794 | loss 6.247368 (-1.07z)| norm 0.5691 (-0.66z)| lr 6.96e-04 | 1976.20 ms | 69.4% bf16 MFU | 264493 tok/s +step 117/18794 | loss 6.255921 (-1.06z)| norm 0.4657 (-1.13z)| lr 7.02e-04 | 1973.56 ms | 69.5% bf16 MFU | 264551 tok/s +step 118/18794 | loss 6.315602 (-0.97z)| norm 0.6704 (-0.65z)| lr 7.08e-04 | 1982.46 ms | 69.2% bf16 MFU | 264547 tok/s +step 119/18794 | loss 6.299328 (-0.99z)| norm 0.9492 (+0.03z)| lr 7.14e-04 | 1981.61 ms | 69.3% bf16 MFU | 264548 tok/s +step 120/18794 | loss 6.216838 (-1.13z)| norm 1.0580 (+0.31z)| lr 7.20e-04 | 1996.58 ms | 68.7% bf16 MFU | 264450 tok/s +step 121/18794 | loss 6.225911 (-1.12z)| norm 0.8403 (-0.21z)| lr 7.26e-04 | 1997.76 ms | 68.7% bf16 MFU | 264349 tok/s +step 122/18794 | loss 6.271805 (-1.03z)| norm 0.4421 (-1.54z)| lr 7.32e-04 | 1973.97 ms | 69.5% bf16 MFU | 264412 tok/s +step 123/18794 | loss 6.162679 (-1.24z)| norm 0.5514 (-1.15z)| lr 7.38e-04 | 1990.71 ms | 68.9% bf16 MFU | 264360 tok/s +step 124/18794 | loss 6.277824 (-1.02z)| norm 0.5085 (-1.28z)| lr 7.44e-04 | 1989.53 ms | 69.0% bf16 MFU | 264318 tok/s +step 125/18794 | loss 6.180478 (-1.22z)| norm 0.5195 (-1.23z)| lr 7.50e-04 | 1996.47 ms | 68.7% bf16 MFU | 264232 tok/s +step 126/18794 | loss 6.256961 (-1.06z)| norm 0.5547 (-1.08z)| lr 7.56e-04 | 1978.34 ms | 69.4% bf16 MFU | 264271 tok/s +step 127/18794 | loss 6.232870 (-1.11z)| norm 0.7085 (-0.52z)| lr 7.62e-04 | 1990.92 ms | 68.9% bf16 MFU | 264224 tok/s +step 128/18794 | loss 6.203641 (-1.18z)| norm 0.6933 (-0.57z)| lr 7.68e-04 | 1974.05 ms | 69.5% bf16 MFU | 264293 tok/s +step 129/18794 | loss 6.177493 (-1.24z)| norm 0.5251 (-1.22z)| lr 7.74e-04 | 1999.50 ms | 68.6% bf16 MFU | 264189 tok/s +step 130/18794 | loss 6.142615 (-1.32z)| norm 0.4947 (-1.31z)| lr 7.80e-04 | 1981.50 ms | 69.3% bf16 MFU | 264209 tok/s +step 131/18794 | loss 6.116244 (-1.39z)| norm 0.5373 (-1.12z)| lr 7.86e-04 | 1975.63 ms | 69.5% bf16 MFU | 264267 tok/s +step 132/18794 | loss 6.148183 (-1.30z)| norm 0.5007 (-1.25z)| lr 7.92e-04 | 1988.61 ms | 69.0% bf16 MFU | 264236 tok/s +step 133/18794 | loss 6.139348 (-1.32z)| norm 0.5147 (-1.17z)| lr 7.98e-04 | 1990.87 ms | 68.9% bf16 MFU | 264192 tok/s +step 134/18794 | loss 6.250252 (-1.01z)| norm 0.8511 (+0.18z)| lr 8.04e-04 | 1975.73 ms | 69.5% bf16 MFU | 264250 tok/s +step 135/18794 | loss 6.174953 (-1.21z)| norm 1.1381 (+1.32z)| lr 8.10e-04 | 1991.07 ms | 68.9% bf16 MFU | 264204 tok/s +step 136/18794 | loss 6.138414 (-1.30z)| norm 0.7918 (-0.07z)| lr 8.16e-04 | 1985.92 ms | 69.1% bf16 MFU | 264194 tok/s +step 137/18794 | loss 6.127423 (-1.33z)| norm 0.6544 (-0.62z)| lr 8.22e-04 | 1973.50 ms | 69.5% bf16 MFU | 264267 tok/s +step 138/18794 | loss 6.141684 (-1.27z)| norm 0.5159 (-1.16z)| lr 8.28e-04 | 1979.20 ms | 69.3% bf16 MFU | 264299 tok/s +step 139/18794 | loss 6.113013 (-1.35z)| norm 0.5568 (-0.98z)| lr 8.34e-04 | 1982.88 ms | 69.2% bf16 MFU | 264304 tok/s +step 140/18794 | loss 6.139874 (-1.26z)| norm 0.4741 (-1.28z)| lr 8.40e-04 | 1975.50 ms | 69.5% bf16 MFU | 264359 tok/s +step 141/18794 | loss 6.153937 (-1.20z)| norm 0.8053 (+0.03z)| lr 8.46e-04 | 1971.01 ms | 69.6% bf16 MFU | 264441 tok/s +step 142/18794 | loss 6.155858 (-1.19z)| norm 0.6588 (-0.54z)| lr 8.52e-04 | 1975.23 ms | 69.5% bf16 MFU | 264490 tok/s +step 143/18794 | loss 6.134356 (-1.24z)| norm 0.5050 (-1.15z)| lr 8.58e-04 | 1992.02 ms | 68.9% bf16 MFU | 264426 tok/s +step 144/18794 | loss 6.078365 (-1.42z)| norm 0.4042 (-1.51z)| lr 8.64e-04 | 1983.59 ms | 69.2% bf16 MFU | 264420 tok/s +step 145/18794 | loss 6.098634 (-1.34z)| norm 0.4076 (-1.47z)| lr 8.70e-04 | 1974.77 ms | 69.5% bf16 MFU | 264474 tok/s +step 146/18794 | loss 6.125590 (-1.24z)| norm 0.4053 (-1.45z)| lr 8.76e-04 | 1982.33 ms | 69.2% bf16 MFU | 264474 tok/s +step 147/18794 | loss 6.079147 (-1.39z)| norm 0.3991 (-1.46z)| lr 8.82e-04 | 1991.84 ms | 68.9% bf16 MFU | 264411 tok/s +step 148/18794 | loss 6.125799 (-1.21z)| norm 0.4094 (-1.40z)| lr 8.88e-04 | 1977.77 ms | 69.4% bf16 MFU | 264445 tok/s +step 149/18794 | loss 6.067526 (-1.42z)| norm 0.4239 (-1.36z)| lr 8.94e-04 | 1985.28 ms | 69.1% bf16 MFU | 264427 tok/s +step 150/18794 | loss 6.109659 (-1.25z)| norm 0.5446 (-0.84z)| lr 9.00e-04 | 1983.71 ms | 69.2% bf16 MFU | 264421 tok/s +step 151/18794 | loss 6.030828 (-1.54z)| norm 0.6375 (-0.44z)| lr 9.06e-04 | 1991.47 ms | 68.9% bf16 MFU | 264363 tok/s +step 152/18794 | loss 6.056729 (-1.42z)| norm 0.6769 (-0.27z)| lr 9.12e-04 | 1990.72 ms | 68.9% bf16 MFU | 264313 tok/s +step 153/18794 | loss 6.049511 (-1.44z)| norm 0.6895 (-0.20z)| lr 9.18e-04 | 1982.71 ms | 69.2% bf16 MFU | 264319 tok/s +step 154/18794 | loss 6.059674 (-1.38z)| norm 0.8159 (+0.33z)| lr 9.24e-04 | 1974.95 ms | 69.5% bf16 MFU | 264376 tok/s +step 155/18794 | loss 6.094950 (-1.22z)| norm 0.7297 (-0.03z)| lr 9.30e-04 | 1982.50 ms | 69.2% bf16 MFU | 264381 tok/s +step 156/18794 | loss 6.133526 (-1.05z)| norm 0.8273 (+0.39z)| lr 9.36e-04 | 1974.59 ms | 69.5% bf16 MFU | 264437 tok/s +step 157/18794 | loss 6.119155 (-1.10z)| norm 1.0360 (+1.27z)| lr 9.42e-04 | 1968.94 ms | 69.7% bf16 MFU | 264530 tok/s +step 158/18794 | loss 6.095830 (-1.19z)| norm 0.7803 (+0.18z)| lr 9.48e-04 | 1987.08 ms | 69.1% bf16 MFU | 264495 tok/s +step 159/18794 | loss 6.103395 (-1.15z)| norm 0.6308 (-0.44z)| lr 9.54e-04 | 1981.87 ms | 69.2% bf16 MFU | 264498 tok/s +step 160/18794 | loss 6.054783 (-1.36z)| norm 0.5318 (-0.87z)| lr 9.60e-04 | 1979.95 ms | 69.3% bf16 MFU | 264513 tok/s +step 161/18794 | loss 6.051837 (-1.36z)| norm 0.4026 (-1.47z)| lr 9.66e-04 | 1988.41 ms | 69.0% bf16 MFU | 264471 tok/s +step 162/18794 | loss 6.027490 (-1.45z)| norm 0.4514 (-1.22z)| lr 9.72e-04 | 1973.51 ms | 69.5% bf16 MFU | 264530 tok/s +step 163/18794 | loss 6.040817 (-1.38z)| norm 0.3968 (-1.44z)| lr 9.78e-04 | 1971.06 ms | 69.6% bf16 MFU | 264604 tok/s +step 164/18794 | loss 6.049406 (-1.32z)| norm 0.4797 (-1.03z)| lr 9.84e-04 | 1984.80 ms | 69.1% bf16 MFU | 264581 tok/s +step 165/18794 | loss 6.015545 (-1.47z)| norm 0.5089 (-0.88z)| lr 9.90e-04 | 1981.20 ms | 69.3% bf16 MFU | 264584 tok/s +step 166/18794 | loss 5.991831 (-1.57z)| norm 0.4120 (-1.31z)| lr 9.96e-04 | 1979.97 ms | 69.3% bf16 MFU | 264594 tok/s +step 167/18794 | loss 6.033115 (-1.34z)| norm 0.5018 (-0.88z)| lr 1.00e-03 | 1985.69 ms | 69.1% bf16 MFU | 264566 tok/s +step 168/18794 | loss 5.967680 (-1.65z)| norm 0.5790 (-0.53z)| lr 1.01e-03 | 1979.03 ms | 69.3% bf16 MFU | 264584 tok/s +step 169/18794 | loss 6.032029 (-1.31z)| norm 0.5851 (-0.50z)| lr 1.01e-03 | 1977.94 ms | 69.4% bf16 MFU | 264608 tok/s +step 170/18794 | loss 6.001100 (-1.45z)| norm 0.6450 (-0.22z)| lr 1.02e-03 | 1985.79 ms | 69.1% bf16 MFU | 264579 tok/s +step 171/18794 | loss 6.018784 (-1.33z)| norm 0.5501 (-0.65z)| lr 1.03e-03 | 1978.06 ms | 69.4% bf16 MFU | 264602 tok/s +step 172/18794 | loss 5.972751 (-1.56z)| norm 0.6392 (-0.22z)| lr 1.03e-03 | 1977.56 ms | 69.4% bf16 MFU | 264628 tok/s +step 173/18794 | loss 5.996784 (-1.42z)| norm 0.4880 (-0.91z)| lr 1.04e-03 | 1981.55 ms | 69.3% bf16 MFU | 264626 tok/s +step 174/18794 | loss 5.976997 (-1.50z)| norm 0.3812 (-1.39z)| lr 1.04e-03 | 1971.15 ms | 69.6% bf16 MFU | 264694 tok/s +step 175/18794 | loss 5.994715 (-1.38z)| norm 0.4207 (-1.18z)| lr 1.05e-03 | 1986.22 ms | 69.1% bf16 MFU | 264657 tok/s +step 176/18794 | loss 5.984017 (-1.43z)| norm 0.3970 (-1.27z)| lr 1.06e-03 | 1985.67 ms | 69.1% bf16 MFU | 264626 tok/s +step 177/18794 | loss 5.961663 (-1.54z)| norm 0.4128 (-1.17z)| lr 1.06e-03 | 1971.01 ms | 69.6% bf16 MFU | 264695 tok/s +step 178/18794 | loss 5.913654 (-1.79z)| norm 0.4255 (-1.09z)| lr 1.07e-03 | 1979.10 ms | 69.3% bf16 MFU | 264706 tok/s +step 179/18794 | loss 5.979064 (-1.39z)| norm 0.4949 (-0.76z)| lr 1.07e-03 | 1986.96 ms | 69.1% bf16 MFU | 264664 tok/s +step 180/18794 | loss 5.911456 (-1.77z)| norm 0.6287 (-0.14z)| lr 1.08e-03 | 1985.73 ms | 69.1% bf16 MFU | 264632 tok/s +step 181/18794 | loss 5.969683 (-1.40z)| norm 0.8489 (+0.87z)| lr 1.09e-03 | 1974.29 ms | 69.5% bf16 MFU | 264678 tok/s +step 182/18794 | loss 5.965319 (-1.41z)| norm 0.7023 (+0.19z)| lr 1.09e-03 | 1974.91 ms | 69.5% bf16 MFU | 264718 tok/s +step 183/18794 | loss 5.914716 (-1.69z)| norm 0.6302 (-0.15z)| lr 1.10e-03 | 1978.77 ms | 69.4% bf16 MFU | 264730 tok/s +step 184/18794 | loss 5.948279 (-1.46z)| norm 0.5987 (-0.30z)| lr 1.10e-03 | 1983.93 ms | 69.2% bf16 MFU | 264707 tok/s +step 185/18794 | loss 5.911939 (-1.65z)| norm 0.4506 (-0.98z)| lr 1.11e-03 | 1983.98 ms | 69.2% bf16 MFU | 264684 tok/s +step 186/18794 | loss 5.879314 (-1.82z)| norm 0.3700 (-1.32z)| lr 1.12e-03 | 1976.79 ms | 69.4% bf16 MFU | 264711 tok/s +step 187/18794 | loss 5.895102 (-1.68z)| norm 0.3417 (-1.42z)| lr 1.12e-03 | 1977.06 ms | 69.4% bf16 MFU | 264735 tok/s +step 188/18794 | loss 5.885301 (-1.71z)| norm 0.3205 (-1.48z)| lr 1.13e-03 | 1975.84 ms | 69.5% bf16 MFU | 264766 tok/s +step 189/18794 | loss 5.841142 (-1.95z)| norm 0.3730 (-1.22z)| lr 1.13e-03 | 1972.11 ms | 69.6% bf16 MFU | 264820 tok/s +step 190/18794 | loss 5.851277 (-1.85z)| norm 0.4581 (-0.85z)| lr 1.14e-03 | 1972.37 ms | 69.6% bf16 MFU | 264870 tok/s +step 191/18794 | loss 5.827429 (-1.95z)| norm 0.4556 (-0.85z)| lr 1.15e-03 | 1993.48 ms | 68.8% bf16 MFU | 264776 tok/s +step 192/18794 | loss 5.892002 (-1.53z)| norm 0.4496 (-0.86z)| lr 1.15e-03 | 2033.05 ms | 67.5% bf16 MFU | 264432 tok/s +step 193/18794 | loss 5.906301 (-1.42z)| norm 0.4756 (-0.75z)| lr 1.16e-03 | 2033.35 ms | 67.5% bf16 MFU | 264102 tok/s +step 194/18794 | loss 5.868995 (-1.63z)| norm 0.4946 (-0.64z)| lr 1.16e-03 | 2031.10 ms | 67.6% bf16 MFU | 263804 tok/s +step 195/18794 | loss 5.908206 (-1.36z)| norm 0.4215 (-0.98z)| lr 1.17e-03 | 2033.56 ms | 67.5% bf16 MFU | 263504 tok/s +step 196/18794 | loss 5.853652 (-1.67z)| norm 0.5268 (-0.47z)| lr 1.18e-03 | 2032.22 ms | 67.5% bf16 MFU | 263228 tok/s +step 197/18794 | loss 5.855215 (-1.63z)| norm 0.6436 (+0.10z)| lr 1.18e-03 | 2031.72 ms | 67.5% bf16 MFU | 262970 tok/s +step 198/18794 | loss 5.869034 (-1.51z)| norm 0.5833 (-0.20z)| lr 1.19e-03 | 2033.61 ms | 67.5% bf16 MFU | 262712 tok/s +step 199/18794 | loss 5.865210 (-1.51z)| norm 0.5883 (-0.16z)| lr 1.19e-03 | 2032.67 ms | 67.5% bf16 MFU | 262473 tok/s +step 200/18794 | loss 5.887379 (-1.34z)| norm 0.5197 (-0.48z)| lr 1.20e-03 | 2015.98 ms | 68.1% bf16 MFU | 262352 tok/s +step 201/18794 | loss 5.846540 (-1.57z)| norm 0.4016 (-1.04z)| lr 1.21e-03 | 2025.46 ms | 67.8% bf16 MFU | 262177 tok/s +step 202/18794 | loss 5.856839 (-1.48z)| norm 0.3984 (-1.04z)| lr 1.21e-03 | 2029.11 ms | 67.6% bf16 MFU | 261987 tok/s +step 203/18794 | loss 5.791916 (-1.87z)| norm 0.4404 (-0.82z)| lr 1.22e-03 | 2031.29 ms | 67.6% bf16 MFU | 261793 tok/s +step 204/18794 | loss 5.824479 (-1.63z)| norm 0.4522 (-0.75z)| lr 1.22e-03 | 2024.14 ms | 67.8% bf16 MFU | 261655 tok/s +step 205/18794 | loss 5.848730 (-1.45z)| norm 0.4671 (-0.65z)| lr 1.23e-03 | 2018.28 ms | 68.0% bf16 MFU | 261560 tok/s +step 206/18794 | loss 5.792617 (-1.78z)| norm 0.6003 (+0.05z)| lr 1.24e-03 | 2032.47 ms | 67.5% bf16 MFU | 261380 tok/s +step 207/18794 | loss 5.810371 (-1.63z)| norm 0.5232 (-0.35z)| lr 1.24e-03 | 2011.13 ms | 68.2% bf16 MFU | 261346 tok/s +step 208/18794 | loss 5.784674 (-1.77z)| norm 0.6079 (+0.13z)| lr 1.25e-03 | 2032.14 ms | 67.5% bf16 MFU | 261178 tok/s +step 209/18794 | loss 5.827262 (-1.46z)| norm 0.4830 (-0.55z)| lr 1.25e-03 | 2027.16 ms | 67.7% bf16 MFU | 261051 tok/s +step 210/18794 | loss 5.830685 (-1.42z)| norm 0.5614 (-0.09z)| lr 1.26e-03 | 2027.08 ms | 67.7% bf16 MFU | 260931 tok/s +step 211/18794 | loss 5.898026 (-0.96z)| norm 0.6010 (+0.17z)| lr 1.27e-03 | 2012.95 ms | 68.2% bf16 MFU | 260907 tok/s +step 212/18794 | loss 5.845257 (-1.30z)| norm 0.4540 (-0.70z)| lr 1.27e-03 | 2019.85 ms | 67.9% bf16 MFU | 260840 tok/s +step 213/18794 | loss 5.880744 (-1.04z)| norm 0.5332 (-0.20z)| lr 1.28e-03 | 2022.81 ms | 67.8% bf16 MFU | 260757 tok/s +step 214/18794 | loss 5.818168 (-1.46z)| norm 0.5305 (-0.20z)| lr 1.28e-03 | 2026.93 ms | 67.7% bf16 MFU | 260653 tok/s +step 215/18794 | loss 5.784591 (-1.67z)| norm 0.6125 (+0.32z)| lr 1.29e-03 | 2024.60 ms | 67.8% bf16 MFU | 260568 tok/s +step 216/18794 | loss 5.805360 (-1.49z)| norm 0.5569 (-0.03z)| lr 1.30e-03 | 2028.16 ms | 67.7% bf16 MFU | 260465 tok/s +step 217/18794 | loss 5.807807 (-1.45z)| norm 0.5120 (-0.32z)| lr 1.30e-03 | 2026.61 ms | 67.7% bf16 MFU | 260377 tok/s +step 218/18794 | loss 5.784245 (-1.60z)| norm 0.5695 (+0.06z)| lr 1.31e-03 | 2008.58 ms | 68.3% bf16 MFU | 260409 tok/s +step 219/18794 | loss 5.816504 (-1.35z)| norm 0.5728 (+0.10z)| lr 1.31e-03 | 2013.38 ms | 68.2% bf16 MFU | 260409 tok/s +step 220/18794 | loss 5.792661 (-1.49z)| norm 0.5585 (+0.05z)| lr 1.32e-03 | 2034.05 ms | 67.5% bf16 MFU | 260276 tok/s +step 221/18794 | loss 5.697205 (-2.13z)| norm 0.5237 (-0.18z)| lr 1.33e-03 | 2016.71 ms | 68.0% bf16 MFU | 260261 tok/s +step 222/18794 | loss 5.812505 (-1.28z)| norm 0.6778 (+0.90z)| lr 1.33e-03 | 2025.23 ms | 67.8% bf16 MFU | 260192 tok/s +step 223/18794 | loss 5.737046 (-1.79z)| norm 0.6179 (+0.47z)| lr 1.34e-03 | 2019.44 ms | 68.0% bf16 MFU | 260163 tok/s +step 224/18794 | loss 5.741765 (-1.73z)| norm 0.5171 (-0.25z)| lr 1.34e-03 | 2010.33 ms | 68.3% bf16 MFU | 260195 tok/s +step 225/18794 | loss 5.730837 (-1.77z)| norm 0.4856 (-0.47z)| lr 1.35e-03 | 2004.30 ms | 68.5% bf16 MFU | 260264 tok/s +step 226/18794 | loss 5.776760 (-1.42z)| norm 0.4848 (-0.47z)| lr 1.36e-03 | 2018.36 ms | 68.0% bf16 MFU | 260239 tok/s +step 227/18794 | loss 5.740831 (-1.65z)| norm 0.4232 (-0.88z)| lr 1.36e-03 | 2011.31 ms | 68.2% bf16 MFU | 260260 tok/s +step 228/18794 | loss 5.726169 (-1.73z)| norm 0.4067 (-0.98z)| lr 1.37e-03 | 2031.30 ms | 67.6% bf16 MFU | 260153 tok/s +step 229/18794 | loss 5.758032 (-1.46z)| norm 0.4013 (-1.01z)| lr 1.37e-03 | 2003.89 ms | 68.5% bf16 MFU | 260227 tok/s +step 230/18794 | loss 5.742394 (-1.55z)| norm 0.3998 (-1.01z)| lr 1.38e-03 | 2004.58 ms | 68.5% bf16 MFU | 260293 tok/s +step 231/18794 | loss 5.702116 (-1.80z)| norm 0.4327 (-0.76z)| lr 1.39e-03 | 2025.55 ms | 67.8% bf16 MFU | 260220 tok/s +step 232/18794 | loss 5.660011 (-2.05z)| norm 0.5409 (-0.01z)| lr 1.39e-03 | 2011.32 ms | 68.2% bf16 MFU | 260242 tok/s +step 233/18794 | loss 5.710970 (-1.65z)| norm 0.6863 (+0.99z)| lr 1.40e-03 | 2018.47 ms | 68.0% bf16 MFU | 260217 tok/s +step 234/18794 | loss 5.688470 (-1.79z)| norm 0.5776 (+0.26z)| lr 1.40e-03 | 2018.74 ms | 68.0% bf16 MFU | 260192 tok/s +step 235/18794 | loss 5.670560 (-1.88z)| norm 0.5324 (-0.02z)| lr 1.41e-03 | 2019.51 ms | 68.0% bf16 MFU | 260163 tok/s +step 236/18794 | loss 5.705377 (-1.59z)| norm 0.4802 (-0.42z)| lr 1.42e-03 | 2028.17 ms | 67.7% bf16 MFU | 260080 tok/s +step 237/18794 | loss 5.732061 (-1.37z)| norm 0.4944 (-0.29z)| lr 1.42e-03 | 2012.97 ms | 68.2% bf16 MFU | 260099 tok/s +step 238/18794 | loss 5.757503 (-1.16z)| norm 0.5101 (-0.16z)| lr 1.43e-03 | 2017.84 ms | 68.0% bf16 MFU | 260085 tok/s +step 239/18794 | loss 5.696414 (-1.58z)| norm 0.6307 (+0.81z)| lr 1.43e-03 | 2020.78 ms | 67.9% bf16 MFU | 260053 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.379935 +step 240/18794 | loss 5.733541 (-1.28z)| norm 0.8374 (+2.38z)| lr 1.44e-03 | 2012.71 ms | 68.2% bf16 MFU | 260075 tok/s +step 241/18794 | loss 5.727137 (-1.31z)| norm 0.7860 (+1.98z)| lr 1.45e-03 | 2011.37 ms | 68.2% bf16 MFU | 260104 tok/s +step 242/18794 | loss 5.731881 (-1.26z)| norm 0.7160 (+1.42z)| lr 1.45e-03 | 2018.17 ms | 68.0% bf16 MFU | 260088 tok/s +step 243/18794 | loss 5.697467 (-1.49z)| norm 0.5901 (+0.42z)| lr 1.46e-03 | 2009.04 ms | 68.3% bf16 MFU | 260132 tok/s +step 244/18794 | loss 5.716289 (-1.32z)| norm 0.6006 (+0.49z)| lr 1.46e-03 | 2020.34 ms | 67.9% bf16 MFU | 260101 tok/s +step 245/18794 | loss 5.661497 (-1.70z)| norm 0.6796 (+1.09z)| lr 1.47e-03 | 2020.49 ms | 67.9% bf16 MFU | 260070 tok/s +step 246/18794 | loss 5.658538 (-1.70z)| norm 0.4985 (-0.34z)| lr 1.48e-03 | 2021.24 ms | 67.9% bf16 MFU | 260036 tok/s +step 247/18794 | loss 5.676478 (-1.53z)| norm 0.5012 (-0.33z)| lr 1.48e-03 | 2018.91 ms | 68.0% bf16 MFU | 260019 tok/s +step 248/18794 | loss 5.662783 (-1.61z)| norm 0.4677 (-0.60z)| lr 1.49e-03 | 2000.79 ms | 68.6% bf16 MFU | 260120 tok/s +step 249/18794 | loss 5.645741 (-1.70z)| norm 0.4972 (-0.37z)| lr 1.49e-03 | 2003.12 ms | 68.5% bf16 MFU | 260200 tok/s +step 250/18794 | loss 5.683871 (-1.38z)| norm 0.3935 (-1.18z)| lr 1.50e-03 | 2031.40 ms | 67.6% bf16 MFU | 260095 tok/s +val loss 5.684191 +HellaSwag: 2417/10042 = 0.240689: 0/1256 +step 251/18794 | loss 5.622016 (-1.81z)| norm 0.4228 (-0.93z)| lr 1.51e-03 | 2014.32 ms | 68.1% bf16 MFU | 260104 tok/s +step 252/18794 | loss 5.622469 (-1.77z)| norm 0.4138 (-0.98z)| lr 1.51e-03 | 2020.50 ms | 67.9% bf16 MFU | 260073 tok/s +step 253/18794 | loss 5.653227 (-1.51z)| norm 0.4779 (-0.46z)| lr 1.52e-03 | 2027.19 ms | 67.7% bf16 MFU | 260001 tok/s +step 254/18794 | loss 5.572978 (-2.06z)| norm 0.5501 (+0.14z)| lr 1.52e-03 | 2005.59 ms | 68.4% bf16 MFU | 260072 tok/s +step 255/18794 | loss 5.640574 (-1.53z)| norm 0.7151 (+1.50z)| lr 1.53e-03 | 2005.57 ms | 68.4% bf16 MFU | 260139 tok/s +step 256/18794 | loss 5.633920 (-1.56z)| norm 0.6749 (+1.21z)| lr 1.54e-03 | 2013.03 ms | 68.2% bf16 MFU | 260154 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.378849 +step 257/18794 | loss 5.683317 (-1.17z)| norm 0.7916 (+2.38z)| lr 1.54e-03 | 2005.74 ms | 68.4% bf16 MFU | 260216 tok/s +step 258/18794 | loss 5.649968 (-1.41z)| norm 0.6011 (+0.69z)| lr 1.55e-03 | 2001.33 ms | 68.6% bf16 MFU | 260304 tok/s +step 259/18794 | loss 5.689627 (-1.08z)| norm 0.6478 (+1.12z)| lr 1.55e-03 | 2021.47 ms | 67.9% bf16 MFU | 260257 tok/s +step 260/18794 | loss 5.638161 (-1.47z)| norm 0.5981 (+0.65z)| lr 1.56e-03 | 2012.66 ms | 68.2% bf16 MFU | 260269 tok/s +step 261/18794 | loss 5.614759 (-1.63z)| norm 0.5498 (+0.19z)| lr 1.57e-03 | 2001.40 ms | 68.6% bf16 MFU | 260353 tok/s +step 262/18794 | loss 5.618372 (-1.58z)| norm 0.5238 (-0.06z)| lr 1.57e-03 | 1998.27 ms | 68.7% bf16 MFU | 260454 tok/s +step 263/18794 | loss 5.567494 (-1.95z)| norm 0.4990 (-0.30z)| lr 1.58e-03 | 2009.28 ms | 68.3% bf16 MFU | 260478 tok/s +step 264/18794 | loss 5.640225 (-1.34z)| norm 0.7255 (+1.78z)| lr 1.58e-03 | 1997.55 ms | 68.7% bf16 MFU | 260577 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.397469 +step 265/18794 | loss 5.676762 (-1.02z)| norm 0.8033 (+2.40z)| lr 1.59e-03 | 2020.60 ms | 67.9% bf16 MFU | 260522 tok/s +step 266/18794 | loss 5.622447 (-1.45z)| norm 0.5407 (+0.03z)| lr 1.60e-03 | 2002.09 ms | 68.5% bf16 MFU | 260589 tok/s +step 267/18794 | loss 5.592034 (-1.67z)| norm 0.5337 (-0.04z)| lr 1.60e-03 | 1998.24 ms | 68.7% bf16 MFU | 260679 tok/s +step 268/18794 | loss 5.608940 (-1.50z)| norm 0.4953 (-0.38z)| lr 1.61e-03 | 2006.21 ms | 68.4% bf16 MFU | 260711 tok/s +step 269/18794 | loss 5.597702 (-1.57z)| norm 0.4668 (-0.62z)| lr 1.61e-03 | 2019.55 ms | 68.0% bf16 MFU | 260656 tok/s +step 270/18794 | loss 5.511796 (-2.24z)| norm 0.4278 (-0.96z)| lr 1.62e-03 | 1998.20 ms | 68.7% bf16 MFU | 260742 tok/s +step 271/18794 | loss 5.531953 (-2.03z)| norm 0.4167 (-1.04z)| lr 1.63e-03 | 1999.00 ms | 68.7% bf16 MFU | 260819 tok/s +step 272/18794 | loss 5.582949 (-1.57z)| norm 0.5846 (+0.48z)| lr 1.63e-03 | 2016.73 ms | 68.0% bf16 MFU | 260776 tok/s +step 273/18794 | loss 5.577705 (-1.59z)| norm 0.7280 (+1.72z)| lr 1.64e-03 | 2005.54 ms | 68.4% bf16 MFU | 260809 tok/s +step 274/18794 | loss 5.634440 (-1.09z)| norm 0.7122 (+1.55z)| lr 1.64e-03 | 2027.63 ms | 67.7% bf16 MFU | 260697 tok/s +step 275/18794 | loss 5.587319 (-1.47z)| norm 0.5216 (-0.15z)| lr 1.65e-03 | 1995.70 ms | 68.8% bf16 MFU | 260797 tok/s +step 276/18794 | loss 5.604403 (-1.31z)| norm 0.7636 (+1.95z)| lr 1.66e-03 | 2002.83 ms | 68.5% bf16 MFU | 260846 tok/s +step 277/18794 | loss 5.601707 (-1.31z)| norm 0.7580 (+1.85z)| lr 1.66e-03 | 2002.67 ms | 68.5% bf16 MFU | 260894 tok/s +step 278/18794 | loss 5.584887 (-1.43z)| norm 0.7207 (+1.49z)| lr 1.67e-03 | 2014.52 ms | 68.1% bf16 MFU | 260862 tok/s +step 279/18794 | loss 5.567986 (-1.56z)| norm 0.6969 (+1.26z)| lr 1.67e-03 | 2007.63 ms | 68.4% bf16 MFU | 260876 tok/s +step 280/18794 | loss 5.519891 (-1.94z)| norm 0.6067 (+0.48z)| lr 1.68e-03 | 1999.98 ms | 68.6% bf16 MFU | 260940 tok/s +step 281/18794 | loss 5.513309 (-1.96z)| norm 0.5817 (+0.30z)| lr 1.69e-03 | 2007.33 ms | 68.4% bf16 MFU | 260952 tok/s +step 282/18794 | loss 5.534681 (-1.74z)| norm 0.5429 (-0.03z)| lr 1.69e-03 | 2003.52 ms | 68.5% bf16 MFU | 260988 tok/s +step 283/18794 | loss 5.487073 (-2.11z)| norm 0.4982 (-0.42z)| lr 1.70e-03 | 1997.51 ms | 68.7% bf16 MFU | 261063 tok/s +step 284/18794 | loss 5.544518 (-1.58z)| norm 0.5432 (-0.01z)| lr 1.70e-03 | 2009.38 ms | 68.3% bf16 MFU | 261055 tok/s +step 285/18794 | loss 5.549733 (-1.50z)| norm 0.6191 (+0.66z)| lr 1.71e-03 | 1997.99 ms | 68.7% bf16 MFU | 261123 tok/s +step 286/18794 | loss 5.586639 (-1.15z)| norm 0.7135 (+1.49z)| lr 1.72e-03 | 1998.88 ms | 68.7% bf16 MFU | 261181 tok/s +step 287/18794 | loss 5.572063 (-1.26z)| norm 0.7165 (+1.49z)| lr 1.72e-03 | 2007.15 ms | 68.4% bf16 MFU | 261183 tok/s +step 288/18794 | loss 5.619515 (-0.82z)| norm 0.6166 (+0.56z)| lr 1.73e-03 | 2012.12 ms | 68.2% bf16 MFU | 261152 tok/s +step 289/18794 | loss 5.561523 (-1.32z)| norm 0.5895 (+0.29z)| lr 1.73e-03 | 2009.48 ms | 68.3% bf16 MFU | 261140 tok/s +step 290/18794 | loss 5.543962 (-1.45z)| norm 0.5993 (+0.38z)| lr 1.74e-03 | 2002.97 ms | 68.5% bf16 MFU | 261170 tok/s +step 291/18794 | loss 5.519061 (-1.63z)| norm 0.5196 (-0.39z)| lr 1.75e-03 | 1999.08 ms | 68.6% bf16 MFU | 261225 tok/s +step 292/18794 | loss 5.541005 (-1.41z)| norm 0.4749 (-0.82z)| lr 1.75e-03 | 1999.93 ms | 68.6% bf16 MFU | 261272 tok/s +step 293/18794 | loss 5.571271 (-1.12z)| norm 0.5565 (-0.05z)| lr 1.76e-03 | 2003.84 ms | 68.5% bf16 MFU | 261290 tok/s +step 294/18794 | loss 5.488443 (-1.84z)| norm 0.5192 (-0.41z)| lr 1.76e-03 | 1996.98 ms | 68.7% bf16 MFU | 261353 tok/s +step 295/18794 | loss 5.514744 (-1.57z)| norm 0.5866 (+0.23z)| lr 1.77e-03 | 1989.36 ms | 69.0% bf16 MFU | 261462 tok/s +step 296/18794 | loss 5.482217 (-1.83z)| norm 0.6398 (+0.73z)| lr 1.78e-03 | 2004.98 ms | 68.4% bf16 MFU | 261464 tok/s +step 297/18794 | loss 5.502029 (-1.61z)| norm 0.5693 (+0.05z)| lr 1.78e-03 | 2006.44 ms | 68.4% bf16 MFU | 261456 tok/s +step 298/18794 | loss 5.465326 (-1.91z)| norm 0.4482 (-1.10z)| lr 1.79e-03 | 2020.85 ms | 67.9% bf16 MFU | 261355 tok/s +step 299/18794 | loss 5.457705 (-1.93z)| norm 0.3952 (-1.57z)| lr 1.79e-03 | 2001.99 ms | 68.5% bf16 MFU | 261381 tok/s +step 300/18794 | loss 5.425887 (-2.17z)| norm 0.4415 (-1.12z)| lr 1.80e-03 | 1989.85 ms | 69.0% bf16 MFU | 261486 tok/s +step 301/18794 | loss 5.471793 (-1.72z)| norm 0.3954 (-1.55z)| lr 1.81e-03 | 1996.80 ms | 68.7% bf16 MFU | 261540 tok/s +step 302/18794 | loss 5.416865 (-2.16z)| norm 0.4266 (-1.26z)| lr 1.81e-03 | 1997.95 ms | 68.7% bf16 MFU | 261584 tok/s +step 303/18794 | loss 5.453120 (-1.79z)| norm 0.5518 (-0.09z)| lr 1.82e-03 | 2010.77 ms | 68.2% bf16 MFU | 261542 tok/s +step 304/18794 | loss 5.473255 (-1.57z)| norm 0.7416 (+1.68z)| lr 1.82e-03 | 2003.65 ms | 68.5% bf16 MFU | 261548 tok/s +step 305/18794 | loss 5.499430 (-1.32z)| norm 0.7161 (+1.41z)| lr 1.83e-03 | 2003.25 ms | 68.5% bf16 MFU | 261556 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.379621 +step 306/18794 | loss 5.539678 (-0.94z)| norm 0.8289 (+2.38z)| lr 1.84e-03 | 2006.64 ms | 68.4% bf16 MFU | 261543 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.835547 +step 307/18794 | loss 5.491996 (-1.34z)| norm 0.8957 (+2.84z)| lr 1.84e-03 | 1990.01 ms | 69.0% bf16 MFU | 261638 tok/s +step 308/18794 | loss 5.513101 (-1.13z)| norm 0.7355 (+1.41z)| lr 1.85e-03 | 2003.77 ms | 68.5% bf16 MFU | 261639 tok/s +step 309/18794 | loss 5.456086 (-1.61z)| norm 0.6010 (+0.23z)| lr 1.85e-03 | 2003.17 ms | 68.5% bf16 MFU | 261644 tok/s +step 310/18794 | loss 5.443693 (-1.69z)| norm 0.5438 (-0.27z)| lr 1.86e-03 | 2010.22 ms | 68.3% bf16 MFU | 261602 tok/s +step 311/18794 | loss 5.448972 (-1.63z)| norm 0.5197 (-0.47z)| lr 1.87e-03 | 1999.09 ms | 68.6% bf16 MFU | 261635 tok/s +step 312/18794 | loss 5.472372 (-1.40z)| norm 0.5002 (-0.65z)| lr 1.87e-03 | 1990.83 ms | 68.9% bf16 MFU | 261721 tok/s +step 313/18794 | loss 5.452665 (-1.57z)| norm 0.5167 (-0.50z)| lr 1.88e-03 | 1987.96 ms | 69.0% bf16 MFU | 261821 tok/s +step 314/18794 | loss 5.455966 (-1.51z)| norm 0.6028 (+0.24z)| lr 1.88e-03 | 1998.00 ms | 68.7% bf16 MFU | 261851 tok/s +step 315/18794 | loss 5.449029 (-1.55z)| norm 0.7424 (+1.44z)| lr 1.89e-03 | 2012.98 ms | 68.2% bf16 MFU | 261781 tok/s +step 316/18794 | loss 5.495254 (-1.09z)| norm 0.5842 (+0.07z)| lr 1.90e-03 | 1994.89 ms | 68.8% bf16 MFU | 261832 tok/s +step 317/18794 | loss 5.447978 (-1.52z)| norm 0.6019 (+0.21z)| lr 1.90e-03 | 1990.47 ms | 68.9% bf16 MFU | 261911 tok/s +step 318/18794 | loss 5.437606 (-1.59z)| norm 0.7025 (+1.07z)| lr 1.91e-03 | 1981.69 ms | 69.2% bf16 MFU | 262043 tok/s +step 319/18794 | loss 5.387040 (-2.05z)| norm 0.6419 (+0.54z)| lr 1.91e-03 | 1991.29 ms | 68.9% bf16 MFU | 262106 tok/s +step 320/18794 | loss 5.418797 (-1.71z)| norm 0.6802 (+0.85z)| lr 1.92e-03 | 1992.57 ms | 68.9% bf16 MFU | 262157 tok/s +step 321/18794 | loss 5.446161 (-1.41z)| norm 0.6621 (+0.69z)| lr 1.93e-03 | 1996.60 ms | 68.7% bf16 MFU | 262178 tok/s +step 322/18794 | loss 5.421937 (-1.62z)| norm 0.6121 (+0.26z)| lr 1.93e-03 | 1989.49 ms | 69.0% bf16 MFU | 262246 tok/s +step 323/18794 | loss 5.371294 (-2.06z)| norm 0.5310 (-0.42z)| lr 1.94e-03 | 1992.32 ms | 68.9% bf16 MFU | 262291 tok/s +step 324/18794 | loss 5.408476 (-1.66z)| norm 0.5786 (-0.02z)| lr 1.94e-03 | 1998.35 ms | 68.7% bf16 MFU | 262295 tok/s +step 325/18794 | loss 5.392613 (-1.78z)| norm 0.5007 (-0.69z)| lr 1.95e-03 | 2000.85 ms | 68.6% bf16 MFU | 262282 tok/s +step 326/18794 | loss 5.413141 (-1.55z)| norm 0.5013 (-0.69z)| lr 1.96e-03 | 1991.71 ms | 68.9% bf16 MFU | 262329 tok/s +step 327/18794 | loss 5.400131 (-1.65z)| norm 0.6850 (+0.87z)| lr 1.96e-03 | 1997.48 ms | 68.7% bf16 MFU | 262337 tok/s +step 328/18794 | loss 5.379533 (-1.80z)| norm 0.6698 (+0.73z)| lr 1.97e-03 | 2003.09 ms | 68.5% bf16 MFU | 262307 tok/s +step 329/18794 | loss 5.367753 (-1.88z)| norm 0.5467 (-0.37z)| lr 1.97e-03 | 2003.87 ms | 68.5% bf16 MFU | 262273 tok/s +step 330/18794 | loss 5.363682 (-1.88z)| norm 0.6110 (+0.19z)| lr 1.98e-03 | 2012.40 ms | 68.2% bf16 MFU | 262186 tok/s +step 331/18794 | loss 5.359096 (-1.87z)| norm 0.4887 (-0.92z)| lr 1.99e-03 | 1993.48 ms | 68.8% bf16 MFU | 262227 tok/s +step 332/18794 | loss 5.359021 (-1.82z)| norm 0.3848 (-1.82z)| lr 1.99e-03 | 1996.41 ms | 68.7% bf16 MFU | 262246 tok/s +step 333/18794 | loss 5.325089 (-2.08z)| norm 0.3847 (-1.77z)| lr 2.00e-03 | 1985.98 ms | 69.1% bf16 MFU | 262334 tok/s +step 334/18794 | loss 5.377131 (-1.56z)| norm 0.4477 (-1.20z)| lr 2.00e-03 | 1986.29 ms | 69.1% bf16 MFU | 262415 tok/s +step 335/18794 | loss 5.339820 (-1.86z)| norm 0.5714 (-0.12z)| lr 2.01e-03 | 1994.36 ms | 68.8% bf16 MFU | 262438 tok/s +step 336/18794 | loss 5.336205 (-1.85z)| norm 0.7194 (+1.15z)| lr 2.02e-03 | 2002.18 ms | 68.5% bf16 MFU | 262409 tok/s +step 337/18794 | loss 5.345093 (-1.73z)| norm 0.6806 (+0.79z)| lr 2.02e-03 | 1980.02 ms | 69.3% bf16 MFU | 262528 tok/s +step 338/18794 | loss 5.285058 (-2.23z)| norm 0.6088 (+0.16z)| lr 2.03e-03 | 1989.23 ms | 69.0% bf16 MFU | 262580 tok/s +step 339/18794 | loss 5.365898 (-1.46z)| norm 0.6178 (+0.24z)| lr 2.03e-03 | 1989.95 ms | 69.0% bf16 MFU | 262624 tok/s +step 340/18794 | loss 5.325638 (-1.80z)| norm 0.5676 (-0.18z)| lr 2.04e-03 | 1985.43 ms | 69.1% bf16 MFU | 262697 tok/s +step 341/18794 | loss 5.300858 (-1.98z)| norm 0.5723 (-0.12z)| lr 2.05e-03 | 1990.70 ms | 68.9% bf16 MFU | 262730 tok/s +step 342/18794 | loss 5.252754 (-2.35z)| norm 0.4996 (-0.76z)| lr 2.05e-03 | 1976.89 ms | 69.4% bf16 MFU | 262854 tok/s +step 343/18794 | loss 5.316021 (-1.75z)| norm 0.5485 (-0.31z)| lr 2.06e-03 | 1983.50 ms | 69.2% bf16 MFU | 262928 tok/s +step 344/18794 | loss 5.233337 (-2.42z)| norm 0.4811 (-0.91z)| lr 2.06e-03 | 1984.37 ms | 69.2% bf16 MFU | 262992 tok/s +step 345/18794 | loss 5.244279 (-2.25z)| norm 0.4574 (-1.11z)| lr 2.07e-03 | 1989.42 ms | 69.0% bf16 MFU | 263019 tok/s +step 346/18794 | loss 5.207941 (-2.47z)| norm 0.4070 (-1.54z)| lr 2.08e-03 | 1991.72 ms | 68.9% bf16 MFU | 263030 tok/s +step 347/18794 | loss 5.216374 (-2.32z)| norm 0.3844 (-1.71z)| lr 2.08e-03 | 1992.14 ms | 68.9% bf16 MFU | 263037 tok/s +step 348/18794 | loss 5.261379 (-1.89z)| norm 0.4233 (-1.36z)| lr 2.09e-03 | 1993.79 ms | 68.8% bf16 MFU | 263033 tok/s +step 349/18794 | loss 5.230487 (-2.09z)| norm 0.5475 (-0.27z)| lr 2.09e-03 | 1999.02 ms | 68.6% bf16 MFU | 262995 tok/s +step 350/18794 | loss 5.298957 (-1.50z)| norm 0.5451 (-0.30z)| lr 2.10e-03 | 1990.32 ms | 68.9% bf16 MFU | 263016 tok/s +step 351/18794 | loss 5.209485 (-2.17z)| norm 0.4654 (-1.03z)| lr 2.11e-03 | 1986.22 ms | 69.1% bf16 MFU | 263064 tok/s +step 352/18794 | loss 5.180718 (-2.32z)| norm 0.4764 (-0.94z)| lr 2.11e-03 | 1990.16 ms | 69.0% bf16 MFU | 263082 tok/s +step 353/18794 | loss 5.156572 (-2.42z)| norm 0.5114 (-0.63z)| lr 2.12e-03 | 2003.33 ms | 68.5% bf16 MFU | 263014 tok/s +step 354/18794 | loss 5.254745 (-1.61z)| norm 0.5219 (-0.53z)| lr 2.12e-03 | 1988.58 ms | 69.0% bf16 MFU | 263046 tok/s +step 355/18794 | loss 5.187152 (-2.08z)| norm 0.5088 (-0.63z)| lr 2.13e-03 | 1983.61 ms | 69.2% bf16 MFU | 263109 tok/s +step 356/18794 | loss 5.196142 (-1.95z)| norm 0.4865 (-0.82z)| lr 2.14e-03 | 1983.38 ms | 69.2% bf16 MFU | 263170 tok/s +step 357/18794 | loss 5.215989 (-1.77z)| norm 0.4653 (-1.00z)| lr 2.14e-03 | 1984.93 ms | 69.1% bf16 MFU | 263219 tok/s +step 358/18794 | loss 5.194363 (-1.88z)| norm 0.4373 (-1.23z)| lr 2.15e-03 | 1981.51 ms | 69.3% bf16 MFU | 263287 tok/s +step 359/18794 | loss 5.145642 (-2.19z)| norm 0.5238 (-0.43z)| lr 2.15e-03 | 1990.34 ms | 68.9% bf16 MFU | 263294 tok/s +step 360/18794 | loss 5.125802 (-2.27z)| norm 0.5607 (-0.08z)| lr 2.16e-03 | 1989.88 ms | 69.0% bf16 MFU | 263303 tok/s +step 361/18794 | loss 5.162319 (-1.95z)| norm 0.6059 (+0.33z)| lr 2.17e-03 | 1982.94 ms | 69.2% bf16 MFU | 263358 tok/s +step 362/18794 | loss 5.200895 (-1.63z)| norm 0.5301 (-0.37z)| lr 2.17e-03 | 1993.35 ms | 68.8% bf16 MFU | 263341 tok/s +step 363/18794 | loss 5.147289 (-1.96z)| norm 0.4973 (-0.67z)| lr 2.18e-03 | 1977.77 ms | 69.4% bf16 MFU | 263428 tok/s +step 364/18794 | loss 5.175058 (-1.72z)| norm 0.6171 (+0.44z)| lr 2.18e-03 | 1997.11 ms | 68.7% bf16 MFU | 263383 tok/s +step 365/18794 | loss 5.190548 (-1.58z)| norm 0.6086 (+0.39z)| lr 2.19e-03 | 1983.51 ms | 69.2% bf16 MFU | 263430 tok/s +step 366/18794 | loss 5.199216 (-1.49z)| norm 0.5944 (+0.25z)| lr 2.20e-03 | 1977.67 ms | 69.4% bf16 MFU | 263514 tok/s +step 367/18794 | loss 5.145988 (-1.83z)| norm 0.5343 (-0.32z)| lr 2.20e-03 | 1985.52 ms | 69.1% bf16 MFU | 263541 tok/s +step 368/18794 | loss 5.146990 (-1.78z)| norm 0.5133 (-0.52z)| lr 2.21e-03 | 1982.23 ms | 69.2% bf16 MFU | 263588 tok/s +step 369/18794 | loss 5.146338 (-1.74z)| norm 0.5646 (-0.04z)| lr 2.21e-03 | 1983.65 ms | 69.2% bf16 MFU | 263624 tok/s +step 370/18794 | loss 5.184324 (-1.44z)| norm 0.5943 (+0.23z)| lr 2.22e-03 | 1992.38 ms | 68.9% bf16 MFU | 263600 tok/s +step 371/18794 | loss 5.173238 (-1.48z)| norm 0.6061 (+0.33z)| lr 2.23e-03 | 1982.53 ms | 69.2% bf16 MFU | 263643 tok/s +step 372/18794 | loss 5.160808 (-1.54z)| norm 0.5802 (+0.08z)| lr 2.23e-03 | 1982.73 ms | 69.2% bf16 MFU | 263682 tok/s +step 373/18794 | loss 5.137239 (-1.66z)| norm 0.5593 (-0.11z)| lr 2.24e-03 | 1982.05 ms | 69.2% bf16 MFU | 263724 tok/s +step 374/18794 | loss 5.158814 (-1.49z)| norm 0.5951 (+0.25z)| lr 2.24e-03 | 1977.40 ms | 69.4% bf16 MFU | 263795 tok/s +step 375/18794 | loss 5.147228 (-1.54z)| norm 0.6265 (+0.56z)| lr 2.25e-03 | 1983.67 ms | 69.2% bf16 MFU | 263820 tok/s +step 376/18794 | loss 5.170184 (-1.35z)| norm 0.7248 (+1.55z)| lr 2.26e-03 | 1977.97 ms | 69.4% bf16 MFU | 263882 tok/s +step 377/18794 | loss 5.148895 (-1.47z)| norm 0.6925 (+1.25z)| lr 2.26e-03 | 1983.47 ms | 69.2% bf16 MFU | 263905 tok/s +step 378/18794 | loss 5.215775 (-0.99z)| norm 0.6591 (+0.92z)| lr 2.27e-03 | 1983.92 ms | 69.2% bf16 MFU | 263923 tok/s +step 379/18794 | loss 5.169726 (-1.29z)| norm 0.7229 (+1.56z)| lr 2.27e-03 | 1983.72 ms | 69.2% bf16 MFU | 263941 tok/s +step 380/18794 | loss 5.155372 (-1.36z)| norm 0.7395 (+1.70z)| lr 2.28e-03 | 1975.26 ms | 69.5% bf16 MFU | 264016 tok/s +step 381/18794 | loss 5.140301 (-1.44z)| norm 0.7735 (+1.98z)| lr 2.29e-03 | 1978.23 ms | 69.4% bf16 MFU | 264066 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.191095 +step 382/18794 | loss 5.200845 (-0.99z)| norm 0.8032 (+2.19z)| lr 2.29e-03 | 2022.14 ms | 67.9% bf16 MFU | 263827 tok/s +step 383/18794 | loss 5.252527 (-0.61z)| norm 0.6672 (+0.87z)| lr 2.30e-03 | 2030.92 ms | 67.6% bf16 MFU | 263543 tok/s +step 384/18794 | loss 5.197436 (-0.98z)| norm 0.7660 (+1.77z)| lr 2.30e-03 | 2038.82 ms | 67.3% bf16 MFU | 263223 tok/s +step 385/18794 | loss 5.180929 (-1.08z)| norm 0.7887 (+1.93z)| lr 2.31e-03 | 2037.71 ms | 67.3% bf16 MFU | 262927 tok/s +step 386/18794 | loss 5.166327 (-1.17z)| norm 0.7505 (+1.56z)| lr 2.32e-03 | 2037.13 ms | 67.4% bf16 MFU | 262649 tok/s +step 387/18794 | loss 5.190500 (-0.98z)| norm 0.7953 (+1.95z)| lr 2.32e-03 | 2037.68 ms | 67.3% bf16 MFU | 262381 tok/s +step 388/18794 | loss 5.179419 (-1.04z)| norm 0.7531 (+1.53z)| lr 2.33e-03 | 2037.46 ms | 67.4% bf16 MFU | 262128 tok/s +step 389/18794 | loss 5.195993 (-0.90z)| norm 0.5379 (-0.40z)| lr 2.33e-03 | 2030.20 ms | 67.6% bf16 MFU | 261934 tok/s +step 390/18794 | loss 5.145897 (-1.26z)| norm 0.5427 (-0.36z)| lr 2.34e-03 | 2032.79 ms | 67.5% bf16 MFU | 261733 tok/s +step 391/18794 | loss 5.147607 (-1.23z)| norm 0.4915 (-0.81z)| lr 2.35e-03 | 2026.82 ms | 67.7% bf16 MFU | 261580 tok/s +step 392/18794 | loss 5.087079 (-1.66z)| norm 0.4685 (-1.02z)| lr 2.35e-03 | 2024.65 ms | 67.8% bf16 MFU | 261449 tok/s +step 393/18794 | loss 5.111802 (-1.45z)| norm 0.4336 (-1.31z)| lr 2.36e-03 | 2027.77 ms | 67.7% bf16 MFU | 261304 tok/s +step 394/18794 | loss 5.117350 (-1.38z)| norm 0.4577 (-1.09z)| lr 2.36e-03 | 2029.44 ms | 67.6% bf16 MFU | 261156 tok/s +step 395/18794 | loss 5.056396 (-1.81z)| norm 0.4423 (-1.20z)| lr 2.37e-03 | 2022.12 ms | 67.9% bf16 MFU | 261062 tok/s +step 396/18794 | loss 5.056265 (-1.77z)| norm 0.4724 (-0.92z)| lr 2.38e-03 | 2035.95 ms | 67.4% bf16 MFU | 260885 tok/s +step 397/18794 | loss 5.050602 (-1.78z)| norm 0.5113 (-0.57z)| lr 2.38e-03 | 2022.54 ms | 67.9% bf16 MFU | 260802 tok/s +step 398/18794 | loss 5.048429 (-1.75z)| norm 0.5435 (-0.30z)| lr 2.39e-03 | 2031.05 ms | 67.6% bf16 MFU | 260668 tok/s +step 399/18794 | loss 5.090292 (-1.40z)| norm 0.4758 (-0.91z)| lr 2.39e-03 | 2019.96 ms | 67.9% bf16 MFU | 260613 tok/s +step 400/18794 | loss 5.109375 (-1.23z)| norm 0.4185 (-1.41z)| lr 2.40e-03 | 2026.41 ms | 67.7% bf16 MFU | 260518 tok/s +step 401/18794 | loss 5.064665 (-1.54z)| norm 0.3527 (-1.98z)| lr 2.41e-03 | 2025.85 ms | 67.7% bf16 MFU | 260432 tok/s +step 402/18794 | loss 5.034820 (-1.72z)| norm 0.3547 (-1.94z)| lr 2.41e-03 | 2022.88 ms | 67.8% bf16 MFU | 260370 tok/s +step 403/18794 | loss 5.056283 (-1.53z)| norm 0.3403 (-2.00z)| lr 2.42e-03 | 2030.25 ms | 67.6% bf16 MFU | 260263 tok/s +step 404/18794 | loss 5.022710 (-1.75z)| norm 0.3905 (-1.54z)| lr 2.42e-03 | 2037.60 ms | 67.3% bf16 MFU | 260115 tok/s +step 405/18794 | loss 5.027251 (-1.68z)| norm 0.4162 (-1.30z)| lr 2.43e-03 | 2016.90 ms | 68.0% bf16 MFU | 260107 tok/s +step 406/18794 | loss 5.017587 (-1.73z)| norm 0.4482 (-1.01z)| lr 2.44e-03 | 2035.48 ms | 67.4% bf16 MFU | 259980 tok/s +step 407/18794 | loss 5.001162 (-1.82z)| norm 0.3952 (-1.48z)| lr 2.44e-03 | 2016.61 ms | 68.1% bf16 MFU | 259980 tok/s +step 408/18794 | loss 4.955299 (-2.13z)| norm 0.3888 (-1.50z)| lr 2.45e-03 | 2031.56 ms | 67.6% bf16 MFU | 259885 tok/s +step 409/18794 | loss 5.002597 (-1.73z)| norm 0.3858 (-1.50z)| lr 2.45e-03 | 2021.53 ms | 67.9% bf16 MFU | 259858 tok/s +step 410/18794 | loss 5.053460 (-1.32z)| norm 0.4030 (-1.32z)| lr 2.46e-03 | 2023.05 ms | 67.8% bf16 MFU | 259823 tok/s +step 411/18794 | loss 4.946524 (-2.08z)| norm 0.4415 (-0.97z)| lr 2.47e-03 | 2028.80 ms | 67.6% bf16 MFU | 259753 tok/s +step 412/18794 | loss 4.966113 (-1.90z)| norm 0.5341 (-0.16z)| lr 2.47e-03 | 2038.30 ms | 67.3% bf16 MFU | 259626 tok/s +step 413/18794 | loss 5.046289 (-1.27z)| norm 0.5684 (+0.14z)| lr 2.48e-03 | 2023.54 ms | 67.8% bf16 MFU | 259600 tok/s +step 414/18794 | loss 5.054710 (-1.18z)| norm 0.5858 (+0.30z)| lr 2.48e-03 | 2016.28 ms | 68.1% bf16 MFU | 259621 tok/s +step 415/18794 | loss 4.993882 (-1.63z)| norm 0.5586 (+0.07z)| lr 2.49e-03 | 2022.87 ms | 67.8% bf16 MFU | 259599 tok/s +step 416/18794 | loss 4.986835 (-1.66z)| norm 0.4587 (-0.81z)| lr 2.50e-03 | 2029.86 ms | 67.6% bf16 MFU | 259534 tok/s +step 417/18794 | loss 4.982957 (-1.67z)| norm 0.5169 (-0.28z)| lr 2.50e-03 | 2019.39 ms | 68.0% bf16 MFU | 259538 tok/s +step 418/18794 | loss 4.988125 (-1.60z)| norm 0.5204 (-0.23z)| lr 2.51e-03 | 2022.23 ms | 67.9% bf16 MFU | 259525 tok/s +step 419/18794 | loss 4.945721 (-1.89z)| norm 0.4892 (-0.50z)| lr 2.51e-03 | 2022.72 ms | 67.8% bf16 MFU | 259508 tok/s +step 420/18794 | loss 4.972808 (-1.65z)| norm 0.4376 (-0.95z)| lr 2.52e-03 | 2034.39 ms | 67.5% bf16 MFU | 259418 tok/s +step 421/18794 | loss 4.983346 (-1.55z)| norm 0.4436 (-0.88z)| lr 2.53e-03 | 2020.45 ms | 67.9% bf16 MFU | 259422 tok/s +step 422/18794 | loss 4.972611 (-1.61z)| norm 0.4215 (-1.06z)| lr 2.53e-03 | 2025.23 ms | 67.8% bf16 MFU | 259395 tok/s +step 423/18794 | loss 5.011748 (-1.27z)| norm 0.4613 (-0.69z)| lr 2.54e-03 | 2019.59 ms | 68.0% bf16 MFU | 259405 tok/s +step 424/18794 | loss 4.954322 (-1.71z)| norm 0.4678 (-0.62z)| lr 2.54e-03 | 2019.93 ms | 67.9% bf16 MFU | 259413 tok/s +step 425/18794 | loss 4.968606 (-1.56z)| norm 0.5720 (+0.31z)| lr 2.55e-03 | 2029.50 ms | 67.6% bf16 MFU | 259359 tok/s +step 426/18794 | loss 4.984251 (-1.42z)| norm 0.6013 (+0.57z)| lr 2.56e-03 | 2013.77 ms | 68.1% bf16 MFU | 259408 tok/s +step 427/18794 | loss 4.968798 (-1.52z)| norm 0.5829 (+0.41z)| lr 2.56e-03 | 2015.39 ms | 68.1% bf16 MFU | 259445 tok/s +step 428/18794 | loss 4.917060 (-1.92z)| norm 0.5114 (-0.22z)| lr 2.57e-03 | 2013.25 ms | 68.2% bf16 MFU | 259494 tok/s +step 429/18794 | loss 4.949207 (-1.62z)| norm 0.4831 (-0.47z)| lr 2.57e-03 | 2019.07 ms | 68.0% bf16 MFU | 259503 tok/s +step 430/18794 | loss 4.975273 (-1.38z)| norm 0.5720 (+0.34z)| lr 2.58e-03 | 2023.20 ms | 67.8% bf16 MFU | 259484 tok/s +step 431/18794 | loss 4.924659 (-1.78z)| norm 0.5612 (+0.24z)| lr 2.59e-03 | 2019.70 ms | 67.9% bf16 MFU | 259489 tok/s +step 432/18794 | loss 4.972746 (-1.35z)| norm 0.6453 (+0.99z)| lr 2.59e-03 | 2033.96 ms | 67.5% bf16 MFU | 259403 tok/s +step 433/18794 | loss 4.965122 (-1.39z)| norm 0.6418 (+0.94z)| lr 2.60e-03 | 2021.64 ms | 67.9% bf16 MFU | 259400 tok/s +step 434/18794 | loss 4.960112 (-1.42z)| norm 0.6239 (+0.76z)| lr 2.60e-03 | 2010.15 ms | 68.3% bf16 MFU | 259471 tok/s +step 435/18794 | loss 4.924661 (-1.70z)| norm 0.5479 (+0.05z)| lr 2.61e-03 | 2017.20 ms | 68.0% bf16 MFU | 259493 tok/s +step 436/18794 | loss 5.001094 (-1.01z)| norm 0.6654 (+1.16z)| lr 2.62e-03 | 2013.81 ms | 68.1% bf16 MFU | 259536 tok/s +step 437/18794 | loss 4.934471 (-1.59z)| norm 0.5958 (+0.52z)| lr 2.62e-03 | 2021.74 ms | 67.9% bf16 MFU | 259525 tok/s +step 438/18794 | loss 4.993465 (-1.03z)| norm 0.6084 (+0.64z)| lr 2.63e-03 | 2023.90 ms | 67.8% bf16 MFU | 259501 tok/s +step 439/18794 | loss 5.010374 (-0.87z)| norm 0.5657 (+0.24z)| lr 2.63e-03 | 2016.47 ms | 68.1% bf16 MFU | 259526 tok/s +step 440/18794 | loss 4.941997 (-1.49z)| norm 0.5824 (+0.40z)| lr 2.64e-03 | 2022.39 ms | 67.9% bf16 MFU | 259512 tok/s +step 441/18794 | loss 5.004111 (-0.89z)| norm 0.6642 (+1.15z)| lr 2.65e-03 | 2017.90 ms | 68.0% bf16 MFU | 259527 tok/s +step 442/18794 | loss 4.956699 (-1.32z)| norm 0.5482 (+0.06z)| lr 2.65e-03 | 2007.23 ms | 68.4% bf16 MFU | 259611 tok/s +step 443/18794 | loss 4.967956 (-1.19z)| norm 0.5516 (+0.09z)| lr 2.66e-03 | 2023.80 ms | 67.8% bf16 MFU | 259583 tok/s +step 444/18794 | loss 4.963030 (-1.22z)| norm 0.5060 (-0.34z)| lr 2.66e-03 | 2017.15 ms | 68.0% bf16 MFU | 259600 tok/s +step 445/18794 | loss 4.875178 (-2.03z)| norm 0.5077 (-0.33z)| lr 2.67e-03 | 2007.40 ms | 68.4% bf16 MFU | 259679 tok/s +step 446/18794 | loss 4.926902 (-1.49z)| norm 0.6125 (+0.64z)| lr 2.68e-03 | 2017.69 ms | 68.0% bf16 MFU | 259687 tok/s +step 447/18794 | loss 4.946258 (-1.28z)| norm 0.5916 (+0.43z)| lr 2.68e-03 | 2013.60 ms | 68.2% bf16 MFU | 259722 tok/s +step 448/18794 | loss 4.955804 (-1.17z)| norm 0.6351 (+0.83z)| lr 2.69e-03 | 2011.02 ms | 68.2% bf16 MFU | 259771 tok/s +step 449/18794 | loss 4.927568 (-1.41z)| norm 0.5241 (-0.23z)| lr 2.69e-03 | 2007.46 ms | 68.4% bf16 MFU | 259841 tok/s +step 450/18794 | loss 4.888379 (-1.78z)| norm 0.4649 (-0.79z)| lr 2.70e-03 | 1999.35 ms | 68.6% bf16 MFU | 259960 tok/s +step 451/18794 | loss 4.921796 (-1.42z)| norm 0.4163 (-1.24z)| lr 2.71e-03 | 2008.36 ms | 68.3% bf16 MFU | 260015 tok/s +step 452/18794 | loss 4.871396 (-1.86z)| norm 0.3828 (-1.54z)| lr 2.71e-03 | 2008.17 ms | 68.3% bf16 MFU | 260068 tok/s +step 453/18794 | loss 4.870020 (-1.83z)| norm 0.3694 (-1.63z)| lr 2.72e-03 | 2008.70 ms | 68.3% bf16 MFU | 260115 tok/s +step 454/18794 | loss 4.881065 (-1.69z)| norm 0.3816 (-1.49z)| lr 2.72e-03 | 2016.10 ms | 68.1% bf16 MFU | 260112 tok/s +step 455/18794 | loss 4.834404 (-2.08z)| norm 0.4266 (-1.06z)| lr 2.73e-03 | 2001.01 ms | 68.6% bf16 MFU | 260207 tok/s +step 456/18794 | loss 4.916978 (-1.26z)| norm 0.4735 (-0.63z)| lr 2.74e-03 | 2028.28 ms | 67.7% bf16 MFU | 260121 tok/s +step 457/18794 | loss 4.906874 (-1.34z)| norm 0.6032 (+0.54z)| lr 2.74e-03 | 2008.66 ms | 68.3% bf16 MFU | 260166 tok/s +step 458/18794 | loss 4.845137 (-1.88z)| norm 0.5936 (+0.44z)| lr 2.75e-03 | 2001.40 ms | 68.6% bf16 MFU | 260255 tok/s +step 459/18794 | loss 4.830215 (-1.96z)| norm 0.4918 (-0.49z)| lr 2.75e-03 | 2015.35 ms | 68.1% bf16 MFU | 260250 tok/s +step 460/18794 | loss 4.838639 (-1.83z)| norm 0.4726 (-0.65z)| lr 2.76e-03 | 2015.14 ms | 68.1% bf16 MFU | 260246 tok/s +step 461/18794 | loss 4.873362 (-1.48z)| norm 0.4918 (-0.47z)| lr 2.77e-03 | 2004.79 ms | 68.5% bf16 MFU | 260310 tok/s +step 462/18794 | loss 4.826913 (-1.86z)| norm 0.4068 (-1.23z)| lr 2.77e-03 | 2011.75 ms | 68.2% bf16 MFU | 260325 tok/s +step 463/18794 | loss 4.824348 (-1.83z)| norm 0.4159 (-1.13z)| lr 2.78e-03 | 2008.13 ms | 68.3% bf16 MFU | 260363 tok/s +step 464/18794 | loss 4.813902 (-1.88z)| norm 0.4523 (-0.79z)| lr 2.78e-03 | 1993.95 ms | 68.8% bf16 MFU | 260492 tok/s +step 465/18794 | loss 4.849277 (-1.53z)| norm 0.4803 (-0.52z)| lr 2.79e-03 | 2015.68 ms | 68.1% bf16 MFU | 260472 tok/s +step 466/18794 | loss 4.794618 (-1.97z)| norm 0.4577 (-0.71z)| lr 2.80e-03 | 2015.51 ms | 68.1% bf16 MFU | 260455 tok/s +step 467/18794 | loss 4.812098 (-1.77z)| norm 0.3831 (-1.36z)| lr 2.80e-03 | 2008.15 ms | 68.3% bf16 MFU | 260486 tok/s +step 468/18794 | loss 4.838734 (-1.50z)| norm 0.4574 (-0.69z)| lr 2.81e-03 | 2012.18 ms | 68.2% bf16 MFU | 260490 tok/s +step 469/18794 | loss 4.801888 (-1.78z)| norm 0.5592 (+0.22z)| lr 2.81e-03 | 2009.85 ms | 68.3% bf16 MFU | 260508 tok/s +step 470/18794 | loss 4.829907 (-1.50z)| norm 0.4900 (-0.39z)| lr 2.82e-03 | 2013.62 ms | 68.2% bf16 MFU | 260501 tok/s +step 471/18794 | loss 4.818987 (-1.57z)| norm 0.4141 (-1.05z)| lr 2.83e-03 | 2008.96 ms | 68.3% bf16 MFU | 260525 tok/s +step 472/18794 | loss 4.760488 (-2.02z)| norm 0.4356 (-0.84z)| lr 2.83e-03 | 2023.65 ms | 67.8% bf16 MFU | 260453 tok/s +step 473/18794 | loss 4.811631 (-1.54z)| norm 0.4655 (-0.56z)| lr 2.84e-03 | 2003.99 ms | 68.5% bf16 MFU | 260511 tok/s +step 474/18794 | loss 4.772402 (-1.84z)| norm 0.4585 (-0.61z)| lr 2.84e-03 | 1991.00 ms | 68.9% bf16 MFU | 260652 tok/s +step 475/18794 | loss 4.781573 (-1.72z)| norm 0.4482 (-0.69z)| lr 2.85e-03 | 1991.13 ms | 68.9% bf16 MFU | 260785 tok/s +step 476/18794 | loss 4.764314 (-1.82z)| norm 0.4587 (-0.58z)| lr 2.86e-03 | 2010.60 ms | 68.3% bf16 MFU | 260784 tok/s +step 477/18794 | loss 4.755266 (-1.85z)| norm 0.3634 (-1.41z)| lr 2.86e-03 | 2004.80 ms | 68.5% bf16 MFU | 260821 tok/s +step 478/18794 | loss 4.824397 (-1.25z)| norm 0.3628 (-1.39z)| lr 2.87e-03 | 1996.04 ms | 68.8% bf16 MFU | 260913 tok/s +step 479/18794 | loss 4.758991 (-1.77z)| norm 0.3988 (-1.05z)| lr 2.87e-03 | 2005.75 ms | 68.4% bf16 MFU | 260937 tok/s +step 480/18794 | loss 4.744379 (-1.85z)| norm 0.4607 (-0.47z)| lr 2.88e-03 | 2014.17 ms | 68.1% bf16 MFU | 260905 tok/s +step 481/18794 | loss 4.767738 (-1.62z)| norm 0.5890 (+0.76z)| lr 2.89e-03 | 2006.41 ms | 68.4% bf16 MFU | 260925 tok/s +step 482/18794 | loss 4.793625 (-1.38z)| norm 0.5442 (+0.37z)| lr 2.89e-03 | 2001.19 ms | 68.6% bf16 MFU | 260978 tok/s +step 483/18794 | loss 4.797338 (-1.35z)| norm 0.5562 (+0.51z)| lr 2.90e-03 | 2001.49 ms | 68.6% bf16 MFU | 261027 tok/s +step 484/18794 | loss 4.750348 (-1.73z)| norm 0.5708 (+0.70z)| lr 2.90e-03 | 1994.78 ms | 68.8% bf16 MFU | 261117 tok/s +step 485/18794 | loss 4.780175 (-1.45z)| norm 0.6257 (+1.34z)| lr 2.91e-03 | 2017.12 ms | 68.0% bf16 MFU | 261057 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.149148 +step 486/18794 | loss 4.788260 (-1.36z)| norm 0.6973 (+2.15z)| lr 2.92e-03 | 1996.02 ms | 68.8% bf16 MFU | 261137 tok/s +step 487/18794 | loss 4.795587 (-1.28z)| norm 0.5334 (+0.40z)| lr 2.92e-03 | 1988.18 ms | 69.0% bf16 MFU | 261266 tok/s +step 488/18794 | loss 4.749807 (-1.68z)| norm 0.6022 (+1.26z)| lr 2.93e-03 | 2011.18 ms | 68.2% bf16 MFU | 261237 tok/s +step 489/18794 | loss 4.798392 (-1.22z)| norm 0.5466 (+0.59z)| lr 2.93e-03 | 1999.35 ms | 68.6% bf16 MFU | 261286 tok/s +step 490/18794 | loss 4.778294 (-1.40z)| norm 0.4842 (-0.15z)| lr 2.94e-03 | 2010.66 ms | 68.3% bf16 MFU | 261260 tok/s +step 491/18794 | loss 4.729326 (-1.84z)| norm 0.4656 (-0.37z)| lr 2.95e-03 | 2004.58 ms | 68.5% bf16 MFU | 261274 tok/s +step 492/18794 | loss 4.760635 (-1.51z)| norm 0.4786 (-0.22z)| lr 2.95e-03 | 1994.09 ms | 68.8% bf16 MFU | 261356 tok/s +step 493/18794 | loss 4.738129 (-1.70z)| norm 0.4553 (-0.50z)| lr 2.96e-03 | 1987.77 ms | 69.0% bf16 MFU | 261476 tok/s +step 494/18794 | loss 4.779846 (-1.27z)| norm 0.4362 (-0.73z)| lr 2.96e-03 | 1996.99 ms | 68.7% bf16 MFU | 261529 tok/s +step 495/18794 | loss 4.723882 (-1.79z)| norm 0.4345 (-0.75z)| lr 2.97e-03 | 1997.20 ms | 68.7% bf16 MFU | 261579 tok/s +step 496/18794 | loss 4.780063 (-1.21z)| norm 0.6089 (+1.32z)| lr 2.98e-03 | 2013.70 ms | 68.1% bf16 MFU | 261518 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.002488 +step 497/18794 | loss 4.735669 (-1.61z)| norm 0.6709 (+2.00z)| lr 2.98e-03 | 1992.72 ms | 68.9% bf16 MFU | 261597 tok/s +step 498/18794 | loss 4.761840 (-1.33z)| norm 0.5488 (+0.57z)| lr 2.99e-03 | 2019.87 ms | 67.9% bf16 MFU | 261495 tok/s +step 499/18794 | loss 4.791490 (-1.02z)| norm 0.5407 (+0.47z)| lr 2.99e-03 | 1994.56 ms | 68.8% bf16 MFU | 261563 tok/s +step 500/18794 | loss 4.835478 (-0.57z)| norm 0.5660 (+0.75z)| lr 3.00e-03 | 1990.08 ms | 69.0% bf16 MFU | 261658 tok/s +val loss 4.760242 +HellaSwag: 2479/10042 = 0.246863Swag: 990/1256: 0/1256 +Writing state to log_gpt3_125M_edu_v4/state_00000500_00001.bin +Writing checkpoint at step 500 +Writing model to log_gpt3_125M_edu_v4/model_00000500.bin +Writing state to log_gpt3_125M_edu_v4/state_00000500_00000.bin +step 501/18794 | loss 4.762667 (-1.29z)| norm 0.4610 (-0.50z)| lr 3.01e-03 | 1998.72 ms | 68.7% bf16 MFU | 261691 tok/s +step 502/18794 | loss 4.707852 (-1.82z)| norm 0.4156 (-1.05z)| lr 3.01e-03 | 2008.29 ms | 68.3% bf16 MFU | 261659 tok/s +step 503/18794 | loss 4.679261 (-2.06z)| norm 0.4180 (-1.05z)| lr 3.02e-03 | 2013.44 ms | 68.2% bf16 MFU | 261596 tok/s +step 504/18794 | loss 4.713466 (-1.67z)| norm 0.4112 (-1.14z)| lr 3.02e-03 | 1993.15 ms | 68.9% bf16 MFU | 261668 tok/s +step 505/18794 | loss 4.721491 (-1.56z)| norm 0.3818 (-1.48z)| lr 3.03e-03 | 2004.57 ms | 68.5% bf16 MFU | 261662 tok/s +step 506/18794 | loss 4.679281 (-1.94z)| norm 0.3938 (-1.32z)| lr 3.04e-03 | 2001.96 ms | 68.5% bf16 MFU | 261674 tok/s +step 507/18794 | loss 4.680508 (-1.87z)| norm 0.4627 (-0.50z)| lr 3.04e-03 | 1996.39 ms | 68.7% bf16 MFU | 261721 tok/s +step 508/18794 | loss 4.732440 (-1.33z)| norm 0.5493 (+0.53z)| lr 3.05e-03 | 1992.48 ms | 68.9% bf16 MFU | 261791 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.046763 +step 509/18794 | loss 4.755693 (-1.07z)| norm 0.6776 (+2.05z)| lr 3.05e-03 | 2002.83 ms | 68.5% bf16 MFU | 261790 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.319504 +step 510/18794 | loss 4.803032 (-0.58z)| norm 0.7069 (+2.32z)| lr 3.06e-03 | 2007.80 ms | 68.3% bf16 MFU | 261757 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.978562 +step 511/18794 | loss 4.824780 (-0.35z)| norm 0.7769 (+2.98z)| lr 3.07e-03 | 2000.01 ms | 68.6% bf16 MFU | 261776 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.398039 +step 512/18794 | loss 4.801355 (-0.58z)| norm 0.7342 (+2.40z)| lr 3.07e-03 | 1991.07 ms | 68.9% bf16 MFU | 261854 tok/s +step 513/18794 | loss 4.720808 (-1.38z)| norm 0.4773 (-0.43z)| lr 3.08e-03 | 2008.74 ms | 68.3% bf16 MFU | 261811 tok/s +step 514/18794 | loss 4.698964 (-1.59z)| norm 0.5166 (+0.01z)| lr 3.08e-03 | 1993.76 ms | 68.8% bf16 MFU | 261869 tok/s +step 515/18794 | loss 4.713140 (-1.41z)| norm 0.4651 (-0.55z)| lr 3.09e-03 | 1986.35 ms | 69.1% bf16 MFU | 261973 tok/s +step 516/18794 | loss 4.685336 (-1.66z)| norm 0.4115 (-1.13z)| lr 3.10e-03 | 1985.32 ms | 69.1% bf16 MFU | 262078 tok/s +step 517/18794 | loss 4.680099 (-1.68z)| norm 0.3667 (-1.59z)| lr 3.10e-03 | 1986.07 ms | 69.1% bf16 MFU | 262173 tok/s +step 518/18794 | loss 4.646480 (-1.98z)| norm 0.4087 (-1.11z)| lr 3.11e-03 | 1987.93 ms | 69.0% bf16 MFU | 262251 tok/s +step 519/18794 | loss 4.731759 (-1.08z)| norm 0.3434 (-1.78z)| lr 3.11e-03 | 1994.93 ms | 68.8% bf16 MFU | 262279 tok/s +step 520/18794 | loss 4.592619 (-2.42z)| norm 0.3682 (-1.49z)| lr 3.12e-03 | 2009.38 ms | 68.3% bf16 MFU | 262211 tok/s +step 521/18794 | loss 4.667464 (-1.63z)| norm 0.4675 (-0.45z)| lr 3.13e-03 | 1993.50 ms | 68.8% bf16 MFU | 262251 tok/s +step 522/18794 | loss 4.668019 (-1.59z)| norm 0.4511 (-0.62z)| lr 3.13e-03 | 1997.48 ms | 68.7% bf16 MFU | 262262 tok/s +step 523/18794 | loss 4.606981 (-2.15z)| norm 0.3931 (-1.22z)| lr 3.14e-03 | 1994.38 ms | 68.8% bf16 MFU | 262293 tok/s +step 524/18794 | loss 4.688198 (-1.31z)| norm 0.3643 (-1.50z)| lr 3.14e-03 | 1994.87 ms | 68.8% bf16 MFU | 262319 tok/s +step 525/18794 | loss 4.603275 (-2.10z)| norm 0.3581 (-1.53z)| lr 3.15e-03 | 2002.23 ms | 68.5% bf16 MFU | 262296 tok/s +step 526/18794 | loss 4.639210 (-1.71z)| norm 0.3653 (-1.42z)| lr 3.16e-03 | 2009.56 ms | 68.3% bf16 MFU | 262226 tok/s +step 527/18794 | loss 4.629413 (-1.77z)| norm 0.4959 (-0.07z)| lr 3.16e-03 | 2001.22 ms | 68.6% bf16 MFU | 262214 tok/s +step 528/18794 | loss 4.688142 (-1.17z)| norm 0.6374 (+1.36z)| lr 3.17e-03 | 1994.29 ms | 68.8% bf16 MFU | 262248 tok/s +step 529/18794 | loss 4.632371 (-1.67z)| norm 0.5485 (+0.45z)| lr 3.17e-03 | 2004.26 ms | 68.5% bf16 MFU | 262215 tok/s +step 530/18794 | loss 4.634892 (-1.62z)| norm 0.5221 (+0.18z)| lr 3.18e-03 | 1997.30 ms | 68.7% bf16 MFU | 262229 tok/s +step 531/18794 | loss 4.659113 (-1.35z)| norm 0.5204 (+0.17z)| lr 3.19e-03 | 1987.13 ms | 69.1% bf16 MFU | 262310 tok/s +step 532/18794 | loss 4.640724 (-1.50z)| norm 0.5159 (+0.14z)| lr 3.19e-03 | 1994.34 ms | 68.8% bf16 MFU | 262339 tok/s +step 533/18794 | loss 4.638815 (-1.50z)| norm 0.4890 (-0.13z)| lr 3.20e-03 | 1992.50 ms | 68.9% bf16 MFU | 262378 tok/s +step 534/18794 | loss 4.684336 (-1.03z)| norm 0.4862 (-0.14z)| lr 3.20e-03 | 1997.87 ms | 68.7% bf16 MFU | 262380 tok/s +step 535/18794 | loss 4.619651 (-1.63z)| norm 0.5081 (+0.09z)| lr 3.21e-03 | 1989.77 ms | 69.0% bf16 MFU | 262436 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.271701 +step 536/18794 | loss 4.620218 (-1.61z)| norm 0.7172 (+2.27z)| lr 3.22e-03 | 2001.47 ms | 68.6% bf16 MFU | 262412 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.696743 +step 537/18794 | loss 4.669328 (-1.10z)| norm 0.7683 (+2.70z)| lr 3.22e-03 | 1991.66 ms | 68.9% bf16 MFU | 262453 tok/s +step 538/18794 | loss 4.661611 (-1.16z)| norm 0.6876 (+1.85z)| lr 3.23e-03 | 1984.68 ms | 69.1% bf16 MFU | 262539 tok/s +step 539/18794 | loss 4.630980 (-1.46z)| norm 0.5455 (+0.43z)| lr 3.23e-03 | 1988.76 ms | 69.0% bf16 MFU | 262593 tok/s +step 540/18794 | loss 4.661208 (-1.13z)| norm 0.6245 (+1.22z)| lr 3.24e-03 | 1992.11 ms | 68.9% bf16 MFU | 262623 tok/s +step 541/18794 | loss 4.624328 (-1.51z)| norm 0.5451 (+0.44z)| lr 3.25e-03 | 1989.03 ms | 69.0% bf16 MFU | 262671 tok/s +step 542/18794 | loss 4.742402 (-0.23z)| norm 0.6001 (+0.99z)| lr 3.25e-03 | 1991.77 ms | 68.9% bf16 MFU | 262699 tok/s +step 543/18794 | loss 4.675340 (-0.95z)| norm 0.4641 (-0.37z)| lr 3.26e-03 | 1980.88 ms | 69.3% bf16 MFU | 262798 tok/s +step 544/18794 | loss 4.698338 (-0.68z)| norm 0.5814 (+0.80z)| lr 3.26e-03 | 1982.03 ms | 69.2% bf16 MFU | 262884 tok/s +step 545/18794 | loss 4.676870 (-0.90z)| norm 0.5453 (+0.43z)| lr 3.27e-03 | 2002.02 ms | 68.5% bf16 MFU | 262834 tok/s +step 546/18794 | loss 4.672376 (-0.94z)| norm 0.6013 (+1.00z)| lr 3.28e-03 | 1987.04 ms | 69.1% bf16 MFU | 262885 tok/s +step 547/18794 | loss 4.640373 (-1.30z)| norm 0.4685 (-0.32z)| lr 3.28e-03 | 1994.83 ms | 68.8% bf16 MFU | 262882 tok/s +step 548/18794 | loss 4.596738 (-1.80z)| norm 0.4176 (-0.82z)| lr 3.29e-03 | 1991.58 ms | 68.9% bf16 MFU | 262900 tok/s +step 549/18794 | loss 4.653918 (-1.10z)| norm 0.4258 (-0.72z)| lr 3.29e-03 | 1990.86 ms | 68.9% bf16 MFU | 262922 tok/s +step 550/18794 | loss 4.654435 (-1.08z)| norm 0.4679 (-0.30z)| lr 3.30e-03 | 1983.87 ms | 69.2% bf16 MFU | 262990 tok/s +step 551/18794 | loss 4.639891 (-1.25z)| norm 0.4723 (-0.26z)| lr 3.31e-03 | 1983.90 ms | 69.2% bf16 MFU | 263054 tok/s +step 552/18794 | loss 4.650084 (-1.10z)| norm 0.5560 (+0.57z)| lr 3.31e-03 | 1982.12 ms | 69.2% bf16 MFU | 263127 tok/s +step 553/18794 | loss 4.622615 (-1.42z)| norm 0.5496 (+0.49z)| lr 3.32e-03 | 1985.36 ms | 69.1% bf16 MFU | 263174 tok/s +step 554/18794 | loss 4.609957 (-1.56z)| norm 0.4933 (-0.10z)| lr 3.32e-03 | 1995.87 ms | 68.8% bf16 MFU | 263150 tok/s +step 555/18794 | loss 4.573041 (-1.98z)| norm 0.4300 (-0.75z)| lr 3.33e-03 | 1982.15 ms | 69.2% bf16 MFU | 263218 tok/s +step 556/18794 | loss 4.602679 (-1.59z)| norm 0.4560 (-0.48z)| lr 3.34e-03 | 1988.53 ms | 69.0% bf16 MFU | 263240 tok/s +step 557/18794 | loss 4.562425 (-2.08z)| norm 0.3594 (-1.45z)| lr 3.34e-03 | 1982.17 ms | 69.2% bf16 MFU | 263303 tok/s +step 558/18794 | loss 4.615428 (-1.36z)| norm 0.3663 (-1.35z)| lr 3.35e-03 | 1980.22 ms | 69.3% bf16 MFU | 263376 tok/s +step 559/18794 | loss 4.566083 (-1.96z)| norm 0.3851 (-1.14z)| lr 3.35e-03 | 1978.29 ms | 69.4% bf16 MFU | 263458 tok/s +step 560/18794 | loss 4.550830 (-2.10z)| norm 0.4176 (-0.80z)| lr 3.36e-03 | 1982.45 ms | 69.2% bf16 MFU | 263508 tok/s +step 561/18794 | loss 4.537251 (-2.22z)| norm 0.4606 (-0.36z)| lr 3.37e-03 | 1978.72 ms | 69.4% bf16 MFU | 263581 tok/s +step 562/18794 | loss 4.537645 (-2.15z)| norm 0.4965 (-0.00z)| lr 3.37e-03 | 1982.88 ms | 69.2% bf16 MFU | 263622 tok/s +step 563/18794 | loss 4.598423 (-1.36z)| norm 0.5124 (+0.15z)| lr 3.38e-03 | 1985.65 ms | 69.1% bf16 MFU | 263643 tok/s +step 564/18794 | loss 4.561529 (-1.78z)| norm 0.4427 (-0.56z)| lr 3.38e-03 | 1977.08 ms | 69.4% bf16 MFU | 263720 tok/s +step 565/18794 | loss 4.546164 (-1.93z)| norm 0.4456 (-0.53z)| lr 3.39e-03 | 1977.73 ms | 69.4% bf16 MFU | 263789 tok/s +step 566/18794 | loss 4.569580 (-1.60z)| norm 0.4226 (-0.76z)| lr 3.40e-03 | 1978.99 ms | 69.3% bf16 MFU | 263846 tok/s +step 567/18794 | loss 4.592929 (-1.28z)| norm 0.3425 (-1.56z)| lr 3.40e-03 | 1977.48 ms | 69.4% bf16 MFU | 263910 tok/s +step 568/18794 | loss 4.552102 (-1.76z)| norm 0.4234 (-0.74z)| lr 3.41e-03 | 1984.13 ms | 69.2% bf16 MFU | 263926 tok/s +step 569/18794 | loss 4.526662 (-2.02z)| norm 0.4694 (-0.26z)| lr 3.41e-03 | 1977.94 ms | 69.4% bf16 MFU | 263984 tok/s +step 570/18794 | loss 4.513006 (-2.13z)| norm 0.5809 (+0.85z)| lr 3.42e-03 | 1977.89 ms | 69.4% bf16 MFU | 264038 tok/s +step 571/18794 | loss 4.569942 (-1.41z)| norm 0.5709 (+0.74z)| lr 3.43e-03 | 1978.27 ms | 69.4% bf16 MFU | 264087 tok/s +step 572/18794 | loss 4.530555 (-1.84z)| norm 0.4787 (-0.20z)| lr 3.43e-03 | 1985.73 ms | 69.1% bf16 MFU | 264084 tok/s +step 573/18794 | loss 4.455596 (-2.64z)| norm 0.4352 (-0.63z)| lr 3.44e-03 | 2040.18 ms | 67.3% bf16 MFU | 263729 tok/s +step 574/18794 | loss 4.521343 (-1.81z)| norm 0.3718 (-1.25z)| lr 3.44e-03 | 2032.05 ms | 67.5% bf16 MFU | 263443 tok/s +step 575/18794 | loss 4.552639 (-1.42z)| norm 0.3488 (-1.46z)| lr 3.45e-03 | 2036.85 ms | 67.4% bf16 MFU | 263141 tok/s +step 576/18794 | loss 4.480052 (-2.19z)| norm 0.3599 (-1.33z)| lr 3.46e-03 | 2039.70 ms | 67.3% bf16 MFU | 262836 tok/s +step 577/18794 | loss 4.484824 (-2.07z)| norm 0.4236 (-0.71z)| lr 3.46e-03 | 2034.49 ms | 67.5% bf16 MFU | 262579 tok/s +step 578/18794 | loss 4.511733 (-1.73z)| norm 0.4673 (-0.29z)| lr 3.47e-03 | 2031.73 ms | 67.5% bf16 MFU | 262353 tok/s +step 579/18794 | loss 4.485445 (-1.97z)| norm 0.4489 (-0.48z)| lr 3.47e-03 | 2038.91 ms | 67.3% bf16 MFU | 262092 tok/s +step 580/18794 | loss 4.486703 (-1.90z)| norm 0.4208 (-0.76z)| lr 3.48e-03 | 2037.28 ms | 67.4% bf16 MFU | 261855 tok/s +step 581/18794 | loss 4.504705 (-1.66z)| norm 0.4185 (-0.77z)| lr 3.49e-03 | 2029.07 ms | 67.6% bf16 MFU | 261682 tok/s +step 582/18794 | loss 4.497266 (-1.70z)| norm 0.4550 (-0.39z)| lr 3.49e-03 | 2032.18 ms | 67.5% bf16 MFU | 261497 tok/s +step 583/18794 | loss 4.483597 (-1.81z)| norm 0.4455 (-0.48z)| lr 3.50e-03 | 2023.82 ms | 67.8% bf16 MFU | 261375 tok/s +step 584/18794 | loss 4.521232 (-1.37z)| norm 0.4808 (-0.12z)| lr 3.50e-03 | 2038.65 ms | 67.3% bf16 MFU | 261165 tok/s +step 585/18794 | loss 4.442704 (-2.15z)| norm 0.4687 (-0.22z)| lr 3.51e-03 | 2011.02 ms | 68.2% bf16 MFU | 261142 tok/s +step 586/18794 | loss 4.518201 (-1.33z)| norm 0.5217 (+0.34z)| lr 3.52e-03 | 2023.12 ms | 67.8% bf16 MFU | 261043 tok/s +step 587/18794 | loss 4.467430 (-1.83z)| norm 0.6027 (+1.16z)| lr 3.52e-03 | 2013.03 ms | 68.2% bf16 MFU | 261013 tok/s +step 588/18794 | loss 4.487544 (-1.58z)| norm 0.5272 (+0.40z)| lr 3.53e-03 | 2021.92 ms | 67.9% bf16 MFU | 260927 tok/s +step 589/18794 | loss 4.508039 (-1.34z)| norm 0.5284 (+0.41z)| lr 3.53e-03 | 2031.38 ms | 67.6% bf16 MFU | 260786 tok/s +step 590/18794 | loss 4.457037 (-1.83z)| norm 0.6244 (+1.38z)| lr 3.54e-03 | 2037.58 ms | 67.4% bf16 MFU | 260612 tok/s +step 591/18794 | loss 4.454750 (-1.81z)| norm 0.5355 (+0.46z)| lr 3.55e-03 | 2035.04 ms | 67.4% bf16 MFU | 260463 tok/s +step 592/18794 | loss 4.482360 (-1.49z)| norm 0.4750 (-0.16z)| lr 3.55e-03 | 2033.04 ms | 67.5% bf16 MFU | 260334 tok/s +step 593/18794 | loss 4.469319 (-1.59z)| norm 0.5206 (+0.30z)| lr 3.56e-03 | 2029.62 ms | 67.6% bf16 MFU | 260233 tok/s +step 594/18794 | loss 4.515333 (-1.09z)| norm 0.5060 (+0.14z)| lr 3.56e-03 | 2032.34 ms | 67.5% bf16 MFU | 260120 tok/s +step 595/18794 | loss 4.471631 (-1.51z)| norm 0.4203 (-0.74z)| lr 3.57e-03 | 2037.30 ms | 67.4% bf16 MFU | 259981 tok/s +step 596/18794 | loss 4.504319 (-1.15z)| norm 0.5265 (+0.36z)| lr 3.58e-03 | 2018.69 ms | 68.0% bf16 MFU | 259968 tok/s +step 597/18794 | loss 4.553084 (-0.63z)| norm 0.5261 (+0.38z)| lr 3.58e-03 | 2022.68 ms | 67.8% bf16 MFU | 259930 tok/s +step 598/18794 | loss 4.470038 (-1.48z)| norm 0.3716 (-1.22z)| lr 3.59e-03 | 2034.43 ms | 67.5% bf16 MFU | 259819 tok/s +step 599/18794 | loss 4.439683 (-1.76z)| norm 0.3682 (-1.23z)| lr 3.59e-03 | 2038.94 ms | 67.3% bf16 MFU | 259685 tok/s +step 600/18794 | loss 4.440223 (-1.74z)| norm 0.3312 (-1.57z)| lr 3.60e-03 | 2019.64 ms | 67.9% bf16 MFU | 259680 tok/s +step 601/18794 | loss 4.378475 (-2.34z)| norm 0.4033 (-0.82z)| lr 3.61e-03 | 2015.88 ms | 68.1% bf16 MFU | 259700 tok/s +step 602/18794 | loss 4.359603 (-2.44z)| norm 0.4490 (-0.36z)| lr 3.61e-03 | 2027.47 ms | 67.7% bf16 MFU | 259645 tok/s +step 603/18794 | loss 4.438810 (-1.58z)| norm 0.4164 (-0.69z)| lr 3.62e-03 | 2025.16 ms | 67.8% bf16 MFU | 259607 tok/s +step 604/18794 | loss 4.368391 (-2.23z)| norm 0.4342 (-0.51z)| lr 3.62e-03 | 2031.67 ms | 67.5% bf16 MFU | 259529 tok/s +step 605/18794 | loss 4.503200 (-0.85z)| norm 0.4329 (-0.53z)| lr 3.63e-03 | 2022.28 ms | 67.9% bf16 MFU | 259516 tok/s +step 606/18794 | loss 4.487054 (-0.99z)| norm 0.4090 (-0.78z)| lr 3.64e-03 | 2016.51 ms | 68.1% bf16 MFU | 259540 tok/s +step 607/18794 | loss 4.448552 (-1.35z)| norm 0.4564 (-0.29z)| lr 3.64e-03 | 2024.86 ms | 67.8% bf16 MFU | 259509 tok/s +step 608/18794 | loss 4.380719 (-1.99z)| norm 0.4209 (-0.64z)| lr 3.65e-03 | 2016.89 ms | 68.0% bf16 MFU | 259531 tok/s +step 609/18794 | loss 4.408525 (-1.68z)| norm 0.3990 (-0.86z)| lr 3.65e-03 | 2015.99 ms | 68.1% bf16 MFU | 259558 tok/s +step 610/18794 | loss 4.422856 (-1.52z)| norm 0.3984 (-0.85z)| lr 3.66e-03 | 2022.84 ms | 67.8% bf16 MFU | 259539 tok/s +step 611/18794 | loss 4.405999 (-1.68z)| norm 0.3504 (-1.39z)| lr 3.67e-03 | 2016.60 ms | 68.1% bf16 MFU | 259561 tok/s +step 612/18794 | loss 4.384451 (-1.88z)| norm 0.3884 (-0.96z)| lr 3.67e-03 | 2017.19 ms | 68.0% bf16 MFU | 259579 tok/s +step 613/18794 | loss 4.381234 (-1.88z)| norm 0.4605 (-0.11z)| lr 3.68e-03 | 2023.99 ms | 67.8% bf16 MFU | 259552 tok/s +step 614/18794 | loss 4.427692 (-1.36z)| norm 0.4418 (-0.32z)| lr 3.68e-03 | 2024.68 ms | 67.8% bf16 MFU | 259522 tok/s +step 615/18794 | loss 4.370833 (-1.91z)| norm 0.4021 (-0.78z)| lr 3.69e-03 | 2039.26 ms | 67.3% bf16 MFU | 259400 tok/s +step 616/18794 | loss 4.394032 (-1.63z)| norm 0.4307 (-0.44z)| lr 3.70e-03 | 2006.57 ms | 68.4% bf16 MFU | 259495 tok/s +step 617/18794 | loss 4.405563 (-1.48z)| norm 0.4020 (-0.79z)| lr 3.70e-03 | 1997.86 ms | 68.7% bf16 MFU | 259641 tok/s +step 618/18794 | loss 4.411732 (-1.39z)| norm 0.4413 (-0.33z)| lr 3.71e-03 | 2012.49 ms | 68.2% bf16 MFU | 259685 tok/s +step 619/18794 | loss 4.423120 (-1.25z)| norm 0.4867 (+0.20z)| lr 3.71e-03 | 2014.07 ms | 68.1% bf16 MFU | 259716 tok/s +step 620/18794 | loss 4.416792 (-1.29z)| norm 0.4721 (+0.01z)| lr 3.72e-03 | 2016.52 ms | 68.1% bf16 MFU | 259730 tok/s +step 621/18794 | loss 4.343882 (-1.99z)| norm 0.4607 (-0.13z)| lr 3.73e-03 | 2022.61 ms | 67.8% bf16 MFU | 259704 tok/s +step 622/18794 | loss 4.465836 (-0.72z)| norm 0.4286 (-0.51z)| lr 3.73e-03 | 2026.19 ms | 67.7% bf16 MFU | 259657 tok/s +step 623/18794 | loss 4.391116 (-1.46z)| norm 0.4087 (-0.76z)| lr 3.74e-03 | 2009.64 ms | 68.3% bf16 MFU | 259718 tok/s +step 624/18794 | loss 4.372056 (-1.62z)| norm 0.4077 (-0.78z)| lr 3.74e-03 | 2024.24 ms | 67.8% bf16 MFU | 259683 tok/s +step 625/18794 | loss 4.383979 (-1.47z)| norm 0.4330 (-0.48z)| lr 3.75e-03 | 2032.01 ms | 67.5% bf16 MFU | 259599 tok/s +step 626/18794 | loss 4.420850 (-1.07z)| norm 0.3984 (-0.92z)| lr 3.76e-03 | 2008.71 ms | 68.3% bf16 MFU | 259670 tok/s +step 627/18794 | loss 4.339737 (-1.84z)| norm 0.4299 (-0.52z)| lr 3.76e-03 | 2008.53 ms | 68.3% bf16 MFU | 259738 tok/s +step 628/18794 | loss 4.470793 (-0.51z)| norm 0.4603 (-0.13z)| lr 3.77e-03 | 2020.02 ms | 67.9% bf16 MFU | 259728 tok/s +step 629/18794 | loss 4.481112 (-0.39z)| norm 0.4322 (-0.47z)| lr 3.77e-03 | 2004.79 ms | 68.5% bf16 MFU | 259818 tok/s +step 630/18794 | loss 4.376218 (-1.43z)| norm 0.3625 (-1.32z)| lr 3.78e-03 | 2027.02 ms | 67.7% bf16 MFU | 259759 tok/s +step 631/18794 | loss 4.391897 (-1.25z)| norm 0.4486 (-0.23z)| lr 3.79e-03 | 2005.94 ms | 68.4% bf16 MFU | 259840 tok/s +step 632/18794 | loss 4.399451 (-1.15z)| norm 0.4187 (-0.59z)| lr 3.79e-03 | 2029.16 ms | 67.6% bf16 MFU | 259766 tok/s +step 633/18794 | loss 4.355806 (-1.56z)| norm 0.3570 (-1.34z)| lr 3.80e-03 | 2005.08 ms | 68.4% bf16 MFU | 259852 tok/s +step 634/18794 | loss 4.396313 (-1.13z)| norm 0.3798 (-1.04z)| lr 3.80e-03 | 2009.13 ms | 68.3% bf16 MFU | 259907 tok/s +step 635/18794 | loss 4.409910 (-0.97z)| norm 0.3631 (-1.22z)| lr 3.81e-03 | 2008.02 ms | 68.3% bf16 MFU | 259967 tok/s +step 636/18794 | loss 4.401234 (-1.04z)| norm 0.3920 (-0.86z)| lr 3.82e-03 | 2024.97 ms | 67.8% bf16 MFU | 259914 tok/s +step 637/18794 | loss 4.373430 (-1.31z)| norm 0.3821 (-1.02z)| lr 3.82e-03 | 2013.31 ms | 68.2% bf16 MFU | 259939 tok/s +step 638/18794 | loss 4.351817 (-1.50z)| norm 0.4158 (-0.54z)| lr 3.83e-03 | 2001.86 ms | 68.6% bf16 MFU | 260037 tok/s +step 639/18794 | loss 4.335689 (-1.64z)| norm 0.4608 (+0.14z)| lr 3.83e-03 | 2013.49 ms | 68.2% bf16 MFU | 260054 tok/s +step 640/18794 | loss 4.353545 (-1.43z)| norm 0.4057 (-0.67z)| lr 3.84e-03 | 2028.92 ms | 67.6% bf16 MFU | 259972 tok/s +step 641/18794 | loss 4.350306 (-1.43z)| norm 0.3755 (-1.12z)| lr 3.85e-03 | 2007.17 ms | 68.4% bf16 MFU | 260034 tok/s +step 642/18794 | loss 4.344846 (-1.49z)| norm 0.3970 (-0.77z)| lr 3.85e-03 | 2025.25 ms | 67.8% bf16 MFU | 259976 tok/s +step 643/18794 | loss 4.289202 (-2.04z)| norm 0.4053 (-0.63z)| lr 3.86e-03 | 2022.90 ms | 67.8% bf16 MFU | 259936 tok/s +step 644/18794 | loss 4.317124 (-1.73z)| norm 0.4386 (-0.08z)| lr 3.86e-03 | 2009.64 ms | 68.3% bf16 MFU | 259983 tok/s +step 645/18794 | loss 4.395044 (-0.86z)| norm 0.4410 (-0.02z)| lr 3.87e-03 | 2007.97 ms | 68.3% bf16 MFU | 260039 tok/s +step 646/18794 | loss 4.361982 (-1.22z)| norm 0.4685 (+0.47z)| lr 3.88e-03 | 2014.14 ms | 68.1% bf16 MFU | 260053 tok/s +step 647/18794 | loss 4.367708 (-1.14z)| norm 0.5271 (+1.45z)| lr 3.88e-03 | 2009.34 ms | 68.3% bf16 MFU | 260096 tok/s +step 648/18794 | loss 4.350439 (-1.31z)| norm 0.4599 (+0.30z)| lr 3.89e-03 | 1997.93 ms | 68.7% bf16 MFU | 260212 tok/s +step 649/18794 | loss 4.381421 (-0.94z)| norm 0.4768 (+0.58z)| lr 3.89e-03 | 2008.64 ms | 68.3% bf16 MFU | 260252 tok/s +step 650/18794 | loss 4.411654 (-0.57z)| norm 0.4904 (+0.81z)| lr 3.90e-03 | 2007.55 ms | 68.4% bf16 MFU | 260298 tok/s +step 651/18794 | loss 4.351044 (-1.29z)| norm 0.4937 (+0.86z)| lr 3.91e-03 | 2004.31 ms | 68.5% bf16 MFU | 260362 tok/s +step 652/18794 | loss 4.436907 (-0.22z)| norm 0.4862 (+0.75z)| lr 3.91e-03 | 2001.06 ms | 68.6% bf16 MFU | 260444 tok/s +step 653/18794 | loss 4.348199 (-1.32z)| norm 0.4572 (+0.28z)| lr 3.92e-03 | 2008.19 ms | 68.3% bf16 MFU | 260476 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.485522 +step 654/18794 | loss 4.399676 (-0.65z)| norm 0.5885 (+2.49z)| lr 3.92e-03 | 2004.33 ms | 68.5% bf16 MFU | 260531 tok/s +step 655/18794 | loss 4.379100 (-0.90z)| norm 0.5447 (+1.70z)| lr 3.93e-03 | 2015.92 ms | 68.1% bf16 MFU | 260508 tok/s +step 656/18794 | loss 4.349248 (-1.27z)| norm 0.3833 (-0.99z)| lr 3.94e-03 | 2021.91 ms | 67.9% bf16 MFU | 260448 tok/s +step 657/18794 | loss 4.494708 (+0.69z)| norm 0.4534 (+0.16z)| lr 3.94e-03 | 2016.03 ms | 68.1% bf16 MFU | 260428 tok/s +step 658/18794 | loss 4.439048 (-0.04z)| norm 0.5309 (+1.44z)| lr 3.95e-03 | 2006.71 ms | 68.4% bf16 MFU | 260470 tok/s +step 659/18794 | loss 4.365601 (-1.05z)| norm 0.5222 (+1.26z)| lr 3.95e-03 | 2018.81 ms | 68.0% bf16 MFU | 260432 tok/s +step 660/18794 | loss 4.413867 (-0.35z)| norm 0.4820 (+0.58z)| lr 3.96e-03 | 2018.92 ms | 68.0% bf16 MFU | 260395 tok/s +step 661/18794 | loss 4.363253 (-1.05z)| norm 0.4895 (+0.70z)| lr 3.97e-03 | 2008.59 ms | 68.3% bf16 MFU | 260426 tok/s +step 662/18794 | loss 4.320436 (-1.63z)| norm 0.4641 (+0.28z)| lr 3.97e-03 | 2016.38 ms | 68.1% bf16 MFU | 260405 tok/s +step 663/18794 | loss 4.354590 (-1.13z)| norm 0.4783 (+0.53z)| lr 3.98e-03 | 2015.34 ms | 68.1% bf16 MFU | 260393 tok/s +step 664/18794 | loss 4.377376 (-0.78z)| norm 0.4678 (+0.35z)| lr 3.98e-03 | 2008.09 ms | 68.3% bf16 MFU | 260427 tok/s +step 665/18794 | loss 4.344653 (-1.24z)| norm 0.3795 (-1.12z)| lr 3.99e-03 | 1988.51 ms | 69.0% bf16 MFU | 260589 tok/s +step 666/18794 | loss 4.341962 (-1.27z)| norm 0.3827 (-1.06z)| lr 4.00e-03 | 2004.66 ms | 68.5% bf16 MFU | 260636 tok/s +step 667/18794 | loss 4.309615 (-1.74z)| norm 0.3810 (-1.10z)| lr 4.00e-03 | 1997.76 ms | 68.7% bf16 MFU | 260726 tok/s +step 668/18794 | loss 4.308010 (-1.74z)| norm 0.3816 (-1.08z)| lr 4.01e-03 | 2002.59 ms | 68.5% bf16 MFU | 260780 tok/s +step 669/18794 | loss 4.352355 (-1.04z)| norm 0.3729 (-1.20z)| lr 4.01e-03 | 1999.11 ms | 68.6% bf16 MFU | 260854 tok/s +step 670/18794 | loss 4.318930 (-1.52z)| norm 0.3648 (-1.32z)| lr 4.02e-03 | 2009.18 ms | 68.3% bf16 MFU | 260859 tok/s +step 671/18794 | loss 4.227127 (-2.85z)| norm 0.3341 (-1.81z)| lr 4.03e-03 | 2008.76 ms | 68.3% bf16 MFU | 260866 tok/s +step 672/18794 | loss 4.340902 (-1.10z)| norm 0.3827 (-0.97z)| lr 4.03e-03 | 2015.37 ms | 68.1% bf16 MFU | 260830 tok/s +step 673/18794 | loss 4.277920 (-2.00z)| norm 0.3954 (-0.74z)| lr 4.04e-03 | 1999.91 ms | 68.6% bf16 MFU | 260896 tok/s +step 674/18794 | loss 4.296216 (-1.69z)| norm 0.4669 (+0.45z)| lr 4.04e-03 | 2011.15 ms | 68.2% bf16 MFU | 260886 tok/s +step 675/18794 | loss 4.288519 (-1.78z)| norm 0.5326 (+1.54z)| lr 4.05e-03 | 2012.54 ms | 68.2% bf16 MFU | 260867 tok/s +step 676/18794 | loss 4.330425 (-1.12z)| norm 0.4922 (+0.84z)| lr 4.06e-03 | 1993.26 ms | 68.8% bf16 MFU | 260975 tok/s +step 677/18794 | loss 4.286633 (-1.75z)| norm 0.5092 (+1.11z)| lr 4.06e-03 | 1994.04 ms | 68.8% bf16 MFU | 261073 tok/s +step 678/18794 | loss 4.306878 (-1.41z)| norm 0.4537 (+0.16z)| lr 4.07e-03 | 2014.50 ms | 68.1% bf16 MFU | 261032 tok/s +step 679/18794 | loss 4.298038 (-1.52z)| norm 0.4000 (-0.74z)| lr 4.07e-03 | 2016.66 ms | 68.0% bf16 MFU | 260979 tok/s +step 680/18794 | loss 4.288396 (-1.62z)| norm 0.3449 (-1.65z)| lr 4.08e-03 | 1997.57 ms | 68.7% bf16 MFU | 261054 tok/s +step 681/18794 | loss 4.282952 (-1.67z)| norm 0.3754 (-1.12z)| lr 4.09e-03 | 1995.24 ms | 68.8% bf16 MFU | 261139 tok/s +step 682/18794 | loss 4.347997 (-0.68z)| norm 0.4519 (+0.16z)| lr 4.09e-03 | 2002.25 ms | 68.5% bf16 MFU | 261175 tok/s +step 683/18794 | loss 4.297488 (-1.42z)| norm 0.4195 (-0.38z)| lr 4.10e-03 | 2002.40 ms | 68.5% bf16 MFU | 261208 tok/s +step 684/18794 | loss 4.295298 (-1.43z)| norm 0.4582 (+0.27z)| lr 4.10e-03 | 2007.35 ms | 68.4% bf16 MFU | 261207 tok/s +step 685/18794 | loss 4.296962 (-1.37z)| norm 0.4382 (-0.06z)| lr 4.11e-03 | 2016.03 ms | 68.1% bf16 MFU | 261149 tok/s +step 686/18794 | loss 4.318360 (-1.03z)| norm 0.3106 (-2.13z)| lr 4.12e-03 | 2001.85 ms | 68.6% bf16 MFU | 261187 tok/s +step 687/18794 | loss 4.335634 (-0.75z)| norm 0.3571 (-1.36z)| lr 4.12e-03 | 1999.21 ms | 68.6% bf16 MFU | 261240 tok/s +step 688/18794 | loss 4.244465 (-2.11z)| norm 0.3620 (-1.25z)| lr 4.13e-03 | 1990.64 ms | 68.9% bf16 MFU | 261347 tok/s +step 689/18794 | loss 4.291774 (-1.36z)| norm 0.3757 (-1.00z)| lr 4.13e-03 | 2006.96 ms | 68.4% bf16 MFU | 261341 tok/s +step 690/18794 | loss 4.291769 (-1.33z)| norm 0.3899 (-0.76z)| lr 4.14e-03 | 1988.94 ms | 69.0% bf16 MFU | 261454 tok/s +step 691/18794 | loss 4.260149 (-1.78z)| norm 0.3313 (-1.78z)| lr 4.15e-03 | 2008.49 ms | 68.3% bf16 MFU | 261433 tok/s +step 692/18794 | loss 4.274506 (-1.53z)| norm 0.3798 (-0.88z)| lr 4.15e-03 | 2016.00 ms | 68.1% bf16 MFU | 261365 tok/s +step 693/18794 | loss 4.280154 (-1.41z)| norm 0.4458 (+0.33z)| lr 4.16e-03 | 1997.88 ms | 68.7% bf16 MFU | 261418 tok/s +step 694/18794 | loss 4.287840 (-1.28z)| norm 0.4439 (+0.31z)| lr 4.16e-03 | 2009.59 ms | 68.3% bf16 MFU | 261391 tok/s +step 695/18794 | loss 4.251679 (-1.81z)| norm 0.4035 (-0.44z)| lr 4.17e-03 | 2002.37 ms | 68.5% bf16 MFU | 261413 tok/s +step 696/18794 | loss 4.273579 (-1.45z)| norm 0.3646 (-1.14z)| lr 4.18e-03 | 2007.99 ms | 68.3% bf16 MFU | 261398 tok/s +step 697/18794 | loss 4.291760 (-1.17z)| norm 0.3029 (-2.24z)| lr 4.18e-03 | 2000.37 ms | 68.6% bf16 MFU | 261433 tok/s +step 698/18794 | loss 4.272252 (-1.47z)| norm 0.3551 (-1.26z)| lr 4.19e-03 | 1985.49 ms | 69.1% bf16 MFU | 261564 tok/s +step 699/18794 | loss 4.234186 (-2.04z)| norm 0.3413 (-1.50z)| lr 4.19e-03 | 2015.95 ms | 68.1% bf16 MFU | 261489 tok/s +step 700/18794 | loss 4.248653 (-1.76z)| norm 0.3646 (-1.09z)| lr 4.20e-03 | 2007.04 ms | 68.4% bf16 MFU | 261476 tok/s +step 701/18794 | loss 4.283490 (-1.17z)| norm 0.3700 (-0.98z)| lr 4.21e-03 | 1994.20 ms | 68.8% bf16 MFU | 261548 tok/s +step 702/18794 | loss 4.246678 (-1.72z)| norm 0.3538 (-1.25z)| lr 4.21e-03 | 1993.67 ms | 68.8% bf16 MFU | 261619 tok/s +step 703/18794 | loss 4.222952 (-2.04z)| norm 0.3511 (-1.28z)| lr 4.22e-03 | 1997.47 ms | 68.7% bf16 MFU | 261662 tok/s +step 704/18794 | loss 4.257443 (-1.46z)| norm 0.4065 (-0.26z)| lr 4.22e-03 | 2005.92 ms | 68.4% bf16 MFU | 261647 tok/s +step 705/18794 | loss 4.188526 (-2.49z)| norm 0.3800 (-0.73z)| lr 4.23e-03 | 2001.76 ms | 68.6% bf16 MFU | 261661 tok/s +step 706/18794 | loss 4.256605 (-1.41z)| norm 0.3956 (-0.45z)| lr 4.24e-03 | 2003.07 ms | 68.5% bf16 MFU | 261665 tok/s +step 707/18794 | loss 4.242246 (-1.60z)| norm 0.4190 (-0.01z)| lr 4.24e-03 | 2005.29 ms | 68.4% bf16 MFU | 261654 tok/s +step 708/18794 | loss 4.255528 (-1.36z)| norm 0.3780 (-0.75z)| lr 4.25e-03 | 1993.90 ms | 68.8% bf16 MFU | 261719 tok/s +step 709/18794 | loss 4.202250 (-2.13z)| norm 0.4720 (+0.94z)| lr 4.25e-03 | 1998.25 ms | 68.7% bf16 MFU | 261751 tok/s +step 710/18794 | loss 4.243997 (-1.45z)| norm 0.5135 (+1.65z)| lr 4.26e-03 | 1993.52 ms | 68.8% bf16 MFU | 261814 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.053452 +step 711/18794 | loss 4.258078 (-1.21z)| norm 0.6018 (+3.05z)| lr 4.27e-03 | 1993.41 ms | 68.8% bf16 MFU | 261873 tok/s +step 712/18794 | loss 4.269752 (-1.01z)| norm 0.4906 (+1.12z)| lr 4.27e-03 | 1987.85 ms | 69.0% bf16 MFU | 261967 tok/s +step 713/18794 | loss 4.259801 (-1.14z)| norm 0.4932 (+1.16z)| lr 4.28e-03 | 2003.26 ms | 68.5% bf16 MFU | 261955 tok/s +step 714/18794 | loss 4.270757 (-0.95z)| norm 0.4267 (+0.03z)| lr 4.28e-03 | 2000.76 ms | 68.6% bf16 MFU | 261959 tok/s +step 715/18794 | loss 4.257916 (-1.13z)| norm 0.4353 (+0.17z)| lr 4.29e-03 | 1999.57 ms | 68.6% bf16 MFU | 261971 tok/s +step 716/18794 | loss 4.241468 (-1.35z)| norm 0.3967 (-0.48z)| lr 4.30e-03 | 1981.97 ms | 69.2% bf16 MFU | 262099 tok/s +step 717/18794 | loss 4.207572 (-1.82z)| norm 0.3563 (-1.15z)| lr 4.30e-03 | 2000.51 ms | 68.6% bf16 MFU | 262098 tok/s +step 718/18794 | loss 4.206422 (-1.79z)| norm 0.3868 (-0.63z)| lr 4.31e-03 | 2015.27 ms | 68.1% bf16 MFU | 262001 tok/s +step 719/18794 | loss 4.205329 (-1.77z)| norm 0.3811 (-0.71z)| lr 4.31e-03 | 1992.37 ms | 68.9% bf16 MFU | 262058 tok/s +step 720/18794 | loss 4.235990 (-1.28z)| norm 0.4343 (+0.20z)| lr 4.32e-03 | 1991.53 ms | 68.9% bf16 MFU | 262118 tok/s +step 721/18794 | loss 4.299239 (-0.33z)| norm 0.4414 (+0.32z)| lr 4.33e-03 | 1981.04 ms | 69.3% bf16 MFU | 262245 tok/s +step 722/18794 | loss 4.283781 (-0.55z)| norm 0.5044 (+1.37z)| lr 4.33e-03 | 2000.04 ms | 68.6% bf16 MFU | 262240 tok/s +step 723/18794 | loss 4.239981 (-1.19z)| norm 0.4571 (+0.56z)| lr 4.34e-03 | 1991.66 ms | 68.9% bf16 MFU | 262290 tok/s +step 724/18794 | loss 4.262256 (-0.84z)| norm 0.3798 (-0.73z)| lr 4.34e-03 | 1992.87 ms | 68.9% bf16 MFU | 262329 tok/s +step 725/18794 | loss 4.259332 (-0.86z)| norm 0.3932 (-0.50z)| lr 4.35e-03 | 1984.56 ms | 69.2% bf16 MFU | 262422 tok/s +step 726/18794 | loss 4.219957 (-1.44z)| norm 0.4165 (-0.11z)| lr 4.36e-03 | 1993.16 ms | 68.9% bf16 MFU | 262453 tok/s +step 727/18794 | loss 4.262080 (-0.78z)| norm 0.3855 (-0.62z)| lr 4.36e-03 | 1991.21 ms | 68.9% bf16 MFU | 262496 tok/s +step 728/18794 | loss 4.220911 (-1.40z)| norm 0.3321 (-1.48z)| lr 4.37e-03 | 1985.39 ms | 69.1% bf16 MFU | 262574 tok/s +step 729/18794 | loss 4.197929 (-1.75z)| norm 0.3985 (-0.37z)| lr 4.37e-03 | 2013.33 ms | 68.2% bf16 MFU | 262466 tok/s +step 730/18794 | loss 4.226079 (-1.28z)| norm 0.3717 (-0.82z)| lr 4.38e-03 | 1991.29 ms | 68.9% bf16 MFU | 262507 tok/s +step 731/18794 | loss 4.203160 (-1.60z)| norm 0.3748 (-0.76z)| lr 4.39e-03 | 1984.01 ms | 69.2% bf16 MFU | 262595 tok/s +step 732/18794 | loss 4.205846 (-1.53z)| norm 0.4275 (+0.12z)| lr 4.39e-03 | 1997.90 ms | 68.7% bf16 MFU | 262586 tok/s +step 733/18794 | loss 4.268108 (-0.53z)| norm 0.4221 (+0.02z)| lr 4.40e-03 | 2001.00 ms | 68.6% bf16 MFU | 262557 tok/s +step 734/18794 | loss 4.249732 (-0.81z)| norm 0.3858 (-0.59z)| lr 4.40e-03 | 1993.11 ms | 68.9% bf16 MFU | 262582 tok/s +step 735/18794 | loss 4.229982 (-1.10z)| norm 0.3628 (-0.97z)| lr 4.41e-03 | 1992.29 ms | 68.9% bf16 MFU | 262611 tok/s +step 736/18794 | loss 4.237489 (-0.97z)| norm 0.3836 (-0.63z)| lr 4.42e-03 | 2007.17 ms | 68.4% bf16 MFU | 262541 tok/s +step 737/18794 | loss 4.229218 (-1.08z)| norm 0.3385 (-1.36z)| lr 4.42e-03 | 1992.28 ms | 68.9% bf16 MFU | 262572 tok/s +step 738/18794 | loss 4.211500 (-1.34z)| norm 0.3138 (-1.72z)| lr 4.43e-03 | 1992.41 ms | 68.9% bf16 MFU | 262600 tok/s +step 739/18794 | loss 4.213445 (-1.28z)| norm 0.3131 (-1.69z)| lr 4.43e-03 | 1985.03 ms | 69.1% bf16 MFU | 262676 tok/s +step 740/18794 | loss 4.190899 (-1.61z)| norm 0.3052 (-1.77z)| lr 4.44e-03 | 1999.88 ms | 68.6% bf16 MFU | 262650 tok/s +step 741/18794 | loss 4.204192 (-1.36z)| norm 0.3233 (-1.47z)| lr 4.45e-03 | 1983.25 ms | 69.2% bf16 MFU | 262736 tok/s +step 742/18794 | loss 4.222373 (-1.05z)| norm 0.3643 (-0.82z)| lr 4.45e-03 | 2006.00 ms | 68.4% bf16 MFU | 262667 tok/s +step 743/18794 | loss 4.158266 (-2.01z)| norm 0.3998 (-0.26z)| lr 4.46e-03 | 1987.56 ms | 69.0% bf16 MFU | 262723 tok/s +step 744/18794 | loss 4.200009 (-1.33z)| norm 0.4245 (+0.13z)| lr 4.46e-03 | 1981.69 ms | 69.3% bf16 MFU | 262815 tok/s +step 745/18794 | loss 4.184433 (-1.54z)| norm 0.3704 (-0.70z)| lr 4.47e-03 | 1986.16 ms | 69.1% bf16 MFU | 262873 tok/s +step 746/18794 | loss 4.199565 (-1.28z)| norm 0.3639 (-0.79z)| lr 4.48e-03 | 1984.62 ms | 69.1% bf16 MFU | 262938 tok/s +step 747/18794 | loss 4.171486 (-1.68z)| norm 0.3930 (-0.32z)| lr 4.48e-03 | 1977.03 ms | 69.4% bf16 MFU | 263050 tok/s +step 748/18794 | loss 4.206606 (-1.11z)| norm 0.3857 (-0.42z)| lr 4.49e-03 | 1994.15 ms | 68.8% bf16 MFU | 263044 tok/s +step 749/18794 | loss 4.173955 (-1.59z)| norm 0.3393 (-1.14z)| lr 4.49e-03 | 1984.83 ms | 69.1% bf16 MFU | 263099 tok/s +step 750/18794 | loss 4.164562 (-1.71z)| norm 0.4107 (+0.01z)| lr 4.50e-03 | 1995.18 ms | 68.8% bf16 MFU | 263083 tok/s +val loss 4.240807 +HellaSwag: 2547/10042 = 0.253635: 0/1256 +step 751/18794 | loss 4.229559 (-0.68z)| norm 0.4839 (+1.19z)| lr 4.51e-03 | 1991.95 ms | 68.9% bf16 MFU | 263089 tok/s +step 752/18794 | loss 4.195314 (-1.21z)| norm 0.5263 (+1.84z)| lr 4.51e-03 | 1980.61 ms | 69.3% bf16 MFU | 263170 tok/s +step 753/18794 | loss 4.210708 (-0.94z)| norm 0.5020 (+1.44z)| lr 4.52e-03 | 1987.31 ms | 69.1% bf16 MFU | 263202 tok/s +step 754/18794 | loss 4.249963 (-0.29z)| norm 0.4754 (+1.07z)| lr 4.52e-03 | 1993.65 ms | 68.8% bf16 MFU | 263191 tok/s +step 755/18794 | loss 4.209426 (-0.94z)| norm 0.3680 (-0.67z)| lr 4.53e-03 | 1986.52 ms | 69.1% bf16 MFU | 263228 tok/s +step 756/18794 | loss 4.250149 (-0.25z)| norm 0.3675 (-0.67z)| lr 4.54e-03 | 1990.66 ms | 68.9% bf16 MFU | 263235 tok/s +step 757/18794 | loss 4.186779 (-1.36z)| norm 0.3455 (-1.02z)| lr 4.54e-03 | 1980.53 ms | 69.3% bf16 MFU | 263309 tok/s +step 758/18794 | loss 4.204380 (-1.05z)| norm 0.3957 (-0.17z)| lr 4.55e-03 | 1980.04 ms | 69.3% bf16 MFU | 263383 tok/s +step 759/18794 | loss 4.228749 (-0.57z)| norm 0.3949 (-0.16z)| lr 4.55e-03 | 1979.34 ms | 69.3% bf16 MFU | 263458 tok/s +step 760/18794 | loss 4.198781 (-1.16z)| norm 0.3258 (-1.33z)| lr 4.56e-03 | 1979.41 ms | 69.3% bf16 MFU | 263529 tok/s +step 761/18794 | loss 4.179088 (-1.54z)| norm 0.2776 (-2.11z)| lr 4.57e-03 | 1982.15 ms | 69.2% bf16 MFU | 263578 tok/s +step 762/18794 | loss 4.175288 (-1.59z)| norm 0.2846 (-1.93z)| lr 4.57e-03 | 1980.40 ms | 69.3% bf16 MFU | 263636 tok/s +step 763/18794 | loss 4.214512 (-0.77z)| norm 0.3189 (-1.33z)| lr 4.58e-03 | 2014.27 ms | 68.1% bf16 MFU | 263468 tok/s +step 764/18794 | loss 4.237261 (-0.27z)| norm 0.3315 (-1.09z)| lr 4.58e-03 | 2044.32 ms | 67.1% bf16 MFU | 263118 tok/s +step 765/18794 | loss 4.196296 (-1.15z)| norm 0.3550 (-0.69z)| lr 4.59e-03 | 2035.23 ms | 67.4% bf16 MFU | 262842 tok/s +step 766/18794 | loss 4.184866 (-1.38z)| norm 0.3391 (-0.94z)| lr 4.60e-03 | 2031.94 ms | 67.5% bf16 MFU | 262601 tok/s +step 767/18794 | loss 4.197164 (-1.08z)| norm 0.3485 (-0.78z)| lr 4.60e-03 | 2039.35 ms | 67.3% bf16 MFU | 262325 tok/s +step 768/18794 | loss 4.180806 (-1.42z)| norm 0.3324 (-1.03z)| lr 4.61e-03 | 2017.46 ms | 68.0% bf16 MFU | 262203 tok/s +step 769/18794 | loss 4.135164 (-2.40z)| norm 0.3057 (-1.46z)| lr 4.61e-03 | 2039.49 ms | 67.3% bf16 MFU | 261946 tok/s +step 770/18794 | loss 4.177958 (-1.41z)| norm 0.3742 (-0.32z)| lr 4.62e-03 | 2041.91 ms | 67.2% bf16 MFU | 261687 tok/s +step 771/18794 | loss 4.181101 (-1.32z)| norm 0.3900 (-0.07z)| lr 4.63e-03 | 2039.24 ms | 67.3% bf16 MFU | 261458 tok/s +step 772/18794 | loss 4.212601 (-0.60z)| norm 0.3717 (-0.37z)| lr 4.63e-03 | 2028.69 ms | 67.6% bf16 MFU | 261307 tok/s +step 773/18794 | loss 4.181496 (-1.29z)| norm 0.3309 (-1.04z)| lr 4.64e-03 | 2043.95 ms | 67.1% bf16 MFU | 261067 tok/s +step 774/18794 | loss 4.223967 (-0.31z)| norm 0.3606 (-0.53z)| lr 4.64e-03 | 2045.98 ms | 67.1% bf16 MFU | 260826 tok/s +step 775/18794 | loss 4.202987 (-0.77z)| norm 0.3554 (-0.60z)| lr 4.65e-03 | 2044.77 ms | 67.1% bf16 MFU | 260605 tok/s +step 776/18794 | loss 4.159578 (-1.75z)| norm 0.3836 (-0.10z)| lr 4.66e-03 | 2041.45 ms | 67.2% bf16 MFU | 260416 tok/s +step 777/18794 | loss 4.189489 (-1.03z)| norm 0.4194 (+0.55z)| lr 4.66e-03 | 2047.41 ms | 67.0% bf16 MFU | 260199 tok/s +step 778/18794 | loss 4.171482 (-1.43z)| norm 0.4160 (+0.50z)| lr 4.67e-03 | 2041.29 ms | 67.2% bf16 MFU | 260031 tok/s +step 779/18794 | loss 4.229708 (-0.05z)| norm 0.4672 (+1.38z)| lr 4.67e-03 | 2035.05 ms | 67.4% bf16 MFU | 259911 tok/s +step 780/18794 | loss 4.210605 (-0.49z)| norm 0.3916 (+0.04z)| lr 4.68e-03 | 2026.02 ms | 67.7% bf16 MFU | 259854 tok/s +step 781/18794 | loss 4.179633 (-1.20z)| norm 0.3562 (-0.58z)| lr 4.69e-03 | 2028.66 ms | 67.6% bf16 MFU | 259783 tok/s +step 782/18794 | loss 4.185621 (-1.06z)| norm 0.3697 (-0.33z)| lr 4.69e-03 | 2036.68 ms | 67.4% bf16 MFU | 259665 tok/s +step 783/18794 | loss 4.211334 (-0.40z)| norm 0.4090 (+0.37z)| lr 4.70e-03 | 2032.93 ms | 67.5% bf16 MFU | 259577 tok/s +step 784/18794 | loss 4.175429 (-1.29z)| norm 0.4744 (+1.53z)| lr 4.70e-03 | 2021.12 ms | 67.9% bf16 MFU | 259568 tok/s +step 785/18794 | loss 4.244406 (+0.48z)| norm 0.4747 (+1.51z)| lr 4.71e-03 | 2029.79 ms | 67.6% bf16 MFU | 259505 tok/s +step 786/18794 | loss 4.208621 (-0.42z)| norm 0.4915 (+1.77z)| lr 4.72e-03 | 2025.79 ms | 67.7% bf16 MFU | 259470 tok/s +step 787/18794 | loss 4.241129 (+0.48z)| norm 0.4297 (+0.67z)| lr 4.72e-03 | 2021.41 ms | 67.9% bf16 MFU | 259465 tok/s +step 788/18794 | loss 4.216643 (-0.19z)| norm 0.4942 (+1.75z)| lr 4.73e-03 | 2038.04 ms | 67.3% bf16 MFU | 259354 tok/s +step 789/18794 | loss 4.241521 (+0.53z)| norm 0.4960 (+1.74z)| lr 4.73e-03 | 2036.80 ms | 67.4% bf16 MFU | 259257 tok/s +reducing beta2 to 0.9 and lr/wd by 0.920 due to grad z-score of 3.802327 +step 790/18794 | loss 4.256606 (+0.98z)| norm 0.6384 (+3.80z)| lr 4.36e-03 | 2038.19 ms | 67.3% bf16 MFU | 259155 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.714471 +step 791/18794 | loss 4.191497 (-0.87z)| norm 0.5775 (+2.71z)| lr 4.75e-03 | 2023.16 ms | 67.8% bf16 MFU | 259155 tok/s +step 792/18794 | loss 4.257862 (+1.05z)| norm 0.4340 (+0.53z)| lr 4.75e-03 | 2033.18 ms | 67.5% bf16 MFU | 259090 tok/s +step 793/18794 | loss 4.204922 (-0.47z)| norm 0.4424 (+0.65z)| lr 4.76e-03 | 2022.45 ms | 67.9% bf16 MFU | 259098 tok/s +step 794/18794 | loss 4.180096 (-1.18z)| norm 0.5051 (+1.58z)| lr 4.76e-03 | 2034.93 ms | 67.4% bf16 MFU | 259025 tok/s +step 795/18794 | loss 4.180001 (-1.16z)| norm 0.3883 (-0.17z)| lr 4.77e-03 | 2016.37 ms | 68.1% bf16 MFU | 259074 tok/s +step 796/18794 | loss 4.188583 (-0.89z)| norm 0.3498 (-0.75z)| lr 4.78e-03 | 2022.08 ms | 67.9% bf16 MFU | 259085 tok/s +step 797/18794 | loss 4.196993 (-0.62z)| norm 0.2799 (-1.78z)| lr 4.78e-03 | 2031.21 ms | 67.6% bf16 MFU | 259036 tok/s +step 798/18794 | loss 4.226426 (+0.30z)| norm 0.3016 (-1.44z)| lr 4.79e-03 | 2028.13 ms | 67.7% bf16 MFU | 259010 tok/s +step 799/18794 | loss 4.208326 (-0.26z)| norm 0.3145 (-1.24z)| lr 4.79e-03 | 2016.79 ms | 68.0% bf16 MFU | 259058 tok/s +step 800/18794 | loss 4.180962 (-1.08z)| norm 0.3565 (-0.62z)| lr 4.80e-03 | 2036.83 ms | 67.4% bf16 MFU | 258975 tok/s +step 801/18794 | loss 4.215302 (+0.00z)| norm 0.4029 (+0.06z)| lr 4.81e-03 | 2034.38 ms | 67.5% bf16 MFU | 258912 tok/s +step 802/18794 | loss 4.169074 (-1.43z)| norm 0.3348 (-0.94z)| lr 4.81e-03 | 2035.35 ms | 67.4% bf16 MFU | 258846 tok/s +step 803/18794 | loss 4.124839 (-2.69z)| norm 0.3516 (-0.69z)| lr 4.82e-03 | 2013.93 ms | 68.1% bf16 MFU | 258920 tok/s +step 804/18794 | loss 4.172917 (-1.21z)| norm 0.3640 (-0.50z)| lr 4.82e-03 | 2038.96 ms | 67.3% bf16 MFU | 258831 tok/s +step 805/18794 | loss 4.131321 (-2.39z)| norm 0.3328 (-0.95z)| lr 4.83e-03 | 2016.62 ms | 68.1% bf16 MFU | 258888 tok/s +step 806/18794 | loss 4.131385 (-2.31z)| norm 0.3320 (-0.95z)| lr 4.84e-03 | 2010.97 ms | 68.2% bf16 MFU | 258980 tok/s +step 807/18794 | loss 4.134926 (-2.13z)| norm 0.3764 (-0.29z)| lr 4.84e-03 | 2028.76 ms | 67.6% bf16 MFU | 258952 tok/s +step 808/18794 | loss 4.135045 (-2.07z)| norm 0.3392 (-0.83z)| lr 4.85e-03 | 2030.91 ms | 67.6% bf16 MFU | 258912 tok/s +step 809/18794 | loss 4.173725 (-0.97z)| norm 0.3058 (-1.29z)| lr 4.85e-03 | 2030.63 ms | 67.6% bf16 MFU | 258876 tok/s +step 810/18794 | loss 4.178339 (-0.82z)| norm 0.3568 (-0.53z)| lr 4.86e-03 | 2021.53 ms | 67.9% bf16 MFU | 258900 tok/s +step 811/18794 | loss 4.167383 (-1.11z)| norm 0.3901 (-0.01z)| lr 4.87e-03 | 2038.74 ms | 67.3% bf16 MFU | 258813 tok/s +step 812/18794 | loss 4.148971 (-1.59z)| norm 0.4345 (+0.69z)| lr 4.87e-03 | 2036.80 ms | 67.4% bf16 MFU | 258743 tok/s +step 813/18794 | loss 4.124070 (-2.23z)| norm 0.3406 (-0.76z)| lr 4.88e-03 | 2022.28 ms | 67.9% bf16 MFU | 258768 tok/s +step 814/18794 | loss 4.137527 (-1.82z)| norm 0.3470 (-0.65z)| lr 4.88e-03 | 2014.48 ms | 68.1% bf16 MFU | 258843 tok/s +step 815/18794 | loss 4.115592 (-2.35z)| norm 0.4296 (+0.65z)| lr 4.89e-03 | 2037.63 ms | 67.3% bf16 MFU | 258766 tok/s +step 816/18794 | loss 4.172841 (-0.77z)| norm 0.4146 (+0.42z)| lr 4.90e-03 | 2008.61 ms | 68.3% bf16 MFU | 258879 tok/s +step 817/18794 | loss 4.185699 (-0.41z)| norm 0.3569 (-0.49z)| lr 4.90e-03 | 2021.29 ms | 67.9% bf16 MFU | 258904 tok/s +step 818/18794 | loss 4.118554 (-2.17z)| norm 0.3624 (-0.40z)| lr 4.91e-03 | 2034.47 ms | 67.5% bf16 MFU | 258844 tok/s +step 819/18794 | loss 4.151232 (-1.27z)| norm 0.3421 (-0.71z)| lr 4.91e-03 | 2008.36 ms | 68.3% bf16 MFU | 258954 tok/s +step 820/18794 | loss 4.115005 (-2.16z)| norm 0.3324 (-0.85z)| lr 4.92e-03 | 2020.95 ms | 67.9% bf16 MFU | 258978 tok/s +step 821/18794 | loss 4.099901 (-2.51z)| norm 0.3700 (-0.25z)| lr 4.93e-03 | 2026.94 ms | 67.7% bf16 MFU | 258962 tok/s +step 822/18794 | loss 4.104650 (-2.34z)| norm 0.3794 (-0.08z)| lr 4.93e-03 | 2023.31 ms | 67.8% bf16 MFU | 258970 tok/s +step 823/18794 | loss 4.145939 (-1.23z)| norm 0.3745 (-0.15z)| lr 4.94e-03 | 2017.84 ms | 68.0% bf16 MFU | 259013 tok/s +step 824/18794 | loss 4.120066 (-1.87z)| norm 0.3902 (+0.10z)| lr 4.94e-03 | 1991.87 ms | 68.9% bf16 MFU | 259223 tok/s +step 825/18794 | loss 4.120040 (-1.83z)| norm 0.3464 (-0.59z)| lr 4.95e-03 | 1999.07 ms | 68.6% bf16 MFU | 259375 tok/s +step 826/18794 | loss 4.101874 (-2.23z)| norm 0.2803 (-1.62z)| lr 4.96e-03 | 2016.30 ms | 68.1% bf16 MFU | 259408 tok/s +step 827/18794 | loss 4.106978 (-2.05z)| norm 0.2354 (-2.25z)| lr 4.96e-03 | 2016.49 ms | 68.1% bf16 MFU | 259437 tok/s +step 828/18794 | loss 4.096829 (-2.22z)| norm 0.2391 (-2.13z)| lr 4.97e-03 | 2030.92 ms | 67.6% bf16 MFU | 259373 tok/s +step 829/18794 | loss 4.079123 (-2.55z)| norm 0.2489 (-1.92z)| lr 4.97e-03 | 2024.41 ms | 67.8% bf16 MFU | 259353 tok/s +step 830/18794 | loss 4.120295 (-1.52z)| norm 0.3205 (-0.85z)| lr 4.98e-03 | 2014.67 ms | 68.1% bf16 MFU | 259397 tok/s +step 831/18794 | loss 4.140466 (-1.02z)| norm 0.4088 (+0.46z)| lr 4.99e-03 | 2021.75 ms | 67.9% bf16 MFU | 259394 tok/s +step 832/18794 | loss 4.143535 (-0.93z)| norm 0.3974 (+0.29z)| lr 4.99e-03 | 2036.73 ms | 67.4% bf16 MFU | 259295 tok/s +step 833/18794 | loss 4.099985 (-1.93z)| norm 0.3942 (+0.25z)| lr 5.00e-03 | 2019.26 ms | 68.0% bf16 MFU | 259312 tok/s +step 834/18794 | loss 4.090281 (-2.10z)| norm 0.3678 (-0.14z)| lr 5.00e-03 | 2009.69 ms | 68.3% bf16 MFU | 259391 tok/s +step 835/18794 | loss 4.078008 (-2.31z)| norm 0.3256 (-0.76z)| lr 5.01e-03 | 1999.28 ms | 68.6% bf16 MFU | 259533 tok/s +step 836/18794 | loss 4.148624 (-0.66z)| norm 0.2783 (-1.43z)| lr 5.02e-03 | 2019.45 ms | 68.0% bf16 MFU | 259537 tok/s +step 837/18794 | loss 4.069115 (-2.42z)| norm 0.2532 (-1.76z)| lr 5.02e-03 | 2014.30 ms | 68.1% bf16 MFU | 259575 tok/s +step 838/18794 | loss 4.097969 (-1.71z)| norm 0.2686 (-1.52z)| lr 5.03e-03 | 1997.63 ms | 68.7% bf16 MFU | 259719 tok/s +step 839/18794 | loss 4.141218 (-0.73z)| norm 0.3224 (-0.75z)| lr 5.03e-03 | 2003.59 ms | 68.5% bf16 MFU | 259817 tok/s +step 840/18794 | loss 4.077302 (-2.09z)| norm 0.3303 (-0.64z)| lr 5.04e-03 | 2007.94 ms | 68.3% bf16 MFU | 259881 tok/s +step 841/18794 | loss 4.097626 (-1.61z)| norm 0.3297 (-0.65z)| lr 5.05e-03 | 2001.94 ms | 68.5% bf16 MFU | 259981 tok/s +step 842/18794 | loss 4.125050 (-0.99z)| norm 0.3819 (+0.10z)| lr 5.05e-03 | 2005.03 ms | 68.4% bf16 MFU | 260057 tok/s +step 843/18794 | loss 4.128376 (-0.91z)| norm 0.4391 (+0.92z)| lr 5.06e-03 | 1998.74 ms | 68.7% bf16 MFU | 260169 tok/s +step 844/18794 | loss 4.123151 (-1.00z)| norm 0.3963 (+0.31z)| lr 5.06e-03 | 2010.48 ms | 68.3% bf16 MFU | 260200 tok/s +step 845/18794 | loss 4.084263 (-1.79z)| norm 0.3479 (-0.39z)| lr 5.07e-03 | 2001.58 ms | 68.6% bf16 MFU | 260287 tok/s +step 846/18794 | loss 4.114887 (-1.12z)| norm 0.3921 (+0.24z)| lr 5.08e-03 | 2007.01 ms | 68.4% bf16 MFU | 260334 tok/s +step 847/18794 | loss 4.072550 (-1.95z)| norm 0.4599 (+1.20z)| lr 5.08e-03 | 2015.71 ms | 68.1% bf16 MFU | 260322 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.580946 +step 848/18794 | loss 4.137464 (-0.59z)| norm 0.5646 (+2.58z)| lr 5.09e-03 | 2014.11 ms | 68.1% bf16 MFU | 260321 tok/s +step 849/18794 | loss 4.207891 (+0.86z)| norm 0.3788 (+0.01z)| lr 5.09e-03 | 2013.10 ms | 68.2% bf16 MFU | 260327 tok/s +step 850/18794 | loss 4.131241 (-0.72z)| norm 0.4074 (+0.41z)| lr 5.10e-03 | 1984.13 ms | 69.2% bf16 MFU | 260523 tok/s +step 851/18794 | loss 4.160220 (-0.10z)| norm 0.3758 (-0.02z)| lr 5.11e-03 | 2013.63 ms | 68.2% bf16 MFU | 260515 tok/s +step 852/18794 | loss 4.116199 (-1.00z)| norm 0.4255 (+0.71z)| lr 5.11e-03 | 2025.69 ms | 67.7% bf16 MFU | 260430 tok/s +step 853/18794 | loss 4.170016 (+0.13z)| norm 0.5064 (+1.86z)| lr 5.12e-03 | 1989.20 ms | 69.0% bf16 MFU | 260587 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.016839 +step 854/18794 | loss 4.128336 (-0.73z)| norm 0.5195 (+2.02z)| lr 5.12e-03 | 1993.13 ms | 68.9% bf16 MFU | 260710 tok/s +step 855/18794 | loss 4.210484 (+1.01z)| norm 0.4008 (+0.34z)| lr 5.13e-03 | 2021.89 ms | 67.9% bf16 MFU | 260640 tok/s +step 856/18794 | loss 4.155339 (-0.14z)| norm 0.3434 (-0.46z)| lr 5.14e-03 | 2014.82 ms | 68.1% bf16 MFU | 260619 tok/s +step 857/18794 | loss 4.119396 (-0.90z)| norm 0.2958 (-1.12z)| lr 5.14e-03 | 2016.01 ms | 68.1% bf16 MFU | 260591 tok/s +step 858/18794 | loss 4.144351 (-0.35z)| norm 0.3354 (-0.56z)| lr 5.15e-03 | 2008.45 ms | 68.3% bf16 MFU | 260613 tok/s +step 859/18794 | loss 4.103805 (-1.20z)| norm 0.4159 (+0.56z)| lr 5.15e-03 | 1989.62 ms | 69.0% bf16 MFU | 260758 tok/s +step 860/18794 | loss 4.115718 (-0.93z)| norm 0.4259 (+0.69z)| lr 5.16e-03 | 1998.04 ms | 68.7% bf16 MFU | 260840 tok/s +step 861/18794 | loss 4.128197 (-0.64z)| norm 0.3599 (-0.25z)| lr 5.17e-03 | 2002.96 ms | 68.5% bf16 MFU | 260886 tok/s +step 862/18794 | loss 4.137033 (-0.44z)| norm 0.3461 (-0.45z)| lr 5.17e-03 | 2000.99 ms | 68.6% bf16 MFU | 260943 tok/s +step 863/18794 | loss 4.116230 (-0.88z)| norm 0.3501 (-0.40z)| lr 5.18e-03 | 2015.79 ms | 68.1% bf16 MFU | 260900 tok/s +step 864/18794 | loss 4.126096 (-0.65z)| norm 0.3485 (-0.43z)| lr 5.18e-03 | 2010.93 ms | 68.2% bf16 MFU | 260891 tok/s +step 865/18794 | loss 4.176356 (+0.46z)| norm 0.3609 (-0.25z)| lr 5.19e-03 | 2015.47 ms | 68.1% bf16 MFU | 260853 tok/s +step 866/18794 | loss 4.118066 (-0.81z)| norm 0.3591 (-0.28z)| lr 5.20e-03 | 2017.54 ms | 68.0% bf16 MFU | 260804 tok/s +step 867/18794 | loss 4.161514 (+0.16z)| norm 0.3713 (-0.11z)| lr 5.20e-03 | 2016.73 ms | 68.0% bf16 MFU | 260762 tok/s +step 868/18794 | loss 4.131470 (-0.49z)| norm 0.3033 (-1.07z)| lr 5.21e-03 | 1992.85 ms | 68.9% bf16 MFU | 260878 tok/s +step 869/18794 | loss 4.074656 (-1.72z)| norm 0.3461 (-0.47z)| lr 5.21e-03 | 2000.19 ms | 68.6% bf16 MFU | 260940 tok/s +step 870/18794 | loss 4.105682 (-1.02z)| norm 0.4025 (+0.33z)| lr 5.22e-03 | 2000.36 ms | 68.6% bf16 MFU | 260998 tok/s +step 871/18794 | loss 4.107969 (-0.95z)| norm 0.4203 (+0.58z)| lr 5.23e-03 | 2007.67 ms | 68.4% bf16 MFU | 261005 tok/s +step 872/18794 | loss 4.058880 (-1.96z)| norm 0.3714 (-0.12z)| lr 5.23e-03 | 2004.58 ms | 68.5% bf16 MFU | 261032 tok/s +step 873/18794 | loss 4.127708 (-0.47z)| norm 0.3963 (+0.23z)| lr 5.24e-03 | 2014.99 ms | 68.1% bf16 MFU | 260990 tok/s +step 874/18794 | loss 4.060672 (-1.87z)| norm 0.3209 (-0.84z)| lr 5.24e-03 | 1992.18 ms | 68.9% bf16 MFU | 261099 tok/s +step 875/18794 | loss 4.083861 (-1.34z)| norm 0.3139 (-0.93z)| lr 5.25e-03 | 2002.86 ms | 68.5% bf16 MFU | 261133 tok/s +step 876/18794 | loss 4.107159 (-0.83z)| norm 0.3082 (-1.00z)| lr 5.26e-03 | 2025.44 ms | 67.8% bf16 MFU | 261019 tok/s +step 877/18794 | loss 4.089521 (-1.18z)| norm 0.3034 (-1.05z)| lr 5.26e-03 | 2011.38 ms | 68.2% bf16 MFU | 261001 tok/s +step 878/18794 | loss 4.049126 (-1.98z)| norm 0.3049 (-1.00z)| lr 5.27e-03 | 1989.36 ms | 69.0% bf16 MFU | 261128 tok/s +step 879/18794 | loss 4.049494 (-1.93z)| norm 0.2670 (-1.50z)| lr 5.27e-03 | 2008.15 ms | 68.3% bf16 MFU | 261126 tok/s +step 880/18794 | loss 4.092959 (-1.00z)| norm 0.2668 (-1.47z)| lr 5.28e-03 | 1992.16 ms | 68.9% bf16 MFU | 261228 tok/s +step 881/18794 | loss 4.068583 (-1.48z)| norm 0.3161 (-0.78z)| lr 5.29e-03 | 1992.20 ms | 68.9% bf16 MFU | 261325 tok/s +step 882/18794 | loss 4.091638 (-0.98z)| norm 0.3399 (-0.45z)| lr 5.29e-03 | 2001.25 ms | 68.6% bf16 MFU | 261358 tok/s +step 883/18794 | loss 4.047282 (-1.85z)| norm 0.2787 (-1.27z)| lr 5.30e-03 | 2006.39 ms | 68.4% bf16 MFU | 261356 tok/s +step 884/18794 | loss 4.128409 (-0.18z)| norm 0.2722 (-1.33z)| lr 5.30e-03 | 1983.93 ms | 69.2% bf16 MFU | 261501 tok/s +step 885/18794 | loss 4.073815 (-1.29z)| norm 0.3288 (-0.54z)| lr 5.31e-03 | 1997.19 ms | 68.7% bf16 MFU | 261552 tok/s +step 886/18794 | loss 4.106949 (-0.58z)| norm 0.3996 (+0.46z)| lr 5.32e-03 | 1983.95 ms | 69.2% bf16 MFU | 261688 tok/s +step 887/18794 | loss 4.103860 (-0.63z)| norm 0.3558 (-0.14z)| lr 5.32e-03 | 2005.30 ms | 68.4% bf16 MFU | 261676 tok/s +step 888/18794 | loss 4.052711 (-1.71z)| norm 0.3585 (-0.09z)| lr 5.33e-03 | 2004.24 ms | 68.5% bf16 MFU | 261671 tok/s +step 889/18794 | loss 4.097389 (-0.72z)| norm 0.3913 (+0.40z)| lr 5.33e-03 | 1992.31 ms | 68.9% bf16 MFU | 261746 tok/s +step 890/18794 | loss 4.051426 (-1.75z)| norm 0.2940 (-1.05z)| lr 5.34e-03 | 2018.00 ms | 68.0% bf16 MFU | 261649 tok/s +step 891/18794 | loss 4.097257 (-0.68z)| norm 0.2916 (-1.10z)| lr 5.35e-03 | 2006.04 ms | 68.4% bf16 MFU | 261634 tok/s +step 892/18794 | loss 4.058050 (-1.60z)| norm 0.3772 (+0.34z)| lr 5.35e-03 | 2009.82 ms | 68.3% bf16 MFU | 261595 tok/s +step 893/18794 | loss 4.052755 (-1.70z)| norm 0.3722 (+0.27z)| lr 5.36e-03 | 2005.71 ms | 68.4% bf16 MFU | 261585 tok/s +step 894/18794 | loss 4.143751 (+0.51z)| norm 0.4016 (+0.82z)| lr 5.36e-03 | 1982.64 ms | 69.2% bf16 MFU | 261728 tok/s +step 895/18794 | loss 4.104679 (-0.43z)| norm 0.4079 (+0.92z)| lr 5.37e-03 | 2000.80 ms | 68.6% bf16 MFU | 261744 tok/s +step 896/18794 | loss 4.054218 (-1.63z)| norm 0.4208 (+1.13z)| lr 5.38e-03 | 1988.65 ms | 69.0% bf16 MFU | 261838 tok/s +step 897/18794 | loss 4.051517 (-1.67z)| norm 0.4255 (+1.19z)| lr 5.38e-03 | 2007.32 ms | 68.4% bf16 MFU | 261806 tok/s +step 898/18794 | loss 4.070123 (-1.21z)| norm 0.3807 (+0.39z)| lr 5.39e-03 | 2000.09 ms | 68.6% bf16 MFU | 261822 tok/s +step 899/18794 | loss 4.042198 (-1.89z)| norm 0.3360 (-0.39z)| lr 5.39e-03 | 1983.94 ms | 69.2% bf16 MFU | 261944 tok/s +step 900/18794 | loss 4.096564 (-0.49z)| norm 0.3417 (-0.29z)| lr 5.40e-03 | 1982.18 ms | 69.2% bf16 MFU | 262072 tok/s +step 901/18794 | loss 4.067157 (-1.24z)| norm 0.2896 (-1.18z)| lr 5.41e-03 | 1988.93 ms | 69.0% bf16 MFU | 262149 tok/s +step 902/18794 | loss 4.015620 (-2.52z)| norm 0.2665 (-1.56z)| lr 5.41e-03 | 1995.71 ms | 68.8% bf16 MFU | 262177 tok/s +step 903/18794 | loss 4.055884 (-1.44z)| norm 0.2725 (-1.42z)| lr 5.42e-03 | 2002.32 ms | 68.5% bf16 MFU | 262160 tok/s +step 904/18794 | loss 4.093532 (-0.45z)| norm 0.3203 (-0.60z)| lr 5.42e-03 | 1990.49 ms | 68.9% bf16 MFU | 262222 tok/s +step 905/18794 | loss 4.086950 (-0.61z)| norm 0.3908 (+0.60z)| lr 5.43e-03 | 2007.14 ms | 68.4% bf16 MFU | 262171 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.161101 +step 906/18794 | loss 4.142853 (+0.84z)| norm 0.4870 (+2.16z)| lr 5.44e-03 | 1993.62 ms | 68.8% bf16 MFU | 262212 tok/s +step 907/18794 | loss 4.102249 (-0.21z)| norm 0.3780 (+0.34z)| lr 5.44e-03 | 1981.77 ms | 69.2% bf16 MFU | 262329 tok/s +step 908/18794 | loss 4.040511 (-1.77z)| norm 0.3680 (+0.17z)| lr 5.45e-03 | 2016.40 ms | 68.1% bf16 MFU | 262213 tok/s +step 909/18794 | loss 4.080678 (-0.72z)| norm 0.3805 (+0.37z)| lr 5.45e-03 | 1988.34 ms | 69.0% bf16 MFU | 262287 tok/s +step 910/18794 | loss 4.115909 (+0.22z)| norm 0.3101 (-0.80z)| lr 5.46e-03 | 2003.56 ms | 68.5% bf16 MFU | 262256 tok/s +step 911/18794 | loss 4.098819 (-0.22z)| norm 0.3348 (-0.38z)| lr 5.47e-03 | 2003.69 ms | 68.5% bf16 MFU | 262226 tok/s +step 912/18794 | loss 4.066014 (-1.08z)| norm 0.3501 (-0.11z)| lr 5.47e-03 | 1978.21 ms | 69.4% bf16 MFU | 262367 tok/s +step 913/18794 | loss 4.074070 (-0.84z)| norm 0.3136 (-0.72z)| lr 5.48e-03 | 1982.16 ms | 69.2% bf16 MFU | 262473 tok/s +step 914/18794 | loss 4.040489 (-1.70z)| norm 0.3315 (-0.41z)| lr 5.48e-03 | 2017.33 ms | 68.0% bf16 MFU | 262344 tok/s +step 915/18794 | loss 4.073431 (-0.81z)| norm 0.3634 (+0.13z)| lr 5.49e-03 | 1978.69 ms | 69.4% bf16 MFU | 262476 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.037445 +step 916/18794 | loss 4.107963 (+0.12z)| norm 0.4787 (+2.04z)| lr 5.50e-03 | 1994.23 ms | 68.8% bf16 MFU | 262497 tok/s +step 917/18794 | loss 4.075378 (-0.74z)| norm 0.4131 (+0.93z)| lr 5.50e-03 | 1991.76 ms | 68.9% bf16 MFU | 262533 tok/s +step 918/18794 | loss 4.083177 (-0.52z)| norm 0.3418 (-0.24z)| lr 5.51e-03 | 1977.48 ms | 69.4% bf16 MFU | 262663 tok/s +step 919/18794 | loss 4.088358 (-0.36z)| norm 0.3712 (+0.24z)| lr 5.51e-03 | 1991.99 ms | 68.9% bf16 MFU | 262690 tok/s +step 920/18794 | loss 4.082216 (-0.52z)| norm 0.3372 (-0.33z)| lr 5.52e-03 | 2004.70 ms | 68.5% bf16 MFU | 262632 tok/s +step 921/18794 | loss 3.995125 (-2.79z)| norm 0.2724 (-1.37z)| lr 5.53e-03 | 2011.32 ms | 68.2% bf16 MFU | 262534 tok/s +step 922/18794 | loss 4.077301 (-0.60z)| norm 0.2747 (-1.30z)| lr 5.53e-03 | 2002.70 ms | 68.5% bf16 MFU | 262497 tok/s +step 923/18794 | loss 4.078315 (-0.56z)| norm 0.3495 (-0.08z)| lr 5.54e-03 | 1984.45 ms | 69.2% bf16 MFU | 262582 tok/s +step 924/18794 | loss 4.069963 (-0.76z)| norm 0.3628 (+0.14z)| lr 5.54e-03 | 1993.46 ms | 68.8% bf16 MFU | 262603 tok/s +step 925/18794 | loss 4.016793 (-2.11z)| norm 0.2969 (-0.92z)| lr 5.55e-03 | 1991.99 ms | 68.9% bf16 MFU | 262633 tok/s +step 926/18794 | loss 4.009457 (-2.22z)| norm 0.2974 (-0.92z)| lr 5.56e-03 | 1997.78 ms | 68.7% bf16 MFU | 262623 tok/s +step 927/18794 | loss 4.087643 (-0.23z)| norm 0.2962 (-0.96z)| lr 5.56e-03 | 1991.50 ms | 68.9% bf16 MFU | 262655 tok/s +step 928/18794 | loss 4.034255 (-1.55z)| norm 0.2853 (-1.17z)| lr 5.57e-03 | 1985.87 ms | 69.1% bf16 MFU | 262722 tok/s +step 929/18794 | loss 4.070918 (-0.62z)| norm 0.2658 (-1.50z)| lr 5.57e-03 | 1982.73 ms | 69.2% bf16 MFU | 262808 tok/s +step 930/18794 | loss 4.033844 (-1.52z)| norm 0.3061 (-0.82z)| lr 5.58e-03 | 1976.70 ms | 69.4% bf16 MFU | 262929 tok/s +step 931/18794 | loss 4.066938 (-0.68z)| norm 0.3027 (-0.86z)| lr 5.59e-03 | 1983.79 ms | 69.2% bf16 MFU | 262997 tok/s +step 932/18794 | loss 4.009040 (-2.06z)| norm 0.3401 (-0.22z)| lr 5.59e-03 | 1994.87 ms | 68.8% bf16 MFU | 262988 tok/s +step 933/18794 | loss 4.036781 (-1.35z)| norm 0.3564 (+0.06z)| lr 5.60e-03 | 1994.79 ms | 68.8% bf16 MFU | 262980 tok/s +step 934/18794 | loss 4.030365 (-1.48z)| norm 0.3651 (+0.20z)| lr 5.60e-03 | 1985.43 ms | 69.1% bf16 MFU | 263034 tok/s +step 935/18794 | loss 4.116284 (+0.58z)| norm 0.4314 (+1.29z)| lr 5.61e-03 | 1977.66 ms | 69.4% bf16 MFU | 263138 tok/s +step 936/18794 | loss 4.048021 (-1.04z)| norm 0.3565 (+0.03z)| lr 5.62e-03 | 1979.82 ms | 69.3% bf16 MFU | 263222 tok/s +step 937/18794 | loss 3.984747 (-2.47z)| norm 0.3107 (-0.76z)| lr 5.62e-03 | 1978.55 ms | 69.4% bf16 MFU | 263310 tok/s +step 938/18794 | loss 3.973603 (-2.61z)| norm 0.3371 (-0.33z)| lr 5.63e-03 | 1983.88 ms | 69.2% bf16 MFU | 263358 tok/s +step 939/18794 | loss 4.061679 (-0.60z)| norm 0.3358 (-0.35z)| lr 5.63e-03 | 1978.13 ms | 69.4% bf16 MFU | 263442 tok/s +step 940/18794 | loss 3.942828 (-3.11z)| norm 0.3201 (-0.62z)| lr 5.64e-03 | 2001.82 ms | 68.6% bf16 MFU | 263365 tok/s +step 941/18794 | loss 4.046959 (-0.85z)| norm 0.3069 (-0.84z)| lr 5.65e-03 | 1993.42 ms | 68.8% bf16 MFU | 263348 tok/s +step 942/18794 | loss 4.040483 (-0.97z)| norm 0.3759 (+0.35z)| lr 5.65e-03 | 1980.64 ms | 69.3% bf16 MFU | 263416 tok/s +step 943/18794 | loss 4.005013 (-1.68z)| norm 0.3563 (+0.02z)| lr 5.66e-03 | 1978.90 ms | 69.3% bf16 MFU | 263492 tok/s +step 944/18794 | loss 4.051805 (-0.67z)| norm 0.3274 (-0.47z)| lr 5.66e-03 | 1978.78 ms | 69.4% bf16 MFU | 263565 tok/s +step 945/18794 | loss 4.020381 (-1.32z)| norm 0.3043 (-0.86z)| lr 5.67e-03 | 1979.24 ms | 69.3% bf16 MFU | 263631 tok/s +step 946/18794 | loss 4.062976 (-0.41z)| norm 0.3954 (+0.72z)| lr 5.68e-03 | 1980.65 ms | 69.3% bf16 MFU | 263685 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.105304 +step 947/18794 | loss 4.062917 (-0.41z)| norm 0.5398 (+3.11z)| lr 5.68e-03 | 1982.16 ms | 69.2% bf16 MFU | 263726 tok/s +step 948/18794 | loss 4.042525 (-0.82z)| norm 0.4103 (+1.02z)| lr 5.69e-03 | 1988.35 ms | 69.0% bf16 MFU | 263724 tok/s +step 949/18794 | loss 4.016838 (-1.36z)| norm 0.3487 (-0.07z)| lr 5.69e-03 | 1995.46 ms | 68.8% bf16 MFU | 263674 tok/s +step 950/18794 | loss 4.068976 (-0.21z)| norm 0.3082 (-0.78z)| lr 5.70e-03 | 1983.52 ms | 69.2% bf16 MFU | 263707 tok/s +step 951/18794 | loss 4.026485 (-1.13z)| norm 0.3086 (-0.76z)| lr 5.71e-03 | 1983.93 ms | 69.2% bf16 MFU | 263735 tok/s +step 952/18794 | loss 4.026212 (-1.11z)| norm 0.3050 (-0.81z)| lr 5.71e-03 | 1979.27 ms | 69.3% bf16 MFU | 263793 tok/s +step 953/18794 | loss 4.000232 (-1.66z)| norm 0.2961 (-0.96z)| lr 5.72e-03 | 1979.50 ms | 69.3% bf16 MFU | 263846 tok/s +step 954/18794 | loss 4.025471 (-1.08z)| norm 0.3281 (-0.35z)| lr 5.72e-03 | 2024.50 ms | 67.8% bf16 MFU | 263602 tok/s +step 955/18794 | loss 4.033526 (-0.90z)| norm 0.3217 (-0.46z)| lr 5.73e-03 | 2041.30 ms | 67.2% bf16 MFU | 263264 tok/s +step 956/18794 | loss 3.984772 (-2.00z)| norm 0.2905 (-1.06z)| lr 5.74e-03 | 2042.42 ms | 67.2% bf16 MFU | 262936 tok/s +step 957/18794 | loss 4.001049 (-1.58z)| norm 0.2933 (-1.01z)| lr 5.74e-03 | 2040.94 ms | 67.2% bf16 MFU | 262633 tok/s +step 958/18794 | loss 4.006310 (-1.43z)| norm 0.2848 (-1.16z)| lr 5.75e-03 | 2040.64 ms | 67.2% bf16 MFU | 262348 tok/s +step 959/18794 | loss 3.989683 (-1.77z)| norm 0.3148 (-0.56z)| lr 5.75e-03 | 2038.99 ms | 67.3% bf16 MFU | 262087 tok/s +step 960/18794 | loss 3.995135 (-1.61z)| norm 0.3034 (-0.77z)| lr 5.76e-03 | 2033.74 ms | 67.5% bf16 MFU | 261872 tok/s +step 961/18794 | loss 4.021252 (-0.99z)| norm 0.3102 (-0.62z)| lr 5.77e-03 | 2033.53 ms | 67.5% bf16 MFU | 261670 tok/s +step 962/18794 | loss 3.986036 (-1.76z)| norm 0.3243 (-0.34z)| lr 5.77e-03 | 2033.46 ms | 67.5% bf16 MFU | 261478 tok/s +step 963/18794 | loss 3.961659 (-2.25z)| norm 0.3649 (+0.47z)| lr 5.78e-03 | 2026.66 ms | 67.7% bf16 MFU | 261339 tok/s +step 964/18794 | loss 3.979121 (-1.81z)| norm 0.4093 (+1.32z)| lr 5.78e-03 | 2032.63 ms | 67.5% bf16 MFU | 261169 tok/s +step 965/18794 | loss 3.955607 (-2.30z)| norm 0.3861 (+0.86z)| lr 5.79e-03 | 2025.68 ms | 67.7% bf16 MFU | 261051 tok/s +step 966/18794 | loss 3.969290 (-1.94z)| norm 0.2984 (-0.84z)| lr 5.80e-03 | 2034.32 ms | 67.5% bf16 MFU | 260885 tok/s +step 967/18794 | loss 4.008534 (-1.05z)| norm 0.3116 (-0.58z)| lr 5.80e-03 | 2028.16 ms | 67.7% bf16 MFU | 260766 tok/s +step 968/18794 | loss 3.963070 (-2.04z)| norm 0.3362 (-0.10z)| lr 5.81e-03 | 2024.66 ms | 67.8% bf16 MFU | 260675 tok/s +step 969/18794 | loss 3.993742 (-1.32z)| norm 0.3953 (+1.04z)| lr 5.81e-03 | 2020.31 ms | 67.9% bf16 MFU | 260617 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.518403 +step 970/18794 | loss 4.050469 (-0.02z)| norm 0.4753 (+2.52z)| lr 5.82e-03 | 2023.87 ms | 67.8% bf16 MFU | 260539 tok/s +step 971/18794 | loss 4.058697 (+0.18z)| norm 0.4255 (+1.57z)| lr 5.83e-03 | 2032.14 ms | 67.5% bf16 MFU | 260411 tok/s +step 972/18794 | loss 4.048565 (-0.05z)| norm 0.4141 (+1.34z)| lr 5.83e-03 | 2039.89 ms | 67.3% bf16 MFU | 260242 tok/s +step 973/18794 | loss 3.986968 (-1.44z)| norm 0.3337 (-0.17z)| lr 5.84e-03 | 2032.34 ms | 67.5% bf16 MFU | 260128 tok/s +step 974/18794 | loss 3.998726 (-1.15z)| norm 0.3113 (-0.59z)| lr 5.84e-03 | 2037.88 ms | 67.3% bf16 MFU | 259985 tok/s +step 975/18794 | loss 3.961995 (-1.93z)| norm 0.3653 (+0.42z)| lr 5.85e-03 | 2034.79 ms | 67.4% bf16 MFU | 259869 tok/s +step 976/18794 | loss 3.953552 (-2.06z)| norm 0.3756 (+0.61z)| lr 5.86e-03 | 2024.30 ms | 67.8% bf16 MFU | 259826 tok/s +step 977/18794 | loss 4.009466 (-0.80z)| norm 0.3351 (-0.16z)| lr 5.86e-03 | 2018.42 ms | 68.0% bf16 MFU | 259822 tok/s +step 978/18794 | loss 4.010533 (-0.77z)| norm 0.3202 (-0.45z)| lr 5.87e-03 | 2038.69 ms | 67.3% bf16 MFU | 259689 tok/s +step 979/18794 | loss 3.934611 (-2.36z)| norm 0.3213 (-0.45z)| lr 5.87e-03 | 2039.69 ms | 67.3% bf16 MFU | 259557 tok/s +step 980/18794 | loss 3.916381 (-2.63z)| norm 0.3120 (-0.64z)| lr 5.88e-03 | 2012.25 ms | 68.2% bf16 MFU | 259607 tok/s +step 981/18794 | loss 3.946698 (-1.94z)| norm 0.2817 (-1.21z)| lr 5.89e-03 | 2034.33 ms | 67.5% bf16 MFU | 259512 tok/s +step 982/18794 | loss 3.943955 (-1.94z)| norm 0.2722 (-1.37z)| lr 5.89e-03 | 2023.49 ms | 67.8% bf16 MFU | 259492 tok/s +step 983/18794 | loss 3.964423 (-1.49z)| norm 0.2664 (-1.47z)| lr 5.90e-03 | 2041.14 ms | 67.2% bf16 MFU | 259360 tok/s +step 984/18794 | loss 3.943629 (-1.87z)| norm 0.2628 (-1.53z)| lr 5.90e-03 | 2012.19 ms | 68.2% bf16 MFU | 259420 tok/s +step 985/18794 | loss 3.945312 (-1.78z)| norm 0.3313 (-0.24z)| lr 5.91e-03 | 2029.35 ms | 67.6% bf16 MFU | 259367 tok/s +step 986/18794 | loss 3.973089 (-1.21z)| norm 0.4419 (+1.83z)| lr 5.92e-03 | 2022.69 ms | 67.8% bf16 MFU | 259358 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.736006 +step 987/18794 | loss 3.992093 (-0.82z)| norm 0.4975 (+2.74z)| lr 5.92e-03 | 2032.37 ms | 67.5% bf16 MFU | 259289 tok/s +step 988/18794 | loss 3.956742 (-1.49z)| norm 0.3529 (+0.13z)| lr 5.93e-03 | 2022.86 ms | 67.8% bf16 MFU | 259283 tok/s +step 989/18794 | loss 4.010759 (-0.41z)| norm 0.3394 (-0.10z)| lr 5.93e-03 | 2027.10 ms | 67.7% bf16 MFU | 259251 tok/s +step 990/18794 | loss 3.963907 (-1.31z)| norm 0.2946 (-0.91z)| lr 5.94e-03 | 2030.49 ms | 67.6% bf16 MFU | 259199 tok/s +step 991/18794 | loss 3.954001 (-1.48z)| norm 0.2837 (-1.11z)| lr 5.95e-03 | 2021.49 ms | 67.9% bf16 MFU | 259207 tok/s +step 992/18794 | loss 4.048676 (+0.39z)| norm 0.2586 (-1.52z)| lr 5.95e-03 | 2012.26 ms | 68.2% bf16 MFU | 259274 tok/s +step 993/18794 | loss 3.943250 (-1.64z)| norm 0.2260 (-2.04z)| lr 5.96e-03 | 2017.05 ms | 68.0% bf16 MFU | 259307 tok/s +step 994/18794 | loss 3.963942 (-1.23z)| norm 0.2480 (-1.61z)| lr 5.96e-03 | 2031.71 ms | 67.5% bf16 MFU | 259244 tok/s +step 995/18794 | loss 3.926036 (-1.93z)| norm 0.2978 (-0.73z)| lr 5.97e-03 | 2024.70 ms | 67.8% bf16 MFU | 259229 tok/s +step 996/18794 | loss 3.972229 (-1.00z)| norm 0.3381 (-0.01z)| lr 5.98e-03 | 2017.31 ms | 68.0% bf16 MFU | 259262 tok/s +step 997/18794 | loss 3.964625 (-1.13z)| norm 0.3221 (-0.28z)| lr 5.98e-03 | 1995.71 ms | 68.8% bf16 MFU | 259435 tok/s +step 998/18794 | loss 3.953213 (-1.32z)| norm 0.3118 (-0.45z)| lr 5.99e-03 | 2000.61 ms | 68.6% bf16 MFU | 259566 tok/s +step 999/18794 | loss 3.998179 (-0.44z)| norm 0.3313 (-0.10z)| lr 5.99e-03 | 2023.62 ms | 67.8% bf16 MFU | 259542 tok/s +step 1000/18794 | loss 3.929062 (-1.74z)| norm 0.3279 (-0.16z)| lr 6.00e-03 | 2025.14 ms | 67.8% bf16 MFU | 259509 tok/s +val loss 3.988947 +HellaSwag: 2538/10042 = 0.252739Swag: 990/1256: 0/1256 +Writing state to log_gpt3_125M_edu_v4/state_00001000_00001.bin +generating: +--- +|Tars||Paracontic patients can make your damaged plant as its own roots. Magnines. Here are a few natural wonders: +- Rats. The bugs are a great choice for tooth sensitivity. With the vital tools our bodies can help fix the ailments that occur at a later stage. Eating so enjoy tasting tasty people, reduce the like +lihood of developing the disease. +- Eye therapies. Your body needs support when you have these deep coping skills. +- Use their thyroid and bone broth every night to protect vision and balance. +- See every single day of the day +- Donance your back with this book.<|endoftext|>CEP is a fun, educational experience that is generous when one's +--- +Writing checkpoint at step 1000 +Writing model to log_gpt3_125M_edu_v4/model_00001000.bin +Writing state to log_gpt3_125M_edu_v4/state_00001000_00000.bin +step 1001/18794 | loss 4.030472 (+0.23z)| norm 0.3684 (+0.55z)| lr 6.00e-03 | 2020.95 ms | 67.9% bf16 MFU | 259505 tok/s +step 1002/18794 | loss 3.951807 (-1.28z)| norm 0.3761 (+0.67z)| lr 6.00e-03 | 2021.66 ms | 67.9% bf16 MFU | 259497 tok/s +step 1003/18794 | loss 3.940065 (-1.47z)| norm 0.3832 (+0.78z)| lr 6.00e-03 | 2023.31 ms | 67.8% bf16 MFU | 259478 tok/s +step 1004/18794 | loss 4.029163 (+0.25z)| norm 0.4024 (+1.11z)| lr 6.00e-03 | 2016.62 ms | 68.1% bf16 MFU | 259503 tok/s +step 1005/18794 | loss 3.970728 (-0.86z)| norm 0.3793 (+0.70z)| lr 6.00e-03 | 2032.89 ms | 67.5% bf16 MFU | 259423 tok/s +step 1006/18794 | loss 3.964808 (-0.97z)| norm 0.3894 (+0.92z)| lr 6.00e-03 | 2020.56 ms | 67.9% bf16 MFU | 259426 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.144245 +step 1007/18794 | loss 3.983066 (-0.59z)| norm 0.4582 (+2.14z)| lr 6.00e-03 | 2022.00 ms | 67.9% bf16 MFU | 259419 tok/s +step 1008/18794 | loss 4.007574 (-0.09z)| norm 0.4525 (+1.99z)| lr 6.00e-03 | 2017.73 ms | 68.0% bf16 MFU | 259440 tok/s +step 1009/18794 | loss 4.014793 (+0.08z)| norm 0.3870 (+0.82z)| lr 6.00e-03 | 2002.75 ms | 68.5% bf16 MFU | 259558 tok/s +step 1010/18794 | loss 3.930901 (-1.63z)| norm 0.3531 (+0.20z)| lr 6.00e-03 | 2020.16 ms | 67.9% bf16 MFU | 259556 tok/s +step 1011/18794 | loss 3.983633 (-0.52z)| norm 0.3205 (-0.38z)| lr 6.00e-03 | 2010.30 ms | 68.3% bf16 MFU | 259618 tok/s +step 1012/18794 | loss 3.923619 (-1.74z)| norm 0.3479 (+0.11z)| lr 6.00e-03 | 2019.63 ms | 67.9% bf16 MFU | 259617 tok/s +step 1013/18794 | loss 3.951391 (-1.14z)| norm 0.3117 (-0.54z)| lr 6.00e-03 | 2020.79 ms | 67.9% bf16 MFU | 259609 tok/s +step 1014/18794 | loss 3.953026 (-1.08z)| norm 0.3067 (-0.62z)| lr 6.00e-03 | 2020.87 ms | 67.9% bf16 MFU | 259600 tok/s +step 1015/18794 | loss 3.948114 (-1.16z)| norm 0.3139 (-0.48z)| lr 6.00e-03 | 2030.87 ms | 67.6% bf16 MFU | 259528 tok/s +step 1016/18794 | loss 3.936494 (-1.39z)| norm 0.3085 (-0.56z)| lr 6.00e-03 | 1993.95 ms | 68.8% bf16 MFU | 259699 tok/s +step 1017/18794 | loss 3.930916 (-1.48z)| norm 0.2990 (-0.72z)| lr 6.00e-03 | 1986.86 ms | 69.1% bf16 MFU | 259908 tok/s +step 1018/18794 | loss 3.952284 (-1.01z)| norm 0.2922 (-0.83z)| lr 6.00e-03 | 2017.17 ms | 68.0% bf16 MFU | 259908 tok/s +step 1019/18794 | loss 3.980936 (-0.37z)| norm 0.2996 (-0.69z)| lr 6.00e-03 | 2015.28 ms | 68.1% bf16 MFU | 259920 tok/s +step 1020/18794 | loss 3.988199 (-0.19z)| norm 0.2712 (-1.19z)| lr 6.00e-03 | 2027.53 ms | 67.7% bf16 MFU | 259853 tok/s +step 1021/18794 | loss 3.970093 (-0.59z)| norm 0.2691 (-1.22z)| lr 6.00e-03 | 2018.20 ms | 68.0% bf16 MFU | 259850 tok/s +step 1022/18794 | loss 3.935930 (-1.34z)| norm 0.2362 (-1.80z)| lr 6.00e-03 | 2011.37 ms | 68.2% bf16 MFU | 259890 tok/s +step 1023/18794 | loss 3.909161 (-1.90z)| norm 0.2483 (-1.55z)| lr 6.00e-03 | 2025.35 ms | 67.8% bf16 MFU | 259839 tok/s +step 1024/18794 | loss 3.900469 (-2.04z)| norm 0.2529 (-1.43z)| lr 6.00e-03 | 2012.61 ms | 68.2% bf16 MFU | 259872 tok/s +step 1025/18794 | loss 3.889063 (-2.22z)| norm 0.2785 (-0.97z)| lr 6.00e-03 | 2017.45 ms | 68.0% bf16 MFU | 259872 tok/s +step 1026/18794 | loss 3.952182 (-0.82z)| norm 0.2947 (-0.68z)| lr 6.00e-03 | 2001.63 ms | 68.6% bf16 MFU | 259975 tok/s +step 1027/18794 | loss 3.938226 (-1.11z)| norm 0.2653 (-1.19z)| lr 6.00e-03 | 2011.74 ms | 68.2% bf16 MFU | 260007 tok/s +step 1028/18794 | loss 3.915414 (-1.58z)| norm 0.2432 (-1.56z)| lr 6.00e-03 | 2012.30 ms | 68.2% bf16 MFU | 260034 tok/s +step 1029/18794 | loss 3.902703 (-1.82z)| norm 0.2647 (-1.18z)| lr 6.00e-03 | 2009.99 ms | 68.3% bf16 MFU | 260074 tok/s +step 1030/18794 | loss 3.935101 (-1.09z)| norm 0.3338 (+0.01z)| lr 6.00e-03 | 1998.68 ms | 68.7% bf16 MFU | 260186 tok/s +step 1031/18794 | loss 3.931273 (-1.16z)| norm 0.4229 (+1.53z)| lr 6.00e-03 | 2001.08 ms | 68.6% bf16 MFU | 260277 tok/s +step 1032/18794 | loss 3.929935 (-1.16z)| norm 0.3590 (+0.43z)| lr 6.00e-03 | 2009.99 ms | 68.3% bf16 MFU | 260306 tok/s +step 1033/18794 | loss 4.009334 (+0.60z)| norm 0.2722 (-1.05z)| lr 6.00e-03 | 2003.12 ms | 68.5% bf16 MFU | 260377 tok/s +step 1034/18794 | loss 3.871233 (-2.38z)| norm 0.3974 (+1.09z)| lr 6.00e-03 | 2006.85 ms | 68.4% bf16 MFU | 260421 tok/s +step 1035/18794 | loss 3.903436 (-1.68z)| norm 0.3831 (+0.86z)| lr 6.00e-03 | 2011.05 ms | 68.2% bf16 MFU | 260435 tok/s +step 1036/18794 | loss 3.952451 (-0.57z)| norm 0.3395 (+0.11z)| lr 6.00e-03 | 2009.62 ms | 68.3% bf16 MFU | 260457 tok/s +step 1037/18794 | loss 3.912005 (-1.45z)| norm 0.3150 (-0.31z)| lr 6.00e-03 | 2002.44 ms | 68.5% bf16 MFU | 260526 tok/s +step 1038/18794 | loss 3.995284 (+0.41z)| norm 0.3961 (+1.07z)| lr 6.00e-03 | 2004.97 ms | 68.4% bf16 MFU | 260574 tok/s +step 1039/18794 | loss 3.920133 (-1.25z)| norm 0.3952 (+1.04z)| lr 6.00e-03 | 2024.53 ms | 67.8% bf16 MFU | 260494 tok/s +step 1040/18794 | loss 3.922049 (-1.20z)| norm 0.3907 (+0.95z)| lr 6.00e-03 | 2004.96 ms | 68.4% bf16 MFU | 260544 tok/s +step 1041/18794 | loss 3.956826 (-0.40z)| norm 0.2901 (-0.76z)| lr 6.00e-03 | 2001.10 ms | 68.6% bf16 MFU | 260617 tok/s +step 1042/18794 | loss 3.937843 (-0.82z)| norm 0.3315 (-0.05z)| lr 6.00e-03 | 2011.26 ms | 68.2% bf16 MFU | 260620 tok/s +step 1043/18794 | loss 3.960168 (-0.30z)| norm 0.3357 (+0.02z)| lr 6.00e-03 | 1997.61 ms | 68.7% bf16 MFU | 260712 tok/s +step 1044/18794 | loss 3.920147 (-1.20z)| norm 0.3097 (-0.42z)| lr 6.00e-03 | 1998.48 ms | 68.7% bf16 MFU | 260793 tok/s +step 1045/18794 | loss 3.942568 (-0.66z)| norm 0.2844 (-0.84z)| lr 6.00e-03 | 2009.87 ms | 68.3% bf16 MFU | 260796 tok/s +step 1046/18794 | loss 3.923782 (-1.09z)| norm 0.2531 (-1.34z)| lr 6.00e-03 | 2001.42 ms | 68.6% bf16 MFU | 260854 tok/s +step 1047/18794 | loss 3.906821 (-1.48z)| norm 0.2978 (-0.58z)| lr 6.00e-03 | 2008.32 ms | 68.3% bf16 MFU | 260865 tok/s +step 1048/18794 | loss 3.910279 (-1.37z)| norm 0.2916 (-0.68z)| lr 6.00e-03 | 2016.38 ms | 68.1% bf16 MFU | 260822 tok/s +step 1049/18794 | loss 3.934774 (-0.76z)| norm 0.2832 (-0.82z)| lr 6.00e-03 | 1999.68 ms | 68.6% bf16 MFU | 260890 tok/s +step 1050/18794 | loss 3.945045 (-0.49z)| norm 0.2649 (-1.14z)| lr 6.00e-03 | 2001.47 ms | 68.6% bf16 MFU | 260943 tok/s +step 1051/18794 | loss 3.937863 (-0.66z)| norm 0.2685 (-1.06z)| lr 6.00e-03 | 2003.15 ms | 68.5% bf16 MFU | 260983 tok/s +step 1052/18794 | loss 3.892390 (-1.78z)| norm 0.2465 (-1.43z)| lr 6.00e-03 | 2010.58 ms | 68.3% bf16 MFU | 260972 tok/s +step 1053/18794 | loss 3.927824 (-0.86z)| norm 0.3078 (-0.34z)| lr 6.00e-03 | 2003.67 ms | 68.5% bf16 MFU | 261006 tok/s +step 1054/18794 | loss 3.947374 (-0.35z)| norm 0.2962 (-0.54z)| lr 6.00e-03 | 2001.62 ms | 68.6% bf16 MFU | 261053 tok/s +step 1055/18794 | loss 3.911594 (-1.25z)| norm 0.2666 (-1.05z)| lr 6.00e-03 | 2013.74 ms | 68.1% bf16 MFU | 261018 tok/s +step 1056/18794 | loss 3.920886 (-0.99z)| norm 0.2818 (-0.78z)| lr 6.00e-03 | 2006.13 ms | 68.4% bf16 MFU | 261034 tok/s +step 1057/18794 | loss 3.918159 (-1.04z)| norm 0.2731 (-0.93z)| lr 6.00e-03 | 1995.09 ms | 68.8% bf16 MFU | 261122 tok/s +step 1058/18794 | loss 3.864209 (-2.35z)| norm 0.2572 (-1.20z)| lr 6.00e-03 | 2001.63 ms | 68.6% bf16 MFU | 261162 tok/s +step 1059/18794 | loss 3.890266 (-1.65z)| norm 0.2252 (-1.72z)| lr 6.00e-03 | 2010.08 ms | 68.3% bf16 MFU | 261146 tok/s +step 1060/18794 | loss 3.907886 (-1.18z)| norm 0.2551 (-1.18z)| lr 6.00e-03 | 2011.37 ms | 68.2% bf16 MFU | 261122 tok/s +step 1061/18794 | loss 3.890495 (-1.59z)| norm 0.2688 (-0.94z)| lr 6.00e-03 | 2009.07 ms | 68.3% bf16 MFU | 261113 tok/s +step 1062/18794 | loss 3.890875 (-1.54z)| norm 0.2597 (-1.08z)| lr 6.00e-03 | 2022.91 ms | 67.8% bf16 MFU | 261017 tok/s +step 1063/18794 | loss 3.877436 (-1.82z)| norm 0.2490 (-1.23z)| lr 6.00e-03 | 2001.69 ms | 68.6% bf16 MFU | 261062 tok/s +step 1064/18794 | loss 3.908567 (-1.04z)| norm 0.3170 (-0.07z)| lr 6.00e-03 | 2004.00 ms | 68.5% bf16 MFU | 261090 tok/s +step 1065/18794 | loss 3.891781 (-1.42z)| norm 0.3664 (+0.78z)| lr 6.00e-03 | 2011.77 ms | 68.2% bf16 MFU | 261066 tok/s +step 1066/18794 | loss 3.903565 (-1.11z)| norm 0.3087 (-0.21z)| lr 6.00e-03 | 2002.26 ms | 68.5% bf16 MFU | 261105 tok/s +step 1067/18794 | loss 3.883197 (-1.57z)| norm 0.3280 (+0.12z)| lr 6.00e-03 | 1996.61 ms | 68.7% bf16 MFU | 261179 tok/s +step 1068/18794 | loss 3.871098 (-1.81z)| norm 0.2932 (-0.47z)| lr 6.00e-03 | 2001.00 ms | 68.6% bf16 MFU | 261221 tok/s +step 1069/18794 | loss 3.893174 (-1.26z)| norm 0.2754 (-0.76z)| lr 6.00e-03 | 1987.38 ms | 69.1% bf16 MFU | 261350 tok/s +step 1070/18794 | loss 3.914533 (-0.74z)| norm 0.3285 (+0.19z)| lr 6.00e-03 | 1996.64 ms | 68.7% bf16 MFU | 261412 tok/s +step 1071/18794 | loss 3.910049 (-0.85z)| norm 0.3174 (+0.01z)| lr 6.00e-03 | 2014.50 ms | 68.1% bf16 MFU | 261354 tok/s +step 1072/18794 | loss 3.874689 (-1.73z)| norm 0.3021 (-0.25z)| lr 6.00e-03 | 2003.64 ms | 68.5% bf16 MFU | 261370 tok/s +step 1073/18794 | loss 3.894247 (-1.20z)| norm 0.2682 (-0.87z)| lr 6.00e-03 | 1994.41 ms | 68.8% bf16 MFU | 261445 tok/s +step 1074/18794 | loss 3.857311 (-2.09z)| norm 0.2622 (-0.96z)| lr 6.00e-03 | 2012.64 ms | 68.2% bf16 MFU | 261398 tok/s +step 1075/18794 | loss 3.895249 (-1.10z)| norm 0.2662 (-0.87z)| lr 6.00e-03 | 1993.48 ms | 68.8% bf16 MFU | 261478 tok/s +step 1076/18794 | loss 3.872346 (-1.64z)| norm 0.2755 (-0.68z)| lr 6.00e-03 | 1999.62 ms | 68.6% bf16 MFU | 261514 tok/s +step 1077/18794 | loss 3.977657 (+1.00z)| norm 0.3007 (-0.21z)| lr 6.00e-03 | 2001.43 ms | 68.6% bf16 MFU | 261536 tok/s +step 1078/18794 | loss 3.933058 (-0.10z)| norm 0.3245 (+0.23z)| lr 6.00e-03 | 1999.85 ms | 68.6% bf16 MFU | 261567 tok/s +step 1079/18794 | loss 3.863784 (-1.83z)| norm 0.2769 (-0.64z)| lr 6.00e-03 | 1995.76 ms | 68.8% bf16 MFU | 261624 tok/s +step 1080/18794 | loss 3.920648 (-0.40z)| norm 0.2768 (-0.64z)| lr 6.00e-03 | 2010.81 ms | 68.2% bf16 MFU | 261580 tok/s +step 1081/18794 | loss 3.907380 (-0.72z)| norm 0.2656 (-0.84z)| lr 6.00e-03 | 1994.01 ms | 68.8% bf16 MFU | 261647 tok/s +step 1082/18794 | loss 3.868696 (-1.65z)| norm 0.2884 (-0.42z)| lr 6.00e-03 | 1989.26 ms | 69.0% bf16 MFU | 261743 tok/s +step 1083/18794 | loss 3.874532 (-1.47z)| norm 0.3051 (-0.12z)| lr 6.00e-03 | 2010.53 ms | 68.3% bf16 MFU | 261694 tok/s +step 1084/18794 | loss 3.834712 (-2.36z)| norm 0.2996 (-0.24z)| lr 6.00e-03 | 1999.42 ms | 68.6% bf16 MFU | 261720 tok/s +step 1085/18794 | loss 3.953327 (+0.48z)| norm 0.2789 (-0.61z)| lr 6.00e-03 | 1988.03 ms | 69.0% bf16 MFU | 261821 tok/s +step 1086/18794 | loss 3.856275 (-1.79z)| norm 0.2865 (-0.45z)| lr 6.00e-03 | 1999.70 ms | 68.6% bf16 MFU | 261839 tok/s +step 1087/18794 | loss 3.892816 (-0.91z)| norm 0.2249 (-1.66z)| lr 6.00e-03 | 1997.17 ms | 68.7% bf16 MFU | 261873 tok/s +step 1088/18794 | loss 3.824443 (-2.43z)| norm 0.2789 (-0.56z)| lr 6.00e-03 | 1993.56 ms | 68.8% bf16 MFU | 261928 tok/s +step 1089/18794 | loss 3.899989 (-0.67z)| norm 0.3094 (+0.06z)| lr 6.00e-03 | 2000.13 ms | 68.6% bf16 MFU | 261938 tok/s +step 1090/18794 | loss 3.873236 (-1.27z)| norm 0.2629 (-0.87z)| lr 6.00e-03 | 2003.22 ms | 68.5% bf16 MFU | 261928 tok/s +step 1091/18794 | loss 3.887755 (-0.92z)| norm 0.2431 (-1.25z)| lr 6.00e-03 | 1987.08 ms | 69.1% bf16 MFU | 262024 tok/s +step 1092/18794 | loss 3.920364 (-0.13z)| norm 0.2797 (-0.53z)| lr 6.00e-03 | 2000.95 ms | 68.6% bf16 MFU | 262023 tok/s +step 1093/18794 | loss 3.858699 (-1.59z)| norm 0.2773 (-0.59z)| lr 6.00e-03 | 2002.70 ms | 68.5% bf16 MFU | 262012 tok/s +step 1094/18794 | loss 3.821113 (-2.40z)| norm 0.2581 (-0.99z)| lr 6.00e-03 | 1996.12 ms | 68.7% bf16 MFU | 262044 tok/s +step 1095/18794 | loss 3.892412 (-0.72z)| norm 0.3004 (-0.12z)| lr 6.00e-03 | 2003.66 ms | 68.5% bf16 MFU | 262025 tok/s +step 1096/18794 | loss 3.925145 (+0.05z)| norm 0.3003 (-0.12z)| lr 6.00e-03 | 1994.01 ms | 68.8% bf16 MFU | 262070 tok/s +step 1097/18794 | loss 3.836798 (-1.96z)| norm 0.3175 (+0.23z)| lr 6.00e-03 | 2000.51 ms | 68.6% bf16 MFU | 262071 tok/s +step 1098/18794 | loss 3.950274 (+0.66z)| norm 0.3471 (+0.83z)| lr 6.00e-03 | 1997.52 ms | 68.7% bf16 MFU | 262091 tok/s +step 1099/18794 | loss 3.913408 (-0.17z)| norm 0.3531 (+0.94z)| lr 6.00e-03 | 2004.57 ms | 68.5% bf16 MFU | 262063 tok/s +step 1100/18794 | loss 3.873201 (-1.10z)| norm 0.3509 (+0.89z)| lr 6.00e-03 | 1995.80 ms | 68.8% bf16 MFU | 262095 tok/s +step 1101/18794 | loss 3.921382 (+0.06z)| norm 0.3117 (+0.11z)| lr 6.00e-03 | 1994.50 ms | 68.8% bf16 MFU | 262134 tok/s +step 1102/18794 | loss 3.882558 (-0.87z)| norm 0.2544 (-1.04z)| lr 6.00e-03 | 1992.13 ms | 68.9% bf16 MFU | 262186 tok/s +step 1103/18794 | loss 3.881316 (-0.88z)| norm 0.2985 (-0.12z)| lr 6.00e-03 | 1990.72 ms | 68.9% bf16 MFU | 262245 tok/s +step 1104/18794 | loss 3.911115 (-0.14z)| norm 0.3201 (+0.35z)| lr 6.00e-03 | 1999.04 ms | 68.6% bf16 MFU | 262246 tok/s +step 1105/18794 | loss 3.857122 (-1.46z)| norm 0.3062 (+0.07z)| lr 6.00e-03 | 2007.17 ms | 68.4% bf16 MFU | 262194 tok/s +step 1106/18794 | loss 3.819560 (-2.31z)| norm 0.3167 (+0.32z)| lr 6.00e-03 | 2007.30 ms | 68.4% bf16 MFU | 262144 tok/s +step 1107/18794 | loss 3.861072 (-1.28z)| norm 0.3100 (+0.22z)| lr 6.00e-03 | 2001.52 ms | 68.6% bf16 MFU | 262134 tok/s +step 1108/18794 | loss 3.891806 (-0.50z)| norm 0.2659 (-0.81z)| lr 6.00e-03 | 2007.91 ms | 68.3% bf16 MFU | 262083 tok/s +step 1109/18794 | loss 3.900812 (-0.25z)| norm 0.3745 (+1.89z)| lr 6.00e-03 | 1984.68 ms | 69.1% bf16 MFU | 262187 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.540296 +step 1110/18794 | loss 3.909810 (-0.01z)| norm 0.4036 (+2.54z)| lr 6.00e-03 | 2001.16 ms | 68.6% bf16 MFU | 262177 tok/s +step 1111/18794 | loss 3.942509 (+0.87z)| norm 0.3542 (+1.32z)| lr 6.00e-03 | 1987.99 ms | 69.0% bf16 MFU | 262255 tok/s +step 1112/18794 | loss 3.869750 (-1.05z)| norm 0.2931 (-0.14z)| lr 6.00e-03 | 2006.06 ms | 68.4% bf16 MFU | 262210 tok/s +step 1113/18794 | loss 3.901742 (-0.19z)| norm 0.3019 (+0.08z)| lr 6.00e-03 | 1990.59 ms | 68.9% bf16 MFU | 262268 tok/s +step 1114/18794 | loss 3.850905 (-1.51z)| norm 0.3041 (+0.13z)| lr 6.00e-03 | 1996.09 ms | 68.8% bf16 MFU | 262288 tok/s +step 1115/18794 | loss 3.864381 (-1.13z)| norm 0.2729 (-0.62z)| lr 6.00e-03 | 1998.61 ms | 68.7% bf16 MFU | 262290 tok/s +step 1116/18794 | loss 3.897267 (-0.25z)| norm 0.2361 (-1.48z)| lr 6.00e-03 | 1992.03 ms | 68.9% bf16 MFU | 262335 tok/s +step 1117/18794 | loss 3.884545 (-0.58z)| norm 0.2775 (-0.47z)| lr 6.00e-03 | 1993.62 ms | 68.8% bf16 MFU | 262367 tok/s +step 1118/18794 | loss 3.828908 (-2.00z)| norm 0.2705 (-0.64z)| lr 6.00e-03 | 2009.95 ms | 68.3% bf16 MFU | 262291 tok/s +step 1119/18794 | loss 3.871629 (-0.86z)| norm 0.2686 (-0.67z)| lr 6.00e-03 | 1987.19 ms | 69.1% bf16 MFU | 262368 tok/s +step 1120/18794 | loss 3.882022 (-0.57z)| norm 0.2859 (-0.26z)| lr 6.00e-03 | 1986.42 ms | 69.1% bf16 MFU | 262447 tok/s +step 1121/18794 | loss 3.846965 (-1.51z)| norm 0.2663 (-0.73z)| lr 6.00e-03 | 1995.22 ms | 68.8% bf16 MFU | 262463 tok/s +step 1122/18794 | loss 3.870447 (-0.84z)| norm 0.2641 (-0.80z)| lr 6.00e-03 | 1991.26 ms | 68.9% bf16 MFU | 262504 tok/s +step 1123/18794 | loss 3.936936 (+0.99z)| norm 0.2723 (-0.61z)| lr 6.00e-03 | 1991.02 ms | 68.9% bf16 MFU | 262546 tok/s +step 1124/18794 | loss 3.858868 (-1.14z)| norm 0.2764 (-0.52z)| lr 6.00e-03 | 1991.69 ms | 68.9% bf16 MFU | 262580 tok/s +step 1125/18794 | loss 3.828916 (-1.91z)| norm 0.2605 (-0.90z)| lr 6.00e-03 | 1991.94 ms | 68.9% bf16 MFU | 262611 tok/s +step 1126/18794 | loss 3.892376 (-0.19z)| norm 0.2572 (-0.96z)| lr 6.00e-03 | 1988.54 ms | 69.0% bf16 MFU | 262664 tok/s +step 1127/18794 | loss 3.899808 (+0.02z)| norm 0.2458 (-1.23z)| lr 6.00e-03 | 2000.99 ms | 68.6% bf16 MFU | 262631 tok/s +step 1128/18794 | loss 3.823432 (-2.00z)| norm 0.2500 (-1.13z)| lr 6.00e-03 | 2011.14 ms | 68.2% bf16 MFU | 262534 tok/s +step 1129/18794 | loss 3.868340 (-0.79z)| norm 0.2489 (-1.15z)| lr 6.00e-03 | 1996.34 ms | 68.7% bf16 MFU | 262539 tok/s +step 1130/18794 | loss 3.948652 (+1.35z)| norm 0.2497 (-1.11z)| lr 6.00e-03 | 1987.06 ms | 69.1% bf16 MFU | 262604 tok/s +step 1131/18794 | loss 3.846231 (-1.34z)| norm 0.2418 (-1.31z)| lr 6.00e-03 | 1988.29 ms | 69.0% bf16 MFU | 262658 tok/s +step 1132/18794 | loss 3.829458 (-1.74z)| norm 0.2801 (-0.34z)| lr 6.00e-03 | 1991.40 ms | 68.9% bf16 MFU | 262689 tok/s +step 1133/18794 | loss 3.851321 (-1.17z)| norm 0.2863 (-0.18z)| lr 6.00e-03 | 1982.70 ms | 69.2% bf16 MFU | 262776 tok/s +step 1134/18794 | loss 3.812701 (-2.16z)| norm 0.2474 (-1.16z)| lr 6.00e-03 | 1987.37 ms | 69.1% bf16 MFU | 262828 tok/s +step 1135/18794 | loss 3.813087 (-2.08z)| norm 0.2195 (-1.86z)| lr 6.00e-03 | 1992.41 ms | 68.9% bf16 MFU | 262844 tok/s +step 1136/18794 | loss 3.853893 (-1.00z)| norm 0.2532 (-0.96z)| lr 6.00e-03 | 1995.21 ms | 68.8% bf16 MFU | 262840 tok/s +step 1137/18794 | loss 3.829744 (-1.59z)| norm 0.2766 (-0.33z)| lr 6.00e-03 | 1981.41 ms | 69.3% bf16 MFU | 262928 tok/s +step 1138/18794 | loss 3.830067 (-1.58z)| norm 0.2567 (-0.85z)| lr 6.00e-03 | 1987.28 ms | 69.1% bf16 MFU | 262973 tok/s +step 1139/18794 | loss 3.832463 (-1.48z)| norm 0.2547 (-0.91z)| lr 6.00e-03 | 1988.53 ms | 69.0% bf16 MFU | 263007 tok/s +step 1140/18794 | loss 3.815383 (-1.88z)| norm 0.2457 (-1.17z)| lr 6.00e-03 | 1986.02 ms | 69.1% bf16 MFU | 263056 tok/s +step 1141/18794 | loss 3.858514 (-0.74z)| norm 0.3035 (+0.55z)| lr 6.00e-03 | 1980.11 ms | 69.3% bf16 MFU | 263142 tok/s +step 1142/18794 | loss 3.866275 (-0.52z)| norm 0.3378 (+1.56z)| lr 6.00e-03 | 1983.26 ms | 69.2% bf16 MFU | 263203 tok/s +step 1143/18794 | loss 3.831213 (-1.43z)| norm 0.2775 (-0.21z)| lr 6.00e-03 | 1979.13 ms | 69.3% bf16 MFU | 263288 tok/s +step 1144/18794 | loss 3.888130 (+0.10z)| norm 0.2633 (-0.62z)| lr 6.00e-03 | 1996.67 ms | 68.7% bf16 MFU | 263253 tok/s +step 1145/18794 | loss 3.908798 (+0.67z)| norm 0.2252 (-1.72z)| lr 6.00e-03 | 2040.60 ms | 67.3% bf16 MFU | 262937 tok/s +step 1146/18794 | loss 3.866022 (-0.47z)| norm 0.2434 (-1.18z)| lr 6.00e-03 | 2042.25 ms | 67.2% bf16 MFU | 262626 tok/s +step 1147/18794 | loss 3.809361 (-1.96z)| norm 0.2429 (-1.17z)| lr 6.00e-03 | 2041.12 ms | 67.2% bf16 MFU | 262338 tok/s +step 1148/18794 | loss 3.814755 (-1.77z)| norm 0.2263 (-1.62z)| lr 6.00e-03 | 2033.86 ms | 67.5% bf16 MFU | 262110 tok/s +step 1149/18794 | loss 3.771915 (-2.77z)| norm 0.2385 (-1.24z)| lr 6.00e-03 | 2034.04 ms | 67.5% bf16 MFU | 261892 tok/s +step 1150/18794 | loss 3.797918 (-2.05z)| norm 0.2378 (-1.25z)| lr 6.00e-03 | 2036.30 ms | 67.4% bf16 MFU | 261671 tok/s +step 1151/18794 | loss 3.803836 (-1.86z)| norm 0.2543 (-0.77z)| lr 6.00e-03 | 2027.12 ms | 67.7% bf16 MFU | 261519 tok/s +step 1152/18794 | loss 3.833518 (-1.08z)| norm 0.3179 (+1.02z)| lr 6.00e-03 | 2039.61 ms | 67.3% bf16 MFU | 261296 tok/s +step 1153/18794 | loss 3.808617 (-1.67z)| norm 0.3162 (+0.97z)| lr 6.00e-03 | 2027.90 ms | 67.7% bf16 MFU | 261158 tok/s +step 1154/18794 | loss 3.825588 (-1.23z)| norm 0.2842 (+0.06z)| lr 6.00e-03 | 2024.76 ms | 67.8% bf16 MFU | 261047 tok/s +step 1155/18794 | loss 3.840328 (-0.84z)| norm 0.2804 (-0.05z)| lr 6.00e-03 | 2026.27 ms | 67.7% bf16 MFU | 260932 tok/s +step 1156/18794 | loss 3.795307 (-1.92z)| norm 0.2481 (-0.95z)| lr 6.00e-03 | 2035.09 ms | 67.4% bf16 MFU | 260767 tok/s +step 1157/18794 | loss 3.797947 (-1.81z)| norm 0.2169 (-1.79z)| lr 6.00e-03 | 2042.34 ms | 67.2% bf16 MFU | 260564 tok/s +step 1158/18794 | loss 3.852661 (-0.45z)| norm 0.2347 (-1.28z)| lr 6.00e-03 | 2031.92 ms | 67.5% bf16 MFU | 260437 tok/s +step 1159/18794 | loss 3.848372 (-0.54z)| norm 0.2117 (-1.90z)| lr 6.00e-03 | 2020.69 ms | 67.9% bf16 MFU | 260388 tok/s +step 1160/18794 | loss 3.741335 (-3.02z)| norm 0.2319 (-1.33z)| lr 6.00e-03 | 2040.37 ms | 67.3% bf16 MFU | 260217 tok/s +step 1161/18794 | loss 3.860042 (-0.20z)| norm 0.2485 (-0.87z)| lr 6.00e-03 | 2022.17 ms | 67.9% bf16 MFU | 260169 tok/s +step 1162/18794 | loss 3.867365 (-0.02z)| norm 0.2892 (+0.23z)| lr 6.00e-03 | 2029.22 ms | 67.6% bf16 MFU | 260079 tok/s +step 1163/18794 | loss 3.858110 (-0.23z)| norm 0.3179 (+0.99z)| lr 6.00e-03 | 2027.27 ms | 67.7% bf16 MFU | 260006 tok/s +step 1164/18794 | loss 3.817861 (-1.17z)| norm 0.2669 (-0.38z)| lr 6.00e-03 | 2042.07 ms | 67.2% bf16 MFU | 259843 tok/s +step 1165/18794 | loss 3.870800 (+0.09z)| norm 0.2370 (-1.19z)| lr 6.00e-03 | 2030.13 ms | 67.6% bf16 MFU | 259764 tok/s +step 1166/18794 | loss 3.804397 (-1.45z)| norm 0.2355 (-1.21z)| lr 6.00e-03 | 2039.12 ms | 67.3% bf16 MFU | 259631 tok/s +step 1167/18794 | loss 3.777524 (-2.02z)| norm 0.2373 (-1.14z)| lr 6.00e-03 | 2022.11 ms | 67.9% bf16 MFU | 259613 tok/s +step 1168/18794 | loss 3.757017 (-2.39z)| norm 0.2488 (-0.80z)| lr 6.00e-03 | 2026.24 ms | 67.7% bf16 MFU | 259570 tok/s +step 1169/18794 | loss 3.793241 (-1.54z)| norm 0.3085 (+0.85z)| lr 6.00e-03 | 2034.23 ms | 67.5% bf16 MFU | 259478 tok/s +step 1170/18794 | loss 3.810095 (-1.14z)| norm 0.2696 (-0.22z)| lr 6.00e-03 | 2021.38 ms | 67.9% bf16 MFU | 259473 tok/s +step 1171/18794 | loss 3.780346 (-1.76z)| norm 0.2501 (-0.75z)| lr 6.00e-03 | 2030.55 ms | 67.6% bf16 MFU | 259409 tok/s +step 1172/18794 | loss 3.797254 (-1.36z)| norm 0.2275 (-1.35z)| lr 6.00e-03 | 2026.66 ms | 67.7% bf16 MFU | 259374 tok/s +step 1173/18794 | loss 3.787835 (-1.53z)| norm 0.2296 (-1.27z)| lr 6.00e-03 | 2033.01 ms | 67.5% bf16 MFU | 259299 tok/s +step 1174/18794 | loss 3.777590 (-1.70z)| norm 0.2716 (-0.11z)| lr 6.00e-03 | 2027.08 ms | 67.7% bf16 MFU | 259266 tok/s +step 1175/18794 | loss 3.801400 (-1.17z)| norm 0.3070 (+0.86z)| lr 6.00e-03 | 2017.93 ms | 68.0% bf16 MFU | 259294 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.059270 +step 1176/18794 | loss 3.783963 (-1.51z)| norm 0.3529 (+2.06z)| lr 6.00e-03 | 2021.57 ms | 67.9% bf16 MFU | 259296 tok/s +step 1177/18794 | loss 3.818852 (-0.77z)| norm 0.2882 (+0.31z)| lr 6.00e-03 | 2026.33 ms | 67.7% bf16 MFU | 259268 tok/s +step 1178/18794 | loss 3.887831 (+0.75z)| norm 0.2144 (-1.65z)| lr 6.00e-03 | 2037.12 ms | 67.4% bf16 MFU | 259173 tok/s +step 1179/18794 | loss 3.877009 (+0.50z)| norm 0.2238 (-1.37z)| lr 6.00e-03 | 2023.10 ms | 67.8% bf16 MFU | 259172 tok/s +step 1180/18794 | loss 3.844720 (-0.19z)| norm 0.2517 (-0.61z)| lr 6.00e-03 | 2034.87 ms | 67.4% bf16 MFU | 259096 tok/s +step 1181/18794 | loss 3.839004 (-0.30z)| norm 0.2794 (+0.12z)| lr 6.00e-03 | 2026.01 ms | 67.7% bf16 MFU | 259080 tok/s +step 1182/18794 | loss 3.758813 (-2.02z)| norm 0.2673 (-0.20z)| lr 6.00e-03 | 2028.23 ms | 67.7% bf16 MFU | 259051 tok/s +step 1183/18794 | loss 3.819636 (-0.68z)| norm 0.2394 (-0.92z)| lr 6.00e-03 | 2031.07 ms | 67.6% bf16 MFU | 259005 tok/s +step 1184/18794 | loss 3.798753 (-1.12z)| norm 0.2163 (-1.50z)| lr 6.00e-03 | 2020.66 ms | 67.9% bf16 MFU | 259028 tok/s +step 1185/18794 | loss 3.841682 (-0.17z)| norm 0.2593 (-0.36z)| lr 6.00e-03 | 2032.11 ms | 67.5% bf16 MFU | 258977 tok/s +step 1186/18794 | loss 3.844631 (-0.10z)| norm 0.2959 (+0.60z)| lr 6.00e-03 | 2021.76 ms | 67.9% bf16 MFU | 258994 tok/s +step 1187/18794 | loss 3.844508 (-0.10z)| norm 0.2258 (-1.25z)| lr 6.00e-03 | 2009.47 ms | 68.3% bf16 MFU | 259090 tok/s +step 1188/18794 | loss 3.753458 (-2.07z)| norm 0.2416 (-0.82z)| lr 6.00e-03 | 2017.38 ms | 68.0% bf16 MFU | 259130 tok/s +step 1189/18794 | loss 3.833887 (-0.30z)| norm 0.2584 (-0.37z)| lr 6.00e-03 | 2023.71 ms | 67.8% bf16 MFU | 259127 tok/s +step 1190/18794 | loss 3.773081 (-1.60z)| norm 0.2277 (-1.16z)| lr 6.00e-03 | 2023.52 ms | 67.8% bf16 MFU | 259125 tok/s +step 1191/18794 | loss 3.793362 (-1.13z)| norm 0.2717 (-0.01z)| lr 6.00e-03 | 2018.51 ms | 68.0% bf16 MFU | 259156 tok/s +step 1192/18794 | loss 3.718129 (-2.66z)| norm 0.3298 (+1.48z)| lr 6.00e-03 | 2020.93 ms | 67.9% bf16 MFU | 259170 tok/s +step 1193/18794 | loss 3.828518 (-0.31z)| norm 0.2953 (+0.58z)| lr 6.00e-03 | 2017.88 ms | 68.0% bf16 MFU | 259202 tok/s +step 1194/18794 | loss 3.786360 (-1.19z)| norm 0.3434 (+1.78z)| lr 6.00e-03 | 2016.14 ms | 68.1% bf16 MFU | 259244 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.725068 +step 1195/18794 | loss 3.918746 (+1.58z)| norm 0.3852 (+2.73z)| lr 6.00e-03 | 2007.22 ms | 68.4% bf16 MFU | 259342 tok/s +step 1196/18794 | loss 3.846912 (+0.10z)| norm 0.3358 (+1.49z)| lr 6.00e-03 | 2018.56 ms | 68.0% bf16 MFU | 259362 tok/s +step 1197/18794 | loss 3.825081 (-0.36z)| norm 0.2962 (+0.53z)| lr 6.00e-03 | 2010.72 ms | 68.3% bf16 MFU | 259431 tok/s +step 1198/18794 | loss 3.785199 (-1.20z)| norm 0.2927 (+0.46z)| lr 6.00e-03 | 2011.52 ms | 68.2% bf16 MFU | 259492 tok/s +step 1199/18794 | loss 3.751454 (-1.88z)| norm 0.2502 (-0.58z)| lr 6.00e-03 | 2003.55 ms | 68.5% bf16 MFU | 259601 tok/s +step 1200/18794 | loss 3.774833 (-1.35z)| norm 0.2676 (-0.12z)| lr 6.00e-03 | 2015.77 ms | 68.1% bf16 MFU | 259626 tok/s +step 1201/18794 | loss 3.739282 (-2.06z)| norm 0.2537 (-0.47z)| lr 6.00e-03 | 2008.81 ms | 68.3% bf16 MFU | 259694 tok/s +step 1202/18794 | loss 3.751550 (-1.75z)| norm 0.2212 (-1.29z)| lr 6.00e-03 | 2010.81 ms | 68.2% bf16 MFU | 259746 tok/s +step 1203/18794 | loss 3.796512 (-0.79z)| norm 0.2383 (-0.84z)| lr 6.00e-03 | 2010.49 ms | 68.3% bf16 MFU | 259798 tok/s +step 1204/18794 | loss 3.690581 (-2.89z)| norm 0.2541 (-0.41z)| lr 6.00e-03 | 2011.59 ms | 68.2% bf16 MFU | 259839 tok/s +step 1205/18794 | loss 3.819682 (-0.24z)| norm 0.2307 (-1.00z)| lr 6.00e-03 | 2014.41 ms | 68.1% bf16 MFU | 259861 tok/s +step 1206/18794 | loss 3.762866 (-1.38z)| norm 0.2203 (-1.24z)| lr 6.00e-03 | 2018.08 ms | 68.0% bf16 MFU | 259858 tok/s +step 1207/18794 | loss 3.880692 (+1.01z)| norm 0.2171 (-1.30z)| lr 6.00e-03 | 2023.00 ms | 67.8% bf16 MFU | 259823 tok/s +step 1208/18794 | loss 3.771179 (-1.19z)| norm 0.2157 (-1.31z)| lr 6.00e-03 | 1998.35 ms | 68.7% bf16 MFU | 259950 tok/s +step 1209/18794 | loss 3.812192 (-0.34z)| norm 0.2304 (-0.93z)| lr 6.00e-03 | 2012.08 ms | 68.2% bf16 MFU | 259981 tok/s +step 1210/18794 | loss 3.818313 (-0.20z)| norm 0.2628 (-0.04z)| lr 6.00e-03 | 2033.46 ms | 67.5% bf16 MFU | 259873 tok/s +step 1211/18794 | loss 3.843717 (+0.36z)| norm 0.3044 (+1.20z)| lr 6.00e-03 | 2007.87 ms | 68.3% bf16 MFU | 259935 tok/s +step 1212/18794 | loss 3.773494 (-1.12z)| norm 0.2982 (+1.01z)| lr 6.00e-03 | 2027.03 ms | 67.7% bf16 MFU | 259871 tok/s +step 1213/18794 | loss 3.783365 (-0.89z)| norm 0.2679 (+0.14z)| lr 6.00e-03 | 2020.98 ms | 67.9% bf16 MFU | 259849 tok/s +step 1214/18794 | loss 3.759477 (-1.37z)| norm 0.2295 (-0.98z)| lr 6.00e-03 | 2012.22 ms | 68.2% bf16 MFU | 259884 tok/s +step 1215/18794 | loss 3.843248 (+0.42z)| norm 0.2200 (-1.23z)| lr 6.00e-03 | 2011.09 ms | 68.2% bf16 MFU | 259925 tok/s +step 1216/18794 | loss 3.812913 (-0.21z)| norm 0.2189 (-1.25z)| lr 6.00e-03 | 2023.38 ms | 67.8% bf16 MFU | 259884 tok/s +step 1217/18794 | loss 3.785561 (-0.79z)| norm 0.1878 (-2.09z)| lr 6.00e-03 | 2010.67 ms | 68.3% bf16 MFU | 259928 tok/s +step 1218/18794 | loss 3.760916 (-1.30z)| norm 0.1767 (-2.31z)| lr 6.00e-03 | 2016.15 ms | 68.1% bf16 MFU | 259933 tok/s +step 1219/18794 | loss 3.784159 (-0.78z)| norm 0.2008 (-1.61z)| lr 6.00e-03 | 2012.26 ms | 68.2% bf16 MFU | 259964 tok/s +step 1220/18794 | loss 3.746400 (-1.56z)| norm 0.2312 (-0.76z)| lr 6.00e-03 | 2011.95 ms | 68.2% bf16 MFU | 259995 tok/s +step 1221/18794 | loss 3.825796 (+0.15z)| norm 0.2531 (-0.15z)| lr 6.00e-03 | 2010.38 ms | 68.3% bf16 MFU | 260035 tok/s +step 1222/18794 | loss 3.796750 (-0.46z)| norm 0.3076 (+1.32z)| lr 6.00e-03 | 2011.35 ms | 68.2% bf16 MFU | 260066 tok/s +step 1223/18794 | loss 3.808658 (-0.18z)| norm 0.3173 (+1.55z)| lr 6.00e-03 | 2008.48 ms | 68.3% bf16 MFU | 260115 tok/s +step 1224/18794 | loss 3.848831 (+0.73z)| norm 0.3006 (+1.09z)| lr 6.00e-03 | 2012.43 ms | 68.2% bf16 MFU | 260135 tok/s +step 1225/18794 | loss 3.823432 (+0.15z)| norm 0.3073 (+1.25z)| lr 6.00e-03 | 2015.11 ms | 68.1% bf16 MFU | 260138 tok/s +step 1226/18794 | loss 3.762980 (-1.19z)| norm 0.2715 (+0.29z)| lr 6.00e-03 | 2021.70 ms | 67.9% bf16 MFU | 260097 tok/s +step 1227/18794 | loss 3.819838 (+0.12z)| norm 0.2607 (+0.00z)| lr 6.00e-03 | 2002.42 ms | 68.5% bf16 MFU | 260184 tok/s +step 1228/18794 | loss 3.826809 (+0.28z)| norm 0.2460 (-0.39z)| lr 6.00e-03 | 2027.58 ms | 67.7% bf16 MFU | 260103 tok/s +step 1229/18794 | loss 3.773824 (-0.92z)| norm 0.2220 (-1.01z)| lr 6.00e-03 | 1996.20 ms | 68.7% bf16 MFU | 260230 tok/s +step 1230/18794 | loss 3.760720 (-1.24z)| norm 0.1784 (-2.09z)| lr 6.00e-03 | 2005.02 ms | 68.4% bf16 MFU | 260293 tok/s +step 1231/18794 | loss 3.845699 (+0.83z)| norm 0.1840 (-1.90z)| lr 6.00e-03 | 2001.99 ms | 68.5% bf16 MFU | 260373 tok/s +step 1232/18794 | loss 3.718763 (-2.18z)| norm 0.2303 (-0.71z)| lr 6.00e-03 | 2009.75 ms | 68.3% bf16 MFU | 260398 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.442298 +step 1233/18794 | loss 3.786862 (-0.55z)| norm 0.3584 (+2.44z)| lr 6.00e-03 | 2016.88 ms | 68.0% bf16 MFU | 260375 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.250859 +step 1234/18794 | loss 3.784820 (-0.59z)| norm 0.3541 (+2.25z)| lr 6.00e-03 | 1994.05 ms | 68.8% bf16 MFU | 260503 tok/s +step 1235/18794 | loss 3.807608 (-0.05z)| norm 0.2553 (-0.13z)| lr 6.00e-03 | 2016.59 ms | 68.1% bf16 MFU | 260477 tok/s +step 1236/18794 | loss 3.818798 (+0.23z)| norm 0.2595 (-0.03z)| lr 6.00e-03 | 2004.01 ms | 68.5% bf16 MFU | 260534 tok/s +step 1237/18794 | loss 3.762451 (-1.10z)| norm 0.2513 (-0.22z)| lr 6.00e-03 | 2000.40 ms | 68.6% bf16 MFU | 260612 tok/s +step 1238/18794 | loss 3.778500 (-0.70z)| norm 0.2521 (-0.20z)| lr 6.00e-03 | 2022.43 ms | 67.9% bf16 MFU | 260543 tok/s +step 1239/18794 | loss 3.743359 (-1.50z)| norm 0.2609 (+0.01z)| lr 6.00e-03 | 2017.40 ms | 68.0% bf16 MFU | 260510 tok/s +step 1240/18794 | loss 3.779199 (-0.65z)| norm 0.2725 (+0.29z)| lr 6.00e-03 | 2012.01 ms | 68.2% bf16 MFU | 260514 tok/s +step 1241/18794 | loss 3.812318 (+0.14z)| norm 0.2542 (-0.15z)| lr 6.00e-03 | 1998.90 ms | 68.7% bf16 MFU | 260603 tok/s +step 1242/18794 | loss 3.779822 (-0.61z)| norm 0.2382 (-0.52z)| lr 6.00e-03 | 2006.41 ms | 68.4% bf16 MFU | 260638 tok/s +step 1243/18794 | loss 3.745776 (-1.39z)| norm 0.2554 (-0.09z)| lr 6.00e-03 | 2008.56 ms | 68.3% bf16 MFU | 260657 tok/s +step 1244/18794 | loss 3.768357 (-0.84z)| norm 0.3142 (+1.34z)| lr 6.00e-03 | 1997.86 ms | 68.7% bf16 MFU | 260746 tok/s +step 1245/18794 | loss 3.780200 (-0.54z)| norm 0.3105 (+1.22z)| lr 6.00e-03 | 2011.09 ms | 68.2% bf16 MFU | 260743 tok/s +step 1246/18794 | loss 3.818780 (+0.43z)| norm 0.3256 (+1.55z)| lr 6.00e-03 | 2011.06 ms | 68.2% bf16 MFU | 260741 tok/s +step 1247/18794 | loss 3.730904 (-1.73z)| norm 0.3056 (+1.05z)| lr 6.00e-03 | 2010.08 ms | 68.3% bf16 MFU | 260746 tok/s +step 1248/18794 | loss 3.737425 (-1.53z)| norm 0.2886 (+0.63z)| lr 6.00e-03 | 2000.43 ms | 68.6% bf16 MFU | 260813 tok/s +step 1249/18794 | loss 3.759829 (-0.98z)| norm 0.2818 (+0.45z)| lr 6.00e-03 | 2000.63 ms | 68.6% bf16 MFU | 260875 tok/s +step 1250/18794 | loss 3.726093 (-1.76z)| norm 0.2062 (-1.34z)| lr 6.00e-03 | 2003.10 ms | 68.5% bf16 MFU | 260918 tok/s +val loss 3.805704 +HellaSwag: 2646/10042 = 0.263493: 0/1256 +step 1251/18794 | loss 3.785520 (-0.33z)| norm 0.2247 (-0.89z)| lr 6.00e-03 | 1995.94 ms | 68.8% bf16 MFU | 261006 tok/s +step 1252/18794 | loss 3.769048 (-0.71z)| norm 0.2651 (+0.08z)| lr 6.00e-03 | 2001.98 ms | 68.5% bf16 MFU | 261050 tok/s +step 1253/18794 | loss 3.766061 (-0.77z)| norm 0.2240 (-0.89z)| lr 6.00e-03 | 2007.68 ms | 68.4% bf16 MFU | 261055 tok/s +step 1254/18794 | loss 3.810132 (+0.29z)| norm 0.2169 (-1.04z)| lr 6.00e-03 | 1990.68 ms | 68.9% bf16 MFU | 261171 tok/s +step 1255/18794 | loss 3.741731 (-1.32z)| norm 0.1963 (-1.50z)| lr 6.00e-03 | 2021.32 ms | 67.9% bf16 MFU | 261081 tok/s +step 1256/18794 | loss 3.717123 (-1.86z)| norm 0.1951 (-1.49z)| lr 6.00e-03 | 2011.87 ms | 68.2% bf16 MFU | 261057 tok/s +step 1257/18794 | loss 3.715598 (-1.84z)| norm 0.2475 (-0.27z)| lr 6.00e-03 | 2002.56 ms | 68.5% bf16 MFU | 261094 tok/s +step 1258/18794 | loss 3.769350 (-0.58z)| norm 0.2634 (+0.10z)| lr 6.00e-03 | 2013.83 ms | 68.1% bf16 MFU | 261057 tok/s +step 1259/18794 | loss 3.715086 (-1.80z)| norm 0.2468 (-0.31z)| lr 6.00e-03 | 2003.55 ms | 68.5% bf16 MFU | 261088 tok/s +step 1260/18794 | loss 3.755187 (-0.88z)| norm 0.2089 (-1.20z)| lr 6.00e-03 | 2003.98 ms | 68.5% bf16 MFU | 261115 tok/s +step 1261/18794 | loss 3.802911 (+0.24z)| norm 0.1918 (-1.57z)| lr 6.00e-03 | 2016.31 ms | 68.1% bf16 MFU | 261060 tok/s +step 1262/18794 | loss 3.768385 (-0.55z)| norm 0.2145 (-1.02z)| lr 6.00e-03 | 1992.74 ms | 68.9% bf16 MFU | 261162 tok/s +step 1263/18794 | loss 3.779319 (-0.28z)| norm 0.2366 (-0.49z)| lr 6.00e-03 | 1999.54 ms | 68.6% bf16 MFU | 261214 tok/s +step 1264/18794 | loss 3.679880 (-2.55z)| norm 0.2544 (-0.07z)| lr 6.00e-03 | 1997.60 ms | 68.7% bf16 MFU | 261277 tok/s +step 1265/18794 | loss 3.748309 (-0.94z)| norm 0.2335 (-0.56z)| lr 6.00e-03 | 2001.85 ms | 68.6% bf16 MFU | 261308 tok/s +step 1266/18794 | loss 3.724731 (-1.47z)| norm 0.1931 (-1.48z)| lr 6.00e-03 | 1989.30 ms | 69.0% bf16 MFU | 261420 tok/s +step 1267/18794 | loss 3.765050 (-0.52z)| norm 0.2165 (-0.93z)| lr 6.00e-03 | 2002.79 ms | 68.5% bf16 MFU | 261438 tok/s +step 1268/18794 | loss 3.772887 (-0.34z)| norm 0.2034 (-1.21z)| lr 6.00e-03 | 1999.57 ms | 68.6% bf16 MFU | 261476 tok/s +step 1269/18794 | loss 3.782727 (-0.11z)| norm 0.2472 (-0.19z)| lr 6.00e-03 | 2002.27 ms | 68.5% bf16 MFU | 261495 tok/s +step 1270/18794 | loss 3.731224 (-1.29z)| norm 0.2744 (+0.44z)| lr 6.00e-03 | 1991.35 ms | 68.9% bf16 MFU | 261584 tok/s +step 1271/18794 | loss 3.749602 (-0.85z)| norm 0.2911 (+0.81z)| lr 6.00e-03 | 1990.41 ms | 68.9% bf16 MFU | 261675 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.165823 +step 1272/18794 | loss 3.765445 (-0.48z)| norm 0.3530 (+2.17z)| lr 6.00e-03 | 2011.50 ms | 68.2% bf16 MFU | 261624 tok/s +step 1273/18794 | loss 3.725796 (-1.37z)| norm 0.3016 (+0.99z)| lr 6.00e-03 | 2009.61 ms | 68.3% bf16 MFU | 261587 tok/s +step 1274/18794 | loss 3.786631 (+0.03z)| norm 0.3430 (+1.87z)| lr 6.00e-03 | 2000.50 ms | 68.6% bf16 MFU | 261612 tok/s +step 1275/18794 | loss 3.754033 (-0.71z)| norm 0.2605 (+0.05z)| lr 6.00e-03 | 2011.79 ms | 68.2% bf16 MFU | 261561 tok/s +step 1276/18794 | loss 3.726710 (-1.31z)| norm 0.2400 (-0.39z)| lr 6.00e-03 | 2007.41 ms | 68.4% bf16 MFU | 261542 tok/s +step 1277/18794 | loss 3.764413 (-0.44z)| norm 0.2319 (-0.56z)| lr 6.00e-03 | 2003.37 ms | 68.5% bf16 MFU | 261550 tok/s +step 1278/18794 | loss 3.670133 (-2.52z)| norm 0.1852 (-1.61z)| lr 6.00e-03 | 1989.01 ms | 69.0% bf16 MFU | 261652 tok/s +step 1279/18794 | loss 3.776950 (-0.09z)| norm 0.2021 (-1.21z)| lr 6.00e-03 | 1997.18 ms | 68.7% bf16 MFU | 261695 tok/s +step 1280/18794 | loss 3.753130 (-0.62z)| norm 0.2170 (-0.86z)| lr 6.00e-03 | 1994.66 ms | 68.8% bf16 MFU | 261753 tok/s +step 1281/18794 | loss 3.774424 (-0.11z)| norm 0.1898 (-1.44z)| lr 6.00e-03 | 2002.40 ms | 68.5% bf16 MFU | 261757 tok/s +step 1282/18794 | loss 3.716207 (-1.46z)| norm 0.2134 (-0.90z)| lr 6.00e-03 | 1999.61 ms | 68.6% bf16 MFU | 261779 tok/s +step 1283/18794 | loss 3.753329 (-0.58z)| norm 0.2786 (+0.53z)| lr 6.00e-03 | 2006.41 ms | 68.4% bf16 MFU | 261755 tok/s +step 1284/18794 | loss 3.774200 (-0.08z)| norm 0.3110 (+1.22z)| lr 6.00e-03 | 1988.31 ms | 69.0% bf16 MFU | 261852 tok/s +step 1285/18794 | loss 3.713373 (-1.48z)| norm 0.2730 (+0.38z)| lr 6.00e-03 | 2002.37 ms | 68.5% bf16 MFU | 261851 tok/s +step 1286/18794 | loss 3.734879 (-0.96z)| norm 0.3075 (+1.14z)| lr 6.00e-03 | 1998.40 ms | 68.7% bf16 MFU | 261876 tok/s +step 1287/18794 | loss 3.708794 (-1.55z)| norm 0.2950 (+0.84z)| lr 6.00e-03 | 1994.60 ms | 68.8% bf16 MFU | 261925 tok/s +step 1288/18794 | loss 3.770839 (-0.08z)| norm 0.2985 (+0.90z)| lr 6.00e-03 | 1989.92 ms | 69.0% bf16 MFU | 262002 tok/s +step 1289/18794 | loss 3.786319 (+0.30z)| norm 0.2484 (-0.19z)| lr 6.00e-03 | 1994.43 ms | 68.8% bf16 MFU | 262046 tok/s +step 1290/18794 | loss 3.712624 (-1.44z)| norm 0.2683 (+0.24z)| lr 6.00e-03 | 1992.16 ms | 68.9% bf16 MFU | 262102 tok/s +step 1291/18794 | loss 3.761933 (-0.26z)| norm 0.3369 (+1.70z)| lr 6.00e-03 | 1999.87 ms | 68.6% bf16 MFU | 262105 tok/s +step 1292/18794 | loss 3.750499 (-0.54z)| norm 0.3213 (+1.37z)| lr 6.00e-03 | 1988.45 ms | 69.0% bf16 MFU | 262183 tok/s +step 1293/18794 | loss 3.716841 (-1.32z)| norm 0.2736 (+0.35z)| lr 6.00e-03 | 1988.29 ms | 69.0% bf16 MFU | 262258 tok/s +step 1294/18794 | loss 3.736025 (-0.85z)| norm 0.2394 (-0.38z)| lr 6.00e-03 | 2001.79 ms | 68.6% bf16 MFU | 262241 tok/s +step 1295/18794 | loss 3.770535 (+0.01z)| norm 0.1787 (-1.72z)| lr 6.00e-03 | 1995.06 ms | 68.8% bf16 MFU | 262269 tok/s +step 1296/18794 | loss 3.682008 (-2.19z)| norm 0.1656 (-1.97z)| lr 6.00e-03 | 2003.77 ms | 68.5% bf16 MFU | 262238 tok/s +step 1297/18794 | loss 3.686408 (-2.02z)| norm 0.1724 (-1.77z)| lr 6.00e-03 | 2003.64 ms | 68.5% bf16 MFU | 262209 tok/s +step 1298/18794 | loss 3.725184 (-1.03z)| norm 0.1549 (-2.09z)| lr 6.00e-03 | 2005.33 ms | 68.4% bf16 MFU | 262171 tok/s +step 1299/18794 | loss 3.678895 (-2.12z)| norm 0.1690 (-1.74z)| lr 6.00e-03 | 2007.36 ms | 68.4% bf16 MFU | 262122 tok/s +step 1300/18794 | loss 3.727018 (-0.93z)| norm 0.1837 (-1.39z)| lr 6.00e-03 | 1995.78 ms | 68.8% bf16 MFU | 262151 tok/s +step 1301/18794 | loss 3.677494 (-2.08z)| norm 0.2146 (-0.72z)| lr 6.00e-03 | 1991.71 ms | 68.9% bf16 MFU | 262205 tok/s +step 1302/18794 | loss 3.785848 (+0.50z)| norm 0.2651 (+0.35z)| lr 6.00e-03 | 1990.91 ms | 68.9% bf16 MFU | 262262 tok/s +step 1303/18794 | loss 3.728508 (-0.85z)| norm 0.3245 (+1.58z)| lr 6.00e-03 | 1988.91 ms | 69.0% bf16 MFU | 262329 tok/s +step 1304/18794 | loss 3.728753 (-0.86z)| norm 0.3040 (+1.13z)| lr 6.00e-03 | 1990.29 ms | 69.0% bf16 MFU | 262384 tok/s +step 1305/18794 | loss 3.710957 (-1.27z)| norm 0.3186 (+1.41z)| lr 6.00e-03 | 1983.72 ms | 69.2% bf16 MFU | 262479 tok/s +step 1306/18794 | loss 3.707492 (-1.33z)| norm 0.2925 (+0.85z)| lr 6.00e-03 | 1979.16 ms | 69.3% bf16 MFU | 262600 tok/s +step 1307/18794 | loss 3.663427 (-2.36z)| norm 0.2172 (-0.71z)| lr 6.00e-03 | 1989.73 ms | 69.0% bf16 MFU | 262645 tok/s +step 1308/18794 | loss 3.724766 (-0.86z)| norm 0.2165 (-0.73z)| lr 6.00e-03 | 1987.99 ms | 69.0% bf16 MFU | 262699 tok/s +step 1309/18794 | loss 3.825743 (+1.57z)| norm 0.2439 (-0.16z)| lr 6.00e-03 | 1982.18 ms | 69.2% bf16 MFU | 262789 tok/s +step 1310/18794 | loss 3.831989 (+1.70z)| norm 0.2499 (-0.04z)| lr 6.00e-03 | 1995.36 ms | 68.8% bf16 MFU | 262788 tok/s +step 1311/18794 | loss 3.686053 (-1.75z)| norm 0.2586 (+0.15z)| lr 6.00e-03 | 1992.31 ms | 68.9% bf16 MFU | 262806 tok/s +step 1312/18794 | loss 3.822895 (+1.50z)| norm 0.2494 (-0.03z)| lr 6.00e-03 | 1986.02 ms | 69.1% bf16 MFU | 262865 tok/s +step 1313/18794 | loss 3.727228 (-0.75z)| norm 0.2141 (-0.76z)| lr 6.00e-03 | 1982.87 ms | 69.2% bf16 MFU | 262942 tok/s +step 1314/18794 | loss 3.701907 (-1.33z)| norm 0.1767 (-1.51z)| lr 6.00e-03 | 1986.34 ms | 69.1% bf16 MFU | 262993 tok/s +step 1315/18794 | loss 3.708042 (-1.17z)| norm 0.2543 (+0.09z)| lr 6.00e-03 | 1996.02 ms | 68.8% bf16 MFU | 262976 tok/s +step 1316/18794 | loss 3.733129 (-0.56z)| norm 0.2879 (+0.77z)| lr 6.00e-03 | 1990.45 ms | 68.9% bf16 MFU | 262998 tok/s +step 1317/18794 | loss 3.779294 (+0.55z)| norm 0.2472 (-0.08z)| lr 6.00e-03 | 1994.54 ms | 68.8% bf16 MFU | 262991 tok/s +step 1318/18794 | loss 3.679636 (-1.79z)| norm 0.1866 (-1.36z)| lr 6.00e-03 | 1979.00 ms | 69.3% bf16 MFU | 263088 tok/s +step 1319/18794 | loss 3.739966 (-0.35z)| norm 0.1824 (-1.44z)| lr 6.00e-03 | 1991.10 ms | 68.9% bf16 MFU | 263099 tok/s +step 1320/18794 | loss 3.732722 (-0.52z)| norm 0.1814 (-1.43z)| lr 6.00e-03 | 1984.79 ms | 69.1% bf16 MFU | 263152 tok/s +step 1321/18794 | loss 3.719052 (-0.83z)| norm 0.1623 (-1.78z)| lr 6.00e-03 | 1984.85 ms | 69.1% bf16 MFU | 263201 tok/s +step 1322/18794 | loss 3.646155 (-2.47z)| norm 0.1637 (-1.71z)| lr 6.00e-03 | 1994.23 ms | 68.8% bf16 MFU | 263186 tok/s +step 1323/18794 | loss 3.763797 (+0.28z)| norm 0.1938 (-1.08z)| lr 6.00e-03 | 1994.84 ms | 68.8% bf16 MFU | 263168 tok/s +step 1324/18794 | loss 3.699978 (-1.20z)| norm 0.2780 (+0.63z)| lr 6.00e-03 | 1991.45 ms | 68.9% bf16 MFU | 263173 tok/s +step 1325/18794 | loss 3.789468 (+0.96z)| norm 0.3237 (+1.55z)| lr 6.00e-03 | 1980.96 ms | 69.3% bf16 MFU | 263248 tok/s +step 1326/18794 | loss 3.769172 (+0.46z)| norm 0.3251 (+1.55z)| lr 6.00e-03 | 1988.93 ms | 69.0% bf16 MFU | 263266 tok/s +step 1327/18794 | loss 3.742667 (-0.16z)| norm 0.3147 (+1.32z)| lr 6.00e-03 | 1978.40 ms | 69.4% bf16 MFU | 263353 tok/s +step 1328/18794 | loss 3.699284 (-1.21z)| norm 0.2598 (+0.23z)| lr 6.00e-03 | 1984.64 ms | 69.1% bf16 MFU | 263394 tok/s +step 1329/18794 | loss 3.753973 (+0.15z)| norm 0.2875 (+0.77z)| lr 6.00e-03 | 1988.39 ms | 69.0% bf16 MFU | 263408 tok/s +step 1330/18794 | loss 3.689659 (-1.41z)| norm 0.2721 (+0.45z)| lr 6.00e-03 | 1988.04 ms | 69.0% bf16 MFU | 263423 tok/s +step 1331/18794 | loss 3.702958 (-1.08z)| norm 0.2415 (-0.18z)| lr 6.00e-03 | 1980.93 ms | 69.3% bf16 MFU | 263485 tok/s +step 1332/18794 | loss 3.735477 (-0.26z)| norm 0.2884 (+0.75z)| lr 6.00e-03 | 1980.92 ms | 69.3% bf16 MFU | 263545 tok/s +step 1333/18794 | loss 3.777386 (+0.80z)| norm 0.2209 (-0.59z)| lr 6.00e-03 | 1982.95 ms | 69.2% bf16 MFU | 263587 tok/s +step 1334/18794 | loss 3.733438 (-0.30z)| norm 0.2373 (-0.24z)| lr 6.00e-03 | 1981.34 ms | 69.3% bf16 MFU | 263639 tok/s +step 1335/18794 | loss 3.692781 (-1.31z)| norm 0.2696 (+0.44z)| lr 6.00e-03 | 2018.52 ms | 68.0% bf16 MFU | 263444 tok/s +step 1336/18794 | loss 3.776278 (+0.85z)| norm 0.3038 (+1.15z)| lr 6.00e-03 | 2042.81 ms | 67.2% bf16 MFU | 263104 tok/s +step 1337/18794 | loss 3.711499 (-0.82z)| norm 0.2777 (+0.59z)| lr 6.00e-03 | 2041.75 ms | 67.2% bf16 MFU | 262788 tok/s +step 1338/18794 | loss 3.759186 (+0.42z)| norm 0.3053 (+1.15z)| lr 6.00e-03 | 2033.84 ms | 67.5% bf16 MFU | 262538 tok/s +step 1339/18794 | loss 3.675588 (-1.71z)| norm 0.3081 (+1.19z)| lr 6.00e-03 | 2035.71 ms | 67.4% bf16 MFU | 262288 tok/s +step 1340/18794 | loss 3.767016 (+0.64z)| norm 0.2806 (+0.62z)| lr 6.00e-03 | 2035.47 ms | 67.4% bf16 MFU | 262052 tok/s +step 1341/18794 | loss 3.684815 (-1.45z)| norm 0.2467 (-0.07z)| lr 6.00e-03 | 2039.17 ms | 67.3% bf16 MFU | 261805 tok/s +step 1342/18794 | loss 3.723592 (-0.43z)| norm 0.1757 (-1.51z)| lr 6.00e-03 | 2037.35 ms | 67.4% bf16 MFU | 261582 tok/s +step 1343/18794 | loss 3.710773 (-0.75z)| norm 0.1885 (-1.22z)| lr 6.00e-03 | 2034.31 ms | 67.5% bf16 MFU | 261389 tok/s +step 1344/18794 | loss 3.723254 (-0.42z)| norm 0.2027 (-0.92z)| lr 6.00e-03 | 2032.74 ms | 67.5% bf16 MFU | 261216 tok/s +step 1345/18794 | loss 3.783307 (+1.13z)| norm 0.2424 (-0.10z)| lr 6.00e-03 | 2032.26 ms | 67.5% bf16 MFU | 261054 tok/s +step 1346/18794 | loss 3.706008 (-0.85z)| norm 0.2725 (+0.54z)| lr 5.99e-03 | 2025.96 ms | 67.7% bf16 MFU | 260940 tok/s +step 1347/18794 | loss 3.722321 (-0.42z)| norm 0.2320 (-0.29z)| lr 5.99e-03 | 2041.59 ms | 67.2% bf16 MFU | 260734 tok/s +step 1348/18794 | loss 3.738281 (-0.00z)| norm 0.2099 (-0.74z)| lr 5.99e-03 | 2032.30 ms | 67.5% bf16 MFU | 260596 tok/s +step 1349/18794 | loss 3.621673 (-2.91z)| norm 0.1944 (-1.04z)| lr 5.99e-03 | 2025.39 ms | 67.8% bf16 MFU | 260509 tok/s +step 1350/18794 | loss 3.712175 (-0.62z)| norm 0.2129 (-0.66z)| lr 5.99e-03 | 2036.47 ms | 67.4% bf16 MFU | 260356 tok/s +step 1351/18794 | loss 3.665051 (-1.76z)| norm 0.1875 (-1.17z)| lr 5.99e-03 | 2022.41 ms | 67.9% bf16 MFU | 260300 tok/s +step 1352/18794 | loss 3.636125 (-2.39z)| norm 0.1719 (-1.46z)| lr 5.99e-03 | 2021.82 ms | 67.9% bf16 MFU | 260251 tok/s +step 1353/18794 | loss 3.739116 (+0.12z)| norm 0.2108 (-0.66z)| lr 5.99e-03 | 2025.68 ms | 67.7% bf16 MFU | 260179 tok/s +step 1354/18794 | loss 3.740472 (+0.18z)| norm 0.2346 (-0.17z)| lr 5.99e-03 | 2033.50 ms | 67.5% bf16 MFU | 260062 tok/s +step 1355/18794 | loss 3.698742 (-0.84z)| norm 0.2095 (-0.69z)| lr 5.99e-03 | 2039.85 ms | 67.3% bf16 MFU | 259910 tok/s +step 1356/18794 | loss 3.741131 (+0.20z)| norm 0.2120 (-0.65z)| lr 5.99e-03 | 2034.63 ms | 67.4% bf16 MFU | 259798 tok/s +step 1357/18794 | loss 3.738039 (+0.12z)| norm 0.2838 (+0.82z)| lr 5.99e-03 | 2021.57 ms | 67.9% bf16 MFU | 259776 tok/s +step 1358/18794 | loss 3.698260 (-0.85z)| norm 0.3418 (+1.96z)| lr 5.99e-03 | 2017.98 ms | 68.0% bf16 MFU | 259777 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.371703 +step 1359/18794 | loss 3.791393 (+1.42z)| norm 0.3668 (+2.37z)| lr 5.99e-03 | 2034.20 ms | 67.5% bf16 MFU | 259675 tok/s +step 1360/18794 | loss 3.734971 (+0.04z)| norm 0.3012 (+1.07z)| lr 5.99e-03 | 2025.31 ms | 67.8% bf16 MFU | 259635 tok/s +step 1361/18794 | loss 3.738223 (+0.14z)| norm 0.2458 (-0.03z)| lr 5.99e-03 | 2027.19 ms | 67.7% bf16 MFU | 259585 tok/s +step 1362/18794 | loss 3.771976 (+0.98z)| norm 0.1990 (-0.94z)| lr 5.99e-03 | 2026.86 ms | 67.7% bf16 MFU | 259539 tok/s +step 1363/18794 | loss 3.755959 (+0.59z)| norm 0.1761 (-1.37z)| lr 5.99e-03 | 2031.39 ms | 67.6% bf16 MFU | 259467 tok/s +step 1364/18794 | loss 3.740612 (+0.19z)| norm 0.1922 (-1.04z)| lr 5.99e-03 | 2024.90 ms | 67.8% bf16 MFU | 259439 tok/s +step 1365/18794 | loss 3.695093 (-0.94z)| norm 0.2119 (-0.65z)| lr 5.99e-03 | 2030.65 ms | 67.6% bf16 MFU | 259377 tok/s +step 1366/18794 | loss 3.683971 (-1.20z)| norm 0.2299 (-0.31z)| lr 5.99e-03 | 2031.67 ms | 67.5% bf16 MFU | 259311 tok/s +step 1367/18794 | loss 3.727232 (-0.11z)| norm 0.2291 (-0.33z)| lr 5.99e-03 | 2027.25 ms | 67.7% bf16 MFU | 259276 tok/s +step 1368/18794 | loss 3.728218 (-0.07z)| norm 0.2051 (-0.80z)| lr 5.99e-03 | 2025.70 ms | 67.7% bf16 MFU | 259253 tok/s +step 1369/18794 | loss 3.716316 (-0.36z)| norm 0.2450 (-0.02z)| lr 5.99e-03 | 2019.58 ms | 68.0% bf16 MFU | 259271 tok/s +step 1370/18794 | loss 3.706583 (-0.60z)| norm 0.2385 (-0.14z)| lr 5.99e-03 | 2024.14 ms | 67.8% bf16 MFU | 259258 tok/s +step 1371/18794 | loss 3.705206 (-0.63z)| norm 0.1933 (-1.00z)| lr 5.99e-03 | 2037.79 ms | 67.3% bf16 MFU | 259159 tok/s +step 1372/18794 | loss 3.676830 (-1.32z)| norm 0.1786 (-1.27z)| lr 5.99e-03 | 2030.94 ms | 67.6% bf16 MFU | 259109 tok/s +step 1373/18794 | loss 3.659394 (-1.71z)| norm 0.1684 (-1.44z)| lr 5.99e-03 | 2031.55 ms | 67.6% bf16 MFU | 259057 tok/s +step 1374/18794 | loss 3.702856 (-0.62z)| norm 0.1920 (-0.96z)| lr 5.99e-03 | 2026.49 ms | 67.7% bf16 MFU | 259040 tok/s +step 1375/18794 | loss 3.686838 (-1.00z)| norm 0.2184 (-0.43z)| lr 5.99e-03 | 2023.69 ms | 67.8% bf16 MFU | 259042 tok/s +step 1376/18794 | loss 3.703794 (-0.57z)| norm 0.2153 (-0.48z)| lr 5.99e-03 | 2022.19 ms | 67.9% bf16 MFU | 259053 tok/s +step 1377/18794 | loss 3.649855 (-1.86z)| norm 0.2287 (-0.21z)| lr 5.99e-03 | 2034.99 ms | 67.4% bf16 MFU | 258982 tok/s +step 1378/18794 | loss 3.693669 (-0.79z)| norm 0.2540 (+0.28z)| lr 5.99e-03 | 2009.47 ms | 68.3% bf16 MFU | 259079 tok/s +step 1379/18794 | loss 3.723310 (-0.04z)| norm 0.2706 (+0.60z)| lr 5.99e-03 | 2026.43 ms | 67.7% bf16 MFU | 259061 tok/s +step 1380/18794 | loss 3.741787 (+0.42z)| norm 0.2582 (+0.34z)| lr 5.99e-03 | 2025.93 ms | 67.7% bf16 MFU | 259047 tok/s +step 1381/18794 | loss 3.786784 (+1.54z)| norm 0.2788 (+0.74z)| lr 5.99e-03 | 2023.34 ms | 67.8% bf16 MFU | 259051 tok/s +step 1382/18794 | loss 3.764750 (+0.97z)| norm 0.2958 (+1.06z)| lr 5.99e-03 | 2024.70 ms | 67.8% bf16 MFU | 259046 tok/s +step 1383/18794 | loss 3.748316 (+0.57z)| norm 0.2081 (-0.68z)| lr 5.99e-03 | 2018.20 ms | 68.0% bf16 MFU | 259082 tok/s +step 1384/18794 | loss 3.669848 (-1.36z)| norm 0.1944 (-0.94z)| lr 5.99e-03 | 2024.82 ms | 67.8% bf16 MFU | 259075 tok/s +step 1385/18794 | loss 3.694619 (-0.74z)| norm 0.1508 (-1.77z)| lr 5.99e-03 | 2014.86 ms | 68.1% bf16 MFU | 259132 tok/s +step 1386/18794 | loss 3.694055 (-0.74z)| norm 0.1774 (-1.22z)| lr 5.99e-03 | 2025.50 ms | 67.8% bf16 MFU | 259117 tok/s +step 1387/18794 | loss 3.746390 (+0.55z)| norm 0.1868 (-1.01z)| lr 5.99e-03 | 2023.09 ms | 67.8% bf16 MFU | 259119 tok/s +step 1388/18794 | loss 3.744141 (+0.50z)| norm 0.2135 (-0.46z)| lr 5.99e-03 | 2027.64 ms | 67.7% bf16 MFU | 259092 tok/s +step 1389/18794 | loss 3.709376 (-0.35z)| norm 0.2746 (+0.75z)| lr 5.99e-03 | 2024.01 ms | 67.8% bf16 MFU | 259089 tok/s +step 1390/18794 | loss 3.726776 (+0.09z)| norm 0.2963 (+1.18z)| lr 5.99e-03 | 2018.54 ms | 68.0% bf16 MFU | 259121 tok/s +step 1391/18794 | loss 3.678496 (-1.10z)| norm 0.2947 (+1.17z)| lr 5.99e-03 | 2014.97 ms | 68.1% bf16 MFU | 259175 tok/s +step 1392/18794 | loss 3.742313 (+0.50z)| norm 0.2576 (+0.44z)| lr 5.99e-03 | 2033.45 ms | 67.5% bf16 MFU | 259108 tok/s +step 1393/18794 | loss 3.739687 (+0.43z)| norm 0.2321 (-0.07z)| lr 5.99e-03 | 2015.13 ms | 68.1% bf16 MFU | 259161 tok/s +step 1394/18794 | loss 3.708678 (-0.34z)| norm 0.2592 (+0.48z)| lr 5.99e-03 | 1999.32 ms | 68.6% bf16 MFU | 259315 tok/s +step 1395/18794 | loss 3.724983 (+0.08z)| norm 0.2548 (+0.37z)| lr 5.99e-03 | 2017.71 ms | 68.0% bf16 MFU | 259341 tok/s +step 1396/18794 | loss 3.728288 (+0.15z)| norm 0.2505 (+0.27z)| lr 5.99e-03 | 2008.35 ms | 68.3% bf16 MFU | 259427 tok/s +step 1397/18794 | loss 3.663588 (-1.48z)| norm 0.2033 (-0.72z)| lr 5.99e-03 | 2017.45 ms | 68.0% bf16 MFU | 259449 tok/s +step 1398/18794 | loss 3.745404 (+0.58z)| norm 0.2228 (-0.33z)| lr 5.99e-03 | 2008.83 ms | 68.3% bf16 MFU | 259526 tok/s +step 1399/18794 | loss 3.702021 (-0.52z)| norm 0.2348 (-0.09z)| lr 5.99e-03 | 2017.74 ms | 68.0% bf16 MFU | 259542 tok/s +step 1400/18794 | loss 3.750963 (+0.71z)| norm 0.2345 (-0.11z)| lr 5.99e-03 | 2019.23 ms | 68.0% bf16 MFU | 259547 tok/s +step 1401/18794 | loss 3.706679 (-0.42z)| norm 0.2269 (-0.28z)| lr 5.99e-03 | 2010.79 ms | 68.2% bf16 MFU | 259607 tok/s +step 1402/18794 | loss 3.687640 (-0.88z)| norm 0.2438 (+0.09z)| lr 5.99e-03 | 2005.40 ms | 68.4% bf16 MFU | 259699 tok/s +step 1403/18794 | loss 3.651587 (-1.76z)| norm 0.1998 (-0.85z)| lr 5.99e-03 | 2009.35 ms | 68.3% bf16 MFU | 259760 tok/s +step 1404/18794 | loss 3.712473 (-0.22z)| norm 0.1905 (-1.03z)| lr 5.99e-03 | 2017.61 ms | 68.0% bf16 MFU | 259765 tok/s +step 1405/18794 | loss 3.673302 (-1.19z)| norm 0.2270 (-0.21z)| lr 5.99e-03 | 2017.70 ms | 68.0% bf16 MFU | 259769 tok/s +step 1406/18794 | loss 3.675004 (-1.13z)| norm 0.2697 (+0.76z)| lr 5.99e-03 | 2019.56 ms | 68.0% bf16 MFU | 259760 tok/s +step 1407/18794 | loss 3.726084 (+0.12z)| norm 0.2628 (+0.59z)| lr 5.99e-03 | 2004.20 ms | 68.5% bf16 MFU | 259852 tok/s +step 1408/18794 | loss 3.733768 (+0.31z)| norm 0.2301 (-0.15z)| lr 5.99e-03 | 2018.33 ms | 68.0% bf16 MFU | 259848 tok/s +step 1409/18794 | loss 3.795876 (+1.92z)| norm 0.2411 (+0.10z)| lr 5.99e-03 | 2018.16 ms | 68.0% bf16 MFU | 259845 tok/s +step 1410/18794 | loss 3.705353 (-0.38z)| norm 0.2216 (-0.33z)| lr 5.99e-03 | 2013.24 ms | 68.2% bf16 MFU | 259873 tok/s +step 1411/18794 | loss 3.718572 (-0.04z)| norm 0.2734 (+0.83z)| lr 5.99e-03 | 2014.65 ms | 68.1% bf16 MFU | 259892 tok/s +step 1412/18794 | loss 3.720499 (+0.04z)| norm 0.3048 (+1.51z)| lr 5.99e-03 | 2024.58 ms | 67.8% bf16 MFU | 259845 tok/s +step 1413/18794 | loss 3.714381 (-0.13z)| norm 0.2316 (-0.13z)| lr 5.99e-03 | 2010.58 ms | 68.3% bf16 MFU | 259891 tok/s +step 1414/18794 | loss 3.724337 (+0.15z)| norm 0.2395 (+0.04z)| lr 5.99e-03 | 2003.53 ms | 68.5% bf16 MFU | 259981 tok/s +step 1415/18794 | loss 3.653624 (-1.78z)| norm 0.2261 (-0.26z)| lr 5.99e-03 | 1996.71 ms | 68.7% bf16 MFU | 260110 tok/s +step 1416/18794 | loss 3.702200 (-0.44z)| norm 0.1967 (-0.90z)| lr 5.99e-03 | 2000.69 ms | 68.6% bf16 MFU | 260208 tok/s +step 1417/18794 | loss 3.666369 (-1.40z)| norm 0.1882 (-1.08z)| lr 5.99e-03 | 2018.24 ms | 68.0% bf16 MFU | 260186 tok/s +step 1418/18794 | loss 3.648906 (-1.85z)| norm 0.1884 (-1.07z)| lr 5.99e-03 | 2010.19 ms | 68.3% bf16 MFU | 260217 tok/s +step 1419/18794 | loss 3.639180 (-2.05z)| norm 0.2220 (-0.33z)| lr 5.99e-03 | 2017.67 ms | 68.0% bf16 MFU | 260199 tok/s +step 1420/18794 | loss 3.703513 (-0.32z)| norm 0.2712 (+0.77z)| lr 5.99e-03 | 2014.94 ms | 68.1% bf16 MFU | 260199 tok/s +step 1421/18794 | loss 3.645586 (-1.82z)| norm 0.2877 (+1.13z)| lr 5.99e-03 | 2003.47 ms | 68.5% bf16 MFU | 260273 tok/s +step 1422/18794 | loss 3.713548 (-0.05z)| norm 0.3035 (+1.46z)| lr 5.99e-03 | 2015.60 ms | 68.1% bf16 MFU | 260266 tok/s +step 1423/18794 | loss 3.892399 (+4.27z)| norm 0.2796 (+0.89z)| lr 5.99e-03 | 1990.00 ms | 69.0% bf16 MFU | 260425 tok/s +step 1424/18794 | loss 3.689274 (-0.66z)| norm 0.3041 (+1.44z)| lr 5.99e-03 | 2002.80 ms | 68.5% bf16 MFU | 260493 tok/s +step 1425/18794 | loss 3.691220 (-0.60z)| norm 0.2900 (+1.14z)| lr 5.99e-03 | 2004.23 ms | 68.5% bf16 MFU | 260548 tok/s +step 1426/18794 | loss 3.728459 (+0.33z)| norm 0.3247 (+1.95z)| lr 5.99e-03 | 2005.77 ms | 68.4% bf16 MFU | 260590 tok/s +step 1427/18794 | loss 3.704605 (-0.26z)| norm 0.2891 (+1.14z)| lr 5.99e-03 | 2011.06 ms | 68.2% bf16 MFU | 260596 tok/s +step 1428/18794 | loss 3.720549 (+0.14z)| norm 0.2689 (+0.66z)| lr 5.99e-03 | 2006.31 ms | 68.4% bf16 MFU | 260632 tok/s +step 1429/18794 | loss 3.708790 (-0.15z)| norm 0.2300 (-0.24z)| lr 5.99e-03 | 2013.61 ms | 68.2% bf16 MFU | 260619 tok/s +step 1430/18794 | loss 3.700722 (-0.35z)| norm 0.2461 (+0.15z)| lr 5.99e-03 | 2016.81 ms | 68.0% bf16 MFU | 260586 tok/s +step 1431/18794 | loss 3.680065 (-0.86z)| norm 0.1980 (-0.97z)| lr 5.99e-03 | 2006.96 ms | 68.4% bf16 MFU | 260618 tok/s +step 1432/18794 | loss 3.707480 (-0.17z)| norm 0.2134 (-0.59z)| lr 5.99e-03 | 2002.69 ms | 68.5% bf16 MFU | 260677 tok/s +step 1433/18794 | loss 3.693510 (-0.50z)| norm 0.2198 (-0.44z)| lr 5.99e-03 | 2026.79 ms | 67.7% bf16 MFU | 260577 tok/s +step 1434/18794 | loss 3.661017 (-1.30z)| norm 0.2131 (-0.60z)| lr 5.99e-03 | 2017.05 ms | 68.0% bf16 MFU | 260545 tok/s +step 1435/18794 | loss 3.707709 (-0.13z)| norm 0.2048 (-0.78z)| lr 5.99e-03 | 1994.16 ms | 68.8% bf16 MFU | 260663 tok/s +step 1436/18794 | loss 3.672307 (-1.00z)| norm 0.1853 (-1.21z)| lr 5.99e-03 | 2006.29 ms | 68.4% bf16 MFU | 260696 tok/s +step 1437/18794 | loss 3.737923 (+0.66z)| norm 0.1788 (-1.34z)| lr 5.99e-03 | 1999.46 ms | 68.6% bf16 MFU | 260772 tok/s +step 1438/18794 | loss 3.714601 (+0.08z)| norm 0.2184 (-0.39z)| lr 5.99e-03 | 2002.53 ms | 68.5% bf16 MFU | 260824 tok/s +step 1439/18794 | loss 3.665054 (-1.18z)| norm 0.2736 (+0.95z)| lr 5.99e-03 | 2021.82 ms | 67.9% bf16 MFU | 260748 tok/s +step 1440/18794 | loss 3.705171 (-0.15z)| norm 0.2332 (-0.02z)| lr 5.99e-03 | 1994.92 ms | 68.8% bf16 MFU | 260852 tok/s +step 1441/18794 | loss 3.695386 (-0.40z)| norm 0.2229 (-0.26z)| lr 5.99e-03 | 2003.07 ms | 68.5% bf16 MFU | 260896 tok/s +step 1442/18794 | loss 3.693690 (-0.44z)| norm 0.1965 (-0.91z)| lr 5.99e-03 | 2010.26 ms | 68.3% bf16 MFU | 260892 tok/s +step 1443/18794 | loss 3.664763 (-1.16z)| norm 0.1685 (-1.58z)| lr 5.99e-03 | 2004.23 ms | 68.5% bf16 MFU | 260926 tok/s +step 1444/18794 | loss 3.684587 (-0.64z)| norm 0.1825 (-1.23z)| lr 5.99e-03 | 1991.59 ms | 68.9% bf16 MFU | 261043 tok/s +step 1445/18794 | loss 3.729060 (+0.51z)| norm 0.2224 (-0.26z)| lr 5.99e-03 | 2014.27 ms | 68.1% bf16 MFU | 261005 tok/s +step 1446/18794 | loss 3.666265 (-1.10z)| norm 0.2541 (+0.51z)| lr 5.99e-03 | 1999.32 ms | 68.6% bf16 MFU | 261066 tok/s +step 1447/18794 | loss 3.685862 (-0.58z)| norm 0.2540 (+0.50z)| lr 5.99e-03 | 1991.69 ms | 68.9% bf16 MFU | 261175 tok/s +step 1448/18794 | loss 3.679603 (-0.73z)| norm 0.2199 (-0.33z)| lr 5.99e-03 | 2010.18 ms | 68.3% bf16 MFU | 261157 tok/s +step 1449/18794 | loss 3.631589 (-1.98z)| norm 0.1903 (-1.04z)| lr 5.99e-03 | 2012.10 ms | 68.2% bf16 MFU | 261128 tok/s +step 1450/18794 | loss 3.705101 (-0.07z)| norm 0.1664 (-1.59z)| lr 5.99e-03 | 1989.26 ms | 69.0% bf16 MFU | 261249 tok/s +step 1451/18794 | loss 3.691624 (-0.43z)| norm 0.1684 (-1.53z)| lr 5.99e-03 | 2010.38 ms | 68.3% bf16 MFU | 261226 tok/s +step 1452/18794 | loss 3.658935 (-1.30z)| norm 0.1806 (-1.24z)| lr 5.99e-03 | 1992.86 ms | 68.9% bf16 MFU | 261319 tok/s +step 1453/18794 | loss 3.647251 (-1.57z)| norm 0.2172 (-0.37z)| lr 5.99e-03 | 2009.08 ms | 68.3% bf16 MFU | 261301 tok/s +step 1454/18794 | loss 3.711305 (+0.11z)| norm 0.2498 (+0.40z)| lr 5.99e-03 | 2014.32 ms | 68.1% bf16 MFU | 261250 tok/s +step 1455/18794 | loss 3.679178 (-0.73z)| norm 0.2123 (-0.49z)| lr 5.99e-03 | 1995.82 ms | 68.8% bf16 MFU | 261322 tok/s +step 1456/18794 | loss 3.702783 (-0.10z)| norm 0.1850 (-1.13z)| lr 5.99e-03 | 2004.14 ms | 68.5% bf16 MFU | 261336 tok/s +step 1457/18794 | loss 3.715754 (+0.24z)| norm 0.2075 (-0.58z)| lr 5.99e-03 | 1995.79 ms | 68.8% bf16 MFU | 261404 tok/s +step 1458/18794 | loss 3.658488 (-1.24z)| norm 0.2438 (+0.32z)| lr 5.99e-03 | 1994.07 ms | 68.8% bf16 MFU | 261480 tok/s +step 1459/18794 | loss 3.706393 (+0.03z)| norm 0.2123 (-0.45z)| lr 5.99e-03 | 2010.23 ms | 68.3% bf16 MFU | 261447 tok/s +step 1460/18794 | loss 3.711331 (+0.17z)| norm 0.1741 (-1.43z)| lr 5.99e-03 | 1995.76 ms | 68.8% bf16 MFU | 261509 tok/s +step 1461/18794 | loss 3.653745 (-1.34z)| norm 0.1849 (-1.12z)| lr 5.99e-03 | 2007.51 ms | 68.4% bf16 MFU | 261492 tok/s +step 1462/18794 | loss 3.669247 (-0.91z)| norm 0.1819 (-1.19z)| lr 5.99e-03 | 1989.76 ms | 69.0% bf16 MFU | 261592 tok/s +step 1463/18794 | loss 3.646021 (-1.51z)| norm 0.1982 (-0.78z)| lr 5.99e-03 | 2001.05 ms | 68.6% bf16 MFU | 261613 tok/s +step 1464/18794 | loss 3.637848 (-1.69z)| norm 0.2204 (-0.20z)| lr 5.99e-03 | 2000.95 ms | 68.6% bf16 MFU | 261633 tok/s +step 1465/18794 | loss 3.676306 (-0.65z)| norm 0.2318 (+0.10z)| lr 5.99e-03 | 2007.58 ms | 68.4% bf16 MFU | 261609 tok/s +step 1466/18794 | loss 3.658618 (-1.11z)| norm 0.2102 (-0.47z)| lr 5.99e-03 | 2008.06 ms | 68.3% bf16 MFU | 261583 tok/s +step 1467/18794 | loss 3.681809 (-0.48z)| norm 0.1977 (-0.79z)| lr 5.99e-03 | 1995.64 ms | 68.8% bf16 MFU | 261640 tok/s +step 1468/18794 | loss 3.680254 (-0.51z)| norm 0.2369 (+0.24z)| lr 5.99e-03 | 2001.07 ms | 68.6% bf16 MFU | 261658 tok/s +step 1469/18794 | loss 3.644194 (-1.44z)| norm 0.2415 (+0.36z)| lr 5.99e-03 | 2010.10 ms | 68.3% bf16 MFU | 261617 tok/s +step 1470/18794 | loss 3.664926 (-0.88z)| norm 0.2556 (+0.73z)| lr 5.99e-03 | 2011.86 ms | 68.2% bf16 MFU | 261566 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.006957 +step 1471/18794 | loss 3.679335 (-0.49z)| norm 0.3066 (+2.01z)| lr 5.99e-03 | 1996.94 ms | 68.7% bf16 MFU | 261615 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.319780 +step 1472/18794 | loss 3.740198 (+1.08z)| norm 0.3218 (+2.32z)| lr 5.99e-03 | 2010.96 ms | 68.2% bf16 MFU | 261570 tok/s +step 1473/18794 | loss 3.657220 (-1.08z)| norm 0.2565 (+0.64z)| lr 5.99e-03 | 2003.47 ms | 68.5% bf16 MFU | 261576 tok/s +step 1474/18794 | loss 3.652805 (-1.18z)| norm 0.2312 (-0.02z)| lr 5.99e-03 | 1988.64 ms | 69.0% bf16 MFU | 261679 tok/s +step 1475/18794 | loss 3.648506 (-1.27z)| norm 0.1990 (-0.84z)| lr 5.99e-03 | 1996.35 ms | 68.7% bf16 MFU | 261726 tok/s +step 1476/18794 | loss 3.675594 (-0.57z)| norm 0.1909 (-1.04z)| lr 5.99e-03 | 2009.32 ms | 68.3% bf16 MFU | 261686 tok/s +step 1477/18794 | loss 3.669401 (-0.73z)| norm 0.2067 (-0.63z)| lr 5.99e-03 | 1995.82 ms | 68.8% bf16 MFU | 261737 tok/s +step 1478/18794 | loss 3.643536 (-1.37z)| norm 0.2047 (-0.67z)| lr 5.99e-03 | 2004.19 ms | 68.5% bf16 MFU | 261730 tok/s +step 1479/18794 | loss 3.689531 (-0.19z)| norm 0.2352 (+0.12z)| lr 5.99e-03 | 2003.23 ms | 68.5% bf16 MFU | 261729 tok/s +step 1480/18794 | loss 3.675031 (-0.55z)| norm 0.2381 (+0.21z)| lr 5.99e-03 | 2001.68 ms | 68.6% bf16 MFU | 261739 tok/s +step 1481/18794 | loss 3.645352 (-1.30z)| norm 0.2290 (-0.02z)| lr 5.99e-03 | 1989.76 ms | 69.0% bf16 MFU | 261827 tok/s +step 1482/18794 | loss 3.686508 (-0.20z)| norm 0.2126 (-0.43z)| lr 5.99e-03 | 2003.07 ms | 68.5% bf16 MFU | 261822 tok/s +step 1483/18794 | loss 3.711324 (+0.47z)| norm 0.2069 (-0.58z)| lr 5.99e-03 | 2003.01 ms | 68.5% bf16 MFU | 261819 tok/s +step 1484/18794 | loss 3.613861 (-2.09z)| norm 0.1925 (-0.96z)| lr 5.99e-03 | 1995.48 ms | 68.8% bf16 MFU | 261865 tok/s +step 1485/18794 | loss 3.683402 (-0.26z)| norm 0.1939 (-0.95z)| lr 5.99e-03 | 1993.45 ms | 68.8% bf16 MFU | 261922 tok/s +step 1486/18794 | loss 3.668028 (-0.65z)| norm 0.1849 (-1.19z)| lr 5.99e-03 | 1999.18 ms | 68.6% bf16 MFU | 261938 tok/s +step 1487/18794 | loss 3.629926 (-1.62z)| norm 0.2080 (-0.58z)| lr 5.99e-03 | 2002.85 ms | 68.5% bf16 MFU | 261930 tok/s +step 1488/18794 | loss 3.644334 (-1.22z)| norm 0.2308 (+0.03z)| lr 5.99e-03 | 1988.91 ms | 69.0% bf16 MFU | 262014 tok/s +step 1489/18794 | loss 3.694633 (+0.11z)| norm 0.2360 (+0.18z)| lr 5.99e-03 | 1997.15 ms | 68.7% bf16 MFU | 262039 tok/s +step 1490/18794 | loss 3.718899 (+0.75z)| norm 0.2641 (+0.97z)| lr 5.99e-03 | 1996.51 ms | 68.7% bf16 MFU | 262067 tok/s +step 1491/18794 | loss 3.600454 (-2.29z)| norm 0.2395 (+0.31z)| lr 5.99e-03 | 2002.44 ms | 68.5% bf16 MFU | 262055 tok/s +step 1492/18794 | loss 3.641166 (-1.22z)| norm 0.1933 (-0.97z)| lr 5.99e-03 | 2010.68 ms | 68.3% bf16 MFU | 261990 tok/s +step 1493/18794 | loss 3.701661 (+0.35z)| norm 0.2206 (-0.20z)| lr 5.99e-03 | 1998.70 ms | 68.7% bf16 MFU | 262006 tok/s +step 1494/18794 | loss 3.629070 (-1.50z)| norm 0.2009 (-0.73z)| lr 5.99e-03 | 1978.39 ms | 69.4% bf16 MFU | 262156 tok/s +step 1495/18794 | loss 3.639526 (-1.21z)| norm 0.1678 (-1.62z)| lr 5.99e-03 | 1985.66 ms | 69.1% bf16 MFU | 262250 tok/s +step 1496/18794 | loss 3.687382 (+0.03z)| norm 0.1768 (-1.34z)| lr 5.99e-03 | 1997.56 ms | 68.7% bf16 MFU | 262261 tok/s +step 1497/18794 | loss 3.693322 (+0.18z)| norm 0.2453 (+0.53z)| lr 5.99e-03 | 1996.05 ms | 68.8% bf16 MFU | 262281 tok/s +step 1498/18794 | loss 3.628359 (-1.47z)| norm 0.2972 (+1.91z)| lr 5.99e-03 | 1996.45 ms | 68.7% bf16 MFU | 262297 tok/s +step 1499/18794 | loss 3.693366 (+0.21z)| norm 0.2333 (+0.18z)| lr 5.99e-03 | 1986.19 ms | 69.1% bf16 MFU | 262381 tok/s +step 1500/18794 | loss 3.686620 (+0.05z)| norm 0.1895 (-0.99z)| lr 5.99e-03 | 1988.75 ms | 69.0% bf16 MFU | 262443 tok/s +val loss 3.683795 +Writing state to log_gpt3_125M_edu_v4/state_00001500_00001.bin +HellaSwag: 2593/10042 = 0.258215 +Writing checkpoint at step 1500 +Writing model to log_gpt3_125M_edu_v4/model_00001500.bin +Writing state to log_gpt3_125M_edu_v4/state_00001500_00000.bin +step 1501/18794 | loss 3.636293 (-1.24z)| norm 0.1750 (-1.35z)| lr 5.99e-03 | 1989.49 ms | 69.0% bf16 MFU | 262497 tok/s +step 1502/18794 | loss 3.606933 (-1.95z)| norm 0.1830 (-1.12z)| lr 5.99e-03 | 2000.21 ms | 68.6% bf16 MFU | 262478 tok/s +step 1503/18794 | loss 3.627193 (-1.41z)| norm 0.2011 (-0.64z)| lr 5.99e-03 | 1995.20 ms | 68.8% bf16 MFU | 262493 tok/s +step 1504/18794 | loss 3.669230 (-0.34z)| norm 0.2184 (-0.19z)| lr 5.99e-03 | 1990.73 ms | 68.9% bf16 MFU | 262537 tok/s +step 1505/18794 | loss 3.668762 (-0.35z)| norm 0.1982 (-0.71z)| lr 5.99e-03 | 1989.43 ms | 69.0% bf16 MFU | 262587 tok/s +step 1506/18794 | loss 3.692886 (+0.26z)| norm 0.2402 (+0.41z)| lr 5.99e-03 | 1987.30 ms | 69.1% bf16 MFU | 262648 tok/s +step 1507/18794 | loss 3.647335 (-0.88z)| norm 0.2072 (-0.46z)| lr 5.99e-03 | 1994.13 ms | 68.8% bf16 MFU | 262662 tok/s +step 1508/18794 | loss 3.715288 (+0.86z)| norm 0.1808 (-1.14z)| lr 5.99e-03 | 1986.29 ms | 69.1% bf16 MFU | 262726 tok/s +step 1509/18794 | loss 3.689981 (+0.25z)| norm 0.2508 (+0.72z)| lr 5.99e-03 | 1980.17 ms | 69.3% bf16 MFU | 262828 tok/s +step 1510/18794 | loss 3.646190 (-0.90z)| norm 0.2744 (+1.32z)| lr 5.99e-03 | 1983.78 ms | 69.2% bf16 MFU | 262901 tok/s +step 1511/18794 | loss 3.654955 (-0.65z)| norm 0.2329 (+0.24z)| lr 5.99e-03 | 1989.44 ms | 69.0% bf16 MFU | 262933 tok/s +step 1512/18794 | loss 3.668438 (-0.28z)| norm 0.2579 (+0.94z)| lr 5.99e-03 | 1988.28 ms | 69.0% bf16 MFU | 262971 tok/s +step 1513/18794 | loss 3.679958 (+0.04z)| norm 0.2977 (+1.96z)| lr 5.99e-03 | 1982.83 ms | 69.2% bf16 MFU | 263043 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.035186 +step 1514/18794 | loss 3.686729 (+0.24z)| norm 0.3028 (+2.04z)| lr 5.99e-03 | 1994.09 ms | 68.8% bf16 MFU | 263037 tok/s +step 1515/18794 | loss 3.629785 (-1.30z)| norm 0.2786 (+1.38z)| lr 5.99e-03 | 2007.06 ms | 68.4% bf16 MFU | 262946 tok/s +step 1516/18794 | loss 3.663399 (-0.38z)| norm 0.2723 (+1.19z)| lr 5.99e-03 | 1981.74 ms | 69.2% bf16 MFU | 263027 tok/s +step 1517/18794 | loss 3.614713 (-1.67z)| norm 0.2331 (+0.17z)| lr 5.99e-03 | 1983.13 ms | 69.2% bf16 MFU | 263094 tok/s +step 1518/18794 | loss 3.656790 (-0.54z)| norm 0.2341 (+0.19z)| lr 5.99e-03 | 1988.67 ms | 69.0% bf16 MFU | 263121 tok/s +step 1519/18794 | loss 3.651679 (-0.68z)| norm 0.2300 (+0.08z)| lr 5.99e-03 | 1985.30 ms | 69.1% bf16 MFU | 263170 tok/s +step 1520/18794 | loss 3.681149 (+0.12z)| norm 0.2026 (-0.62z)| lr 5.99e-03 | 1986.99 ms | 69.1% bf16 MFU | 263204 tok/s +step 1521/18794 | loss 3.627722 (-1.32z)| norm 0.1826 (-1.12z)| lr 5.99e-03 | 1987.37 ms | 69.1% bf16 MFU | 263234 tok/s +step 1522/18794 | loss 3.656049 (-0.54z)| norm 0.2059 (-0.49z)| lr 5.99e-03 | 1979.56 ms | 69.3% bf16 MFU | 263315 tok/s +step 1523/18794 | loss 3.675921 (+0.07z)| norm 0.2502 (+0.71z)| lr 5.99e-03 | 1980.23 ms | 69.3% bf16 MFU | 263387 tok/s +step 1524/18794 | loss 3.703066 (+0.96z)| norm 0.2493 (+0.71z)| lr 5.99e-03 | 1980.86 ms | 69.3% bf16 MFU | 263452 tok/s +step 1525/18794 | loss 3.628729 (-1.47z)| norm 0.1923 (-0.85z)| lr 5.99e-03 | 1979.06 ms | 69.3% bf16 MFU | 263525 tok/s +step 1526/18794 | loss 3.619241 (-1.75z)| norm 0.1938 (-0.80z)| lr 5.99e-03 | 2040.72 ms | 67.2% bf16 MFU | 263195 tok/s +step 1527/18794 | loss 3.586121 (-2.71z)| norm 0.2189 (-0.05z)| lr 5.99e-03 | 2040.89 ms | 67.2% bf16 MFU | 262880 tok/s +step 1528/18794 | loss 3.656713 (-0.45z)| norm 0.2821 (+1.82z)| lr 5.99e-03 | 2034.78 ms | 67.4% bf16 MFU | 262619 tok/s +step 1529/18794 | loss 3.580274 (-2.77z)| norm 0.2455 (+0.73z)| lr 5.99e-03 | 2039.87 ms | 67.3% bf16 MFU | 262339 tok/s +step 1530/18794 | loss 3.619047 (-1.53z)| norm 0.2274 (+0.20z)| lr 5.99e-03 | 2033.82 ms | 67.5% bf16 MFU | 262111 tok/s +step 1531/18794 | loss 3.666513 (-0.06z)| norm 0.2641 (+1.26z)| lr 5.99e-03 | 2034.06 ms | 67.5% bf16 MFU | 261893 tok/s +step 1532/18794 | loss 3.641150 (-0.82z)| norm 0.2480 (+0.77z)| lr 5.99e-03 | 2034.16 ms | 67.5% bf16 MFU | 261686 tok/s +step 1533/18794 | loss 3.669850 (+0.08z)| norm 0.2075 (-0.41z)| lr 5.99e-03 | 2023.12 ms | 67.8% bf16 MFU | 261559 tok/s +step 1534/18794 | loss 3.661628 (-0.18z)| norm 0.1955 (-0.75z)| lr 5.99e-03 | 2040.14 ms | 67.3% bf16 MFU | 261330 tok/s +step 1535/18794 | loss 3.657996 (-0.28z)| norm 0.2002 (-0.61z)| lr 5.99e-03 | 2041.24 ms | 67.2% bf16 MFU | 261106 tok/s +step 1536/18794 | loss 3.635868 (-0.96z)| norm 0.2248 (+0.09z)| lr 5.99e-03 | 2025.20 ms | 67.8% bf16 MFU | 260995 tok/s +step 1537/18794 | loss 3.666813 (+0.03z)| norm 0.2031 (-0.55z)| lr 5.99e-03 | 2042.60 ms | 67.2% bf16 MFU | 260779 tok/s +step 1538/18794 | loss 3.629015 (-1.16z)| norm 0.1960 (-0.76z)| lr 5.99e-03 | 2043.15 ms | 67.2% bf16 MFU | 260570 tok/s +step 1539/18794 | loss 3.594855 (-2.19z)| norm 0.1929 (-0.83z)| lr 5.99e-03 | 2025.91 ms | 67.7% bf16 MFU | 260481 tok/s +step 1540/18794 | loss 3.613668 (-1.56z)| norm 0.1763 (-1.30z)| lr 5.99e-03 | 2022.55 ms | 67.9% bf16 MFU | 260418 tok/s +step 1541/18794 | loss 3.714554 (+1.59z)| norm 0.1965 (-0.69z)| lr 5.99e-03 | 2036.29 ms | 67.4% bf16 MFU | 260271 tok/s +step 1542/18794 | loss 3.634654 (-0.88z)| norm 0.2197 (-0.02z)| lr 5.99e-03 | 2028.09 ms | 67.7% bf16 MFU | 260183 tok/s +step 1543/18794 | loss 3.615253 (-1.45z)| norm 0.2139 (-0.20z)| lr 5.99e-03 | 2026.40 ms | 67.7% bf16 MFU | 260110 tok/s +step 1544/18794 | loss 3.646307 (-0.49z)| norm 0.2218 (+0.02z)| lr 5.99e-03 | 2030.90 ms | 67.6% bf16 MFU | 260013 tok/s +step 1545/18794 | loss 3.593491 (-2.07z)| norm 0.1986 (-0.67z)| lr 5.99e-03 | 2040.71 ms | 67.2% bf16 MFU | 259858 tok/s +step 1546/18794 | loss 3.646980 (-0.42z)| norm 0.1989 (-0.64z)| lr 5.99e-03 | 2041.12 ms | 67.2% bf16 MFU | 259708 tok/s +step 1547/18794 | loss 3.637270 (-0.70z)| norm 0.1851 (-1.04z)| lr 5.99e-03 | 2031.75 ms | 67.5% bf16 MFU | 259625 tok/s +step 1548/18794 | loss 3.574576 (-2.52z)| norm 0.1963 (-0.69z)| lr 5.99e-03 | 2026.31 ms | 67.7% bf16 MFU | 259581 tok/s +step 1549/18794 | loss 3.591212 (-1.98z)| norm 0.1977 (-0.65z)| lr 5.99e-03 | 2040.59 ms | 67.3% bf16 MFU | 259448 tok/s +step 1550/18794 | loss 3.667670 (+0.28z)| norm 0.1823 (-1.13z)| lr 5.99e-03 | 2028.78 ms | 67.6% bf16 MFU | 259397 tok/s +step 1551/18794 | loss 3.658585 (+0.02z)| norm 0.2300 (+0.30z)| lr 5.99e-03 | 2027.06 ms | 67.7% bf16 MFU | 259359 tok/s +step 1552/18794 | loss 3.593668 (-1.86z)| norm 0.2336 (+0.40z)| lr 5.99e-03 | 2020.05 ms | 67.9% bf16 MFU | 259369 tok/s +step 1553/18794 | loss 3.656942 (-0.01z)| norm 0.2445 (+0.72z)| lr 5.99e-03 | 2011.44 ms | 68.2% bf16 MFU | 259433 tok/s +step 1554/18794 | loss 3.587749 (-1.99z)| norm 0.2474 (+0.81z)| lr 5.99e-03 | 2039.65 ms | 67.3% bf16 MFU | 259314 tok/s +step 1555/18794 | loss 3.643680 (-0.35z)| norm 0.2510 (+0.90z)| lr 5.99e-03 | 2022.08 ms | 67.9% bf16 MFU | 259312 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.464173 +step 1556/18794 | loss 3.652572 (-0.08z)| norm 0.3052 (+2.46z)| lr 5.99e-03 | 2034.46 ms | 67.5% bf16 MFU | 259232 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.300953 +step 1557/18794 | loss 3.580085 (-2.15z)| norm 0.3028 (+2.30z)| lr 5.99e-03 | 2015.49 ms | 68.1% bf16 MFU | 259276 tok/s +step 1558/18794 | loss 3.640689 (-0.38z)| norm 0.2233 (-0.00z)| lr 5.99e-03 | 2018.62 ms | 68.0% bf16 MFU | 259299 tok/s +step 1559/18794 | loss 3.662803 (+0.28z)| norm 0.1875 (-1.03z)| lr 5.99e-03 | 2020.42 ms | 67.9% bf16 MFU | 259309 tok/s +step 1560/18794 | loss 3.553160 (-2.83z)| norm 0.1936 (-0.87z)| lr 5.99e-03 | 2034.05 ms | 67.5% bf16 MFU | 259231 tok/s +step 1561/18794 | loss 3.621813 (-0.85z)| norm 0.2238 (+0.00z)| lr 5.99e-03 | 2019.61 ms | 67.9% bf16 MFU | 259249 tok/s +step 1562/18794 | loss 3.698979 (+1.34z)| norm 0.2116 (-0.37z)| lr 5.99e-03 | 2031.96 ms | 67.5% bf16 MFU | 259188 tok/s +step 1563/18794 | loss 3.581686 (-1.93z)| norm 0.1956 (-0.84z)| lr 5.99e-03 | 2017.93 ms | 68.0% bf16 MFU | 259219 tok/s +step 1564/18794 | loss 3.589633 (-1.67z)| norm 0.1907 (-0.97z)| lr 5.99e-03 | 2021.29 ms | 67.9% bf16 MFU | 259227 tok/s +step 1565/18794 | loss 3.596491 (-1.45z)| norm 0.1733 (-1.45z)| lr 5.99e-03 | 2032.86 ms | 67.5% bf16 MFU | 259161 tok/s +step 1566/18794 | loss 3.614130 (-0.95z)| norm 0.1500 (-2.06z)| lr 5.99e-03 | 2016.07 ms | 68.1% bf16 MFU | 259206 tok/s +step 1567/18794 | loss 3.547495 (-2.64z)| norm 0.1414 (-2.24z)| lr 5.99e-03 | 2025.54 ms | 67.8% bf16 MFU | 259188 tok/s +step 1568/18794 | loss 3.604930 (-1.11z)| norm 0.1395 (-2.21z)| lr 5.99e-03 | 2034.63 ms | 67.4% bf16 MFU | 259112 tok/s +step 1569/18794 | loss 3.605512 (-1.08z)| norm 0.1464 (-1.96z)| lr 5.99e-03 | 2018.33 ms | 68.0% bf16 MFU | 259145 tok/s +step 1570/18794 | loss 3.562808 (-2.11z)| norm 0.1644 (-1.45z)| lr 5.99e-03 | 2015.92 ms | 68.1% bf16 MFU | 259191 tok/s +step 1571/18794 | loss 3.601078 (-1.12z)| norm 0.2036 (-0.39z)| lr 5.99e-03 | 2012.28 ms | 68.2% bf16 MFU | 259259 tok/s +step 1572/18794 | loss 3.553286 (-2.29z)| norm 0.2203 (+0.09z)| lr 5.99e-03 | 2027.16 ms | 67.7% bf16 MFU | 259228 tok/s +step 1573/18794 | loss 3.633974 (-0.23z)| norm 0.2671 (+1.41z)| lr 5.99e-03 | 2021.01 ms | 67.9% bf16 MFU | 259237 tok/s +step 1574/18794 | loss 3.608675 (-0.86z)| norm 0.2652 (+1.34z)| lr 5.99e-03 | 2011.26 ms | 68.2% bf16 MFU | 259309 tok/s +step 1575/18794 | loss 3.629354 (-0.33z)| norm 0.2542 (+1.01z)| lr 5.99e-03 | 2004.85 ms | 68.5% bf16 MFU | 259419 tok/s +step 1576/18794 | loss 3.600464 (-1.04z)| norm 0.2916 (+1.99z)| lr 5.99e-03 | 2027.25 ms | 67.7% bf16 MFU | 259379 tok/s +step 1577/18794 | loss 3.665214 (+0.60z)| norm 0.2792 (+1.61z)| lr 5.99e-03 | 2022.99 ms | 67.8% bf16 MFU | 259369 tok/s +step 1578/18794 | loss 3.658370 (+0.42z)| norm 0.2957 (+1.99z)| lr 5.99e-03 | 2010.84 ms | 68.2% bf16 MFU | 259437 tok/s +step 1579/18794 | loss 3.579708 (-1.53z)| norm 0.2378 (+0.45z)| lr 5.99e-03 | 2020.72 ms | 67.9% bf16 MFU | 259438 tok/s +step 1580/18794 | loss 3.574275 (-1.63z)| norm 0.2178 (-0.07z)| lr 5.99e-03 | 2014.98 ms | 68.1% bf16 MFU | 259476 tok/s +step 1581/18794 | loss 3.689806 (+1.23z)| norm 0.2169 (-0.09z)| lr 5.99e-03 | 2010.85 ms | 68.2% bf16 MFU | 259538 tok/s +step 1582/18794 | loss 3.696254 (+1.39z)| norm 0.2037 (-0.44z)| lr 5.99e-03 | 2028.87 ms | 67.6% bf16 MFU | 259482 tok/s +step 1583/18794 | loss 3.634800 (-0.11z)| norm 0.1958 (-0.65z)| lr 5.99e-03 | 2020.39 ms | 67.9% bf16 MFU | 259483 tok/s +step 1584/18794 | loss 3.619030 (-0.51z)| norm 0.1765 (-1.15z)| lr 5.99e-03 | 2011.18 ms | 68.2% bf16 MFU | 259543 tok/s +step 1585/18794 | loss 3.625754 (-0.33z)| norm 0.2164 (-0.10z)| lr 5.99e-03 | 2024.88 ms | 67.8% bf16 MFU | 259512 tok/s +step 1586/18794 | loss 3.701828 (+1.57z)| norm 0.2451 (+0.64z)| lr 5.99e-03 | 2002.97 ms | 68.5% bf16 MFU | 259624 tok/s +step 1587/18794 | loss 3.643804 (+0.11z)| norm 0.2649 (+1.15z)| lr 5.99e-03 | 2017.66 ms | 68.0% bf16 MFU | 259635 tok/s +step 1588/18794 | loss 3.572497 (-1.63z)| norm 0.2639 (+1.11z)| lr 5.99e-03 | 2010.96 ms | 68.2% bf16 MFU | 259689 tok/s +step 1589/18794 | loss 3.642179 (+0.10z)| norm 0.2526 (+0.80z)| lr 5.99e-03 | 2002.44 ms | 68.5% bf16 MFU | 259796 tok/s +step 1590/18794 | loss 3.644708 (+0.19z)| norm 0.2319 (+0.27z)| lr 5.99e-03 | 2006.74 ms | 68.4% bf16 MFU | 259870 tok/s +step 1591/18794 | loss 3.624723 (-0.33z)| norm 0.2292 (+0.20z)| lr 5.99e-03 | 2010.30 ms | 68.3% bf16 MFU | 259916 tok/s +step 1592/18794 | loss 3.604611 (-0.83z)| norm 0.2084 (-0.35z)| lr 5.99e-03 | 2019.11 ms | 68.0% bf16 MFU | 259903 tok/s +step 1593/18794 | loss 3.556041 (-2.01z)| norm 0.1913 (-0.79z)| lr 5.99e-03 | 2012.20 ms | 68.2% bf16 MFU | 259936 tok/s +step 1594/18794 | loss 3.657957 (+0.55z)| norm 0.2019 (-0.51z)| lr 5.99e-03 | 2020.50 ms | 67.9% bf16 MFU | 259913 tok/s +step 1595/18794 | loss 3.593966 (-1.04z)| norm 0.1979 (-0.63z)| lr 5.99e-03 | 2026.56 ms | 67.7% bf16 MFU | 259853 tok/s +step 1596/18794 | loss 3.655678 (+0.52z)| norm 0.2084 (-0.36z)| lr 5.99e-03 | 2004.95 ms | 68.4% bf16 MFU | 259935 tok/s +step 1597/18794 | loss 3.720293 (+2.12z)| norm 0.2140 (-0.21z)| lr 5.99e-03 | 2016.69 ms | 68.0% bf16 MFU | 259937 tok/s +step 1598/18794 | loss 3.655237 (+0.49z)| norm 0.1988 (-0.60z)| lr 5.99e-03 | 2015.38 ms | 68.1% bf16 MFU | 259948 tok/s +step 1599/18794 | loss 3.592141 (-1.07z)| norm 0.2109 (-0.26z)| lr 5.98e-03 | 2001.00 ms | 68.6% bf16 MFU | 260051 tok/s +step 1600/18794 | loss 3.587984 (-1.15z)| norm 0.2582 (+1.01z)| lr 5.98e-03 | 2028.15 ms | 67.7% bf16 MFU | 259974 tok/s +step 1601/18794 | loss 3.595896 (-0.94z)| norm 0.2802 (+1.57z)| lr 5.98e-03 | 2017.63 ms | 68.0% bf16 MFU | 259968 tok/s +step 1602/18794 | loss 3.520631 (-2.70z)| norm 0.2799 (+1.53z)| lr 5.98e-03 | 2001.74 ms | 68.6% bf16 MFU | 260065 tok/s +step 1603/18794 | loss 3.613303 (-0.46z)| norm 0.2316 (+0.22z)| lr 5.98e-03 | 2023.50 ms | 67.8% bf16 MFU | 260017 tok/s +step 1604/18794 | loss 3.660049 (+0.67z)| norm 0.2347 (+0.30z)| lr 5.98e-03 | 2009.58 ms | 68.3% bf16 MFU | 260061 tok/s +step 1605/18794 | loss 3.657040 (+0.60z)| norm 0.2199 (-0.11z)| lr 5.98e-03 | 2003.41 ms | 68.5% bf16 MFU | 260142 tok/s +step 1606/18794 | loss 3.658610 (+0.65z)| norm 0.2152 (-0.23z)| lr 5.98e-03 | 2011.64 ms | 68.2% bf16 MFU | 260167 tok/s +step 1607/18794 | loss 3.611042 (-0.50z)| norm 0.2623 (+1.03z)| lr 5.98e-03 | 2019.41 ms | 68.0% bf16 MFU | 260139 tok/s +step 1608/18794 | loss 3.653299 (+0.56z)| norm 0.2600 (+0.95z)| lr 5.98e-03 | 2014.49 ms | 68.1% bf16 MFU | 260145 tok/s +step 1609/18794 | loss 3.623476 (-0.17z)| norm 0.2188 (-0.16z)| lr 5.98e-03 | 1997.33 ms | 68.7% bf16 MFU | 260263 tok/s +step 1610/18794 | loss 3.638972 (+0.22z)| norm 0.2239 (-0.00z)| lr 5.98e-03 | 2008.77 ms | 68.3% bf16 MFU | 260300 tok/s +step 1611/18794 | loss 3.632576 (+0.07z)| norm 0.2349 (+0.30z)| lr 5.98e-03 | 2018.69 ms | 68.0% bf16 MFU | 260271 tok/s +step 1612/18794 | loss 3.586521 (-1.07z)| norm 0.2250 (+0.03z)| lr 5.98e-03 | 2009.30 ms | 68.3% bf16 MFU | 260304 tok/s +step 1613/18794 | loss 3.655097 (+0.67z)| norm 0.1928 (-0.84z)| lr 5.98e-03 | 2009.76 ms | 68.3% bf16 MFU | 260332 tok/s +step 1614/18794 | loss 3.614950 (-0.34z)| norm 0.1736 (-1.36z)| lr 5.98e-03 | 2004.89 ms | 68.4% bf16 MFU | 260391 tok/s +step 1615/18794 | loss 3.557911 (-1.75z)| norm 0.1702 (-1.43z)| lr 5.98e-03 | 2004.11 ms | 68.5% bf16 MFU | 260451 tok/s +step 1616/18794 | loss 3.613075 (-0.35z)| norm 0.2040 (-0.45z)| lr 5.98e-03 | 2003.03 ms | 68.5% bf16 MFU | 260516 tok/s +step 1617/18794 | loss 3.607892 (-0.48z)| norm 0.2086 (-0.31z)| lr 5.98e-03 | 2016.26 ms | 68.1% bf16 MFU | 260492 tok/s +step 1618/18794 | loss 3.600065 (-0.66z)| norm 0.2354 (+0.46z)| lr 5.98e-03 | 2010.81 ms | 68.2% bf16 MFU | 260504 tok/s +step 1619/18794 | loss 3.593145 (-0.82z)| norm 0.2167 (-0.08z)| lr 5.98e-03 | 2025.39 ms | 67.8% bf16 MFU | 260422 tok/s +step 1620/18794 | loss 3.609629 (-0.39z)| norm 0.1870 (-0.92z)| lr 5.98e-03 | 2005.42 ms | 68.4% bf16 MFU | 260472 tok/s +step 1621/18794 | loss 3.642929 (+0.45z)| norm 0.2200 (+0.01z)| lr 5.98e-03 | 2007.75 ms | 68.4% bf16 MFU | 260505 tok/s +step 1622/18794 | loss 3.554999 (-1.74z)| norm 0.2098 (-0.28z)| lr 5.98e-03 | 2010.39 ms | 68.3% bf16 MFU | 260520 tok/s +step 1623/18794 | loss 3.589030 (-0.86z)| norm 0.1749 (-1.26z)| lr 5.98e-03 | 1993.51 ms | 68.8% bf16 MFU | 260643 tok/s +step 1624/18794 | loss 3.615893 (-0.17z)| norm 0.1716 (-1.33z)| lr 5.98e-03 | 2003.80 ms | 68.5% bf16 MFU | 260694 tok/s +step 1625/18794 | loss 3.578589 (-1.11z)| norm 0.1623 (-1.57z)| lr 5.98e-03 | 1995.34 ms | 68.8% bf16 MFU | 260797 tok/s +step 1626/18794 | loss 3.538691 (-2.07z)| norm 0.1620 (-1.55z)| lr 5.98e-03 | 2000.77 ms | 68.6% bf16 MFU | 260859 tok/s +step 1627/18794 | loss 3.648258 (+0.67z)| norm 0.1587 (-1.61z)| lr 5.98e-03 | 2004.69 ms | 68.5% bf16 MFU | 260893 tok/s +step 1628/18794 | loss 3.577354 (-1.09z)| norm 0.1886 (-0.76z)| lr 5.98e-03 | 1999.93 ms | 68.6% bf16 MFU | 260956 tok/s +step 1629/18794 | loss 3.668831 (+1.18z)| norm 0.2274 (+0.33z)| lr 5.98e-03 | 2007.83 ms | 68.3% bf16 MFU | 260964 tok/s +step 1630/18794 | loss 3.614094 (-0.19z)| norm 0.2669 (+1.41z)| lr 5.98e-03 | 2005.33 ms | 68.4% bf16 MFU | 260988 tok/s +step 1631/18794 | loss 3.597222 (-0.60z)| norm 0.2582 (+1.18z)| lr 5.98e-03 | 1995.17 ms | 68.8% bf16 MFU | 261078 tok/s +step 1632/18794 | loss 3.607013 (-0.34z)| norm 0.2487 (+0.91z)| lr 5.98e-03 | 2002.63 ms | 68.5% bf16 MFU | 261114 tok/s +step 1633/18794 | loss 3.598736 (-0.54z)| norm 0.2681 (+1.42z)| lr 5.98e-03 | 1998.93 ms | 68.7% bf16 MFU | 261172 tok/s +step 1634/18794 | loss 3.605644 (-0.35z)| norm 0.2295 (+0.34z)| lr 5.98e-03 | 1996.59 ms | 68.7% bf16 MFU | 261243 tok/s +step 1635/18794 | loss 3.564698 (-1.36z)| norm 0.1960 (-0.58z)| lr 5.98e-03 | 1997.35 ms | 68.7% bf16 MFU | 261306 tok/s +step 1636/18794 | loss 3.566417 (-1.29z)| norm 0.2209 (+0.11z)| lr 5.98e-03 | 2002.21 ms | 68.5% bf16 MFU | 261333 tok/s +step 1637/18794 | loss 3.529822 (-2.14z)| norm 0.2723 (+1.50z)| lr 5.98e-03 | 2001.77 ms | 68.6% bf16 MFU | 261362 tok/s +step 1638/18794 | loss 3.626352 (+0.25z)| norm 0.2424 (+0.67z)| lr 5.98e-03 | 2004.53 ms | 68.5% bf16 MFU | 261372 tok/s +step 1639/18794 | loss 3.688820 (+1.75z)| norm 0.2316 (+0.36z)| lr 5.98e-03 | 1994.80 ms | 68.8% bf16 MFU | 261444 tok/s +step 1640/18794 | loss 3.575883 (-1.00z)| norm 0.2527 (+0.92z)| lr 5.98e-03 | 1999.59 ms | 68.6% bf16 MFU | 261482 tok/s +step 1641/18794 | loss 3.598550 (-0.43z)| norm 0.2826 (+1.70z)| lr 5.98e-03 | 2008.69 ms | 68.3% bf16 MFU | 261458 tok/s +step 1642/18794 | loss 3.629494 (+0.34z)| norm 0.2885 (+1.81z)| lr 5.98e-03 | 2009.83 ms | 68.3% bf16 MFU | 261429 tok/s +step 1643/18794 | loss 3.617949 (+0.06z)| norm 0.2555 (+0.91z)| lr 5.98e-03 | 2003.71 ms | 68.5% bf16 MFU | 261440 tok/s +step 1644/18794 | loss 3.641725 (+0.65z)| norm 0.2815 (+1.57z)| lr 5.98e-03 | 2006.12 ms | 68.4% bf16 MFU | 261435 tok/s +step 1645/18794 | loss 3.637298 (+0.53z)| norm 0.2219 (-0.00z)| lr 5.98e-03 | 1998.86 ms | 68.7% bf16 MFU | 261478 tok/s +step 1646/18794 | loss 3.558143 (-1.42z)| norm 0.2074 (-0.39z)| lr 5.98e-03 | 2002.28 ms | 68.5% bf16 MFU | 261497 tok/s +step 1647/18794 | loss 3.530886 (-2.03z)| norm 0.2002 (-0.58z)| lr 5.98e-03 | 2009.19 ms | 68.3% bf16 MFU | 261469 tok/s +step 1648/18794 | loss 3.569026 (-1.10z)| norm 0.2077 (-0.39z)| lr 5.98e-03 | 2011.01 ms | 68.2% bf16 MFU | 261431 tok/s +step 1649/18794 | loss 3.609169 (-0.12z)| norm 0.2710 (+1.26z)| lr 5.98e-03 | 2009.14 ms | 68.3% bf16 MFU | 261407 tok/s +step 1650/18794 | loss 3.511319 (-2.42z)| norm 0.2020 (-0.56z)| lr 5.98e-03 | 2002.89 ms | 68.5% bf16 MFU | 261425 tok/s +step 1651/18794 | loss 3.544520 (-1.59z)| norm 0.1593 (-1.65z)| lr 5.98e-03 | 1995.44 ms | 68.8% bf16 MFU | 261491 tok/s +step 1652/18794 | loss 3.584693 (-0.63z)| norm 0.1621 (-1.54z)| lr 5.98e-03 | 2010.48 ms | 68.3% bf16 MFU | 261455 tok/s +step 1653/18794 | loss 3.613777 (+0.06z)| norm 0.1660 (-1.40z)| lr 5.98e-03 | 2003.43 ms | 68.5% bf16 MFU | 261467 tok/s +step 1654/18794 | loss 3.562660 (-1.14z)| norm 0.1901 (-0.78z)| lr 5.98e-03 | 2008.76 ms | 68.3% bf16 MFU | 261444 tok/s +step 1655/18794 | loss 3.579820 (-0.72z)| norm 0.1824 (-0.95z)| lr 5.98e-03 | 2003.52 ms | 68.5% bf16 MFU | 261456 tok/s +step 1656/18794 | loss 3.564181 (-1.07z)| norm 0.1812 (-0.97z)| lr 5.98e-03 | 1990.70 ms | 68.9% bf16 MFU | 261551 tok/s +step 1657/18794 | loss 3.662420 (+1.23z)| norm 0.2173 (-0.01z)| lr 5.98e-03 | 2002.27 ms | 68.5% bf16 MFU | 261566 tok/s +step 1658/18794 | loss 3.659186 (+1.15z)| norm 0.2457 (+0.74z)| lr 5.98e-03 | 2002.86 ms | 68.5% bf16 MFU | 261576 tok/s +step 1659/18794 | loss 3.557744 (-1.21z)| norm 0.2637 (+1.19z)| lr 5.98e-03 | 1999.81 ms | 68.6% bf16 MFU | 261606 tok/s +step 1660/18794 | loss 3.591917 (-0.42z)| norm 0.2135 (-0.14z)| lr 5.98e-03 | 2008.09 ms | 68.3% bf16 MFU | 261580 tok/s +step 1661/18794 | loss 3.666097 (+1.31z)| norm 0.2076 (-0.30z)| lr 5.98e-03 | 1997.63 ms | 68.7% bf16 MFU | 261624 tok/s +step 1662/18794 | loss 3.581721 (-0.65z)| norm 0.1926 (-0.69z)| lr 5.98e-03 | 1994.01 ms | 68.8% bf16 MFU | 261689 tok/s +step 1663/18794 | loss 3.630593 (+0.51z)| norm 0.2051 (-0.36z)| lr 5.98e-03 | 2009.98 ms | 68.3% bf16 MFU | 261647 tok/s +step 1664/18794 | loss 3.558566 (-1.20z)| norm 0.1578 (-1.59z)| lr 5.98e-03 | 1995.06 ms | 68.8% bf16 MFU | 261704 tok/s +step 1665/18794 | loss 3.652407 (+1.01z)| norm 0.1571 (-1.59z)| lr 5.98e-03 | 2001.93 ms | 68.5% bf16 MFU | 261713 tok/s +step 1666/18794 | loss 3.532145 (-1.79z)| norm 0.2241 (+0.14z)| lr 5.98e-03 | 1996.31 ms | 68.7% bf16 MFU | 261759 tok/s +step 1667/18794 | loss 3.578872 (-0.71z)| norm 0.2789 (+1.56z)| lr 5.98e-03 | 2002.72 ms | 68.5% bf16 MFU | 261761 tok/s +step 1668/18794 | loss 3.586705 (-0.52z)| norm 0.2566 (+0.96z)| lr 5.98e-03 | 1994.10 ms | 68.8% bf16 MFU | 261819 tok/s +step 1669/18794 | loss 3.601031 (-0.18z)| norm 0.2246 (+0.06z)| lr 5.98e-03 | 1994.47 ms | 68.8% bf16 MFU | 261871 tok/s +step 1670/18794 | loss 3.628754 (+0.45z)| norm 0.2441 (+0.59z)| lr 5.98e-03 | 1984.32 ms | 69.2% bf16 MFU | 261988 tok/s +step 1671/18794 | loss 3.597780 (-0.28z)| norm 0.2631 (+1.11z)| lr 5.98e-03 | 1993.95 ms | 68.8% bf16 MFU | 262036 tok/s +step 1672/18794 | loss 3.642293 (+0.76z)| norm 0.2755 (+1.42z)| lr 5.98e-03 | 1993.80 ms | 68.8% bf16 MFU | 262082 tok/s +step 1673/18794 | loss 3.564266 (-1.07z)| norm 0.2404 (+0.46z)| lr 5.98e-03 | 1992.18 ms | 68.9% bf16 MFU | 262137 tok/s +step 1674/18794 | loss 3.556828 (-1.23z)| norm 0.1745 (-1.35z)| lr 5.98e-03 | 1998.57 ms | 68.7% bf16 MFU | 262146 tok/s +step 1675/18794 | loss 3.616879 (+0.18z)| norm 0.1987 (-0.66z)| lr 5.98e-03 | 2004.34 ms | 68.5% bf16 MFU | 262118 tok/s +step 1676/18794 | loss 3.587087 (-0.51z)| norm 0.1657 (-1.56z)| lr 5.98e-03 | 1989.50 ms | 69.0% bf16 MFU | 262188 tok/s +step 1677/18794 | loss 3.580691 (-0.65z)| norm 0.1843 (-1.02z)| lr 5.98e-03 | 1992.30 ms | 68.9% bf16 MFU | 262237 tok/s +step 1678/18794 | loss 3.613199 (+0.13z)| norm 0.2188 (-0.02z)| lr 5.98e-03 | 1999.42 ms | 68.6% bf16 MFU | 262236 tok/s +step 1679/18794 | loss 3.591016 (-0.40z)| norm 0.2759 (+1.60z)| lr 5.98e-03 | 1996.50 ms | 68.7% bf16 MFU | 262254 tok/s +step 1680/18794 | loss 3.578418 (-0.70z)| norm 0.2694 (+1.39z)| lr 5.98e-03 | 2002.02 ms | 68.5% bf16 MFU | 262236 tok/s +step 1681/18794 | loss 3.608432 (+0.03z)| norm 0.2617 (+1.15z)| lr 5.98e-03 | 2009.71 ms | 68.3% bf16 MFU | 262168 tok/s +step 1682/18794 | loss 3.682785 (+1.85z)| norm 0.2406 (+0.54z)| lr 5.98e-03 | 1987.02 ms | 69.1% bf16 MFU | 262252 tok/s +step 1683/18794 | loss 3.581607 (-0.60z)| norm 0.2006 (-0.58z)| lr 5.98e-03 | 1993.65 ms | 68.8% bf16 MFU | 262289 tok/s +step 1684/18794 | loss 3.567328 (-0.94z)| norm 0.1757 (-1.28z)| lr 5.98e-03 | 1985.89 ms | 69.1% bf16 MFU | 262374 tok/s +step 1685/18794 | loss 3.642210 (+0.88z)| norm 0.1719 (-1.36z)| lr 5.98e-03 | 1985.37 ms | 69.1% bf16 MFU | 262460 tok/s +step 1686/18794 | loss 3.631628 (+0.66z)| norm 0.1774 (-1.19z)| lr 5.98e-03 | 1994.24 ms | 68.8% bf16 MFU | 262482 tok/s +step 1687/18794 | loss 3.648502 (+1.07z)| norm 0.1767 (-1.18z)| lr 5.98e-03 | 1983.71 ms | 69.2% bf16 MFU | 262572 tok/s +step 1688/18794 | loss 3.580990 (-0.61z)| norm 0.1968 (-0.61z)| lr 5.98e-03 | 1994.51 ms | 68.8% bf16 MFU | 262587 tok/s +step 1689/18794 | loss 3.552374 (-1.29z)| norm 0.1911 (-0.75z)| lr 5.98e-03 | 1994.00 ms | 68.8% bf16 MFU | 262604 tok/s +step 1690/18794 | loss 3.568052 (-0.89z)| norm 0.1954 (-0.62z)| lr 5.98e-03 | 1987.40 ms | 69.1% bf16 MFU | 262664 tok/s +step 1691/18794 | loss 3.583336 (-0.50z)| norm 0.1932 (-0.67z)| lr 5.98e-03 | 1992.69 ms | 68.9% bf16 MFU | 262686 tok/s +step 1692/18794 | loss 3.568269 (-0.86z)| norm 0.2116 (-0.16z)| lr 5.98e-03 | 1985.74 ms | 69.1% bf16 MFU | 262753 tok/s +step 1693/18794 | loss 3.561696 (-1.02z)| norm 0.2228 (+0.15z)| lr 5.98e-03 | 1983.08 ms | 69.2% bf16 MFU | 262835 tok/s +step 1694/18794 | loss 3.624184 (+0.54z)| norm 0.2436 (+0.72z)| lr 5.98e-03 | 1990.02 ms | 69.0% bf16 MFU | 262866 tok/s +step 1695/18794 | loss 3.616541 (+0.34z)| norm 0.2286 (+0.29z)| lr 5.98e-03 | 2000.01 ms | 68.6% bf16 MFU | 262830 tok/s +step 1696/18794 | loss 3.536642 (-1.62z)| norm 0.2131 (-0.15z)| lr 5.98e-03 | 1992.11 ms | 68.9% bf16 MFU | 262847 tok/s +step 1697/18794 | loss 3.584565 (-0.41z)| norm 0.2194 (+0.03z)| lr 5.98e-03 | 1983.57 ms | 69.2% bf16 MFU | 262921 tok/s +step 1698/18794 | loss 3.569571 (-0.78z)| norm 0.1964 (-0.62z)| lr 5.98e-03 | 1986.14 ms | 69.1% bf16 MFU | 262973 tok/s +step 1699/18794 | loss 3.592994 (-0.17z)| norm 0.1962 (-0.61z)| lr 5.98e-03 | 1988.57 ms | 69.0% bf16 MFU | 263007 tok/s +step 1700/18794 | loss 3.543357 (-1.45z)| norm 0.2030 (-0.41z)| lr 5.98e-03 | 1988.15 ms | 69.0% bf16 MFU | 263042 tok/s +step 1701/18794 | loss 3.551680 (-1.21z)| norm 0.2123 (-0.14z)| lr 5.98e-03 | 1985.57 ms | 69.1% bf16 MFU | 263093 tok/s +step 1702/18794 | loss 3.502143 (-2.45z)| norm 0.2290 (+0.36z)| lr 5.98e-03 | 1986.98 ms | 69.1% bf16 MFU | 263131 tok/s +step 1703/18794 | loss 3.570590 (-0.70z)| norm 0.2041 (-0.35z)| lr 5.98e-03 | 1982.60 ms | 69.2% bf16 MFU | 263197 tok/s +step 1704/18794 | loss 3.625677 (+0.72z)| norm 0.1698 (-1.32z)| lr 5.98e-03 | 1980.04 ms | 69.3% bf16 MFU | 263276 tok/s +step 1705/18794 | loss 3.634457 (+0.96z)| norm 0.1737 (-1.18z)| lr 5.98e-03 | 1984.76 ms | 69.1% bf16 MFU | 263320 tok/s +step 1706/18794 | loss 3.531564 (-1.67z)| norm 0.2296 (+0.41z)| lr 5.98e-03 | 1978.21 ms | 69.4% bf16 MFU | 263406 tok/s +step 1707/18794 | loss 3.596519 (+0.01z)| norm 0.2595 (+1.27z)| lr 5.98e-03 | 1979.28 ms | 69.3% bf16 MFU | 263480 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.679956 +step 1708/18794 | loss 3.594724 (-0.02z)| norm 0.3123 (+2.68z)| lr 5.98e-03 | 1994.97 ms | 68.8% bf16 MFU | 263446 tok/s +step 1709/18794 | loss 3.579196 (-0.41z)| norm 0.2376 (+0.60z)| lr 5.98e-03 | 1984.57 ms | 69.1% bf16 MFU | 263483 tok/s +step 1710/18794 | loss 3.564542 (-0.78z)| norm 0.2097 (-0.17z)| lr 5.98e-03 | 1981.30 ms | 69.3% bf16 MFU | 263540 tok/s +step 1711/18794 | loss 3.562314 (-0.82z)| norm 0.2115 (-0.11z)| lr 5.98e-03 | 1982.89 ms | 69.2% bf16 MFU | 263583 tok/s +step 1712/18794 | loss 3.609988 (+0.43z)| norm 0.2671 (+1.41z)| lr 5.98e-03 | 1979.90 ms | 69.3% bf16 MFU | 263644 tok/s +step 1713/18794 | loss 3.583967 (-0.24z)| norm 0.2212 (+0.14z)| lr 5.98e-03 | 1978.01 ms | 69.4% bf16 MFU | 263715 tok/s +step 1714/18794 | loss 3.602457 (+0.25z)| norm 0.2006 (-0.44z)| lr 5.98e-03 | 1978.68 ms | 69.4% bf16 MFU | 263778 tok/s +step 1715/18794 | loss 3.590359 (-0.08z)| norm 0.2112 (-0.16z)| lr 5.98e-03 | 1980.89 ms | 69.3% bf16 MFU | 263822 tok/s +step 1716/18794 | loss 3.596816 (+0.10z)| norm 0.2046 (-0.35z)| lr 5.98e-03 | 2003.25 ms | 68.5% bf16 MFU | 263717 tok/s +step 1717/18794 | loss 3.562478 (-0.81z)| norm 0.2218 (+0.13z)| lr 5.98e-03 | 2040.31 ms | 67.3% bf16 MFU | 263379 tok/s +step 1718/18794 | loss 3.622246 (+0.78z)| norm 0.2081 (-0.24z)| lr 5.98e-03 | 2030.94 ms | 67.6% bf16 MFU | 263118 tok/s +step 1719/18794 | loss 3.575240 (-0.47z)| norm 0.2005 (-0.45z)| lr 5.98e-03 | 2031.35 ms | 67.6% bf16 MFU | 262867 tok/s +step 1720/18794 | loss 3.657867 (+1.70z)| norm 0.2466 (+0.82z)| lr 5.98e-03 | 2038.65 ms | 67.3% bf16 MFU | 262582 tok/s +step 1721/18794 | loss 3.575213 (-0.46z)| norm 0.2439 (+0.74z)| lr 5.98e-03 | 2038.90 ms | 67.3% bf16 MFU | 262310 tok/s +step 1722/18794 | loss 3.609637 (+0.44z)| norm 0.1990 (-0.51z)| lr 5.98e-03 | 2030.91 ms | 67.6% bf16 MFU | 262102 tok/s +step 1723/18794 | loss 3.609290 (+0.43z)| norm 0.2327 (+0.41z)| lr 5.98e-03 | 2040.41 ms | 67.3% bf16 MFU | 261845 tok/s +step 1724/18794 | loss 3.547339 (-1.20z)| norm 0.2337 (+0.43z)| lr 5.98e-03 | 2033.76 ms | 67.5% bf16 MFU | 261642 tok/s +step 1725/18794 | loss 3.669460 (+1.97z)| norm 0.2338 (+0.42z)| lr 5.98e-03 | 2024.72 ms | 67.8% bf16 MFU | 261507 tok/s +step 1726/18794 | loss 3.643070 (+1.26z)| norm 0.2535 (+0.96z)| lr 5.98e-03 | 2032.77 ms | 67.5% bf16 MFU | 261328 tok/s +step 1727/18794 | loss 3.599584 (+0.15z)| norm 0.2459 (+0.72z)| lr 5.98e-03 | 2036.86 ms | 67.4% bf16 MFU | 261132 tok/s +step 1728/18794 | loss 3.578526 (-0.41z)| norm 0.2587 (+1.08z)| lr 5.98e-03 | 2023.01 ms | 67.8% bf16 MFU | 261033 tok/s +step 1729/18794 | loss 3.596095 (+0.07z)| norm 0.2708 (+1.40z)| lr 5.98e-03 | 2035.94 ms | 67.4% bf16 MFU | 260857 tok/s +step 1730/18794 | loss 3.602110 (+0.24z)| norm 0.2219 (+0.00z)| lr 5.98e-03 | 2038.45 ms | 67.3% bf16 MFU | 260674 tok/s +step 1731/18794 | loss 3.572602 (-0.55z)| norm 0.2172 (-0.12z)| lr 5.98e-03 | 2031.60 ms | 67.5% bf16 MFU | 260544 tok/s +step 1732/18794 | loss 3.649307 (+1.48z)| norm 0.2308 (+0.28z)| lr 5.98e-03 | 2033.08 ms | 67.5% bf16 MFU | 260411 tok/s +step 1733/18794 | loss 3.632350 (+1.02z)| norm 0.2208 (+0.00z)| lr 5.98e-03 | 2028.29 ms | 67.7% bf16 MFU | 260315 tok/s +step 1734/18794 | loss 3.595611 (+0.05z)| norm 0.2233 (+0.08z)| lr 5.98e-03 | 2024.77 ms | 67.8% bf16 MFU | 260246 tok/s +step 1735/18794 | loss 3.549889 (-1.15z)| norm 0.1866 (-1.00z)| lr 5.98e-03 | 2036.36 ms | 67.4% bf16 MFU | 260107 tok/s +step 1736/18794 | loss 3.635801 (+1.09z)| norm 0.1658 (-1.58z)| lr 5.98e-03 | 2037.50 ms | 67.4% bf16 MFU | 259967 tok/s +step 1737/18794 | loss 3.590119 (-0.12z)| norm 0.1661 (-1.54z)| lr 5.98e-03 | 2028.05 ms | 67.7% bf16 MFU | 259895 tok/s +step 1738/18794 | loss 3.580868 (-0.36z)| norm 0.1407 (-2.20z)| lr 5.98e-03 | 2037.79 ms | 67.3% bf16 MFU | 259764 tok/s +step 1739/18794 | loss 3.626234 (+0.89z)| norm 0.1807 (-1.04z)| lr 5.98e-03 | 2025.95 ms | 67.7% bf16 MFU | 259715 tok/s +step 1740/18794 | loss 3.600079 (+0.17z)| norm 0.2075 (-0.27z)| lr 5.98e-03 | 2032.39 ms | 67.5% bf16 MFU | 259628 tok/s +step 1741/18794 | loss 3.617167 (+0.63z)| norm 0.2283 (+0.35z)| lr 5.98e-03 | 2034.06 ms | 67.5% bf16 MFU | 259534 tok/s +step 1742/18794 | loss 3.575152 (-0.51z)| norm 0.2567 (+1.20z)| lr 5.98e-03 | 2022.88 ms | 67.8% bf16 MFU | 259516 tok/s +step 1743/18794 | loss 3.665251 (+1.93z)| norm 0.2248 (+0.27z)| lr 5.98e-03 | 2019.49 ms | 68.0% bf16 MFU | 259521 tok/s +step 1744/18794 | loss 3.583761 (-0.26z)| norm 0.2321 (+0.51z)| lr 5.98e-03 | 2031.67 ms | 67.5% bf16 MFU | 259448 tok/s +step 1745/18794 | loss 3.624008 (+0.84z)| norm 0.2222 (+0.21z)| lr 5.98e-03 | 2009.80 ms | 68.3% bf16 MFU | 259519 tok/s +step 1746/18794 | loss 3.572502 (-0.58z)| norm 0.2068 (-0.26z)| lr 5.98e-03 | 2000.22 ms | 68.6% bf16 MFU | 259649 tok/s +step 1747/18794 | loss 3.564315 (-0.82z)| norm 0.1874 (-0.84z)| lr 5.98e-03 | 2018.60 ms | 68.0% bf16 MFU | 259653 tok/s +step 1748/18794 | loss 3.548530 (-1.25z)| norm 0.1705 (-1.33z)| lr 5.98e-03 | 2031.82 ms | 67.5% bf16 MFU | 259572 tok/s +step 1749/18794 | loss 3.596127 (+0.07z)| norm 0.1876 (-0.80z)| lr 5.98e-03 | 2024.88 ms | 67.8% bf16 MFU | 259540 tok/s +step 1750/18794 | loss 3.597276 (+0.08z)| norm 0.1945 (-0.59z)| lr 5.98e-03 | 2024.66 ms | 67.8% bf16 MFU | 259510 tok/s +val loss 3.608206 +HellaSwag: 2691/10042 = 0.267974: 0/1256 +step 1751/18794 | loss 3.580025 (-0.42z)| norm 0.2128 (-0.05z)| lr 5.98e-03 | 2025.05 ms | 67.8% bf16 MFU | 259480 tok/s +step 1752/18794 | loss 3.570585 (-0.69z)| norm 0.2257 (+0.33z)| lr 5.98e-03 | 2029.40 ms | 67.6% bf16 MFU | 259423 tok/s +step 1753/18794 | loss 3.580648 (-0.39z)| norm 0.2387 (+0.72z)| lr 5.98e-03 | 2029.28 ms | 67.6% bf16 MFU | 259370 tok/s +step 1754/18794 | loss 3.593557 (-0.03z)| norm 0.2807 (+1.97z)| lr 5.98e-03 | 2024.72 ms | 67.8% bf16 MFU | 259349 tok/s +step 1755/18794 | loss 3.601133 (+0.18z)| norm 0.2529 (+1.09z)| lr 5.98e-03 | 2033.43 ms | 67.5% bf16 MFU | 259273 tok/s +step 1756/18794 | loss 3.581999 (-0.37z)| norm 0.2455 (+0.85z)| lr 5.98e-03 | 2026.73 ms | 67.7% bf16 MFU | 259244 tok/s +step 1757/18794 | loss 3.572767 (-0.62z)| norm 0.2651 (+1.42z)| lr 5.98e-03 | 2011.37 ms | 68.2% bf16 MFU | 259315 tok/s +step 1758/18794 | loss 3.529370 (-1.86z)| norm 0.2463 (+0.85z)| lr 5.98e-03 | 2016.95 ms | 68.0% bf16 MFU | 259346 tok/s +step 1759/18794 | loss 3.587088 (-0.18z)| norm 0.2103 (-0.24z)| lr 5.98e-03 | 2017.15 ms | 68.0% bf16 MFU | 259374 tok/s +step 1760/18794 | loss 3.606944 (+0.41z)| norm 0.1734 (-1.35z)| lr 5.98e-03 | 2013.32 ms | 68.2% bf16 MFU | 259426 tok/s +step 1761/18794 | loss 3.596020 (+0.11z)| norm 0.1767 (-1.23z)| lr 5.98e-03 | 2019.24 ms | 68.0% bf16 MFU | 259437 tok/s +step 1762/18794 | loss 3.591864 (-0.02z)| norm 0.2097 (-0.24z)| lr 5.98e-03 | 2033.76 ms | 67.5% bf16 MFU | 259355 tok/s +step 1763/18794 | loss 3.562416 (-0.89z)| norm 0.2310 (+0.40z)| lr 5.98e-03 | 2017.44 ms | 68.0% bf16 MFU | 259381 tok/s +step 1764/18794 | loss 3.574936 (-0.52z)| norm 0.1949 (-0.71z)| lr 5.98e-03 | 2018.42 ms | 68.0% bf16 MFU | 259400 tok/s +step 1765/18794 | loss 3.524484 (-2.01z)| norm 0.1758 (-1.32z)| lr 5.98e-03 | 2016.77 ms | 68.0% bf16 MFU | 259428 tok/s +step 1766/18794 | loss 3.558296 (-1.01z)| norm 0.1828 (-1.09z)| lr 5.98e-03 | 2009.50 ms | 68.3% bf16 MFU | 259502 tok/s +step 1767/18794 | loss 3.550987 (-1.21z)| norm 0.1856 (-0.98z)| lr 5.98e-03 | 2017.73 ms | 68.0% bf16 MFU | 259519 tok/s +step 1768/18794 | loss 3.551484 (-1.18z)| norm 0.1794 (-1.15z)| lr 5.98e-03 | 2011.17 ms | 68.2% bf16 MFU | 259577 tok/s +step 1769/18794 | loss 3.569141 (-0.63z)| norm 0.1784 (-1.16z)| lr 5.98e-03 | 2012.80 ms | 68.2% bf16 MFU | 259622 tok/s +step 1770/18794 | loss 3.640980 (+1.53z)| norm 0.1846 (-0.95z)| lr 5.98e-03 | 2025.75 ms | 67.7% bf16 MFU | 259582 tok/s +step 1771/18794 | loss 3.590465 (+0.01z)| norm 0.1808 (-1.05z)| lr 5.98e-03 | 2011.34 ms | 68.2% bf16 MFU | 259636 tok/s +step 1772/18794 | loss 3.563221 (-0.79z)| norm 0.1670 (-1.46z)| lr 5.98e-03 | 2011.55 ms | 68.2% bf16 MFU | 259686 tok/s +step 1773/18794 | loss 3.581701 (-0.24z)| norm 0.1757 (-1.16z)| lr 5.97e-03 | 2002.61 ms | 68.5% bf16 MFU | 259792 tok/s +step 1774/18794 | loss 3.578335 (-0.35z)| norm 0.1854 (-0.86z)| lr 5.97e-03 | 2011.09 ms | 68.2% bf16 MFU | 259837 tok/s +step 1775/18794 | loss 3.586398 (-0.09z)| norm 0.1934 (-0.61z)| lr 5.97e-03 | 2016.44 ms | 68.1% bf16 MFU | 259846 tok/s +step 1776/18794 | loss 3.579555 (-0.30z)| norm 0.1976 (-0.49z)| lr 5.97e-03 | 2008.86 ms | 68.3% bf16 MFU | 259903 tok/s +step 1777/18794 | loss 3.537871 (-1.55z)| norm 0.2071 (-0.19z)| lr 5.97e-03 | 2026.50 ms | 67.7% bf16 MFU | 259843 tok/s +step 1778/18794 | loss 3.596807 (+0.24z)| norm 0.2640 (+1.60z)| lr 5.97e-03 | 2001.05 ms | 68.6% bf16 MFU | 259951 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.110047 +step 1779/18794 | loss 3.626453 (+1.13z)| norm 0.2804 (+2.11z)| lr 5.97e-03 | 2016.32 ms | 68.1% bf16 MFU | 259955 tok/s +mostly skipping update due to grad z-score of 8.144992 +step 1780/18794 | loss 3.605741 (+0.49z)| norm 0.6595 (+8.14z)| lr 5.97e-04 | 2015.76 ms | 68.1% bf16 MFU | 259962 tok/s +mostly skipping update due to grad z-score of 7.390176 +step 1781/18794 | loss 3.602098 (+0.39z)| norm 0.5605 (+7.39z)| lr 5.97e-04 | 2019.08 ms | 68.0% bf16 MFU | 259947 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.809592 +step 1782/18794 | loss 3.651632 (+1.94z)| norm 0.3531 (+2.81z)| lr 5.97e-03 | 2015.51 ms | 68.1% bf16 MFU | 259956 tok/s +mostly skipping update due to grad z-score of 8.759098 +step 1783/18794 | loss 3.735018 (+4.08z)| norm 1.1168 (+8.76z)| lr 5.97e-04 | 2009.73 ms | 68.3% bf16 MFU | 260002 tok/s +mostly skipping update due to grad z-score of 7.901023 +step 1784/18794 | loss 3.695334 (+2.81z)| norm 0.8505 (+7.90z)| lr 5.97e-04 | 2009.84 ms | 68.3% bf16 MFU | 260045 tok/s +reducing beta2 to 0.9 and lr/wd by 0.966 due to grad z-score of 3.623800 +step 1785/18794 | loss 3.705712 (+2.96z)| norm 0.5355 (+3.62z)| lr 5.77e-03 | 2012.59 ms | 68.2% bf16 MFU | 260068 tok/s +reducing beta2 to 0.9 and lr/wd by 0.857 due to grad z-score of 4.085239 +step 1786/18794 | loss 3.640390 (+1.25z)| norm 0.6121 (+4.09z)| lr 5.12e-03 | 2014.43 ms | 68.1% bf16 MFU | 260078 tok/s +reducing beta2 to 0.9 and lr/wd by 0.774 due to grad z-score of 4.522456 +step 1787/18794 | loss 3.696333 (+2.63z)| norm 0.7090 (+4.52z)| lr 4.62e-03 | 2004.83 ms | 68.5% bf16 MFU | 260150 tok/s +step 1788/18794 | loss 3.674104 (+2.00z)| norm 0.4220 (+1.73z)| lr 5.97e-03 | 2021.32 ms | 67.9% bf16 MFU | 260111 tok/s +step 1789/18794 | loss 3.629406 (+0.87z)| norm 0.3404 (+0.94z)| lr 5.97e-03 | 1990.92 ms | 68.9% bf16 MFU | 260273 tok/s +step 1790/18794 | loss 3.613463 (+0.46z)| norm 0.3009 (+0.56z)| lr 5.97e-03 | 1985.11 ms | 69.1% bf16 MFU | 260464 tok/s +step 1791/18794 | loss 3.640049 (+1.10z)| norm 0.4117 (+1.56z)| lr 5.97e-03 | 1982.55 ms | 69.2% bf16 MFU | 260664 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.979831 +step 1792/18794 | loss 3.631608 (+0.87z)| norm 0.5831 (+2.98z)| lr 5.97e-03 | 1983.97 ms | 69.2% bf16 MFU | 260844 tok/s +mostly skipping update due to grad z-score of 5.526645 +step 1793/18794 | loss 3.687505 (+2.18z)| norm 1.0029 (+5.53z)| lr 5.97e-04 | 1977.91 ms | 69.4% bf16 MFU | 261055 tok/s +reducing beta2 to 0.9 and lr/wd by 0.728 due to grad z-score of 4.808120 +step 1794/18794 | loss 3.620434 (+0.56z)| norm 0.8718 (+4.81z)| lr 4.35e-03 | 1979.61 ms | 69.3% bf16 MFU | 261245 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.032115 +step 1795/18794 | loss 3.576971 (-0.49z)| norm 0.5241 (+2.03z)| lr 5.97e-03 | 1986.02 ms | 69.1% bf16 MFU | 261382 tok/s +step 1796/18794 | loss 3.677090 (+1.89z)| norm 0.4297 (+1.29z)| lr 5.97e-03 | 1986.17 ms | 69.1% bf16 MFU | 261511 tok/s +step 1797/18794 | loss 3.641990 (+1.03z)| norm 0.4735 (+1.58z)| lr 5.97e-03 | 1982.73 ms | 69.2% bf16 MFU | 261657 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.066242 +step 1798/18794 | loss 3.607621 (+0.19z)| norm 0.5479 (+2.07z)| lr 5.97e-03 | 1980.92 ms | 69.3% bf16 MFU | 261808 tok/s +step 1799/18794 | loss 3.641909 (+1.00z)| norm 0.4566 (+1.37z)| lr 5.97e-03 | 1980.75 ms | 69.3% bf16 MFU | 261952 tok/s +step 1800/18794 | loss 3.585018 (-0.37z)| norm 0.3130 (+0.32z)| lr 5.97e-03 | 1986.87 ms | 69.1% bf16 MFU | 262048 tok/s +step 1801/18794 | loss 3.552819 (-1.15z)| norm 0.2383 (-0.22z)| lr 5.97e-03 | 1984.81 ms | 69.1% bf16 MFU | 262153 tok/s +step 1802/18794 | loss 3.609848 (+0.21z)| norm 0.1774 (-0.66z)| lr 5.97e-03 | 1986.64 ms | 69.1% bf16 MFU | 262241 tok/s +step 1803/18794 | loss 3.607733 (+0.14z)| norm 0.1540 (-0.83z)| lr 5.97e-03 | 1979.97 ms | 69.3% bf16 MFU | 262368 tok/s +step 1804/18794 | loss 3.648337 (+1.15z)| norm 0.1786 (-0.65z)| lr 5.97e-03 | 1982.78 ms | 69.2% bf16 MFU | 262471 tok/s +step 1805/18794 | loss 3.598965 (-0.07z)| norm 0.1546 (-0.81z)| lr 5.97e-03 | 1985.93 ms | 69.1% bf16 MFU | 262548 tok/s +step 1806/18794 | loss 3.596698 (-0.15z)| norm 0.1503 (-0.84z)| lr 5.97e-03 | 1984.35 ms | 69.2% bf16 MFU | 262631 tok/s +step 1807/18794 | loss 3.589789 (-0.32z)| norm 0.1342 (-0.95z)| lr 5.97e-03 | 1981.85 ms | 69.2% bf16 MFU | 262726 tok/s +step 1808/18794 | loss 3.574664 (-0.69z)| norm 0.1244 (-1.01z)| lr 5.97e-03 | 1981.09 ms | 69.3% bf16 MFU | 262822 tok/s +step 1809/18794 | loss 3.565506 (-0.92z)| norm 0.1535 (-0.80z)| lr 5.97e-03 | 1983.80 ms | 69.2% bf16 MFU | 262896 tok/s +step 1810/18794 | loss 3.603397 (+0.02z)| norm 0.1496 (-0.81z)| lr 5.97e-03 | 1985.47 ms | 69.1% bf16 MFU | 262954 tok/s +step 1811/18794 | loss 3.636375 (+0.84z)| norm 0.1664 (-0.68z)| lr 5.97e-03 | 1986.93 ms | 69.1% bf16 MFU | 263000 tok/s +step 1812/18794 | loss 3.602946 (-0.00z)| norm 0.1568 (-0.74z)| lr 5.97e-03 | 1984.05 ms | 69.2% bf16 MFU | 263062 tok/s +step 1813/18794 | loss 3.556139 (-1.18z)| norm 0.1669 (-0.67z)| lr 5.97e-03 | 1984.25 ms | 69.2% bf16 MFU | 263120 tok/s +step 1814/18794 | loss 3.616179 (+0.33z)| norm 0.1627 (-0.69z)| lr 5.97e-03 | 1984.43 ms | 69.2% bf16 MFU | 263174 tok/s +step 1815/18794 | loss 3.587443 (-0.39z)| norm 0.2056 (-0.39z)| lr 5.97e-03 | 1983.53 ms | 69.2% bf16 MFU | 263232 tok/s +step 1816/18794 | loss 3.563490 (-0.98z)| norm 0.1816 (-0.55z)| lr 5.97e-03 | 1982.13 ms | 69.2% bf16 MFU | 263295 tok/s +step 1817/18794 | loss 3.563403 (-0.98z)| norm 0.1577 (-0.72z)| lr 5.97e-03 | 1985.80 ms | 69.1% bf16 MFU | 263332 tok/s +step 1818/18794 | loss 3.592844 (-0.24z)| norm 0.1780 (-0.57z)| lr 5.97e-03 | 1985.85 ms | 69.1% bf16 MFU | 263366 tok/s +step 1819/18794 | loss 3.548843 (-1.33z)| norm 0.1898 (-0.49z)| lr 5.97e-03 | 1983.08 ms | 69.2% bf16 MFU | 263416 tok/s +step 1820/18794 | loss 3.546731 (-1.35z)| norm 0.1602 (-0.69z)| lr 5.97e-03 | 1983.11 ms | 69.2% bf16 MFU | 263464 tok/s +step 1821/18794 | loss 3.560063 (-1.01z)| norm 0.1890 (-0.48z)| lr 5.97e-03 | 1979.84 ms | 69.3% bf16 MFU | 263532 tok/s +step 1822/18794 | loss 3.567547 (-0.81z)| norm 0.2221 (-0.25z)| lr 5.97e-03 | 1987.15 ms | 69.1% bf16 MFU | 263547 tok/s +step 1823/18794 | loss 3.545100 (-1.34z)| norm 0.2265 (-0.22z)| lr 5.97e-03 | 1985.20 ms | 69.1% bf16 MFU | 263575 tok/s +step 1824/18794 | loss 3.564987 (-0.86z)| norm 0.2342 (-0.17z)| lr 5.97e-03 | 1981.47 ms | 69.3% bf16 MFU | 263626 tok/s +step 1825/18794 | loss 3.579649 (-0.48z)| norm 0.2005 (-0.40z)| lr 5.97e-03 | 1981.62 ms | 69.3% bf16 MFU | 263673 tok/s +step 1826/18794 | loss 3.549877 (-1.20z)| norm 0.1822 (-0.53z)| lr 5.97e-03 | 1980.86 ms | 69.3% bf16 MFU | 263723 tok/s +step 1827/18794 | loss 3.547651 (-1.24z)| norm 0.1841 (-0.51z)| lr 5.97e-03 | 1982.51 ms | 69.2% bf16 MFU | 263760 tok/s +step 1828/18794 | loss 3.496253 (-2.42z)| norm 0.1468 (-0.76z)| lr 5.97e-03 | 1987.71 ms | 69.0% bf16 MFU | 263760 tok/s +step 1829/18794 | loss 3.539308 (-1.36z)| norm 0.1600 (-0.66z)| lr 5.97e-03 | 1982.15 ms | 69.2% bf16 MFU | 263798 tok/s +step 1830/18794 | loss 3.595110 (-0.02z)| norm 0.1639 (-0.63z)| lr 5.97e-03 | 1982.52 ms | 69.2% bf16 MFU | 263830 tok/s +step 1831/18794 | loss 3.558127 (-0.90z)| norm 0.2027 (-0.36z)| lr 5.97e-03 | 1980.94 ms | 69.3% bf16 MFU | 263872 tok/s +step 1832/18794 | loss 3.579283 (-0.38z)| norm 0.2279 (-0.18z)| lr 5.97e-03 | 1986.07 ms | 69.1% bf16 MFU | 263878 tok/s +step 1833/18794 | loss 3.541975 (-1.25z)| norm 0.2178 (-0.25z)| lr 5.97e-03 | 1983.07 ms | 69.2% bf16 MFU | 263903 tok/s +step 1834/18794 | loss 3.567771 (-0.63z)| norm 0.1959 (-0.40z)| lr 5.97e-03 | 1987.65 ms | 69.0% bf16 MFU | 263896 tok/s +step 1835/18794 | loss 3.492294 (-2.36z)| norm 0.1878 (-0.45z)| lr 5.97e-03 | 1979.66 ms | 69.3% bf16 MFU | 263944 tok/s +step 1836/18794 | loss 3.627578 (+0.80z)| norm 0.1736 (-0.54z)| lr 5.97e-03 | 1979.68 ms | 69.3% bf16 MFU | 263988 tok/s +step 1837/18794 | loss 3.546471 (-1.08z)| norm 0.1823 (-0.48z)| lr 5.97e-03 | 1979.97 ms | 69.3% bf16 MFU | 264028 tok/s +step 1838/18794 | loss 3.542356 (-1.16z)| norm 0.1871 (-0.45z)| lr 5.97e-03 | 1984.31 ms | 69.2% bf16 MFU | 264038 tok/s +step 1839/18794 | loss 3.571243 (-0.48z)| norm 0.2036 (-0.34z)| lr 5.97e-03 | 1984.17 ms | 69.2% bf16 MFU | 264048 tok/s +step 1840/18794 | loss 3.545540 (-1.05z)| norm 0.2237 (-0.20z)| lr 5.97e-03 | 1983.77 ms | 69.2% bf16 MFU | 264060 tok/s +step 1841/18794 | loss 3.634357 (+0.98z)| norm 0.2290 (-0.17z)| lr 5.97e-03 | 1981.58 ms | 69.3% bf16 MFU | 264086 tok/s +step 1842/18794 | loss 3.613732 (+0.50z)| norm 0.2487 (-0.04z)| lr 5.97e-03 | 1981.45 ms | 69.3% bf16 MFU | 264111 tok/s +step 1843/18794 | loss 3.556516 (-0.80z)| norm 0.2443 (-0.07z)| lr 5.97e-03 | 1980.44 ms | 69.3% bf16 MFU | 264143 tok/s +step 1844/18794 | loss 3.582358 (-0.20z)| norm 0.2406 (-0.10z)| lr 5.97e-03 | 1982.09 ms | 69.2% bf16 MFU | 264161 tok/s +step 1845/18794 | loss 3.539597 (-1.16z)| norm 0.2480 (-0.05z)| lr 5.97e-03 | 1984.08 ms | 69.2% bf16 MFU | 264165 tok/s +step 1846/18794 | loss 3.570717 (-0.45z)| norm 0.2163 (-0.27z)| lr 5.97e-03 | 1985.99 ms | 69.1% bf16 MFU | 264157 tok/s +step 1847/18794 | loss 3.620034 (+0.68z)| norm 0.1872 (-0.47z)| lr 5.97e-03 | 1984.60 ms | 69.1% bf16 MFU | 264158 tok/s +step 1848/18794 | loss 3.555957 (-0.80z)| norm 0.1942 (-0.42z)| lr 5.97e-03 | 1979.63 ms | 69.3% bf16 MFU | 264192 tok/s +step 1849/18794 | loss 3.563181 (-0.63z)| norm 0.2432 (-0.08z)| lr 5.97e-03 | 1979.46 ms | 69.3% bf16 MFU | 264226 tok/s +step 1850/18794 | loss 3.557425 (-0.75z)| norm 0.2424 (-0.09z)| lr 5.97e-03 | 1979.67 ms | 69.3% bf16 MFU | 264256 tok/s +step 1851/18794 | loss 3.562880 (-0.62z)| norm 0.2353 (-0.14z)| lr 5.97e-03 | 1979.24 ms | 69.3% bf16 MFU | 264288 tok/s +step 1852/18794 | loss 3.577645 (-0.28z)| norm 0.2132 (-0.30z)| lr 5.97e-03 | 1978.95 ms | 69.3% bf16 MFU | 264320 tok/s +step 1853/18794 | loss 3.583278 (-0.15z)| norm 0.2240 (-0.22z)| lr 5.97e-03 | 1979.86 ms | 69.3% bf16 MFU | 264345 tok/s +step 1854/18794 | loss 3.551465 (-0.87z)| norm 0.1938 (-0.43z)| lr 5.97e-03 | 1979.02 ms | 69.3% bf16 MFU | 264374 tok/s +step 1855/18794 | loss 3.583431 (-0.13z)| norm 0.2161 (-0.28z)| lr 5.97e-03 | 1979.66 ms | 69.3% bf16 MFU | 264397 tok/s +step 1856/18794 | loss 3.537051 (-1.18z)| norm 0.1965 (-0.41z)| lr 5.97e-03 | 1979.23 ms | 69.3% bf16 MFU | 264422 tok/s +step 1857/18794 | loss 3.626697 (+0.84z)| norm 0.2242 (-0.21z)| lr 5.97e-03 | 1979.37 ms | 69.3% bf16 MFU | 264444 tok/s +step 1858/18794 | loss 3.539321 (-1.14z)| norm 0.3156 (+0.42z)| lr 5.97e-03 | 1980.14 ms | 69.3% bf16 MFU | 264461 tok/s +step 1859/18794 | loss 3.626248 (+0.82z)| norm 0.3463 (+0.62z)| lr 5.97e-03 | 1978.28 ms | 69.4% bf16 MFU | 264489 tok/s +step 1860/18794 | loss 3.647979 (+1.29z)| norm 0.2269 (-0.20z)| lr 5.97e-03 | 1979.41 ms | 69.3% bf16 MFU | 264508 tok/s +step 1861/18794 | loss 3.573986 (-0.36z)| norm 0.2327 (-0.16z)| lr 5.97e-03 | 1979.33 ms | 69.3% bf16 MFU | 264527 tok/s +step 1862/18794 | loss 3.641829 (+1.14z)| norm 0.2398 (-0.11z)| lr 5.97e-03 | 1978.17 ms | 69.4% bf16 MFU | 264552 tok/s +step 1863/18794 | loss 3.517146 (-1.61z)| norm 0.1903 (-0.46z)| lr 5.97e-03 | 1978.89 ms | 69.3% bf16 MFU | 264572 tok/s +step 1864/18794 | loss 3.534442 (-1.21z)| norm 0.1602 (-0.67z)| lr 5.97e-03 | 1980.28 ms | 69.3% bf16 MFU | 264581 tok/s +step 1865/18794 | loss 3.564314 (-0.57z)| norm 0.1679 (-0.61z)| lr 5.97e-03 | 1979.68 ms | 69.3% bf16 MFU | 264594 tok/s +step 1866/18794 | loss 3.577710 (-0.28z)| norm 0.1731 (-0.57z)| lr 5.97e-03 | 1980.33 ms | 69.3% bf16 MFU | 264601 tok/s +step 1867/18794 | loss 3.570413 (-0.45z)| norm 0.1804 (-0.52z)| lr 5.97e-03 | 1979.89 ms | 69.3% bf16 MFU | 264611 tok/s +step 1868/18794 | loss 3.613708 (+0.50z)| norm 0.1788 (-0.53z)| lr 5.97e-03 | 1978.53 ms | 69.4% bf16 MFU | 264630 tok/s +step 1869/18794 | loss 3.556361 (-0.77z)| norm 0.2089 (-0.32z)| lr 5.97e-03 | 1979.67 ms | 69.3% bf16 MFU | 264641 tok/s +step 1870/18794 | loss 3.623416 (+0.73z)| norm 0.2240 (-0.22z)| lr 5.97e-03 | 1979.39 ms | 69.3% bf16 MFU | 264652 tok/s +step 1871/18794 | loss 3.558183 (-0.72z)| norm 0.2206 (-0.25z)| lr 5.97e-03 | 1982.19 ms | 69.2% bf16 MFU | 264645 tok/s +step 1872/18794 | loss 3.535092 (-1.22z)| norm 0.1963 (-0.42z)| lr 5.97e-03 | 1979.91 ms | 69.3% bf16 MFU | 264653 tok/s +step 1873/18794 | loss 3.551581 (-0.84z)| norm 0.1761 (-0.56z)| lr 5.97e-03 | 1978.81 ms | 69.4% bf16 MFU | 264667 tok/s +step 1874/18794 | loss 3.576071 (-0.30z)| norm 0.1803 (-0.53z)| lr 5.97e-03 | 1981.47 ms | 69.3% bf16 MFU | 264664 tok/s +step 1875/18794 | loss 3.574315 (-0.34z)| norm 0.1870 (-0.48z)| lr 5.97e-03 | 1982.33 ms | 69.2% bf16 MFU | 264655 tok/s +step 1876/18794 | loss 3.516429 (-1.58z)| norm 0.1697 (-0.60z)| lr 5.97e-03 | 1985.14 ms | 69.1% bf16 MFU | 264627 tok/s +step 1877/18794 | loss 3.522930 (-1.43z)| norm 0.1871 (-0.48z)| lr 5.97e-03 | 1982.16 ms | 69.2% bf16 MFU | 264621 tok/s +step 1878/18794 | loss 3.605704 (+0.36z)| norm 0.2229 (-0.24z)| lr 5.97e-03 | 1980.46 ms | 69.3% bf16 MFU | 264627 tok/s +step 1879/18794 | loss 3.568289 (-0.44z)| norm 0.2322 (-0.17z)| lr 5.97e-03 | 1984.57 ms | 69.1% bf16 MFU | 264604 tok/s +step 1880/18794 | loss 3.559933 (-0.61z)| norm 0.2127 (-0.31z)| lr 5.97e-03 | 1979.96 ms | 69.3% bf16 MFU | 264614 tok/s +step 1881/18794 | loss 3.561354 (-0.57z)| norm 0.2351 (-0.15z)| lr 5.97e-03 | 1977.74 ms | 69.4% bf16 MFU | 264638 tok/s +step 1882/18794 | loss 3.602786 (+0.34z)| norm 0.2627 (+0.04z)| lr 5.97e-03 | 1978.14 ms | 69.4% bf16 MFU | 264658 tok/s +step 1883/18794 | loss 3.563124 (-0.51z)| norm 0.2930 (+0.28z)| lr 5.97e-03 | 1977.24 ms | 69.4% bf16 MFU | 264683 tok/s +step 1884/18794 | loss 3.541046 (-1.02z)| norm 0.2848 (+0.22z)| lr 5.97e-03 | 1979.24 ms | 69.3% bf16 MFU | 264694 tok/s +step 1885/18794 | loss 3.567693 (-0.37z)| norm 0.2687 (+0.17z)| lr 5.97e-03 | 1981.13 ms | 69.3% bf16 MFU | 264691 tok/s +step 1886/18794 | loss 3.600252 (+0.45z)| norm 0.3014 (+0.46z)| lr 5.97e-03 | 1978.24 ms | 69.4% bf16 MFU | 264708 tok/s +step 1887/18794 | loss 3.583310 (+0.06z)| norm 0.3192 (+0.65z)| lr 5.97e-03 | 1978.68 ms | 69.4% bf16 MFU | 264721 tok/s +step 1888/18794 | loss 3.543894 (-0.96z)| norm 0.2249 (-0.12z)| lr 5.97e-03 | 1983.73 ms | 69.2% bf16 MFU | 264700 tok/s +step 1889/18794 | loss 3.591897 (+0.34z)| norm 0.2058 (-0.28z)| lr 5.97e-03 | 1978.21 ms | 69.4% bf16 MFU | 264716 tok/s +step 1890/18794 | loss 3.574950 (-0.11z)| norm 0.1909 (-0.40z)| lr 5.97e-03 | 1977.64 ms | 69.4% bf16 MFU | 264736 tok/s +step 1891/18794 | loss 3.545824 (-0.88z)| norm 0.1687 (-0.60z)| lr 5.97e-03 | 1980.02 ms | 69.3% bf16 MFU | 264738 tok/s +step 1892/18794 | loss 3.581777 (+0.12z)| norm 0.1962 (-0.33z)| lr 5.97e-03 | 1977.80 ms | 69.4% bf16 MFU | 264756 tok/s +step 1893/18794 | loss 3.570514 (-0.17z)| norm 0.1984 (-0.28z)| lr 5.97e-03 | 1981.36 ms | 69.3% bf16 MFU | 264749 tok/s +step 1894/18794 | loss 3.591159 (+0.44z)| norm 0.2087 (-0.15z)| lr 5.97e-03 | 1979.07 ms | 69.3% bf16 MFU | 264757 tok/s +step 1895/18794 | loss 3.541409 (-1.00z)| norm 0.1978 (-0.28z)| lr 5.97e-03 | 1978.08 ms | 69.4% bf16 MFU | 264772 tok/s +step 1896/18794 | loss 3.575662 (+0.03z)| norm 0.1640 (-0.77z)| lr 5.97e-03 | 1988.00 ms | 69.0% bf16 MFU | 264719 tok/s +step 1897/18794 | loss 3.586688 (+0.39z)| norm 0.1720 (-0.66z)| lr 5.97e-03 | 1980.60 ms | 69.3% bf16 MFU | 264719 tok/s +step 1898/18794 | loss 3.546189 (-0.85z)| norm 0.1866 (-0.42z)| lr 5.97e-03 | 1988.05 ms | 69.0% bf16 MFU | 264669 tok/s +step 1899/18794 | loss 3.581118 (+0.26z)| norm 0.2317 (+0.64z)| lr 5.97e-03 | 1986.21 ms | 69.1% bf16 MFU | 264634 tok/s +step 1900/18794 | loss 3.583428 (+0.34z)| norm 0.2689 (+1.57z)| lr 5.97e-03 | 1987.15 ms | 69.1% bf16 MFU | 264594 tok/s +step 1901/18794 | loss 3.569409 (-0.12z)| norm 0.2009 (-0.08z)| lr 5.97e-03 | 1984.00 ms | 69.2% bf16 MFU | 264577 tok/s +step 1902/18794 | loss 3.554163 (-0.59z)| norm 0.2182 (+0.33z)| lr 5.97e-03 | 1984.51 ms | 69.2% bf16 MFU | 264558 tok/s +step 1903/18794 | loss 3.602774 (+0.98z)| norm 0.1878 (-0.43z)| lr 5.97e-03 | 1985.80 ms | 69.1% bf16 MFU | 264531 tok/s +step 1904/18794 | loss 3.518379 (-1.73z)| norm 0.1594 (-1.12z)| lr 5.97e-03 | 1980.24 ms | 69.3% bf16 MFU | 264542 tok/s +step 1905/18794 | loss 3.551066 (-0.64z)| norm 0.1853 (-0.49z)| lr 5.97e-03 | 1985.89 ms | 69.1% bf16 MFU | 264516 tok/s +step 1906/18794 | loss 3.525669 (-1.44z)| norm 0.2000 (-0.14z)| lr 5.97e-03 | 1984.27 ms | 69.2% bf16 MFU | 264501 tok/s +step 1907/18794 | loss 3.552215 (-0.57z)| norm 0.2073 (+0.02z)| lr 5.97e-03 | 2024.69 ms | 67.8% bf16 MFU | 264223 tok/s +step 1908/18794 | loss 3.542621 (-0.86z)| norm 0.2131 (+0.16z)| lr 5.97e-03 | 2038.86 ms | 67.3% bf16 MFU | 263869 tok/s +step 1909/18794 | loss 3.638268 (+2.17z)| norm 0.2185 (+0.28z)| lr 5.97e-03 | 2029.10 ms | 67.6% bf16 MFU | 263595 tok/s +step 1910/18794 | loss 3.547804 (-0.69z)| norm 0.2208 (+0.33z)| lr 5.97e-03 | 2032.05 ms | 67.5% bf16 MFU | 263316 tok/s +step 1911/18794 | loss 3.546113 (-0.73z)| norm 0.1854 (-0.62z)| lr 5.97e-03 | 2038.64 ms | 67.3% bf16 MFU | 263009 tok/s +step 1912/18794 | loss 3.519960 (-1.54z)| norm 0.1796 (-0.79z)| lr 5.97e-03 | 2037.59 ms | 67.4% bf16 MFU | 262724 tok/s +step 1913/18794 | loss 3.550071 (-0.57z)| norm 0.2046 (-0.13z)| lr 5.97e-03 | 2032.21 ms | 67.5% bf16 MFU | 262487 tok/s +step 1914/18794 | loss 3.543051 (-0.78z)| norm 0.2220 (+0.33z)| lr 5.96e-03 | 2038.83 ms | 67.3% bf16 MFU | 262220 tok/s +step 1915/18794 | loss 3.559671 (-0.23z)| norm 0.2293 (+0.52z)| lr 5.96e-03 | 2031.80 ms | 67.5% bf16 MFU | 262011 tok/s +step 1916/18794 | loss 3.587609 (+0.68z)| norm 0.2385 (+0.76z)| lr 5.96e-03 | 2033.48 ms | 67.5% bf16 MFU | 261802 tok/s +step 1917/18794 | loss 3.493482 (-2.30z)| norm 0.2055 (-0.16z)| lr 5.96e-03 | 2038.43 ms | 67.3% bf16 MFU | 261572 tok/s +step 1918/18794 | loss 3.555823 (-0.32z)| norm 0.1830 (-0.78z)| lr 5.96e-03 | 2029.98 ms | 67.6% bf16 MFU | 261407 tok/s +step 1919/18794 | loss 3.615153 (+1.53z)| norm 0.2128 (+0.03z)| lr 5.96e-03 | 2038.78 ms | 67.3% bf16 MFU | 261195 tok/s +step 1920/18794 | loss 3.592863 (+0.82z)| norm 0.2467 (+0.95z)| lr 5.96e-03 | 2033.81 ms | 67.5% bf16 MFU | 261024 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.049034 +step 1921/18794 | loss 3.635831 (+2.09z)| norm 0.2886 (+2.05z)| lr 5.96e-03 | 2035.92 ms | 67.4% bf16 MFU | 260849 tok/s +step 1922/18794 | loss 3.564250 (-0.10z)| norm 0.2126 (-0.02z)| lr 5.96e-03 | 2036.37 ms | 67.4% bf16 MFU | 260680 tok/s +step 1923/18794 | loss 3.553569 (-0.43z)| norm 0.1850 (-0.76z)| lr 5.96e-03 | 2035.52 ms | 67.4% bf16 MFU | 260524 tok/s +step 1924/18794 | loss 3.548288 (-0.59z)| norm 0.1432 (-1.84z)| lr 5.96e-03 | 2025.19 ms | 67.8% bf16 MFU | 260442 tok/s +step 1925/18794 | loss 3.595457 (+0.85z)| norm 0.1718 (-1.06z)| lr 5.96e-03 | 2018.54 ms | 68.0% bf16 MFU | 260407 tok/s +step 1926/18794 | loss 3.615913 (+1.44z)| norm 0.1838 (-0.74z)| lr 5.96e-03 | 2018.14 ms | 68.0% bf16 MFU | 260376 tok/s +step 1927/18794 | loss 3.509349 (-1.75z)| norm 0.1816 (-0.80z)| lr 5.96e-03 | 2039.71 ms | 67.3% bf16 MFU | 260209 tok/s +step 1928/18794 | loss 3.507651 (-1.82z)| norm 0.1743 (-1.01z)| lr 5.96e-03 | 2020.57 ms | 67.9% bf16 MFU | 260172 tok/s +step 1929/18794 | loss 3.555638 (-0.38z)| norm 0.1932 (-0.52z)| lr 5.96e-03 | 2033.75 ms | 67.5% bf16 MFU | 260053 tok/s +step 1930/18794 | loss 3.581250 (+0.40z)| norm 0.2377 (+0.68z)| lr 5.96e-03 | 2031.01 ms | 67.6% bf16 MFU | 259958 tok/s +step 1931/18794 | loss 3.559219 (-0.27z)| norm 0.2254 (+0.33z)| lr 5.96e-03 | 2030.99 ms | 67.6% bf16 MFU | 259867 tok/s +step 1932/18794 | loss 3.522050 (-1.37z)| norm 0.2128 (-0.01z)| lr 5.96e-03 | 2016.50 ms | 68.1% bf16 MFU | 259874 tok/s +step 1933/18794 | loss 3.537109 (-0.91z)| norm 0.2425 (+0.79z)| lr 5.96e-03 | 2018.32 ms | 68.0% bf16 MFU | 259868 tok/s +step 1934/18794 | loss 3.566529 (-0.03z)| norm 0.2644 (+1.36z)| lr 5.96e-03 | 2033.41 ms | 67.5% bf16 MFU | 259767 tok/s +step 1935/18794 | loss 3.550617 (-0.54z)| norm 0.2522 (+1.01z)| lr 5.96e-03 | 2016.41 ms | 68.1% bf16 MFU | 259779 tok/s +step 1936/18794 | loss 3.536714 (-0.95z)| norm 0.2269 (+0.32z)| lr 5.96e-03 | 2030.89 ms | 67.6% bf16 MFU | 259698 tok/s +step 1937/18794 | loss 3.561362 (-0.19z)| norm 0.2674 (+1.38z)| lr 5.96e-03 | 2015.97 ms | 68.1% bf16 MFU | 259716 tok/s +step 1938/18794 | loss 3.578662 (+0.34z)| norm 0.2436 (+0.73z)| lr 5.96e-03 | 2015.85 ms | 68.1% bf16 MFU | 259735 tok/s +step 1939/18794 | loss 3.545237 (-0.69z)| norm 0.2094 (-0.20z)| lr 5.96e-03 | 2038.26 ms | 67.3% bf16 MFU | 259609 tok/s +step 1940/18794 | loss 3.544509 (-0.71z)| norm 0.1932 (-0.62z)| lr 5.96e-03 | 2017.23 ms | 68.0% bf16 MFU | 259624 tok/s +step 1941/18794 | loss 3.572107 (+0.17z)| norm 0.1839 (-0.86z)| lr 5.96e-03 | 2031.29 ms | 67.6% bf16 MFU | 259548 tok/s +step 1942/18794 | loss 3.596707 (+0.97z)| norm 0.1853 (-0.80z)| lr 5.96e-03 | 2019.10 ms | 68.0% bf16 MFU | 259554 tok/s +step 1943/18794 | loss 3.546531 (-0.64z)| norm 0.2716 (+1.49z)| lr 5.96e-03 | 2016.46 ms | 68.1% bf16 MFU | 259576 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.123709 +step 1944/18794 | loss 3.568297 (+0.06z)| norm 0.3399 (+3.12z)| lr 5.96e-03 | 2024.17 ms | 67.8% bf16 MFU | 259548 tok/s +step 1945/18794 | loss 3.574845 (+0.26z)| norm 0.2839 (+1.68z)| lr 5.96e-03 | 2015.48 ms | 68.1% bf16 MFU | 259577 tok/s +step 1946/18794 | loss 3.602364 (+1.13z)| norm 0.2411 (+0.60z)| lr 5.96e-03 | 2020.80 ms | 67.9% bf16 MFU | 259571 tok/s +step 1947/18794 | loss 3.554019 (-0.40z)| norm 0.2044 (-0.32z)| lr 5.96e-03 | 2023.27 ms | 67.8% bf16 MFU | 259549 tok/s +step 1948/18794 | loss 3.546768 (-0.63z)| norm 0.1840 (-0.83z)| lr 5.96e-03 | 2029.95 ms | 67.6% bf16 MFU | 259485 tok/s +step 1949/18794 | loss 3.599149 (+1.05z)| norm 0.1839 (-0.82z)| lr 5.96e-03 | 2016.23 ms | 68.1% bf16 MFU | 259512 tok/s +step 1950/18794 | loss 3.538106 (-0.91z)| norm 0.1956 (-0.51z)| lr 5.96e-03 | 2024.69 ms | 67.8% bf16 MFU | 259484 tok/s +step 1951/18794 | loss 3.534572 (-1.01z)| norm 0.1804 (-0.88z)| lr 5.96e-03 | 2032.13 ms | 67.5% bf16 MFU | 259410 tok/s +step 1952/18794 | loss 3.577050 (+0.35z)| norm 0.1908 (-0.61z)| lr 5.96e-03 | 2024.84 ms | 67.8% bf16 MFU | 259386 tok/s +step 1953/18794 | loss 3.558337 (-0.24z)| norm 0.1961 (-0.47z)| lr 5.96e-03 | 2023.03 ms | 67.8% bf16 MFU | 259375 tok/s +step 1954/18794 | loss 3.628237 (+1.93z)| norm 0.1967 (-0.46z)| lr 5.96e-03 | 2017.04 ms | 68.0% bf16 MFU | 259402 tok/s +step 1955/18794 | loss 3.556667 (-0.31z)| norm 0.2254 (+0.25z)| lr 5.96e-03 | 2026.13 ms | 67.7% bf16 MFU | 259370 tok/s +step 1956/18794 | loss 3.496155 (-2.15z)| norm 0.2109 (-0.11z)| lr 5.96e-03 | 2008.81 ms | 68.3% bf16 MFU | 259452 tok/s +step 1957/18794 | loss 3.506035 (-1.81z)| norm 0.1585 (-1.38z)| lr 5.96e-03 | 2027.76 ms | 67.7% bf16 MFU | 259407 tok/s +step 1958/18794 | loss 3.567133 (+0.06z)| norm 0.1758 (-0.95z)| lr 5.96e-03 | 2001.47 ms | 68.6% bf16 MFU | 259534 tok/s +step 1959/18794 | loss 3.572658 (+0.26z)| norm 0.2187 (+0.18z)| lr 5.96e-03 | 2003.22 ms | 68.5% bf16 MFU | 259643 tok/s +step 1960/18794 | loss 3.516813 (-1.50z)| norm 0.2239 (+0.32z)| lr 5.96e-03 | 2017.98 ms | 68.0% bf16 MFU | 259652 tok/s +step 1961/18794 | loss 3.553745 (-0.30z)| norm 0.2113 (-0.01z)| lr 5.96e-03 | 2020.11 ms | 67.9% bf16 MFU | 259646 tok/s +step 1962/18794 | loss 3.511954 (-1.64z)| norm 0.2849 (+1.93z)| lr 5.96e-03 | 2015.27 ms | 68.1% bf16 MFU | 259671 tok/s +step 1963/18794 | loss 3.480197 (-2.61z)| norm 0.2798 (+1.74z)| lr 5.96e-03 | 2022.79 ms | 67.8% bf16 MFU | 259647 tok/s +step 1964/18794 | loss 3.546092 (-0.50z)| norm 0.2618 (+1.25z)| lr 5.96e-03 | 2017.64 ms | 68.0% bf16 MFU | 259658 tok/s +step 1965/18794 | loss 3.559720 (-0.06z)| norm 0.2235 (+0.23z)| lr 5.96e-03 | 2009.36 ms | 68.3% bf16 MFU | 259721 tok/s +step 1966/18794 | loss 3.537137 (-0.77z)| norm 0.1945 (-0.54z)| lr 5.96e-03 | 2022.79 ms | 67.8% bf16 MFU | 259694 tok/s +step 1967/18794 | loss 3.533189 (-0.88z)| norm 0.1656 (-1.30z)| lr 5.96e-03 | 2017.02 ms | 68.0% bf16 MFU | 259706 tok/s +step 1968/18794 | loss 3.553505 (-0.21z)| norm 0.1617 (-1.39z)| lr 5.96e-03 | 2011.77 ms | 68.2% bf16 MFU | 259751 tok/s +step 1969/18794 | loss 3.520136 (-1.28z)| norm 0.1991 (-0.40z)| lr 5.96e-03 | 2019.22 ms | 68.0% bf16 MFU | 259746 tok/s +step 1970/18794 | loss 3.593884 (+1.13z)| norm 0.2103 (-0.11z)| lr 5.96e-03 | 2031.45 ms | 67.6% bf16 MFU | 259663 tok/s +step 1971/18794 | loss 3.578953 (+0.63z)| norm 0.1851 (-0.76z)| lr 5.96e-03 | 2015.16 ms | 68.1% bf16 MFU | 259689 tok/s +step 1972/18794 | loss 3.522963 (-1.19z)| norm 0.1663 (-1.23z)| lr 5.96e-03 | 2019.70 ms | 67.9% bf16 MFU | 259684 tok/s +step 1973/18794 | loss 3.522014 (-1.20z)| norm 0.1656 (-1.25z)| lr 5.96e-03 | 2005.55 ms | 68.4% bf16 MFU | 259770 tok/s +step 1974/18794 | loss 3.494768 (-2.02z)| norm 0.1401 (-1.87z)| lr 5.96e-03 | 2024.13 ms | 67.8% bf16 MFU | 259733 tok/s +step 1975/18794 | loss 3.588440 (+0.95z)| norm 0.1602 (-1.34z)| lr 5.96e-03 | 2001.22 ms | 68.6% bf16 MFU | 259845 tok/s +step 1976/18794 | loss 3.568706 (+0.31z)| norm 0.1675 (-1.15z)| lr 5.96e-03 | 2009.63 ms | 68.3% bf16 MFU | 259897 tok/s +step 1977/18794 | loss 3.553000 (-0.20z)| norm 0.1590 (-1.35z)| lr 5.96e-03 | 2012.98 ms | 68.2% bf16 MFU | 259925 tok/s +step 1978/18794 | loss 3.484413 (-2.33z)| norm 0.1465 (-1.63z)| lr 5.96e-03 | 2002.46 ms | 68.5% bf16 MFU | 260020 tok/s +step 1979/18794 | loss 3.592038 (+1.06z)| norm 0.1838 (-0.69z)| lr 5.96e-03 | 2007.21 ms | 68.4% bf16 MFU | 260079 tok/s +step 1980/18794 | loss 3.510436 (-1.47z)| norm 0.1863 (-0.62z)| lr 5.96e-03 | 2015.40 ms | 68.1% bf16 MFU | 260082 tok/s +step 1981/18794 | loss 3.538408 (-0.60z)| norm 0.1842 (-0.65z)| lr 5.96e-03 | 2011.11 ms | 68.2% bf16 MFU | 260113 tok/s +step 1982/18794 | loss 3.575584 (+0.57z)| norm 0.2145 (+0.11z)| lr 5.96e-03 | 2002.49 ms | 68.5% bf16 MFU | 260198 tok/s +step 1983/18794 | loss 3.487731 (-2.11z)| norm 0.2623 (+1.33z)| lr 5.96e-03 | 2011.24 ms | 68.2% bf16 MFU | 260222 tok/s +step 1984/18794 | loss 3.568104 (+0.34z)| norm 0.2357 (+0.68z)| lr 5.96e-03 | 2017.23 ms | 68.0% bf16 MFU | 260206 tok/s +step 1985/18794 | loss 3.483438 (-2.17z)| norm 0.2348 (+0.67z)| lr 5.96e-03 | 2015.06 ms | 68.1% bf16 MFU | 260205 tok/s +step 1986/18794 | loss 3.556456 (+0.03z)| norm 0.2319 (+0.63z)| lr 5.96e-03 | 2011.09 ms | 68.2% bf16 MFU | 260230 tok/s +step 1987/18794 | loss 3.544494 (-0.32z)| norm 0.2591 (+1.43z)| lr 5.96e-03 | 2022.66 ms | 67.8% bf16 MFU | 260179 tok/s +step 1988/18794 | loss 3.511782 (-1.29z)| norm 0.2351 (+0.76z)| lr 5.96e-03 | 2018.38 ms | 68.0% bf16 MFU | 260158 tok/s +step 1989/18794 | loss 3.578278 (+0.71z)| norm 0.2088 (+0.03z)| lr 5.96e-03 | 2011.69 ms | 68.2% bf16 MFU | 260181 tok/s +step 1990/18794 | loss 3.545776 (-0.26z)| norm 0.2227 (+0.40z)| lr 5.96e-03 | 2025.27 ms | 67.8% bf16 MFU | 260115 tok/s +step 1991/18794 | loss 3.577704 (+0.69z)| norm 0.2335 (+0.69z)| lr 5.96e-03 | 2001.66 ms | 68.6% bf16 MFU | 260206 tok/s +step 1992/18794 | loss 3.544477 (-0.30z)| norm 0.2224 (+0.37z)| lr 5.96e-03 | 2018.00 ms | 68.0% bf16 MFU | 260186 tok/s +step 1993/18794 | loss 3.462476 (-2.65z)| norm 0.1788 (-0.83z)| lr 5.96e-03 | 2002.47 ms | 68.5% bf16 MFU | 260268 tok/s +step 1994/18794 | loss 3.545656 (-0.21z)| norm 0.1721 (-1.00z)| lr 5.96e-03 | 2010.48 ms | 68.3% bf16 MFU | 260293 tok/s +step 1995/18794 | loss 3.554068 (+0.03z)| norm 0.1843 (-0.66z)| lr 5.96e-03 | 2007.79 ms | 68.3% bf16 MFU | 260335 tok/s +step 1996/18794 | loss 3.595684 (+1.24z)| norm 0.2000 (-0.24z)| lr 5.96e-03 | 2017.37 ms | 68.0% bf16 MFU | 260313 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.383577 +step 1997/18794 | loss 3.535682 (-0.50z)| norm 0.2981 (+2.38z)| lr 5.96e-03 | 2008.80 ms | 68.3% bf16 MFU | 260347 tok/s +step 1998/18794 | loss 3.533291 (-0.56z)| norm 0.2564 (+1.23z)| lr 5.96e-03 | 2005.10 ms | 68.4% bf16 MFU | 260403 tok/s +step 1999/18794 | loss 3.561904 (+0.28z)| norm 0.2161 (+0.15z)| lr 5.96e-03 | 2009.67 ms | 68.3% bf16 MFU | 260427 tok/s +step 2000/18794 | loss 3.526166 (-0.75z)| norm 0.2103 (+0.01z)| lr 5.96e-03 | 2006.67 ms | 68.4% bf16 MFU | 260469 tok/s +val loss 3.548141 +HellaSwag: 2649/10042 = 0.263792Swag: 990/1256: 0/1256 +generating: +--- +Writing state to log_gpt3_125M_edu_v4/state_00002000_00001.bin +Pilution diagram by Dr. Farock & colleagues +· Detailed discussion of intensity of combustion of a mixture of gaseous gases +· The undesirable or critical variable in a particular resource for transporting stored CO2 +· The continuous symptoms of impurities +Critical quantity parabnage chart by Carolyn Wallett*, Marilyn D. Boyd* Otto* Rose* Randomized double-sided labeling with updated information from showing trends in terms of stigrated down the road +Level of biomass is important for vaccination overall in follow definition 1 and for children. 1 gives an example of a direct relation to the biomass content. The potential for man shattering off the + main stream of a meal has been leathered along with the +--- +Writing checkpoint at step 2000 +Writing model to log_gpt3_125M_edu_v4/model_00002000.bin +Writing state to log_gpt3_125M_edu_v4/state_00002000_00000.bin +step 2001/18794 | loss 3.477731 (-2.10z)| norm 0.1971 (-0.35z)| lr 5.96e-03 | 2000.15 ms | 68.6% bf16 MFU | 260552 tok/s +step 2002/18794 | loss 3.571108 (+0.58z)| norm 0.1733 (-0.98z)| lr 5.96e-03 | 2020.57 ms | 67.9% bf16 MFU | 260498 tok/s +step 2003/18794 | loss 3.544641 (-0.17z)| norm 0.1658 (-1.17z)| lr 5.96e-03 | 2002.45 ms | 68.5% bf16 MFU | 260565 tok/s +step 2004/18794 | loss 3.543125 (-0.22z)| norm 0.1974 (-0.33z)| lr 5.96e-03 | 2004.77 ms | 68.5% bf16 MFU | 260612 tok/s +step 2005/18794 | loss 3.526731 (-0.69z)| norm 0.1810 (-0.78z)| lr 5.96e-03 | 2001.93 ms | 68.6% bf16 MFU | 260676 tok/s +step 2006/18794 | loss 3.556105 (+0.15z)| norm 0.1648 (-1.20z)| lr 5.96e-03 | 2002.66 ms | 68.5% bf16 MFU | 260732 tok/s +step 2007/18794 | loss 3.588291 (+1.08z)| norm 0.1692 (-1.06z)| lr 5.96e-03 | 2011.26 ms | 68.2% bf16 MFU | 260730 tok/s +step 2008/18794 | loss 3.523941 (-0.78z)| norm 0.1739 (-0.92z)| lr 5.96e-03 | 2001.62 ms | 68.6% bf16 MFU | 260790 tok/s +step 2009/18794 | loss 3.534531 (-0.46z)| norm 0.1443 (-1.67z)| lr 5.96e-03 | 2011.66 ms | 68.2% bf16 MFU | 260781 tok/s +step 2010/18794 | loss 3.541377 (-0.25z)| norm 0.1661 (-1.08z)| lr 5.96e-03 | 2004.84 ms | 68.5% bf16 MFU | 260818 tok/s +step 2011/18794 | loss 3.591032 (+1.21z)| norm 0.1441 (-1.62z)| lr 5.96e-03 | 1994.95 ms | 68.8% bf16 MFU | 260917 tok/s +step 2012/18794 | loss 3.499297 (-1.49z)| norm 0.1845 (-0.58z)| lr 5.96e-03 | 2013.91 ms | 68.1% bf16 MFU | 260888 tok/s +step 2013/18794 | loss 3.482601 (-1.93z)| norm 0.2220 (+0.39z)| lr 5.96e-03 | 1992.36 ms | 68.9% bf16 MFU | 261001 tok/s +step 2014/18794 | loss 3.545643 (-0.11z)| norm 0.2401 (+0.85z)| lr 5.96e-03 | 2007.00 ms | 68.4% bf16 MFU | 261013 tok/s +step 2015/18794 | loss 3.521996 (-0.78z)| norm 0.2075 (+0.02z)| lr 5.96e-03 | 1994.29 ms | 68.8% bf16 MFU | 261107 tok/s +step 2016/18794 | loss 3.520321 (-0.81z)| norm 0.2061 (-0.01z)| lr 5.96e-03 | 1985.95 ms | 69.1% bf16 MFU | 261251 tok/s +step 2017/18794 | loss 3.555725 (+0.20z)| norm 0.2410 (+0.88z)| lr 5.96e-03 | 1987.46 ms | 69.0% bf16 MFU | 261379 tok/s +step 2018/18794 | loss 3.471580 (-2.18z)| norm 0.2399 (+0.84z)| lr 5.96e-03 | 1987.45 ms | 69.0% bf16 MFU | 261500 tok/s +step 2019/18794 | loss 3.562996 (+0.44z)| norm 0.2011 (-0.16z)| lr 5.96e-03 | 2007.20 ms | 68.4% bf16 MFU | 261485 tok/s +step 2020/18794 | loss 3.520350 (-0.78z)| norm 0.1835 (-0.60z)| lr 5.96e-03 | 2008.40 ms | 68.3% bf16 MFU | 261463 tok/s +step 2021/18794 | loss 3.575057 (+0.86z)| norm 0.1839 (-0.58z)| lr 5.96e-03 | 1995.86 ms | 68.8% bf16 MFU | 261524 tok/s +step 2022/18794 | loss 3.508823 (-1.11z)| norm 0.2069 (+0.03z)| lr 5.96e-03 | 2003.88 ms | 68.5% bf16 MFU | 261530 tok/s +step 2023/18794 | loss 3.577528 (+0.94z)| norm 0.2003 (-0.15z)| lr 5.96e-03 | 1997.07 ms | 68.7% bf16 MFU | 261580 tok/s +step 2024/18794 | loss 3.559740 (+0.41z)| norm 0.2012 (-0.14z)| lr 5.96e-03 | 1995.96 ms | 68.8% bf16 MFU | 261634 tok/s +step 2025/18794 | loss 3.525389 (-0.60z)| norm 0.2051 (-0.04z)| lr 5.96e-03 | 2002.29 ms | 68.5% bf16 MFU | 261645 tok/s +step 2026/18794 | loss 3.543218 (-0.05z)| norm 0.2083 (+0.04z)| lr 5.96e-03 | 1999.49 ms | 68.6% bf16 MFU | 261673 tok/s +step 2027/18794 | loss 3.573942 (+0.88z)| norm 0.2022 (-0.13z)| lr 5.96e-03 | 2003.44 ms | 68.5% bf16 MFU | 261674 tok/s +step 2028/18794 | loss 3.518914 (-0.82z)| norm 0.1946 (-0.35z)| lr 5.96e-03 | 2000.65 ms | 68.6% bf16 MFU | 261694 tok/s +step 2029/18794 | loss 3.597538 (+1.58z)| norm 0.1781 (-0.79z)| lr 5.96e-03 | 2000.69 ms | 68.6% bf16 MFU | 261712 tok/s +step 2030/18794 | loss 3.576360 (+0.94z)| norm 0.2022 (-0.13z)| lr 5.96e-03 | 1998.62 ms | 68.7% bf16 MFU | 261742 tok/s +step 2031/18794 | loss 3.503618 (-1.27z)| norm 0.2300 (+0.63z)| lr 5.96e-03 | 2006.53 ms | 68.4% bf16 MFU | 261720 tok/s +step 2032/18794 | loss 3.511934 (-1.01z)| norm 0.2048 (-0.06z)| lr 5.96e-03 | 2010.63 ms | 68.3% bf16 MFU | 261672 tok/s +step 2033/18794 | loss 3.548190 (+0.09z)| norm 0.1908 (-0.42z)| lr 5.96e-03 | 2002.15 ms | 68.5% bf16 MFU | 261681 tok/s +step 2034/18794 | loss 3.550503 (+0.16z)| norm 0.1916 (-0.39z)| lr 5.96e-03 | 2009.46 ms | 68.3% bf16 MFU | 261642 tok/s +step 2035/18794 | loss 3.519836 (-0.76z)| norm 0.1685 (-1.01z)| lr 5.96e-03 | 1987.49 ms | 69.0% bf16 MFU | 261750 tok/s +step 2036/18794 | loss 3.569704 (+0.74z)| norm 0.1377 (-1.81z)| lr 5.96e-03 | 1991.05 ms | 68.9% bf16 MFU | 261829 tok/s +step 2037/18794 | loss 3.544976 (-0.00z)| norm 0.1551 (-1.31z)| lr 5.95e-03 | 2003.39 ms | 68.5% bf16 MFU | 261822 tok/s +step 2038/18794 | loss 3.566073 (+0.64z)| norm 0.1631 (-1.07z)| lr 5.95e-03 | 1992.58 ms | 68.9% bf16 MFU | 261887 tok/s +step 2039/18794 | loss 3.499955 (-1.34z)| norm 0.1822 (-0.54z)| lr 5.95e-03 | 1988.70 ms | 69.0% bf16 MFU | 261975 tok/s +step 2040/18794 | loss 3.510157 (-1.01z)| norm 0.1932 (-0.23z)| lr 5.95e-03 | 2001.95 ms | 68.5% bf16 MFU | 261970 tok/s +step 2041/18794 | loss 3.545473 (+0.05z)| norm 0.2158 (+0.38z)| lr 5.95e-03 | 2001.56 ms | 68.6% bf16 MFU | 261969 tok/s +step 2042/18794 | loss 3.537629 (-0.17z)| norm 0.2327 (+0.83z)| lr 5.95e-03 | 1994.22 ms | 68.8% bf16 MFU | 262015 tok/s +step 2043/18794 | loss 3.581181 (+1.14z)| norm 0.2138 (+0.33z)| lr 5.95e-03 | 2014.37 ms | 68.1% bf16 MFU | 261928 tok/s +step 2044/18794 | loss 3.529950 (-0.40z)| norm 0.1947 (-0.18z)| lr 5.95e-03 | 1992.71 ms | 68.9% bf16 MFU | 261987 tok/s +step 2045/18794 | loss 3.538676 (-0.13z)| norm 0.2291 (+0.91z)| lr 5.95e-03 | 1993.60 ms | 68.8% bf16 MFU | 262037 tok/s +step 2046/18794 | loss 3.501290 (-1.24z)| norm 0.1921 (-0.23z)| lr 5.95e-03 | 1999.54 ms | 68.6% bf16 MFU | 262045 tok/s +step 2047/18794 | loss 3.515496 (-0.79z)| norm 0.2021 (+0.09z)| lr 5.95e-03 | 1985.61 ms | 69.1% bf16 MFU | 262145 tok/s +step 2048/18794 | loss 3.526964 (-0.44z)| norm 0.2167 (+0.53z)| lr 5.95e-03 | 2001.56 ms | 68.6% bf16 MFU | 262135 tok/s +step 2049/18794 | loss 3.476037 (-1.95z)| norm 0.1974 (-0.08z)| lr 5.95e-03 | 2001.20 ms | 68.6% bf16 MFU | 262128 tok/s +step 2050/18794 | loss 3.513606 (-0.79z)| norm 0.1718 (-0.87z)| lr 5.95e-03 | 1994.23 ms | 68.8% bf16 MFU | 262166 tok/s +step 2051/18794 | loss 3.569299 (+0.88z)| norm 0.1851 (-0.46z)| lr 5.95e-03 | 2010.30 ms | 68.3% bf16 MFU | 262098 tok/s +step 2052/18794 | loss 3.568444 (+0.86z)| norm 0.2297 (+0.93z)| lr 5.95e-03 | 2000.30 ms | 68.6% bf16 MFU | 262098 tok/s +step 2053/18794 | loss 3.519753 (-0.60z)| norm 0.2638 (+1.94z)| lr 5.95e-03 | 1990.89 ms | 68.9% bf16 MFU | 262161 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.115133 +step 2054/18794 | loss 3.529827 (-0.28z)| norm 0.2719 (+2.12z)| lr 5.95e-03 | 2007.24 ms | 68.4% bf16 MFU | 262113 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.347523 +step 2055/18794 | loss 3.537058 (-0.04z)| norm 0.2822 (+2.35z)| lr 5.95e-03 | 1993.37 ms | 68.8% bf16 MFU | 262158 tok/s +step 2056/18794 | loss 3.458848 (-2.45z)| norm 0.2119 (+0.29z)| lr 5.95e-03 | 1995.23 ms | 68.8% bf16 MFU | 262188 tok/s +step 2057/18794 | loss 3.499814 (-1.18z)| norm 0.2110 (+0.25z)| lr 5.95e-03 | 2011.02 ms | 68.2% bf16 MFU | 262114 tok/s +step 2058/18794 | loss 3.500784 (-1.12z)| norm 0.1934 (-0.28z)| lr 5.95e-03 | 1978.30 ms | 69.4% bf16 MFU | 262260 tok/s +step 2059/18794 | loss 3.520369 (-0.51z)| norm 0.1851 (-0.51z)| lr 5.95e-03 | 1987.09 ms | 69.1% bf16 MFU | 262339 tok/s +step 2060/18794 | loss 3.556063 (+0.58z)| norm 0.2511 (+1.42z)| lr 5.95e-03 | 1995.14 ms | 68.8% bf16 MFU | 262361 tok/s +step 2061/18794 | loss 3.523637 (-0.41z)| norm 0.2362 (+0.97z)| lr 5.95e-03 | 1995.04 ms | 68.8% bf16 MFU | 262383 tok/s +step 2062/18794 | loss 3.550158 (+0.40z)| norm 0.2255 (+0.70z)| lr 5.95e-03 | 1983.32 ms | 69.2% bf16 MFU | 262481 tok/s +step 2063/18794 | loss 3.523122 (-0.46z)| norm 0.1797 (-0.67z)| lr 5.95e-03 | 2001.86 ms | 68.6% bf16 MFU | 262452 tok/s +step 2064/18794 | loss 3.496937 (-1.26z)| norm 0.1948 (-0.19z)| lr 5.95e-03 | 1995.07 ms | 68.8% bf16 MFU | 262469 tok/s +step 2065/18794 | loss 3.542196 (+0.16z)| norm 0.2291 (+0.89z)| lr 5.95e-03 | 1993.52 ms | 68.8% bf16 MFU | 262495 tok/s +step 2066/18794 | loss 3.593524 (+1.72z)| norm 0.1958 (-0.15z)| lr 5.95e-03 | 1985.74 ms | 69.1% bf16 MFU | 262572 tok/s +step 2067/18794 | loss 3.501961 (-1.08z)| norm 0.1875 (-0.42z)| lr 5.95e-03 | 1986.86 ms | 69.1% bf16 MFU | 262637 tok/s +step 2068/18794 | loss 3.541939 (+0.15z)| norm 0.1906 (-0.34z)| lr 5.95e-03 | 2002.44 ms | 68.5% bf16 MFU | 262597 tok/s +step 2069/18794 | loss 3.530886 (-0.20z)| norm 0.1793 (-0.69z)| lr 5.95e-03 | 2004.49 ms | 68.5% bf16 MFU | 262545 tok/s +step 2070/18794 | loss 3.541113 (+0.14z)| norm 0.1731 (-0.87z)| lr 5.95e-03 | 1983.47 ms | 69.2% bf16 MFU | 262634 tok/s +step 2071/18794 | loss 3.484990 (-1.58z)| norm 0.1511 (-1.54z)| lr 5.95e-03 | 1991.81 ms | 68.9% bf16 MFU | 262663 tok/s +step 2072/18794 | loss 3.527368 (-0.26z)| norm 0.1344 (-2.01z)| lr 5.95e-03 | 1995.82 ms | 68.8% bf16 MFU | 262665 tok/s +step 2073/18794 | loss 3.581009 (+1.37z)| norm 0.1404 (-1.80z)| lr 5.95e-03 | 1994.91 ms | 68.8% bf16 MFU | 262672 tok/s +step 2074/18794 | loss 3.497701 (-1.20z)| norm 0.1430 (-1.73z)| lr 5.95e-03 | 1987.11 ms | 69.1% bf16 MFU | 262731 tok/s +step 2075/18794 | loss 3.489300 (-1.43z)| norm 0.1651 (-1.06z)| lr 5.95e-03 | 1997.15 ms | 68.7% bf16 MFU | 262720 tok/s +step 2076/18794 | loss 3.554204 (+0.59z)| norm 0.2297 (+0.89z)| lr 5.95e-03 | 1984.24 ms | 69.2% bf16 MFU | 262795 tok/s +step 2077/18794 | loss 3.554359 (+0.59z)| norm 0.2511 (+1.51z)| lr 5.95e-03 | 1985.41 ms | 69.1% bf16 MFU | 262859 tok/s +step 2078/18794 | loss 3.481125 (-1.68z)| norm 0.2591 (+1.72z)| lr 5.95e-03 | 1994.09 ms | 68.8% bf16 MFU | 262862 tok/s +step 2079/18794 | loss 3.542214 (+0.23z)| norm 0.2435 (+1.22z)| lr 5.95e-03 | 1992.14 ms | 68.9% bf16 MFU | 262878 tok/s +step 2080/18794 | loss 3.513161 (-0.68z)| norm 0.2113 (+0.24z)| lr 5.95e-03 | 1983.95 ms | 69.2% bf16 MFU | 262947 tok/s +step 2081/18794 | loss 3.517410 (-0.54z)| norm 0.2371 (+1.00z)| lr 5.95e-03 | 1987.42 ms | 69.1% bf16 MFU | 262990 tok/s +step 2082/18794 | loss 3.482151 (-1.61z)| norm 0.2248 (+0.63z)| lr 5.95e-03 | 1979.19 ms | 69.3% bf16 MFU | 263086 tok/s +step 2083/18794 | loss 3.525374 (-0.28z)| norm 0.2321 (+0.86z)| lr 5.95e-03 | 1986.49 ms | 69.1% bf16 MFU | 263128 tok/s +step 2084/18794 | loss 3.469502 (-1.98z)| norm 0.2115 (+0.25z)| lr 5.95e-03 | 1988.64 ms | 69.0% bf16 MFU | 263153 tok/s +step 2085/18794 | loss 3.553759 (+0.63z)| norm 0.1739 (-0.89z)| lr 5.95e-03 | 1978.91 ms | 69.3% bf16 MFU | 263243 tok/s +step 2086/18794 | loss 3.499154 (-1.07z)| norm 0.1593 (-1.30z)| lr 5.95e-03 | 1990.86 ms | 68.9% bf16 MFU | 263248 tok/s +step 2087/18794 | loss 3.562363 (+0.91z)| norm 0.1786 (-0.70z)| lr 5.95e-03 | 1982.41 ms | 69.2% bf16 MFU | 263309 tok/s +step 2088/18794 | loss 3.500301 (-1.03z)| norm 0.1861 (-0.46z)| lr 5.95e-03 | 1986.27 ms | 69.1% bf16 MFU | 263341 tok/s +step 2089/18794 | loss 3.521224 (-0.36z)| norm 0.1802 (-0.63z)| lr 5.95e-03 | 1978.54 ms | 69.4% bf16 MFU | 263424 tok/s +step 2090/18794 | loss 3.476207 (-1.73z)| norm 0.1641 (-1.11z)| lr 5.95e-03 | 1985.72 ms | 69.1% bf16 MFU | 263454 tok/s +step 2091/18794 | loss 3.516501 (-0.47z)| norm 0.1869 (-0.39z)| lr 5.95e-03 | 1977.45 ms | 69.4% bf16 MFU | 263538 tok/s +step 2092/18794 | loss 3.500028 (-0.96z)| norm 0.1696 (-0.91z)| lr 5.95e-03 | 1985.40 ms | 69.1% bf16 MFU | 263565 tok/s +step 2093/18794 | loss 3.562844 (+0.98z)| norm 0.1690 (-0.92z)| lr 5.95e-03 | 1979.98 ms | 69.3% bf16 MFU | 263626 tok/s +step 2094/18794 | loss 3.513818 (-0.57z)| norm 0.1615 (-1.14z)| lr 5.95e-03 | 1979.51 ms | 69.3% bf16 MFU | 263688 tok/s +step 2095/18794 | loss 3.482769 (-1.51z)| norm 0.1640 (-1.06z)| lr 5.95e-03 | 1981.37 ms | 69.3% bf16 MFU | 263734 tok/s +step 2096/18794 | loss 3.501625 (-0.91z)| norm 0.1997 (+0.04z)| lr 5.95e-03 | 1979.68 ms | 69.3% bf16 MFU | 263789 tok/s +step 2097/18794 | loss 3.513865 (-0.51z)| norm 0.1806 (-0.54z)| lr 5.95e-03 | 1986.12 ms | 69.1% bf16 MFU | 263798 tok/s +step 2098/18794 | loss 3.493727 (-1.13z)| norm 0.1969 (+0.00z)| lr 5.95e-03 | 2030.30 ms | 67.6% bf16 MFU | 263520 tok/s +step 2099/18794 | loss 3.477580 (-1.61z)| norm 0.2145 (+0.58z)| lr 5.95e-03 | 2037.30 ms | 67.4% bf16 MFU | 263211 tok/s +step 2100/18794 | loss 3.486742 (-1.29z)| norm 0.2391 (+1.36z)| lr 5.95e-03 | 2033.65 ms | 67.5% bf16 MFU | 262941 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.245192 +step 2101/18794 | loss 3.487107 (-1.29z)| norm 0.2689 (+2.25z)| lr 5.95e-03 | 2035.48 ms | 67.4% bf16 MFU | 262672 tok/s +step 2102/18794 | loss 3.513204 (-0.46z)| norm 0.2387 (+1.27z)| lr 5.95e-03 | 2040.05 ms | 67.3% bf16 MFU | 262389 tok/s +step 2103/18794 | loss 3.509406 (-0.57z)| norm 0.2092 (+0.33z)| lr 5.95e-03 | 2025.02 ms | 67.8% bf16 MFU | 262214 tok/s +step 2104/18794 | loss 3.433298 (-2.82z)| norm 0.2038 (+0.15z)| lr 5.95e-03 | 2022.83 ms | 67.8% bf16 MFU | 262063 tok/s +step 2105/18794 | loss 3.511826 (-0.43z)| norm 0.1980 (-0.03z)| lr 5.95e-03 | 2038.62 ms | 67.3% bf16 MFU | 261819 tok/s +step 2106/18794 | loss 3.486912 (-1.16z)| norm 0.2411 (+1.30z)| lr 5.95e-03 | 2037.34 ms | 67.4% bf16 MFU | 261595 tok/s +step 2107/18794 | loss 3.511976 (-0.39z)| norm 0.2054 (+0.16z)| lr 5.95e-03 | 2031.99 ms | 67.5% bf16 MFU | 261416 tok/s +step 2108/18794 | loss 3.464326 (-1.81z)| norm 0.2155 (+0.47z)| lr 5.95e-03 | 2039.58 ms | 67.3% bf16 MFU | 261198 tok/s +step 2109/18794 | loss 3.409290 (-3.25z)| norm 0.2155 (+0.46z)| lr 5.95e-03 | 2039.03 ms | 67.3% bf16 MFU | 260994 tok/s +step 2110/18794 | loss 3.420513 (-2.78z)| norm 0.2289 (+0.87z)| lr 5.95e-03 | 2038.63 ms | 67.3% bf16 MFU | 260804 tok/s +step 2111/18794 | loss 3.503972 (-0.47z)| norm 0.2442 (+1.35z)| lr 5.95e-03 | 2035.79 ms | 67.4% bf16 MFU | 260640 tok/s +step 2112/18794 | loss 3.527968 (+0.20z)| norm 0.2585 (+1.76z)| lr 5.95e-03 | 2030.92 ms | 67.6% bf16 MFU | 260516 tok/s +step 2113/18794 | loss 3.520509 (-0.02z)| norm 0.2097 (+0.20z)| lr 5.95e-03 | 2039.03 ms | 67.3% bf16 MFU | 260346 tok/s +step 2114/18794 | loss 3.524073 (+0.08z)| norm 0.1982 (-0.16z)| lr 5.95e-03 | 2028.76 ms | 67.6% bf16 MFU | 260250 tok/s +step 2115/18794 | loss 3.457165 (-1.76z)| norm 0.1959 (-0.23z)| lr 5.95e-03 | 2041.03 ms | 67.2% bf16 MFU | 260081 tok/s +step 2116/18794 | loss 3.488592 (-0.88z)| norm 0.1990 (-0.13z)| lr 5.95e-03 | 2039.21 ms | 67.3% bf16 MFU | 259933 tok/s +step 2117/18794 | loss 3.550677 (+0.85z)| norm 0.1899 (-0.41z)| lr 5.95e-03 | 2026.09 ms | 67.7% bf16 MFU | 259874 tok/s +step 2118/18794 | loss 3.462719 (-1.59z)| norm 0.1945 (-0.25z)| lr 5.95e-03 | 2024.54 ms | 67.8% bf16 MFU | 259829 tok/s +step 2119/18794 | loss 3.485255 (-0.94z)| norm 0.1610 (-1.32z)| lr 5.95e-03 | 2023.78 ms | 67.8% bf16 MFU | 259791 tok/s +step 2120/18794 | loss 3.507661 (-0.32z)| norm 0.1744 (-0.88z)| lr 5.95e-03 | 2038.39 ms | 67.3% bf16 MFU | 259662 tok/s +step 2121/18794 | loss 3.554019 (+0.98z)| norm 0.2058 (+0.13z)| lr 5.95e-03 | 2019.52 ms | 68.0% bf16 MFU | 259659 tok/s +step 2122/18794 | loss 3.449870 (-1.88z)| norm 0.2203 (+0.60z)| lr 5.95e-03 | 2008.95 ms | 68.3% bf16 MFU | 259725 tok/s +step 2123/18794 | loss 3.460700 (-1.55z)| norm 0.1999 (-0.06z)| lr 5.95e-03 | 2036.09 ms | 67.4% bf16 MFU | 259613 tok/s +step 2124/18794 | loss 3.492936 (-0.65z)| norm 0.2018 (-0.00z)| lr 5.95e-03 | 2031.90 ms | 67.5% bf16 MFU | 259534 tok/s +step 2125/18794 | loss 3.466844 (-1.34z)| norm 0.1854 (-0.53z)| lr 5.95e-03 | 2028.85 ms | 67.6% bf16 MFU | 259478 tok/s +step 2126/18794 | loss 3.535074 (+0.53z)| norm 0.1830 (-0.60z)| lr 5.95e-03 | 2039.58 ms | 67.3% bf16 MFU | 259357 tok/s +step 2127/18794 | loss 3.461424 (-1.46z)| norm 0.1723 (-0.93z)| lr 5.95e-03 | 2028.28 ms | 67.7% bf16 MFU | 259314 tok/s +step 2128/18794 | loss 3.487353 (-0.74z)| norm 0.1609 (-1.28z)| lr 5.95e-03 | 2028.13 ms | 67.7% bf16 MFU | 259273 tok/s +step 2129/18794 | loss 3.394403 (-3.15z)| norm 0.1989 (-0.07z)| lr 5.95e-03 | 2020.76 ms | 67.9% bf16 MFU | 259282 tok/s +step 2130/18794 | loss 3.510921 (-0.02z)| norm 0.2007 (-0.01z)| lr 5.95e-03 | 2024.72 ms | 67.8% bf16 MFU | 259265 tok/s +step 2131/18794 | loss 3.438031 (-1.94z)| norm 0.1909 (-0.31z)| lr 5.95e-03 | 2028.57 ms | 67.6% bf16 MFU | 259225 tok/s +step 2132/18794 | loss 3.443010 (-1.76z)| norm 0.2188 (+0.58z)| lr 5.95e-03 | 2015.74 ms | 68.1% bf16 MFU | 259268 tok/s +step 2133/18794 | loss 3.503637 (-0.16z)| norm 0.2286 (+0.88z)| lr 5.95e-03 | 2016.30 ms | 68.1% bf16 MFU | 259306 tok/s +step 2134/18794 | loss 3.481415 (-0.73z)| norm 0.2061 (+0.16z)| lr 5.95e-03 | 2026.12 ms | 67.7% bf16 MFU | 259279 tok/s +step 2135/18794 | loss 3.470531 (-1.00z)| norm 0.2167 (+0.48z)| lr 5.95e-03 | 2025.55 ms | 67.8% bf16 MFU | 259257 tok/s +step 2136/18794 | loss 3.451792 (-1.47z)| norm 0.2207 (+0.60z)| lr 5.95e-03 | 2015.63 ms | 68.1% bf16 MFU | 259300 tok/s +step 2137/18794 | loss 3.461414 (-1.19z)| norm 0.2463 (+1.41z)| lr 5.95e-03 | 2001.93 ms | 68.5% bf16 MFU | 259429 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.457960 +step 2138/18794 | loss 3.465145 (-1.07z)| norm 0.2809 (+2.46z)| lr 5.95e-03 | 2010.34 ms | 68.3% bf16 MFU | 259498 tok/s +step 2139/18794 | loss 3.480325 (-0.66z)| norm 0.2580 (+1.67z)| lr 5.95e-03 | 2008.10 ms | 68.3% bf16 MFU | 259577 tok/s +step 2140/18794 | loss 3.521042 (+0.41z)| norm 0.2302 (+0.78z)| lr 5.95e-03 | 2027.17 ms | 67.7% bf16 MFU | 259530 tok/s +step 2141/18794 | loss 3.526895 (+0.57z)| norm 0.2074 (+0.06z)| lr 5.95e-03 | 2021.45 ms | 67.9% bf16 MFU | 259521 tok/s +step 2142/18794 | loss 3.451218 (-1.40z)| norm 0.1932 (-0.39z)| lr 5.95e-03 | 2008.78 ms | 68.3% bf16 MFU | 259595 tok/s +step 2143/18794 | loss 3.464837 (-1.03z)| norm 0.2038 (-0.05z)| lr 5.95e-03 | 2024.70 ms | 67.8% bf16 MFU | 259563 tok/s +step 2144/18794 | loss 3.461576 (-1.09z)| norm 0.2062 (+0.03z)| lr 5.95e-03 | 2022.68 ms | 67.8% bf16 MFU | 259545 tok/s +step 2145/18794 | loss 3.464748 (-0.99z)| norm 0.1986 (-0.20z)| lr 5.95e-03 | 2016.69 ms | 68.0% bf16 MFU | 259566 tok/s +step 2146/18794 | loss 3.500149 (-0.05z)| norm 0.1900 (-0.48z)| lr 5.95e-03 | 2017.32 ms | 68.0% bf16 MFU | 259583 tok/s +step 2147/18794 | loss 3.453150 (-1.27z)| norm 0.1599 (-1.42z)| lr 5.94e-03 | 2010.70 ms | 68.3% bf16 MFU | 259641 tok/s +step 2148/18794 | loss 3.527862 (+0.70z)| norm 0.1600 (-1.38z)| lr 5.94e-03 | 2001.91 ms | 68.6% bf16 MFU | 259754 tok/s +step 2149/18794 | loss 3.498848 (-0.07z)| norm 0.1983 (-0.18z)| lr 5.94e-03 | 2002.39 ms | 68.5% bf16 MFU | 259858 tok/s +step 2150/18794 | loss 3.493486 (-0.21z)| norm 0.1894 (-0.46z)| lr 5.94e-03 | 2016.87 ms | 68.0% bf16 MFU | 259862 tok/s +step 2151/18794 | loss 3.485408 (-0.41z)| norm 0.1681 (-1.13z)| lr 5.94e-03 | 2017.77 ms | 68.0% bf16 MFU | 259861 tok/s +step 2152/18794 | loss 3.478922 (-0.57z)| norm 0.1735 (-0.94z)| lr 5.94e-03 | 2017.75 ms | 68.0% bf16 MFU | 259860 tok/s +step 2153/18794 | loss 3.508851 (+0.25z)| norm 0.2161 (+0.42z)| lr 5.94e-03 | 2020.57 ms | 67.9% bf16 MFU | 259840 tok/s +step 2154/18794 | loss 3.458653 (-1.09z)| norm 0.2589 (+1.80z)| lr 5.94e-03 | 2002.86 ms | 68.5% bf16 MFU | 259937 tok/s +step 2155/18794 | loss 3.481111 (-0.47z)| norm 0.2409 (+1.27z)| lr 5.94e-03 | 2023.31 ms | 67.8% bf16 MFU | 259896 tok/s +step 2156/18794 | loss 3.500657 (+0.05z)| norm 0.1902 (-0.40z)| lr 5.94e-03 | 2008.89 ms | 68.3% bf16 MFU | 259951 tok/s +step 2157/18794 | loss 3.480927 (-0.48z)| norm 0.2271 (+0.81z)| lr 5.94e-03 | 2004.32 ms | 68.5% bf16 MFU | 260032 tok/s +step 2158/18794 | loss 3.511965 (+0.37z)| norm 0.2475 (+1.45z)| lr 5.94e-03 | 2026.27 ms | 67.7% bf16 MFU | 259968 tok/s +step 2159/18794 | loss 3.535089 (+1.00z)| norm 0.2243 (+0.68z)| lr 5.94e-03 | 2010.82 ms | 68.2% bf16 MFU | 260006 tok/s +step 2160/18794 | loss 3.487590 (-0.29z)| norm 0.1382 (-2.06z)| lr 5.94e-03 | 2023.65 ms | 67.8% bf16 MFU | 259960 tok/s +step 2161/18794 | loss 3.447175 (-1.38z)| norm 0.1307 (-2.23z)| lr 5.94e-03 | 2023.90 ms | 67.8% bf16 MFU | 259914 tok/s +step 2162/18794 | loss 3.504947 (+0.23z)| norm 0.1395 (-1.89z)| lr 5.94e-03 | 2017.75 ms | 68.0% bf16 MFU | 259910 tok/s +step 2163/18794 | loss 3.531788 (+0.97z)| norm 0.1885 (-0.37z)| lr 5.94e-03 | 2008.05 ms | 68.3% bf16 MFU | 259969 tok/s +step 2164/18794 | loss 3.501375 (+0.12z)| norm 0.2266 (+0.80z)| lr 5.94e-03 | 2009.27 ms | 68.3% bf16 MFU | 260018 tok/s +step 2165/18794 | loss 3.444641 (-1.42z)| norm 0.2417 (+1.26z)| lr 5.94e-03 | 2008.63 ms | 68.3% bf16 MFU | 260068 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.034126 +step 2166/18794 | loss 3.513834 (+0.54z)| norm 0.2687 (+2.03z)| lr 5.94e-03 | 1996.76 ms | 68.7% bf16 MFU | 260193 tok/s +step 2167/18794 | loss 3.456408 (-1.10z)| norm 0.1633 (-1.14z)| lr 5.94e-03 | 2014.45 ms | 68.1% bf16 MFU | 260196 tok/s +step 2168/18794 | loss 3.488728 (-0.16z)| norm 0.1594 (-1.24z)| lr 5.94e-03 | 2011.77 ms | 68.2% bf16 MFU | 260217 tok/s +step 2169/18794 | loss 3.426785 (-1.89z)| norm 0.1475 (-1.57z)| lr 5.94e-03 | 2010.31 ms | 68.3% bf16 MFU | 260246 tok/s +step 2170/18794 | loss 3.448609 (-1.24z)| norm 0.1595 (-1.21z)| lr 5.94e-03 | 2003.56 ms | 68.5% bf16 MFU | 260318 tok/s +step 2171/18794 | loss 3.554231 (+1.72z)| norm 0.2113 (+0.30z)| lr 5.94e-03 | 2016.85 ms | 68.0% bf16 MFU | 260300 tok/s +step 2172/18794 | loss 3.463601 (-0.81z)| norm 0.2375 (+1.07z)| lr 5.94e-03 | 2001.84 ms | 68.6% bf16 MFU | 260380 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.258474 +step 2173/18794 | loss 3.541991 (+1.44z)| norm 0.2790 (+2.26z)| lr 5.94e-03 | 2017.92 ms | 68.0% bf16 MFU | 260351 tok/s +step 2174/18794 | loss 3.480931 (-0.31z)| norm 0.2442 (+1.20z)| lr 5.94e-03 | 2016.79 ms | 68.0% bf16 MFU | 260332 tok/s +step 2175/18794 | loss 3.449266 (-1.20z)| norm 0.2308 (+0.78z)| lr 5.94e-03 | 2003.81 ms | 68.5% bf16 MFU | 260398 tok/s +step 2176/18794 | loss 3.426282 (-1.82z)| norm 0.2149 (+0.30z)| lr 5.94e-03 | 2008.30 ms | 68.3% bf16 MFU | 260431 tok/s +step 2177/18794 | loss 3.448014 (-1.18z)| norm 0.2312 (+0.81z)| lr 5.94e-03 | 2011.16 ms | 68.2% bf16 MFU | 260444 tok/s +step 2178/18794 | loss 3.430158 (-1.66z)| norm 0.2037 (-0.02z)| lr 5.94e-03 | 2020.91 ms | 67.9% bf16 MFU | 260393 tok/s +step 2179/18794 | loss 3.480014 (-0.23z)| norm 0.2001 (-0.12z)| lr 5.94e-03 | 2004.02 ms | 68.5% bf16 MFU | 260454 tok/s +step 2180/18794 | loss 3.435905 (-1.46z)| norm 0.1882 (-0.48z)| lr 5.94e-03 | 2005.22 ms | 68.4% bf16 MFU | 260505 tok/s +step 2181/18794 | loss 3.475475 (-0.32z)| norm 0.1472 (-1.72z)| lr 5.94e-03 | 2009.03 ms | 68.3% bf16 MFU | 260528 tok/s +step 2182/18794 | loss 3.464767 (-0.62z)| norm 0.1404 (-1.88z)| lr 5.94e-03 | 2003.83 ms | 68.5% bf16 MFU | 260584 tok/s +step 2183/18794 | loss 3.463961 (-0.63z)| norm 0.1821 (-0.59z)| lr 5.94e-03 | 2005.48 ms | 68.4% bf16 MFU | 260626 tok/s +step 2184/18794 | loss 3.461790 (-0.69z)| norm 0.1774 (-0.72z)| lr 5.94e-03 | 2019.28 ms | 68.0% bf16 MFU | 260577 tok/s +step 2185/18794 | loss 3.467282 (-0.52z)| norm 0.1578 (-1.31z)| lr 5.94e-03 | 2010.19 ms | 68.3% bf16 MFU | 260589 tok/s +step 2186/18794 | loss 3.453086 (-0.91z)| norm 0.1517 (-1.49z)| lr 5.94e-03 | 2009.01 ms | 68.3% bf16 MFU | 260608 tok/s +step 2187/18794 | loss 3.462675 (-0.62z)| norm 0.1315 (-2.04z)| lr 5.94e-03 | 2017.68 ms | 68.0% bf16 MFU | 260570 tok/s +step 2188/18794 | loss 3.430086 (-1.55z)| norm 0.1545 (-1.34z)| lr 5.94e-03 | 2015.64 ms | 68.1% bf16 MFU | 260547 tok/s +step 2189/18794 | loss 3.417032 (-1.88z)| norm 0.1727 (-0.80z)| lr 5.94e-03 | 2013.48 ms | 68.2% bf16 MFU | 260539 tok/s +step 2190/18794 | loss 3.528211 (+1.33z)| norm 0.1727 (-0.81z)| lr 5.94e-03 | 2010.61 ms | 68.3% bf16 MFU | 260550 tok/s +step 2191/18794 | loss 3.478789 (-0.09z)| norm 0.1489 (-1.48z)| lr 5.94e-03 | 2007.95 ms | 68.3% bf16 MFU | 260578 tok/s +step 2192/18794 | loss 3.433599 (-1.37z)| norm 0.1700 (-0.87z)| lr 5.94e-03 | 2009.82 ms | 68.3% bf16 MFU | 260592 tok/s +step 2193/18794 | loss 3.474517 (-0.17z)| norm 0.1629 (-1.07z)| lr 5.94e-03 | 2019.44 ms | 68.0% bf16 MFU | 260543 tok/s +step 2194/18794 | loss 3.442742 (-1.09z)| norm 0.1650 (-1.01z)| lr 5.94e-03 | 2001.50 ms | 68.6% bf16 MFU | 260613 tok/s +step 2195/18794 | loss 3.426450 (-1.54z)| norm 0.1764 (-0.68z)| lr 5.94e-03 | 2007.75 ms | 68.4% bf16 MFU | 260639 tok/s +step 2196/18794 | loss 3.519016 (+1.16z)| norm 0.1800 (-0.57z)| lr 5.94e-03 | 2009.76 ms | 68.3% bf16 MFU | 260651 tok/s +step 2197/18794 | loss 3.478526 (-0.01z)| norm 0.1625 (-1.07z)| lr 5.94e-03 | 2005.65 ms | 68.4% bf16 MFU | 260689 tok/s +step 2198/18794 | loss 3.462609 (-0.47z)| norm 0.2052 (+0.17z)| lr 5.94e-03 | 2019.50 ms | 68.0% bf16 MFU | 260635 tok/s +step 2199/18794 | loss 3.428746 (-1.43z)| norm 0.2130 (+0.39z)| lr 5.94e-03 | 2005.73 ms | 68.4% bf16 MFU | 260673 tok/s +step 2200/18794 | loss 3.475625 (-0.07z)| norm 0.2077 (+0.25z)| lr 5.94e-03 | 2005.13 ms | 68.4% bf16 MFU | 260713 tok/s +step 2201/18794 | loss 3.502802 (+0.71z)| norm 0.2202 (+0.64z)| lr 5.94e-03 | 2008.20 ms | 68.3% bf16 MFU | 260731 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.214689 +step 2202/18794 | loss 3.488333 (+0.30z)| norm 0.2747 (+2.21z)| lr 5.94e-03 | 2001.78 ms | 68.6% bf16 MFU | 260790 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.095741 +step 2203/18794 | loss 3.520378 (+1.23z)| norm 0.2728 (+2.10z)| lr 5.94e-03 | 2009.26 ms | 68.3% bf16 MFU | 260797 tok/s +step 2204/18794 | loss 3.441119 (-1.07z)| norm 0.2328 (+0.94z)| lr 5.94e-03 | 2010.35 ms | 68.3% bf16 MFU | 260797 tok/s +step 2205/18794 | loss 3.459105 (-0.54z)| norm 0.2228 (+0.64z)| lr 5.94e-03 | 2001.33 ms | 68.6% bf16 MFU | 260856 tok/s +step 2206/18794 | loss 3.466602 (-0.31z)| norm 0.2274 (+0.78z)| lr 5.94e-03 | 2008.93 ms | 68.3% bf16 MFU | 260862 tok/s +step 2207/18794 | loss 3.478997 (+0.06z)| norm 0.1981 (-0.05z)| lr 5.94e-03 | 2002.14 ms | 68.5% bf16 MFU | 260912 tok/s +step 2208/18794 | loss 3.413286 (-1.82z)| norm 0.1663 (-0.94z)| lr 5.94e-03 | 1992.39 ms | 68.9% bf16 MFU | 261024 tok/s +step 2209/18794 | loss 3.445327 (-0.92z)| norm 0.1811 (-0.51z)| lr 5.94e-03 | 1990.45 ms | 68.9% bf16 MFU | 261143 tok/s +step 2210/18794 | loss 3.481579 (+0.12z)| norm 0.2113 (+0.35z)| lr 5.94e-03 | 2008.73 ms | 68.3% bf16 MFU | 261136 tok/s +step 2211/18794 | loss 3.425172 (-1.51z)| norm 0.2255 (+0.77z)| lr 5.94e-03 | 1993.36 ms | 68.8% bf16 MFU | 261230 tok/s +step 2212/18794 | loss 3.420946 (-1.60z)| norm 0.2382 (+1.15z)| lr 5.94e-03 | 2001.29 ms | 68.6% bf16 MFU | 261267 tok/s +step 2213/18794 | loss 3.421044 (-1.56z)| norm 0.2073 (+0.25z)| lr 5.94e-03 | 2003.33 ms | 68.5% bf16 MFU | 261289 tok/s +step 2214/18794 | loss 3.452934 (-0.62z)| norm 0.1747 (-0.68z)| lr 5.94e-03 | 1994.98 ms | 68.8% bf16 MFU | 261365 tok/s +step 2215/18794 | loss 3.496320 (+0.65z)| norm 0.2219 (+0.67z)| lr 5.94e-03 | 2000.66 ms | 68.6% bf16 MFU | 261399 tok/s +step 2216/18794 | loss 3.424569 (-1.43z)| norm 0.1838 (-0.42z)| lr 5.94e-03 | 1999.62 ms | 68.6% bf16 MFU | 261439 tok/s +step 2217/18794 | loss 3.473137 (+0.01z)| norm 0.1545 (-1.24z)| lr 5.94e-03 | 2009.60 ms | 68.3% bf16 MFU | 261412 tok/s +step 2218/18794 | loss 3.504770 (+0.94z)| norm 0.1617 (-1.02z)| lr 5.94e-03 | 2014.93 ms | 68.1% bf16 MFU | 261351 tok/s +step 2219/18794 | loss 3.433797 (-1.15z)| norm 0.1807 (-0.49z)| lr 5.94e-03 | 1994.23 ms | 68.8% bf16 MFU | 261429 tok/s +step 2220/18794 | loss 3.411915 (-1.76z)| norm 0.1905 (-0.22z)| lr 5.94e-03 | 2002.56 ms | 68.5% bf16 MFU | 261448 tok/s +step 2221/18794 | loss 3.439211 (-0.95z)| norm 0.1895 (-0.24z)| lr 5.94e-03 | 1994.17 ms | 68.8% bf16 MFU | 261521 tok/s +step 2222/18794 | loss 3.485979 (+0.45z)| norm 0.1520 (-1.28z)| lr 5.94e-03 | 1989.05 ms | 69.0% bf16 MFU | 261624 tok/s +step 2223/18794 | loss 3.468935 (-0.07z)| norm 0.1688 (-0.80z)| lr 5.94e-03 | 2007.74 ms | 68.4% bf16 MFU | 261600 tok/s +step 2224/18794 | loss 3.431380 (-1.17z)| norm 0.2276 (+0.86z)| lr 5.94e-03 | 2015.68 ms | 68.1% bf16 MFU | 261525 tok/s +step 2225/18794 | loss 3.443456 (-0.80z)| norm 0.2701 (+1.99z)| lr 5.94e-03 | 1995.33 ms | 68.8% bf16 MFU | 261587 tok/s +step 2226/18794 | loss 3.457075 (-0.38z)| norm 0.2439 (+1.24z)| lr 5.94e-03 | 1992.82 ms | 68.9% bf16 MFU | 261662 tok/s +step 2227/18794 | loss 3.438143 (-0.94z)| norm 0.1912 (-0.21z)| lr 5.94e-03 | 2004.62 ms | 68.5% bf16 MFU | 261656 tok/s +step 2228/18794 | loss 3.481996 (+0.39z)| norm 0.2139 (+0.40z)| lr 5.94e-03 | 1993.77 ms | 68.8% bf16 MFU | 261721 tok/s +step 2229/18794 | loss 3.428809 (-1.26z)| norm 0.2172 (+0.49z)| lr 5.94e-03 | 1997.68 ms | 68.7% bf16 MFU | 261757 tok/s +step 2230/18794 | loss 3.467141 (-0.06z)| norm 0.1877 (-0.32z)| lr 5.94e-03 | 2003.30 ms | 68.5% bf16 MFU | 261755 tok/s +step 2231/18794 | loss 3.396277 (-2.21z)| norm 0.1746 (-0.68z)| lr 5.94e-03 | 2007.29 ms | 68.4% bf16 MFU | 261727 tok/s +step 2232/18794 | loss 3.477360 (+0.25z)| norm 0.2176 (+0.51z)| lr 5.94e-03 | 2010.06 ms | 68.3% bf16 MFU | 261682 tok/s +step 2233/18794 | loss 3.466425 (-0.07z)| norm 0.2304 (+0.86z)| lr 5.94e-03 | 2001.29 ms | 68.6% bf16 MFU | 261697 tok/s +step 2234/18794 | loss 3.463812 (-0.14z)| norm 0.2044 (+0.14z)| lr 5.94e-03 | 2017.66 ms | 68.0% bf16 MFU | 261604 tok/s +step 2235/18794 | loss 3.495530 (+0.82z)| norm 0.2100 (+0.30z)| lr 5.94e-03 | 2007.18 ms | 68.4% bf16 MFU | 261584 tok/s +step 2236/18794 | loss 3.506894 (+1.15z)| norm 0.2294 (+0.83z)| lr 5.94e-03 | 1996.42 ms | 68.7% bf16 MFU | 261636 tok/s +step 2237/18794 | loss 3.453454 (-0.48z)| norm 0.2433 (+1.21z)| lr 5.94e-03 | 1991.66 ms | 68.9% bf16 MFU | 261716 tok/s +step 2238/18794 | loss 3.527461 (+1.73z)| norm 0.2082 (+0.27z)| lr 5.94e-03 | 1995.78 ms | 68.8% bf16 MFU | 261765 tok/s +step 2239/18794 | loss 3.440907 (-0.85z)| norm 0.1912 (-0.19z)| lr 5.94e-03 | 2002.63 ms | 68.5% bf16 MFU | 261767 tok/s +step 2240/18794 | loss 3.470884 (+0.06z)| norm 0.2090 (+0.33z)| lr 5.94e-03 | 2002.43 ms | 68.5% bf16 MFU | 261770 tok/s +step 2241/18794 | loss 3.516747 (+1.46z)| norm 0.1994 (+0.05z)| lr 5.94e-03 | 2005.64 ms | 68.4% bf16 MFU | 261752 tok/s +step 2242/18794 | loss 3.490275 (+0.64z)| norm 0.1581 (-1.12z)| lr 5.94e-03 | 1999.82 ms | 68.6% bf16 MFU | 261773 tok/s +step 2243/18794 | loss 3.445398 (-0.72z)| norm 0.1602 (-1.04z)| lr 5.94e-03 | 1978.68 ms | 69.4% bf16 MFU | 261932 tok/s +step 2244/18794 | loss 3.420821 (-1.44z)| norm 0.1583 (-1.08z)| lr 5.94e-03 | 1986.85 ms | 69.1% bf16 MFU | 262030 tok/s +step 2245/18794 | loss 3.476490 (+0.23z)| norm 0.1520 (-1.23z)| lr 5.94e-03 | 1983.43 ms | 69.2% bf16 MFU | 262145 tok/s +step 2246/18794 | loss 3.435527 (-0.98z)| norm 0.1833 (-0.35z)| lr 5.94e-03 | 2017.47 ms | 68.0% bf16 MFU | 262031 tok/s +step 2247/18794 | loss 3.434299 (-1.01z)| norm 0.2178 (+0.61z)| lr 5.93e-03 | 1993.70 ms | 68.8% bf16 MFU | 262078 tok/s +step 2248/18794 | loss 3.421788 (-1.36z)| norm 0.2042 (+0.21z)| lr 5.93e-03 | 1982.81 ms | 69.2% bf16 MFU | 262195 tok/s +step 2249/18794 | loss 3.481359 (+0.44z)| norm 0.1987 (+0.05z)| lr 5.93e-03 | 1983.36 ms | 69.2% bf16 MFU | 262303 tok/s +step 2250/18794 | loss 3.450913 (-0.47z)| norm 0.1687 (-0.79z)| lr 5.93e-03 | 1988.08 ms | 69.0% bf16 MFU | 262373 tok/s +val loss 3.484648 +HellaSwag: 2632/10042 = 0.262099: 0/1256 +step 2251/18794 | loss 3.405246 (-1.80z)| norm 0.1925 (-0.12z)| lr 5.93e-03 | 2005.69 ms | 68.4% bf16 MFU | 262325 tok/s +step 2252/18794 | loss 3.474108 (+0.26z)| norm 0.1889 (-0.23z)| lr 5.93e-03 | 1994.22 ms | 68.8% bf16 MFU | 262354 tok/s +step 2253/18794 | loss 3.456680 (-0.25z)| norm 0.1572 (-1.11z)| lr 5.93e-03 | 1981.07 ms | 69.3% bf16 MFU | 262468 tok/s +step 2254/18794 | loss 3.436223 (-0.85z)| norm 0.1727 (-0.66z)| lr 5.93e-03 | 2003.95 ms | 68.5% bf16 MFU | 262426 tok/s +step 2255/18794 | loss 3.446019 (-0.55z)| norm 0.1739 (-0.61z)| lr 5.93e-03 | 1979.48 ms | 69.3% bf16 MFU | 262548 tok/s +step 2256/18794 | loss 3.471624 (+0.23z)| norm 0.1914 (-0.10z)| lr 5.93e-03 | 2002.02 ms | 68.5% bf16 MFU | 262515 tok/s +step 2257/18794 | loss 3.450914 (-0.39z)| norm 0.2302 (+1.02z)| lr 5.93e-03 | 1984.22 ms | 69.2% bf16 MFU | 262600 tok/s +step 2258/18794 | loss 3.476251 (+0.39z)| norm 0.2293 (+1.00z)| lr 5.93e-03 | 1982.25 ms | 69.2% bf16 MFU | 262695 tok/s +step 2259/18794 | loss 3.430343 (-0.99z)| norm 0.2032 (+0.25z)| lr 5.93e-03 | 1995.57 ms | 68.8% bf16 MFU | 262697 tok/s +step 2260/18794 | loss 3.469906 (+0.24z)| norm 0.2276 (+0.95z)| lr 5.93e-03 | 2002.69 ms | 68.5% bf16 MFU | 262651 tok/s +step 2261/18794 | loss 3.451100 (-0.35z)| norm 0.1794 (-0.49z)| lr 5.93e-03 | 1992.47 ms | 68.9% bf16 MFU | 262676 tok/s +step 2262/18794 | loss 3.454057 (-0.24z)| norm 0.2155 (+0.57z)| lr 5.93e-03 | 1986.39 ms | 69.1% bf16 MFU | 262739 tok/s +step 2263/18794 | loss 3.450182 (-0.34z)| norm 0.2080 (+0.34z)| lr 5.93e-03 | 1993.42 ms | 68.8% bf16 MFU | 262752 tok/s +step 2264/18794 | loss 3.414447 (-1.47z)| norm 0.1643 (-0.97z)| lr 5.93e-03 | 1987.64 ms | 69.0% bf16 MFU | 262803 tok/s +step 2265/18794 | loss 3.447865 (-0.39z)| norm 0.1684 (-0.82z)| lr 5.93e-03 | 1988.14 ms | 69.0% bf16 MFU | 262849 tok/s +step 2266/18794 | loss 3.506098 (+1.49z)| norm 0.1627 (-0.99z)| lr 5.93e-03 | 1986.53 ms | 69.1% bf16 MFU | 262902 tok/s +step 2267/18794 | loss 3.510060 (+1.58z)| norm 0.1387 (-1.71z)| lr 5.93e-03 | 1982.31 ms | 69.2% bf16 MFU | 262981 tok/s +step 2268/18794 | loss 3.373747 (-2.65z)| norm 0.1564 (-1.16z)| lr 5.93e-03 | 1997.75 ms | 68.7% bf16 MFU | 262954 tok/s +step 2269/18794 | loss 3.434691 (-0.77z)| norm 0.1689 (-0.79z)| lr 5.93e-03 | 1990.15 ms | 69.0% bf16 MFU | 262979 tok/s +step 2270/18794 | loss 3.465133 (+0.17z)| norm 0.2265 (+0.98z)| lr 5.93e-03 | 1983.32 ms | 69.2% bf16 MFU | 263047 tok/s +step 2271/18794 | loss 3.467480 (+0.28z)| norm 0.2391 (+1.35z)| lr 5.93e-03 | 1985.30 ms | 69.1% bf16 MFU | 263099 tok/s +step 2272/18794 | loss 3.512661 (+1.71z)| norm 0.2045 (+0.29z)| lr 5.93e-03 | 1991.98 ms | 68.9% bf16 MFU | 263104 tok/s +step 2273/18794 | loss 3.430786 (-0.90z)| norm 0.1877 (-0.21z)| lr 5.93e-03 | 1986.28 ms | 69.1% bf16 MFU | 263147 tok/s +step 2274/18794 | loss 3.447890 (-0.33z)| norm 0.1960 (+0.08z)| lr 5.93e-03 | 1981.29 ms | 69.3% bf16 MFU | 263220 tok/s +step 2275/18794 | loss 3.445299 (-0.41z)| norm 0.1915 (-0.06z)| lr 5.93e-03 | 1979.66 ms | 69.3% bf16 MFU | 263301 tok/s +step 2276/18794 | loss 3.526670 (+2.20z)| norm 0.2049 (+0.39z)| lr 5.93e-03 | 1992.40 ms | 68.9% bf16 MFU | 263293 tok/s +step 2277/18794 | loss 3.473746 (+0.48z)| norm 0.1971 (+0.15z)| lr 5.93e-03 | 1976.82 ms | 69.4% bf16 MFU | 263389 tok/s +step 2278/18794 | loss 3.467117 (+0.25z)| norm 0.1945 (+0.06z)| lr 5.93e-03 | 1988.75 ms | 69.0% bf16 MFU | 263401 tok/s +step 2279/18794 | loss 3.417232 (-1.34z)| norm 0.1940 (+0.05z)| lr 5.93e-03 | 2005.69 ms | 68.4% bf16 MFU | 263301 tok/s +step 2280/18794 | loss 3.452336 (-0.21z)| norm 0.1881 (-0.15z)| lr 5.93e-03 | 1976.63 ms | 69.4% bf16 MFU | 263398 tok/s +step 2281/18794 | loss 3.436619 (-0.71z)| norm 0.2005 (+0.25z)| lr 5.93e-03 | 1983.44 ms | 69.2% bf16 MFU | 263445 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.423336 +step 2282/18794 | loss 3.433969 (-0.78z)| norm 0.2678 (+2.42z)| lr 5.93e-03 | 1978.90 ms | 69.3% bf16 MFU | 263520 tok/s +step 2283/18794 | loss 3.468244 (+0.32z)| norm 0.2569 (+2.00z)| lr 5.93e-03 | 1978.46 ms | 69.4% bf16 MFU | 263594 tok/s +step 2284/18794 | loss 3.446637 (-0.37z)| norm 0.2215 (+0.84z)| lr 5.93e-03 | 1979.29 ms | 69.3% bf16 MFU | 263658 tok/s +step 2285/18794 | loss 3.467901 (+0.32z)| norm 0.2589 (+1.99z)| lr 5.93e-03 | 1983.26 ms | 69.2% bf16 MFU | 263693 tok/s +step 2286/18794 | loss 3.449313 (-0.28z)| norm 0.2097 (+0.40z)| lr 5.93e-03 | 1983.57 ms | 69.2% bf16 MFU | 263724 tok/s +step 2287/18794 | loss 3.467447 (+0.30z)| norm 0.2171 (+0.63z)| lr 5.93e-03 | 1976.66 ms | 69.4% bf16 MFU | 263800 tok/s +step 2288/18794 | loss 3.421519 (-1.17z)| norm 0.1963 (-0.07z)| lr 5.93e-03 | 2005.15 ms | 68.4% bf16 MFU | 263684 tok/s +step 2289/18794 | loss 3.519300 (+1.92z)| norm 0.1723 (-0.87z)| lr 5.93e-03 | 2033.89 ms | 67.5% bf16 MFU | 263388 tok/s +step 2290/18794 | loss 3.471565 (+0.43z)| norm 0.1877 (-0.36z)| lr 5.93e-03 | 2039.55 ms | 67.3% bf16 MFU | 263072 tok/s +step 2291/18794 | loss 3.470002 (+0.38z)| norm 0.1820 (-0.57z)| lr 5.93e-03 | 2035.89 ms | 67.4% bf16 MFU | 262794 tok/s +step 2292/18794 | loss 3.466767 (+0.26z)| norm 0.1839 (-0.51z)| lr 5.93e-03 | 2033.68 ms | 67.5% bf16 MFU | 262545 tok/s +step 2293/18794 | loss 3.484140 (+0.83z)| norm 0.1699 (-0.99z)| lr 5.93e-03 | 2041.73 ms | 67.2% bf16 MFU | 262257 tok/s +step 2294/18794 | loss 3.497052 (+1.22z)| norm 0.1873 (-0.41z)| lr 5.93e-03 | 2038.27 ms | 67.3% bf16 MFU | 262005 tok/s +step 2295/18794 | loss 3.484184 (+0.79z)| norm 0.1629 (-1.24z)| lr 5.93e-03 | 2029.86 ms | 67.6% bf16 MFU | 261819 tok/s +step 2296/18794 | loss 3.498296 (+1.27z)| norm 0.1570 (-1.42z)| lr 5.93e-03 | 2038.61 ms | 67.3% bf16 MFU | 261587 tok/s +step 2297/18794 | loss 3.479449 (+0.65z)| norm 0.1412 (-1.92z)| lr 5.93e-03 | 2037.90 ms | 67.3% bf16 MFU | 261371 tok/s +step 2298/18794 | loss 3.531839 (+2.29z)| norm 0.1727 (-0.86z)| lr 5.93e-03 | 2032.56 ms | 67.5% bf16 MFU | 261200 tok/s +step 2299/18794 | loss 3.479827 (+0.61z)| norm 0.2081 (+0.32z)| lr 5.93e-03 | 2038.76 ms | 67.3% bf16 MFU | 260998 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.909398 +step 2300/18794 | loss 3.501518 (+1.29z)| norm 0.2906 (+2.91z)| lr 5.93e-03 | 2026.11 ms | 67.7% bf16 MFU | 260886 tok/s +step 2301/18794 | loss 3.524205 (+1.98z)| norm 0.2610 (+1.92z)| lr 5.93e-03 | 2023.57 ms | 67.8% bf16 MFU | 260796 tok/s +step 2302/18794 | loss 3.519466 (+1.80z)| norm 0.2021 (+0.10z)| lr 5.93e-03 | 2033.14 ms | 67.5% bf16 MFU | 260650 tok/s +step 2303/18794 | loss 3.461597 (+0.02z)| norm 0.2081 (+0.32z)| lr 5.93e-03 | 2022.80 ms | 67.8% bf16 MFU | 260577 tok/s +step 2304/18794 | loss 3.465099 (+0.12z)| norm 0.1915 (-0.21z)| lr 5.93e-03 | 2034.34 ms | 67.5% bf16 MFU | 260434 tok/s +step 2305/18794 | loss 3.484737 (+0.73z)| norm 0.1501 (-1.56z)| lr 5.93e-03 | 2030.25 ms | 67.6% bf16 MFU | 260324 tok/s +step 2306/18794 | loss 3.447362 (-0.44z)| norm 0.1631 (-1.10z)| lr 5.93e-03 | 2030.04 ms | 67.6% bf16 MFU | 260221 tok/s +step 2307/18794 | loss 3.448461 (-0.40z)| norm 0.1553 (-1.34z)| lr 5.93e-03 | 2021.16 ms | 67.9% bf16 MFU | 260180 tok/s +step 2308/18794 | loss 3.466222 (+0.15z)| norm 0.1459 (-1.62z)| lr 5.93e-03 | 2036.15 ms | 67.4% bf16 MFU | 260046 tok/s +step 2309/18794 | loss 3.461055 (-0.02z)| norm 0.1364 (-1.88z)| lr 5.93e-03 | 2033.41 ms | 67.5% bf16 MFU | 259935 tok/s +step 2310/18794 | loss 3.469803 (+0.26z)| norm 0.1697 (-0.81z)| lr 5.93e-03 | 2036.99 ms | 67.4% bf16 MFU | 259808 tok/s +step 2311/18794 | loss 3.416150 (-1.45z)| norm 0.1599 (-1.09z)| lr 5.93e-03 | 2029.94 ms | 67.6% bf16 MFU | 259731 tok/s +step 2312/18794 | loss 3.462286 (+0.01z)| norm 0.1559 (-1.20z)| lr 5.93e-03 | 2023.47 ms | 67.8% bf16 MFU | 259700 tok/s +step 2313/18794 | loss 3.401613 (-1.92z)| norm 0.1399 (-1.67z)| lr 5.93e-03 | 2034.95 ms | 67.4% bf16 MFU | 259597 tok/s +step 2314/18794 | loss 3.433110 (-0.91z)| norm 0.1422 (-1.57z)| lr 5.93e-03 | 2020.75 ms | 67.9% bf16 MFU | 259590 tok/s +step 2315/18794 | loss 3.462041 (+0.02z)| norm 0.1692 (-0.71z)| lr 5.93e-03 | 2034.04 ms | 67.5% bf16 MFU | 259498 tok/s +step 2316/18794 | loss 3.521599 (+1.88z)| norm 0.2110 (+0.58z)| lr 5.93e-03 | 2023.25 ms | 67.8% bf16 MFU | 259480 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.065107 +step 2317/18794 | loss 3.442451 (-0.62z)| norm 0.2606 (+2.07z)| lr 5.93e-03 | 2032.37 ms | 67.5% bf16 MFU | 259404 tok/s +step 2318/18794 | loss 3.512461 (+1.59z)| norm 0.2400 (+1.40z)| lr 5.93e-03 | 2017.80 ms | 68.0% bf16 MFU | 259426 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.844632 +step 2319/18794 | loss 3.501513 (+1.22z)| norm 0.2924 (+2.84z)| lr 5.93e-03 | 2024.56 ms | 67.8% bf16 MFU | 259402 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.514225 +step 2320/18794 | loss 3.471729 (+0.27z)| norm 0.2849 (+2.51z)| lr 5.93e-03 | 2009.20 ms | 68.3% bf16 MFU | 259480 tok/s +step 2321/18794 | loss 3.506504 (+1.34z)| norm 0.2488 (+1.46z)| lr 5.93e-03 | 2009.60 ms | 68.3% bf16 MFU | 259550 tok/s +step 2322/18794 | loss 3.526734 (+1.94z)| norm 0.2143 (+0.48z)| lr 5.93e-03 | 2018.61 ms | 68.0% bf16 MFU | 259559 tok/s +step 2323/18794 | loss 3.480662 (+0.50z)| norm 0.2195 (+0.61z)| lr 5.93e-03 | 2029.68 ms | 67.6% bf16 MFU | 259496 tok/s +step 2324/18794 | loss 3.452086 (-0.39z)| norm 0.1796 (-0.51z)| lr 5.93e-03 | 2015.64 ms | 68.1% bf16 MFU | 259527 tok/s +step 2325/18794 | loss 3.491865 (+0.83z)| norm 0.1838 (-0.37z)| lr 5.93e-03 | 2029.52 ms | 67.6% bf16 MFU | 259467 tok/s +step 2326/18794 | loss 3.456527 (-0.27z)| norm 0.2263 (+0.87z)| lr 5.93e-03 | 2024.07 ms | 67.8% bf16 MFU | 259445 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.066772 +step 2327/18794 | loss 3.461487 (-0.12z)| norm 0.2698 (+2.07z)| lr 5.93e-03 | 2014.67 ms | 68.1% bf16 MFU | 259485 tok/s +step 2328/18794 | loss 3.468258 (+0.09z)| norm 0.2476 (+1.41z)| lr 5.93e-03 | 2010.86 ms | 68.2% bf16 MFU | 259547 tok/s +step 2329/18794 | loss 3.497227 (+0.98z)| norm 0.1992 (+0.05z)| lr 5.93e-03 | 2014.12 ms | 68.1% bf16 MFU | 259585 tok/s +step 2330/18794 | loss 3.541318 (+2.28z)| norm 0.2148 (+0.49z)| lr 5.93e-03 | 2022.69 ms | 67.8% bf16 MFU | 259566 tok/s +step 2331/18794 | loss 3.531873 (+1.96z)| norm 0.2204 (+0.63z)| lr 5.93e-03 | 2016.59 ms | 68.1% bf16 MFU | 259587 tok/s +step 2332/18794 | loss 3.456979 (-0.33z)| norm 0.1609 (-1.03z)| lr 5.93e-03 | 2019.64 ms | 67.9% bf16 MFU | 259587 tok/s +step 2333/18794 | loss 3.462764 (-0.15z)| norm 0.1800 (-0.48z)| lr 5.93e-03 | 2039.59 ms | 67.3% bf16 MFU | 259461 tok/s +step 2334/18794 | loss 3.491069 (+0.70z)| norm 0.1596 (-1.04z)| lr 5.93e-03 | 2030.92 ms | 67.6% bf16 MFU | 259395 tok/s +step 2335/18794 | loss 3.465377 (-0.07z)| norm 0.1602 (-1.00z)| lr 5.93e-03 | 2016.98 ms | 68.0% bf16 MFU | 259422 tok/s +step 2336/18794 | loss 3.442937 (-0.75z)| norm 0.1980 (+0.06z)| lr 5.93e-03 | 2016.01 ms | 68.1% bf16 MFU | 259454 tok/s +step 2337/18794 | loss 3.509454 (+1.28z)| norm 0.1954 (+0.01z)| lr 5.93e-03 | 2010.77 ms | 68.2% bf16 MFU | 259519 tok/s +step 2338/18794 | loss 3.465602 (-0.05z)| norm 0.1947 (-0.01z)| lr 5.93e-03 | 2016.46 ms | 68.1% bf16 MFU | 259543 tok/s +step 2339/18794 | loss 3.454873 (-0.39z)| norm 0.2258 (+0.86z)| lr 5.93e-03 | 2033.45 ms | 67.5% bf16 MFU | 259457 tok/s +step 2340/18794 | loss 3.472281 (+0.16z)| norm 0.2216 (+0.73z)| lr 5.92e-03 | 2011.44 ms | 68.2% bf16 MFU | 259517 tok/s +step 2341/18794 | loss 3.456539 (-0.32z)| norm 0.1759 (-0.55z)| lr 5.92e-03 | 2014.79 ms | 68.1% bf16 MFU | 259552 tok/s +step 2342/18794 | loss 3.521928 (+1.71z)| norm 0.1737 (-0.61z)| lr 5.92e-03 | 2023.92 ms | 67.8% bf16 MFU | 259527 tok/s +step 2343/18794 | loss 3.497225 (+0.93z)| norm 0.1877 (-0.23z)| lr 5.92e-03 | 2009.43 ms | 68.3% bf16 MFU | 259596 tok/s +step 2344/18794 | loss 3.540079 (+2.19z)| norm 0.1853 (-0.31z)| lr 5.92e-03 | 2012.81 ms | 68.2% bf16 MFU | 259640 tok/s +step 2345/18794 | loss 3.411256 (-1.71z)| norm 0.1820 (-0.41z)| lr 5.92e-03 | 2017.92 ms | 68.0% bf16 MFU | 259649 tok/s +step 2346/18794 | loss 3.473518 (+0.16z)| norm 0.1808 (-0.45z)| lr 5.92e-03 | 2031.69 ms | 67.5% bf16 MFU | 259569 tok/s +step 2347/18794 | loss 3.491656 (+0.69z)| norm 0.2025 (+0.18z)| lr 5.92e-03 | 2013.88 ms | 68.1% bf16 MFU | 259608 tok/s +step 2348/18794 | loss 3.469544 (+0.00z)| norm 0.2059 (+0.28z)| lr 5.92e-03 | 2017.57 ms | 68.0% bf16 MFU | 259620 tok/s +step 2349/18794 | loss 3.446622 (-0.69z)| norm 0.1874 (-0.25z)| lr 5.92e-03 | 2026.61 ms | 67.7% bf16 MFU | 259574 tok/s +step 2350/18794 | loss 3.494237 (+0.76z)| norm 0.1669 (-0.84z)| lr 5.92e-03 | 2018.01 ms | 68.0% bf16 MFU | 259586 tok/s +step 2351/18794 | loss 3.424130 (-1.41z)| norm 0.1842 (-0.34z)| lr 5.92e-03 | 2013.45 ms | 68.2% bf16 MFU | 259626 tok/s +step 2352/18794 | loss 3.493929 (+0.74z)| norm 0.1977 (+0.05z)| lr 5.92e-03 | 1997.10 ms | 68.7% bf16 MFU | 259771 tok/s +step 2353/18794 | loss 3.462101 (-0.24z)| norm 0.2312 (+0.99z)| lr 5.92e-03 | 2014.37 ms | 68.1% bf16 MFU | 259796 tok/s +step 2354/18794 | loss 3.482899 (+0.39z)| norm 0.1990 (+0.06z)| lr 5.92e-03 | 2007.78 ms | 68.4% bf16 MFU | 259863 tok/s +step 2355/18794 | loss 3.489014 (+0.56z)| norm 0.1872 (-0.29z)| lr 5.92e-03 | 2016.99 ms | 68.0% bf16 MFU | 259867 tok/s +step 2356/18794 | loss 3.438193 (-1.00z)| norm 0.1760 (-0.61z)| lr 5.92e-03 | 2011.04 ms | 68.2% bf16 MFU | 259909 tok/s +step 2357/18794 | loss 3.437520 (-1.01z)| norm 0.2162 (+0.56z)| lr 5.92e-03 | 2018.14 ms | 68.0% bf16 MFU | 259902 tok/s +step 2358/18794 | loss 3.485501 (+0.46z)| norm 0.2077 (+0.32z)| lr 5.92e-03 | 2016.16 ms | 68.1% bf16 MFU | 259910 tok/s +step 2359/18794 | loss 3.465548 (-0.16z)| norm 0.2167 (+0.58z)| lr 5.92e-03 | 2009.64 ms | 68.3% bf16 MFU | 259958 tok/s +step 2360/18794 | loss 3.455541 (-0.47z)| norm 0.2025 (+0.17z)| lr 5.92e-03 | 2009.37 ms | 68.3% bf16 MFU | 260006 tok/s +step 2361/18794 | loss 3.439775 (-0.95z)| norm 0.1714 (-0.73z)| lr 5.92e-03 | 2000.28 ms | 68.6% bf16 MFU | 260112 tok/s +step 2362/18794 | loss 3.458768 (-0.37z)| norm 0.1572 (-1.12z)| lr 5.92e-03 | 2011.06 ms | 68.2% bf16 MFU | 260141 tok/s +step 2363/18794 | loss 3.481808 (+0.34z)| norm 0.2013 (+0.16z)| lr 5.92e-03 | 2015.01 ms | 68.1% bf16 MFU | 260144 tok/s +step 2364/18794 | loss 3.462845 (-0.27z)| norm 0.1794 (-0.48z)| lr 5.92e-03 | 2005.39 ms | 68.4% bf16 MFU | 260208 tok/s +step 2365/18794 | loss 3.423945 (-1.47z)| norm 0.1521 (-1.26z)| lr 5.92e-03 | 2001.20 ms | 68.6% bf16 MFU | 260297 tok/s +step 2366/18794 | loss 3.462348 (-0.26z)| norm 0.1649 (-0.89z)| lr 5.92e-03 | 2007.20 ms | 68.4% bf16 MFU | 260343 tok/s +step 2367/18794 | loss 3.425043 (-1.41z)| norm 0.1622 (-0.99z)| lr 5.92e-03 | 1996.86 ms | 68.7% bf16 MFU | 260453 tok/s +step 2368/18794 | loss 3.456882 (-0.45z)| norm 0.1648 (-0.92z)| lr 5.92e-03 | 2015.89 ms | 68.1% bf16 MFU | 260434 tok/s +step 2369/18794 | loss 3.468609 (-0.08z)| norm 0.1945 (-0.06z)| lr 5.92e-03 | 2001.15 ms | 68.6% bf16 MFU | 260512 tok/s +step 2370/18794 | loss 3.483180 (+0.39z)| norm 0.2135 (+0.51z)| lr 5.92e-03 | 2012.11 ms | 68.2% bf16 MFU | 260515 tok/s +step 2371/18794 | loss 3.458797 (-0.41z)| norm 0.1803 (-0.46z)| lr 5.92e-03 | 1997.21 ms | 68.7% bf16 MFU | 260615 tok/s +step 2372/18794 | loss 3.431814 (-1.27z)| norm 0.1831 (-0.37z)| lr 5.92e-03 | 2011.86 ms | 68.2% bf16 MFU | 260614 tok/s +step 2373/18794 | loss 3.467978 (-0.09z)| norm 0.2365 (+1.19z)| lr 5.92e-03 | 2010.32 ms | 68.3% bf16 MFU | 260623 tok/s +step 2374/18794 | loss 3.495536 (+0.81z)| norm 0.2402 (+1.28z)| lr 5.92e-03 | 2007.77 ms | 68.4% bf16 MFU | 260649 tok/s +step 2375/18794 | loss 3.435101 (-1.20z)| norm 0.2320 (+1.02z)| lr 5.92e-03 | 2010.84 ms | 68.2% bf16 MFU | 260653 tok/s +step 2376/18794 | loss 3.502041 (+1.05z)| norm 0.2090 (+0.35z)| lr 5.92e-03 | 2005.86 ms | 68.4% bf16 MFU | 260689 tok/s +step 2377/18794 | loss 3.480352 (+0.32z)| norm 0.1736 (-0.67z)| lr 5.92e-03 | 2007.63 ms | 68.4% bf16 MFU | 260712 tok/s +step 2378/18794 | loss 3.431272 (-1.31z)| norm 0.1606 (-1.03z)| lr 5.92e-03 | 2012.59 ms | 68.2% bf16 MFU | 260701 tok/s +step 2379/18794 | loss 3.458274 (-0.43z)| norm 0.1374 (-1.66z)| lr 5.92e-03 | 1999.39 ms | 68.6% bf16 MFU | 260778 tok/s +step 2380/18794 | loss 3.469748 (-0.05z)| norm 0.1433 (-1.46z)| lr 5.92e-03 | 1993.97 ms | 68.8% bf16 MFU | 260885 tok/s +step 2381/18794 | loss 3.414010 (-1.91z)| norm 0.1475 (-1.32z)| lr 5.92e-03 | 2009.60 ms | 68.3% bf16 MFU | 260886 tok/s +step 2382/18794 | loss 3.439865 (-1.04z)| norm 0.1677 (-0.74z)| lr 5.92e-03 | 2008.73 ms | 68.3% bf16 MFU | 260892 tok/s +step 2383/18794 | loss 3.508343 (+1.23z)| norm 0.2312 (+1.08z)| lr 5.92e-03 | 2001.24 ms | 68.6% bf16 MFU | 260946 tok/s +step 2384/18794 | loss 3.446193 (-0.84z)| norm 0.2276 (+0.98z)| lr 5.92e-03 | 2008.52 ms | 68.3% bf16 MFU | 260950 tok/s +step 2385/18794 | loss 3.494797 (+0.77z)| norm 0.2456 (+1.50z)| lr 5.92e-03 | 2002.93 ms | 68.5% bf16 MFU | 260991 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.085572 +step 2386/18794 | loss 3.489488 (+0.58z)| norm 0.2679 (+2.09z)| lr 5.92e-03 | 2000.99 ms | 68.6% bf16 MFU | 261042 tok/s +step 2387/18794 | loss 3.441307 (-1.01z)| norm 0.2613 (+1.86z)| lr 5.92e-03 | 1993.86 ms | 68.8% bf16 MFU | 261138 tok/s +step 2388/18794 | loss 3.527448 (+1.80z)| norm 0.2192 (+0.68z)| lr 5.92e-03 | 2002.33 ms | 68.5% bf16 MFU | 261173 tok/s +step 2389/18794 | loss 3.518978 (+1.52z)| norm 0.1814 (-0.37z)| lr 5.92e-03 | 1999.62 ms | 68.6% bf16 MFU | 261224 tok/s +step 2390/18794 | loss 3.548408 (+2.40z)| norm 0.1751 (-0.54z)| lr 5.92e-03 | 2018.28 ms | 68.0% bf16 MFU | 261151 tok/s +step 2391/18794 | loss 3.473375 (-0.01z)| norm 0.1780 (-0.46z)| lr 5.92e-03 | 1999.64 ms | 68.6% bf16 MFU | 261203 tok/s +step 2392/18794 | loss 3.518630 (+1.41z)| norm 0.2097 (+0.41z)| lr 5.92e-03 | 2012.43 ms | 68.2% bf16 MFU | 261169 tok/s +step 2393/18794 | loss 3.440189 (-1.06z)| norm 0.2063 (+0.31z)| lr 5.92e-03 | 1997.88 ms | 68.7% bf16 MFU | 261232 tok/s +step 2394/18794 | loss 3.498012 (+0.77z)| norm 0.1834 (-0.33z)| lr 5.92e-03 | 2009.77 ms | 68.3% bf16 MFU | 261214 tok/s +step 2395/18794 | loss 3.388440 (-2.57z)| norm 0.1802 (-0.42z)| lr 5.92e-03 | 2008.85 ms | 68.3% bf16 MFU | 261202 tok/s +step 2396/18794 | loss 3.457301 (-0.46z)| norm 0.1836 (-0.34z)| lr 5.92e-03 | 2007.88 ms | 68.3% bf16 MFU | 261198 tok/s +step 2397/18794 | loss 3.471625 (-0.02z)| norm 0.1829 (-0.37z)| lr 5.92e-03 | 1997.01 ms | 68.7% bf16 MFU | 261265 tok/s +step 2398/18794 | loss 3.441761 (-0.92z)| norm 0.1783 (-0.50z)| lr 5.92e-03 | 1999.79 ms | 68.6% bf16 MFU | 261310 tok/s +step 2399/18794 | loss 3.463213 (-0.25z)| norm 0.2142 (+0.51z)| lr 5.92e-03 | 2003.06 ms | 68.5% bf16 MFU | 261332 tok/s +step 2400/18794 | loss 3.449061 (-0.67z)| norm 0.1894 (-0.17z)| lr 5.92e-03 | 1998.53 ms | 68.7% bf16 MFU | 261382 tok/s +step 2401/18794 | loss 3.418514 (-1.59z)| norm 0.1716 (-0.68z)| lr 5.92e-03 | 2017.52 ms | 68.0% bf16 MFU | 261306 tok/s +step 2402/18794 | loss 3.417979 (-1.57z)| norm 0.1747 (-0.58z)| lr 5.92e-03 | 2010.55 ms | 68.3% bf16 MFU | 261279 tok/s +step 2403/18794 | loss 3.496134 (+0.84z)| norm 0.1608 (-0.97z)| lr 5.92e-03 | 1998.03 ms | 68.7% bf16 MFU | 261336 tok/s +step 2404/18794 | loss 3.452633 (-0.50z)| norm 0.1768 (-0.49z)| lr 5.92e-03 | 1993.63 ms | 68.8% bf16 MFU | 261418 tok/s +step 2405/18794 | loss 3.419015 (-1.50z)| norm 0.2265 (+0.96z)| lr 5.92e-03 | 2009.48 ms | 68.3% bf16 MFU | 261392 tok/s +reducing beta2 to 0.9 and lr/wd by 0.922 due to grad z-score of 3.794714 +step 2406/18794 | loss 3.471535 (+0.09z)| norm 0.3331 (+3.79z)| lr 5.46e-03 | 1995.93 ms | 68.8% bf16 MFU | 261457 tok/s +step 2407/18794 | loss 3.449905 (-0.57z)| norm 0.2389 (+1.17z)| lr 5.92e-03 | 2000.19 ms | 68.6% bf16 MFU | 261490 tok/s +step 2408/18794 | loss 3.445778 (-0.69z)| norm 0.2053 (+0.23z)| lr 5.92e-03 | 2001.62 ms | 68.6% bf16 MFU | 261512 tok/s +step 2409/18794 | loss 3.415105 (-1.59z)| norm 0.2307 (+0.92z)| lr 5.92e-03 | 1987.82 ms | 69.0% bf16 MFU | 261624 tok/s +step 2410/18794 | loss 3.430894 (-1.10z)| norm 0.2337 (+0.98z)| lr 5.92e-03 | 2015.03 ms | 68.1% bf16 MFU | 261552 tok/s +step 2411/18794 | loss 3.401922 (-1.95z)| norm 0.1896 (-0.27z)| lr 5.92e-03 | 1997.48 ms | 68.7% bf16 MFU | 261598 tok/s +step 2412/18794 | loss 3.448206 (-0.56z)| norm 0.1593 (-1.13z)| lr 5.92e-03 | 2003.30 ms | 68.5% bf16 MFU | 261604 tok/s +step 2413/18794 | loss 3.505363 (+1.12z)| norm 0.2329 (+0.94z)| lr 5.92e-03 | 1995.09 ms | 68.8% bf16 MFU | 261663 tok/s +step 2414/18794 | loss 3.429245 (-1.17z)| norm 0.2013 (+0.02z)| lr 5.92e-03 | 2006.06 ms | 68.4% bf16 MFU | 261648 tok/s +step 2415/18794 | loss 3.433359 (-1.03z)| norm 0.1697 (-0.90z)| lr 5.92e-03 | 2008.38 ms | 68.3% bf16 MFU | 261618 tok/s +step 2416/18794 | loss 3.438069 (-0.87z)| norm 0.2096 (+0.26z)| lr 5.92e-03 | 2002.16 ms | 68.5% bf16 MFU | 261630 tok/s +step 2417/18794 | loss 3.432425 (-1.04z)| norm 0.1977 (-0.07z)| lr 5.92e-03 | 2009.42 ms | 68.3% bf16 MFU | 261594 tok/s +step 2418/18794 | loss 3.377918 (-2.58z)| norm 0.1840 (-0.46z)| lr 5.92e-03 | 2007.78 ms | 68.4% bf16 MFU | 261571 tok/s +step 2419/18794 | loss 3.440835 (-0.71z)| norm 0.1609 (-1.14z)| lr 5.92e-03 | 2002.26 ms | 68.5% bf16 MFU | 261585 tok/s +step 2420/18794 | loss 3.430886 (-0.99z)| norm 0.1637 (-1.05z)| lr 5.92e-03 | 2005.42 ms | 68.4% bf16 MFU | 261577 tok/s +step 2421/18794 | loss 3.440638 (-0.69z)| norm 0.1621 (-1.09z)| lr 5.92e-03 | 1989.27 ms | 69.0% bf16 MFU | 261676 tok/s +step 2422/18794 | loss 3.466260 (+0.09z)| norm 0.1751 (-0.66z)| lr 5.92e-03 | 1986.66 ms | 69.1% bf16 MFU | 261788 tok/s +step 2423/18794 | loss 3.457422 (-0.17z)| norm 0.1753 (-0.64z)| lr 5.92e-03 | 2005.40 ms | 68.4% bf16 MFU | 261770 tok/s +step 2424/18794 | loss 3.452716 (-0.31z)| norm 0.1669 (-0.90z)| lr 5.92e-03 | 2000.74 ms | 68.6% bf16 MFU | 261784 tok/s +step 2425/18794 | loss 3.492944 (+0.90z)| norm 0.1551 (-1.26z)| lr 5.92e-03 | 2011.48 ms | 68.2% bf16 MFU | 261727 tok/s +step 2426/18794 | loss 3.423918 (-1.16z)| norm 0.1627 (-1.00z)| lr 5.91e-03 | 1990.14 ms | 69.0% bf16 MFU | 261813 tok/s +step 2427/18794 | loss 3.399599 (-1.84z)| norm 0.1727 (-0.67z)| lr 5.91e-03 | 1982.09 ms | 69.2% bf16 MFU | 261948 tok/s +step 2428/18794 | loss 3.469557 (+0.22z)| norm 0.1776 (-0.49z)| lr 5.91e-03 | 2009.59 ms | 68.3% bf16 MFU | 261895 tok/s +step 2429/18794 | loss 3.428652 (-0.96z)| norm 0.1749 (-0.57z)| lr 5.91e-03 | 1996.30 ms | 68.7% bf16 MFU | 261932 tok/s +step 2430/18794 | loss 3.468412 (+0.23z)| norm 0.1603 (-1.03z)| lr 5.91e-03 | 2009.75 ms | 68.3% bf16 MFU | 261879 tok/s +step 2431/18794 | loss 3.431239 (-0.88z)| norm 0.1764 (-0.49z)| lr 5.91e-03 | 1994.40 ms | 68.8% bf16 MFU | 261929 tok/s +step 2432/18794 | loss 3.467465 (+0.24z)| norm 0.1518 (-1.29z)| lr 5.91e-03 | 1996.77 ms | 68.7% bf16 MFU | 261961 tok/s +step 2433/18794 | loss 3.444900 (-0.45z)| norm 0.1710 (-0.66z)| lr 5.91e-03 | 1987.66 ms | 69.0% bf16 MFU | 262052 tok/s +step 2434/18794 | loss 3.447731 (-0.35z)| norm 0.2234 (+1.04z)| lr 5.91e-03 | 2002.92 ms | 68.5% bf16 MFU | 262037 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.447569 +step 2435/18794 | loss 3.453766 (-0.16z)| norm 0.2695 (+2.45z)| lr 5.91e-03 | 1990.19 ms | 69.0% bf16 MFU | 262107 tok/s +step 2436/18794 | loss 3.494628 (+1.08z)| norm 0.2293 (+1.15z)| lr 5.91e-03 | 1995.65 ms | 68.8% bf16 MFU | 262137 tok/s +step 2437/18794 | loss 3.471116 (+0.37z)| norm 0.1936 (+0.02z)| lr 5.91e-03 | 2002.49 ms | 68.5% bf16 MFU | 262121 tok/s +step 2438/18794 | loss 3.435521 (-0.73z)| norm 0.1976 (+0.14z)| lr 5.91e-03 | 1992.30 ms | 68.9% bf16 MFU | 262173 tok/s +step 2439/18794 | loss 3.429993 (-0.89z)| norm 0.2499 (+1.77z)| lr 5.91e-03 | 1987.06 ms | 69.1% bf16 MFU | 262257 tok/s +step 2440/18794 | loss 3.425374 (-1.01z)| norm 0.2290 (+1.11z)| lr 5.91e-03 | 2000.64 ms | 68.6% bf16 MFU | 262247 tok/s +step 2441/18794 | loss 3.384584 (-2.19z)| norm 0.1690 (-0.76z)| lr 5.91e-03 | 1997.53 ms | 68.7% bf16 MFU | 262258 tok/s +step 2442/18794 | loss 3.472554 (+0.48z)| norm 0.2044 (+0.33z)| lr 5.91e-03 | 1985.95 ms | 69.1% bf16 MFU | 262345 tok/s +step 2443/18794 | loss 3.467629 (+0.34z)| norm 0.2416 (+1.47z)| lr 5.91e-03 | 1990.48 ms | 68.9% bf16 MFU | 262398 tok/s +step 2444/18794 | loss 3.416967 (-1.22z)| norm 0.2018 (+0.23z)| lr 5.91e-03 | 1994.32 ms | 68.8% bf16 MFU | 262422 tok/s +step 2445/18794 | loss 3.443535 (-0.39z)| norm 0.1615 (-1.01z)| lr 5.91e-03 | 1985.61 ms | 69.1% bf16 MFU | 262504 tok/s +step 2446/18794 | loss 3.459955 (+0.14z)| norm 0.1448 (-1.50z)| lr 5.91e-03 | 1992.52 ms | 68.9% bf16 MFU | 262535 tok/s +step 2447/18794 | loss 3.404349 (-1.60z)| norm 0.1492 (-1.33z)| lr 5.91e-03 | 1995.02 ms | 68.8% bf16 MFU | 262548 tok/s +step 2448/18794 | loss 3.496487 (+1.31z)| norm 0.1697 (-0.70z)| lr 5.91e-03 | 2001.81 ms | 68.6% bf16 MFU | 262516 tok/s +step 2449/18794 | loss 3.429798 (-0.79z)| norm 0.1826 (-0.31z)| lr 5.91e-03 | 1995.34 ms | 68.8% bf16 MFU | 262528 tok/s +step 2450/18794 | loss 3.474784 (+0.64z)| norm 0.1801 (-0.39z)| lr 5.91e-03 | 1994.50 ms | 68.8% bf16 MFU | 262545 tok/s +step 2451/18794 | loss 3.436220 (-0.59z)| norm 0.2000 (+0.20z)| lr 5.91e-03 | 1989.09 ms | 69.0% bf16 MFU | 262597 tok/s +step 2452/18794 | loss 3.436289 (-0.57z)| norm 0.2170 (+0.71z)| lr 5.91e-03 | 1994.04 ms | 68.8% bf16 MFU | 262613 tok/s +step 2453/18794 | loss 3.422696 (-0.99z)| norm 0.1925 (-0.01z)| lr 5.91e-03 | 1979.23 ms | 69.3% bf16 MFU | 262727 tok/s +step 2454/18794 | loss 3.415745 (-1.19z)| norm 0.1690 (-0.72z)| lr 5.91e-03 | 1979.36 ms | 69.3% bf16 MFU | 262835 tok/s +step 2455/18794 | loss 3.482663 (+0.94z)| norm 0.1803 (-0.37z)| lr 5.91e-03 | 1994.38 ms | 68.8% bf16 MFU | 262837 tok/s +step 2456/18794 | loss 3.449626 (-0.11z)| norm 0.1838 (-0.27z)| lr 5.91e-03 | 1999.35 ms | 68.6% bf16 MFU | 262807 tok/s +step 2457/18794 | loss 3.425234 (-0.88z)| norm 0.1813 (-0.34z)| lr 5.91e-03 | 1998.88 ms | 68.7% bf16 MFU | 262781 tok/s +step 2458/18794 | loss 3.465432 (+0.40z)| norm 0.2054 (+0.40z)| lr 5.91e-03 | 1986.85 ms | 69.1% bf16 MFU | 262836 tok/s +step 2459/18794 | loss 3.485820 (+1.04z)| norm 0.2018 (+0.29z)| lr 5.91e-03 | 1988.46 ms | 69.0% bf16 MFU | 262877 tok/s +step 2460/18794 | loss 3.450372 (-0.08z)| norm 0.1992 (+0.21z)| lr 5.91e-03 | 1991.53 ms | 68.9% bf16 MFU | 262896 tok/s +step 2461/18794 | loss 3.486606 (+1.05z)| norm 0.2272 (+1.05z)| lr 5.91e-03 | 1991.86 ms | 68.9% bf16 MFU | 262912 tok/s +step 2462/18794 | loss 3.502874 (+1.53z)| norm 0.2598 (+1.98z)| lr 5.91e-03 | 1985.49 ms | 69.1% bf16 MFU | 262970 tok/s +step 2463/18794 | loss 3.455336 (+0.05z)| norm 0.2411 (+1.39z)| lr 5.91e-03 | 1981.07 ms | 69.3% bf16 MFU | 263054 tok/s +step 2464/18794 | loss 3.465818 (+0.38z)| norm 0.2275 (+0.97z)| lr 5.91e-03 | 1988.72 ms | 69.0% bf16 MFU | 263083 tok/s +step 2465/18794 | loss 3.462204 (+0.26z)| norm 0.2396 (+1.30z)| lr 5.91e-03 | 1994.27 ms | 68.8% bf16 MFU | 263073 tok/s +step 2466/18794 | loss 3.470245 (+0.50z)| norm 0.2347 (+1.13z)| lr 5.91e-03 | 1986.28 ms | 69.1% bf16 MFU | 263117 tok/s +step 2467/18794 | loss 3.461699 (+0.23z)| norm 0.2023 (+0.17z)| lr 5.91e-03 | 1996.10 ms | 68.8% bf16 MFU | 263094 tok/s +step 2468/18794 | loss 3.471992 (+0.54z)| norm 0.1663 (-0.90z)| lr 5.91e-03 | 1981.41 ms | 69.3% bf16 MFU | 263170 tok/s +step 2469/18794 | loss 3.450541 (-0.12z)| norm 0.1726 (-0.70z)| lr 5.91e-03 | 1986.88 ms | 69.1% bf16 MFU | 263205 tok/s +step 2470/18794 | loss 3.422344 (-0.99z)| norm 0.1746 (-0.63z)| lr 5.91e-03 | 1986.46 ms | 69.1% bf16 MFU | 263241 tok/s +step 2471/18794 | loss 3.434903 (-0.59z)| norm 0.2026 (+0.19z)| lr 5.91e-03 | 1984.32 ms | 69.2% bf16 MFU | 263290 tok/s +step 2472/18794 | loss 3.428246 (-0.80z)| norm 0.1956 (-0.02z)| lr 5.91e-03 | 1980.30 ms | 69.3% bf16 MFU | 263363 tok/s +step 2473/18794 | loss 3.461354 (+0.25z)| norm 0.2224 (+0.78z)| lr 5.91e-03 | 1988.82 ms | 69.0% bf16 MFU | 263376 tok/s +step 2474/18794 | loss 3.409080 (-1.37z)| norm 0.2087 (+0.38z)| lr 5.91e-03 | 1987.63 ms | 69.0% bf16 MFU | 263396 tok/s +step 2475/18794 | loss 3.464696 (+0.37z)| norm 0.2300 (+1.02z)| lr 5.91e-03 | 1987.66 ms | 69.0% bf16 MFU | 263415 tok/s +step 2476/18794 | loss 3.429411 (-0.73z)| norm 0.2183 (+0.67z)| lr 5.91e-03 | 1983.05 ms | 69.2% bf16 MFU | 263463 tok/s +step 2477/18794 | loss 3.372844 (-2.42z)| norm 0.1794 (-0.50z)| lr 5.91e-03 | 1987.19 ms | 69.1% bf16 MFU | 263482 tok/s +step 2478/18794 | loss 3.437382 (-0.43z)| norm 0.1542 (-1.25z)| lr 5.91e-03 | 1981.50 ms | 69.3% bf16 MFU | 263537 tok/s +step 2479/18794 | loss 3.389125 (-1.87z)| norm 0.1801 (-0.49z)| lr 5.91e-03 | 2016.73 ms | 68.0% bf16 MFU | 263359 tok/s +step 2480/18794 | loss 3.405768 (-1.33z)| norm 0.2308 (+1.02z)| lr 5.91e-03 | 2042.33 ms | 67.2% bf16 MFU | 263026 tok/s +step 2481/18794 | loss 3.436945 (-0.40z)| norm 0.2639 (+1.98z)| lr 5.91e-03 | 2042.95 ms | 67.2% bf16 MFU | 262707 tok/s +step 2482/18794 | loss 3.473317 (+0.69z)| norm 0.2306 (+0.95z)| lr 5.91e-03 | 2026.78 ms | 67.7% bf16 MFU | 262505 tok/s +step 2483/18794 | loss 3.406332 (-1.31z)| norm 0.1578 (-1.22z)| lr 5.91e-03 | 2035.07 ms | 67.4% bf16 MFU | 262261 tok/s +step 2484/18794 | loss 3.448717 (-0.02z)| norm 0.1717 (-0.79z)| lr 5.91e-03 | 2046.43 ms | 67.1% bf16 MFU | 261958 tok/s +step 2485/18794 | loss 3.441454 (-0.23z)| norm 0.2136 (+0.49z)| lr 5.91e-03 | 2040.47 ms | 67.3% bf16 MFU | 261707 tok/s +step 2486/18794 | loss 3.486212 (+1.15z)| norm 0.2177 (+0.65z)| lr 5.91e-03 | 2018.12 ms | 68.0% bf16 MFU | 261612 tok/s +step 2487/18794 | loss 3.373140 (-2.25z)| norm 0.2334 (+1.16z)| lr 5.91e-03 | 2046.02 ms | 67.1% bf16 MFU | 261343 tok/s +step 2488/18794 | loss 3.369978 (-2.31z)| norm 0.2253 (+0.90z)| lr 5.91e-03 | 2047.55 ms | 67.0% bf16 MFU | 261079 tok/s +step 2489/18794 | loss 3.377654 (-2.04z)| norm 0.2086 (+0.36z)| lr 5.91e-03 | 2029.58 ms | 67.6% bf16 MFU | 260941 tok/s +step 2490/18794 | loss 3.391098 (-1.65z)| norm 0.2144 (+0.54z)| lr 5.91e-03 | 2037.94 ms | 67.3% bf16 MFU | 260757 tok/s +step 2491/18794 | loss 3.454542 (+0.35z)| norm 0.1785 (-0.60z)| lr 5.91e-03 | 2028.05 ms | 67.7% bf16 MFU | 260645 tok/s +step 2492/18794 | loss 3.437969 (-0.15z)| norm 0.1744 (-0.71z)| lr 5.91e-03 | 2030.10 ms | 67.6% bf16 MFU | 260526 tok/s +step 2493/18794 | loss 3.405492 (-1.18z)| norm 0.1878 (-0.29z)| lr 5.91e-03 | 2034.91 ms | 67.4% bf16 MFU | 260382 tok/s +step 2494/18794 | loss 3.413965 (-0.90z)| norm 0.1921 (-0.15z)| lr 5.91e-03 | 2029.80 ms | 67.6% bf16 MFU | 260278 tok/s +step 2495/18794 | loss 3.398517 (-1.41z)| norm 0.1564 (-1.27z)| lr 5.91e-03 | 2030.43 ms | 67.6% bf16 MFU | 260175 tok/s +step 2496/18794 | loss 3.452425 (+0.36z)| norm 0.1666 (-0.94z)| lr 5.91e-03 | 2027.17 ms | 67.7% bf16 MFU | 260097 tok/s +step 2497/18794 | loss 3.443211 (+0.06z)| norm 0.1869 (-0.30z)| lr 5.91e-03 | 2027.79 ms | 67.7% bf16 MFU | 260020 tok/s +step 2498/18794 | loss 3.422517 (-0.61z)| norm 0.2280 (+0.96z)| lr 5.91e-03 | 2021.74 ms | 67.9% bf16 MFU | 259985 tok/s +step 2499/18794 | loss 3.410857 (-0.97z)| norm 0.2456 (+1.49z)| lr 5.91e-03 | 2030.91 ms | 67.6% bf16 MFU | 259894 tok/s +step 2500/18794 | loss 3.370762 (-2.20z)| norm 0.2233 (+0.79z)| lr 5.91e-03 | 2045.42 ms | 67.1% bf16 MFU | 259715 tok/s +val loss 3.438010 +Writing state to log_gpt3_125M_edu_v4/state_00002500_00001.bin +HellaSwag: 2688/10042 = 0.267676 +Writing checkpoint at step 2500 +Writing model to log_gpt3_125M_edu_v4/model_00002500.bin +Writing state to log_gpt3_125M_edu_v4/state_00002500_00000.bin +step 2501/18794 | loss 3.381691 (-1.82z)| norm 0.1580 (-1.21z)| lr 5.91e-03 | 2026.57 ms | 67.7% bf16 MFU | 259665 tok/s +step 2502/18794 | loss 3.386818 (-1.63z)| norm 0.1643 (-1.01z)| lr 5.91e-03 | 2025.66 ms | 67.7% bf16 MFU | 259623 tok/s +step 2503/18794 | loss 3.413141 (-0.79z)| norm 0.1756 (-0.68z)| lr 5.91e-03 | 2032.69 ms | 67.5% bf16 MFU | 259538 tok/s +step 2504/18794 | loss 3.430733 (-0.23z)| norm 0.1887 (-0.28z)| lr 5.91e-03 | 2035.88 ms | 67.4% bf16 MFU | 259437 tok/s +step 2505/18794 | loss 3.416140 (-0.69z)| norm 0.2407 (+1.31z)| lr 5.91e-03 | 2027.42 ms | 67.7% bf16 MFU | 259395 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.403581 +step 2506/18794 | loss 3.472114 (+1.08z)| norm 0.2709 (+2.40z)| lr 5.91e-03 | 2036.64 ms | 67.4% bf16 MFU | 259297 tok/s +step 2507/18794 | loss 3.381441 (-1.74z)| norm 0.2273 (+0.99z)| lr 5.91e-03 | 2049.08 ms | 67.0% bf16 MFU | 259125 tok/s +step 2508/18794 | loss 3.403412 (-1.04z)| norm 0.2333 (+1.17z)| lr 5.90e-03 | 2034.30 ms | 67.5% bf16 MFU | 259055 tok/s +step 2509/18794 | loss 3.396650 (-1.23z)| norm 0.2096 (+0.41z)| lr 5.90e-03 | 2035.94 ms | 67.4% bf16 MFU | 258978 tok/s +step 2510/18794 | loss 3.451632 (+0.45z)| norm 0.2141 (+0.56z)| lr 5.90e-03 | 2032.60 ms | 67.5% bf16 MFU | 258926 tok/s +step 2511/18794 | loss 3.421180 (-0.49z)| norm 0.2347 (+1.22z)| lr 5.90e-03 | 2019.97 ms | 67.9% bf16 MFU | 258958 tok/s +step 2512/18794 | loss 3.430216 (-0.21z)| norm 0.1888 (-0.30z)| lr 5.90e-03 | 2022.76 ms | 67.8% bf16 MFU | 258970 tok/s +step 2513/18794 | loss 3.417958 (-0.57z)| norm 0.1802 (-0.56z)| lr 5.90e-03 | 2021.15 ms | 67.9% bf16 MFU | 258991 tok/s +step 2514/18794 | loss 3.442595 (+0.20z)| norm 0.2293 (+1.05z)| lr 5.90e-03 | 2013.53 ms | 68.2% bf16 MFU | 259061 tok/s +step 2515/18794 | loss 3.456113 (+0.62z)| norm 0.2451 (+1.53z)| lr 5.90e-03 | 2040.91 ms | 67.2% bf16 MFU | 258952 tok/s +step 2516/18794 | loss 3.348763 (-2.63z)| norm 0.1960 (-0.07z)| lr 5.90e-03 | 1988.09 ms | 69.0% bf16 MFU | 259190 tok/s +step 2517/18794 | loss 3.443742 (+0.25z)| norm 0.1809 (-0.56z)| lr 5.90e-03 | 1984.43 ms | 69.2% bf16 MFU | 259441 tok/s +step 2518/18794 | loss 3.406243 (-0.91z)| norm 0.1876 (-0.34z)| lr 5.90e-03 | 2006.56 ms | 68.4% bf16 MFU | 259533 tok/s +step 2519/18794 | loss 3.376390 (-1.78z)| norm 0.1960 (-0.08z)| lr 5.90e-03 | 2028.74 ms | 67.6% bf16 MFU | 259478 tok/s +step 2520/18794 | loss 3.370398 (-1.91z)| norm 0.2287 (+0.98z)| lr 5.90e-03 | 2030.56 ms | 67.6% bf16 MFU | 259414 tok/s +step 2521/18794 | loss 3.439795 (+0.15z)| norm 0.2337 (+1.12z)| lr 5.90e-03 | 2023.74 ms | 67.8% bf16 MFU | 259397 tok/s +step 2522/18794 | loss 3.396320 (-1.11z)| norm 0.2194 (+0.64z)| lr 5.90e-03 | 2027.27 ms | 67.7% bf16 MFU | 259358 tok/s +step 2523/18794 | loss 3.416527 (-0.50z)| norm 0.1995 (-0.03z)| lr 5.90e-03 | 2038.43 ms | 67.3% bf16 MFU | 259250 tok/s +step 2524/18794 | loss 3.404335 (-0.85z)| norm 0.1653 (-1.16z)| lr 5.90e-03 | 2004.04 ms | 68.5% bf16 MFU | 259368 tok/s +step 2525/18794 | loss 3.377434 (-1.62z)| norm 0.1995 (-0.05z)| lr 5.90e-03 | 2024.76 ms | 67.8% bf16 MFU | 259347 tok/s +step 2526/18794 | loss 3.456143 (+0.71z)| norm 0.1609 (-1.34z)| lr 5.90e-03 | 2032.11 ms | 67.5% bf16 MFU | 259279 tok/s +step 2527/18794 | loss 3.410321 (-0.65z)| norm 0.1610 (-1.33z)| lr 5.90e-03 | 2003.43 ms | 68.5% bf16 MFU | 259400 tok/s +step 2528/18794 | loss 3.367820 (-1.86z)| norm 0.1547 (-1.52z)| lr 5.90e-03 | 2025.10 ms | 67.8% bf16 MFU | 259375 tok/s +step 2529/18794 | loss 3.435933 (+0.13z)| norm 0.1620 (-1.26z)| lr 5.90e-03 | 2012.58 ms | 68.2% bf16 MFU | 259431 tok/s +step 2530/18794 | loss 3.491477 (+1.74z)| norm 0.1802 (-0.68z)| lr 5.90e-03 | 2033.19 ms | 67.5% bf16 MFU | 259353 tok/s +step 2531/18794 | loss 3.475399 (+1.25z)| norm 0.2343 (+1.09z)| lr 5.90e-03 | 2019.02 ms | 68.0% bf16 MFU | 259369 tok/s +step 2532/18794 | loss 3.431507 (-0.01z)| norm 0.2263 (+0.81z)| lr 5.90e-03 | 2013.51 ms | 68.2% bf16 MFU | 259420 tok/s +step 2533/18794 | loss 3.428382 (-0.09z)| norm 0.1979 (-0.14z)| lr 5.90e-03 | 2031.35 ms | 67.6% bf16 MFU | 259354 tok/s +step 2534/18794 | loss 3.390722 (-1.16z)| norm 0.2152 (+0.44z)| lr 5.90e-03 | 2006.91 ms | 68.4% bf16 MFU | 259448 tok/s +step 2535/18794 | loss 3.401538 (-0.83z)| norm 0.2586 (+1.92z)| lr 5.90e-03 | 2032.27 ms | 67.5% bf16 MFU | 259375 tok/s +step 2536/18794 | loss 3.404747 (-0.73z)| norm 0.2376 (+1.20z)| lr 5.90e-03 | 2029.40 ms | 67.6% bf16 MFU | 259324 tok/s +step 2537/18794 | loss 3.424930 (-0.12z)| norm 0.1776 (-0.82z)| lr 5.90e-03 | 2016.83 ms | 68.0% bf16 MFU | 259355 tok/s +step 2538/18794 | loss 3.425285 (-0.11z)| norm 0.1727 (-0.97z)| lr 5.90e-03 | 2009.57 ms | 68.3% bf16 MFU | 259432 tok/s +step 2539/18794 | loss 3.384472 (-1.29z)| norm 0.1808 (-0.68z)| lr 5.90e-03 | 2027.80 ms | 67.7% bf16 MFU | 259388 tok/s +step 2540/18794 | loss 3.396071 (-0.94z)| norm 0.2085 (+0.26z)| lr 5.90e-03 | 2018.89 ms | 68.0% bf16 MFU | 259403 tok/s +step 2541/18794 | loss 3.377784 (-1.46z)| norm 0.2125 (+0.39z)| lr 5.90e-03 | 2037.60 ms | 67.3% bf16 MFU | 259298 tok/s +step 2542/18794 | loss 3.426225 (-0.04z)| norm 0.2269 (+0.87z)| lr 5.90e-03 | 2010.27 ms | 68.3% bf16 MFU | 259374 tok/s +step 2543/18794 | loss 3.484671 (+1.65z)| norm 0.2322 (+1.05z)| lr 5.90e-03 | 2012.63 ms | 68.2% bf16 MFU | 259430 tok/s +step 2544/18794 | loss 3.322125 (-2.91z)| norm 0.1803 (-0.71z)| lr 5.90e-03 | 2024.08 ms | 67.8% bf16 MFU | 259410 tok/s +step 2545/18794 | loss 3.400469 (-0.72z)| norm 0.1537 (-1.61z)| lr 5.90e-03 | 2017.18 ms | 68.0% bf16 MFU | 259435 tok/s +step 2546/18794 | loss 3.411306 (-0.41z)| norm 0.1853 (-0.56z)| lr 5.90e-03 | 2013.87 ms | 68.1% bf16 MFU | 259480 tok/s +step 2547/18794 | loss 3.427494 (+0.03z)| norm 0.1849 (-0.59z)| lr 5.90e-03 | 2036.65 ms | 67.4% bf16 MFU | 259377 tok/s +step 2548/18794 | loss 3.430890 (+0.15z)| norm 0.1749 (-0.95z)| lr 5.90e-03 | 2003.73 ms | 68.5% bf16 MFU | 259491 tok/s +step 2549/18794 | loss 3.397908 (-0.78z)| norm 0.1778 (-0.84z)| lr 5.90e-03 | 1999.18 ms | 68.6% bf16 MFU | 259629 tok/s +step 2550/18794 | loss 3.471296 (+1.31z)| norm 0.1826 (-0.68z)| lr 5.90e-03 | 2024.33 ms | 67.8% bf16 MFU | 259598 tok/s +step 2551/18794 | loss 3.376134 (-1.36z)| norm 0.1907 (-0.39z)| lr 5.90e-03 | 2012.09 ms | 68.2% bf16 MFU | 259646 tok/s +step 2552/18794 | loss 3.379457 (-1.25z)| norm 0.1699 (-1.09z)| lr 5.90e-03 | 1992.42 ms | 68.9% bf16 MFU | 259821 tok/s +step 2553/18794 | loss 3.367578 (-1.54z)| norm 0.1666 (-1.19z)| lr 5.90e-03 | 2025.61 ms | 67.7% bf16 MFU | 259771 tok/s +step 2554/18794 | loss 3.474706 (+1.38z)| norm 0.1769 (-0.84z)| lr 5.90e-03 | 2019.02 ms | 68.0% bf16 MFU | 259766 tok/s +step 2555/18794 | loss 3.420415 (-0.09z)| norm 0.1883 (-0.45z)| lr 5.90e-03 | 2019.18 ms | 68.0% bf16 MFU | 259761 tok/s +step 2556/18794 | loss 3.385619 (-1.03z)| norm 0.1682 (-1.14z)| lr 5.90e-03 | 2014.15 ms | 68.1% bf16 MFU | 259788 tok/s +step 2557/18794 | loss 3.354209 (-1.84z)| norm 0.1472 (-1.83z)| lr 5.90e-03 | 2004.95 ms | 68.4% bf16 MFU | 259873 tok/s +step 2558/18794 | loss 3.384366 (-1.00z)| norm 0.1860 (-0.49z)| lr 5.90e-03 | 2006.28 ms | 68.4% bf16 MFU | 259946 tok/s +step 2559/18794 | loss 3.427214 (+0.18z)| norm 0.1797 (-0.70z)| lr 5.90e-03 | 2010.79 ms | 68.2% bf16 MFU | 259985 tok/s +step 2560/18794 | loss 3.402917 (-0.48z)| norm 0.1873 (-0.44z)| lr 5.90e-03 | 2003.58 ms | 68.5% bf16 MFU | 260070 tok/s +step 2561/18794 | loss 3.338217 (-2.21z)| norm 0.2175 (+0.60z)| lr 5.90e-03 | 2016.42 ms | 68.1% bf16 MFU | 260067 tok/s +step 2562/18794 | loss 3.432663 (+0.41z)| norm 0.2379 (+1.32z)| lr 5.90e-03 | 2011.50 ms | 68.2% bf16 MFU | 260096 tok/s +step 2563/18794 | loss 3.500652 (+2.27z)| norm 0.2032 (+0.13z)| lr 5.90e-03 | 2015.97 ms | 68.1% bf16 MFU | 260094 tok/s +step 2564/18794 | loss 3.428175 (+0.28z)| norm 0.1865 (-0.44z)| lr 5.90e-03 | 2021.13 ms | 67.9% bf16 MFU | 260060 tok/s +step 2565/18794 | loss 3.431640 (+0.39z)| norm 0.1876 (-0.39z)| lr 5.90e-03 | 2015.62 ms | 68.1% bf16 MFU | 260062 tok/s +step 2566/18794 | loss 3.442273 (+0.70z)| norm 0.2297 (+1.11z)| lr 5.90e-03 | 2034.67 ms | 67.4% bf16 MFU | 259943 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.000112 +step 2567/18794 | loss 3.436166 (+0.54z)| norm 0.2566 (+2.00z)| lr 5.90e-03 | 2022.57 ms | 67.9% bf16 MFU | 259907 tok/s +step 2568/18794 | loss 3.426847 (+0.29z)| norm 0.2200 (+0.71z)| lr 5.90e-03 | 1989.26 ms | 69.0% bf16 MFU | 260090 tok/s +step 2569/18794 | loss 3.362729 (-1.52z)| norm 0.2426 (+1.47z)| lr 5.90e-03 | 2030.10 ms | 67.6% bf16 MFU | 259998 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.501597 +step 2570/18794 | loss 3.401862 (-0.40z)| norm 0.2756 (+2.50z)| lr 5.90e-03 | 2023.09 ms | 67.8% bf16 MFU | 259956 tok/s +step 2571/18794 | loss 3.370208 (-1.28z)| norm 0.1814 (-0.66z)| lr 5.90e-03 | 2000.02 ms | 68.6% bf16 MFU | 260065 tok/s +step 2572/18794 | loss 3.379457 (-0.99z)| norm 0.2051 (+0.13z)| lr 5.90e-03 | 2018.28 ms | 68.0% bf16 MFU | 260050 tok/s +step 2573/18794 | loss 3.397970 (-0.46z)| norm 0.1848 (-0.54z)| lr 5.90e-03 | 2013.76 ms | 68.1% bf16 MFU | 260065 tok/s +step 2574/18794 | loss 3.415590 (+0.04z)| norm 0.2242 (+0.78z)| lr 5.90e-03 | 2023.75 ms | 67.8% bf16 MFU | 260015 tok/s +step 2575/18794 | loss 3.374725 (-1.10z)| norm 0.2164 (+0.53z)| lr 5.90e-03 | 1982.96 ms | 69.2% bf16 MFU | 260235 tok/s +step 2576/18794 | loss 3.342205 (-1.97z)| norm 0.1689 (-1.05z)| lr 5.90e-03 | 2018.40 ms | 68.0% bf16 MFU | 260211 tok/s +step 2577/18794 | loss 3.340056 (-1.99z)| norm 0.1878 (-0.42z)| lr 5.90e-03 | 2006.00 ms | 68.4% bf16 MFU | 260268 tok/s +step 2578/18794 | loss 3.374018 (-1.03z)| norm 0.1613 (-1.32z)| lr 5.90e-03 | 2016.48 ms | 68.1% bf16 MFU | 260255 tok/s +step 2579/18794 | loss 3.425027 (+0.37z)| norm 0.1736 (-0.90z)| lr 5.90e-03 | 2012.50 ms | 68.2% bf16 MFU | 260268 tok/s +step 2580/18794 | loss 3.385825 (-0.71z)| norm 0.1884 (-0.39z)| lr 5.90e-03 | 2015.38 ms | 68.1% bf16 MFU | 260262 tok/s +step 2581/18794 | loss 3.407279 (-0.11z)| norm 0.1619 (-1.27z)| lr 5.90e-03 | 2009.44 ms | 68.3% bf16 MFU | 260294 tok/s +step 2582/18794 | loss 3.408903 (-0.05z)| norm 0.1934 (-0.18z)| lr 5.90e-03 | 2020.82 ms | 67.9% bf16 MFU | 260252 tok/s +step 2583/18794 | loss 3.402719 (-0.22z)| norm 0.2199 (+0.72z)| lr 5.90e-03 | 2014.56 ms | 68.1% bf16 MFU | 260251 tok/s +step 2584/18794 | loss 3.378704 (-0.87z)| norm 0.2342 (+1.20z)| lr 5.90e-03 | 1982.61 ms | 69.2% bf16 MFU | 260461 tok/s +step 2585/18794 | loss 3.365612 (-1.22z)| norm 0.2011 (+0.05z)| lr 5.90e-03 | 2017.02 ms | 68.0% bf16 MFU | 260435 tok/s +step 2586/18794 | loss 3.374463 (-0.96z)| norm 0.2024 (+0.10z)| lr 5.89e-03 | 2001.36 ms | 68.6% bf16 MFU | 260511 tok/s +step 2587/18794 | loss 3.416571 (+0.23z)| norm 0.2191 (+0.69z)| lr 5.89e-03 | 2006.94 ms | 68.4% bf16 MFU | 260547 tok/s +step 2588/18794 | loss 3.397466 (-0.32z)| norm 0.2487 (+1.70z)| lr 5.89e-03 | 2017.05 ms | 68.0% bf16 MFU | 260517 tok/s +step 2589/18794 | loss 3.409906 (+0.03z)| norm 0.2432 (+1.48z)| lr 5.89e-03 | 1997.39 ms | 68.7% bf16 MFU | 260615 tok/s +step 2590/18794 | loss 3.393707 (-0.44z)| norm 0.2400 (+1.35z)| lr 5.89e-03 | 2016.82 ms | 68.0% bf16 MFU | 260582 tok/s +step 2591/18794 | loss 3.420644 (+0.35z)| norm 0.2087 (+0.28z)| lr 5.89e-03 | 2000.79 ms | 68.6% bf16 MFU | 260655 tok/s +step 2592/18794 | loss 3.414503 (+0.18z)| norm 0.1625 (-1.28z)| lr 5.89e-03 | 1996.08 ms | 68.8% bf16 MFU | 260755 tok/s +step 2593/18794 | loss 3.379664 (-0.83z)| norm 0.2083 (+0.26z)| lr 5.89e-03 | 1999.60 ms | 68.6% bf16 MFU | 260827 tok/s +step 2594/18794 | loss 3.425572 (+0.50z)| norm 0.2140 (+0.45z)| lr 5.89e-03 | 1991.02 ms | 68.9% bf16 MFU | 260952 tok/s +step 2595/18794 | loss 3.446124 (+1.08z)| norm 0.1681 (-1.12z)| lr 5.89e-03 | 2006.35 ms | 68.4% bf16 MFU | 260970 tok/s +step 2596/18794 | loss 3.379664 (-0.83z)| norm 0.1549 (-1.56z)| lr 5.89e-03 | 1988.58 ms | 69.0% bf16 MFU | 261104 tok/s +step 2597/18794 | loss 3.364284 (-1.25z)| norm 0.1527 (-1.60z)| lr 5.89e-03 | 2017.32 ms | 68.0% bf16 MFU | 261044 tok/s +step 2598/18794 | loss 3.400740 (-0.18z)| norm 0.1676 (-1.08z)| lr 5.89e-03 | 2009.47 ms | 68.3% bf16 MFU | 261037 tok/s +step 2599/18794 | loss 3.390006 (-0.49z)| norm 0.2013 (+0.06z)| lr 5.89e-03 | 2009.15 ms | 68.3% bf16 MFU | 261033 tok/s +step 2600/18794 | loss 3.395296 (-0.34z)| norm 0.2013 (+0.07z)| lr 5.89e-03 | 1993.77 ms | 68.8% bf16 MFU | 261129 tok/s +step 2601/18794 | loss 3.359452 (-1.38z)| norm 0.1810 (-0.63z)| lr 5.89e-03 | 1998.71 ms | 68.7% bf16 MFU | 261188 tok/s +step 2602/18794 | loss 3.325585 (-2.28z)| norm 0.2076 (+0.27z)| lr 5.89e-03 | 1997.68 ms | 68.7% bf16 MFU | 261251 tok/s +step 2603/18794 | loss 3.405595 (-0.02z)| norm 0.2348 (+1.18z)| lr 5.89e-03 | 1987.81 ms | 69.0% bf16 MFU | 261376 tok/s +step 2604/18794 | loss 3.398598 (-0.21z)| norm 0.2356 (+1.18z)| lr 5.89e-03 | 1989.57 ms | 69.0% bf16 MFU | 261483 tok/s +step 2605/18794 | loss 3.368244 (-1.05z)| norm 0.1829 (-0.60z)| lr 5.89e-03 | 2000.62 ms | 68.6% bf16 MFU | 261512 tok/s +step 2606/18794 | loss 3.375777 (-0.82z)| norm 0.1608 (-1.35z)| lr 5.89e-03 | 2009.66 ms | 68.3% bf16 MFU | 261481 tok/s +step 2607/18794 | loss 3.365873 (-1.09z)| norm 0.1814 (-0.61z)| lr 5.89e-03 | 2000.24 ms | 68.6% bf16 MFU | 261513 tok/s +step 2608/18794 | loss 3.448185 (+1.23z)| norm 0.1721 (-0.92z)| lr 5.89e-03 | 2002.49 ms | 68.5% bf16 MFU | 261528 tok/s +step 2609/18794 | loss 3.390005 (-0.41z)| norm 0.1534 (-1.54z)| lr 5.89e-03 | 2005.94 ms | 68.4% bf16 MFU | 261520 tok/s +step 2610/18794 | loss 3.413680 (+0.27z)| norm 0.1751 (-0.77z)| lr 5.89e-03 | 2003.68 ms | 68.5% bf16 MFU | 261527 tok/s +step 2611/18794 | loss 3.439508 (+1.00z)| norm 0.1757 (-0.73z)| lr 5.89e-03 | 1992.87 ms | 68.9% bf16 MFU | 261605 tok/s +step 2612/18794 | loss 3.394503 (-0.27z)| norm 0.1625 (-1.18z)| lr 5.89e-03 | 1979.31 ms | 69.3% bf16 MFU | 261769 tok/s +step 2613/18794 | loss 3.412652 (+0.25z)| norm 0.2052 (+0.30z)| lr 5.89e-03 | 2011.42 ms | 68.2% bf16 MFU | 261713 tok/s +step 2614/18794 | loss 3.384273 (-0.55z)| norm 0.2233 (+0.93z)| lr 5.89e-03 | 1997.39 ms | 68.7% bf16 MFU | 261752 tok/s +step 2615/18794 | loss 3.448430 (+1.30z)| norm 0.2482 (+1.80z)| lr 5.89e-03 | 1999.45 ms | 68.6% bf16 MFU | 261775 tok/s +step 2616/18794 | loss 3.435536 (+0.91z)| norm 0.2179 (+0.73z)| lr 5.89e-03 | 1995.63 ms | 68.8% bf16 MFU | 261822 tok/s +step 2617/18794 | loss 3.437019 (+0.95z)| norm 0.2027 (+0.20z)| lr 5.89e-03 | 1991.18 ms | 68.9% bf16 MFU | 261896 tok/s +step 2618/18794 | loss 3.354160 (-1.42z)| norm 0.1820 (-0.52z)| lr 5.89e-03 | 1981.69 ms | 69.3% bf16 MFU | 262030 tok/s +step 2619/18794 | loss 3.424162 (+0.58z)| norm 0.1595 (-1.28z)| lr 5.89e-03 | 1994.49 ms | 68.8% bf16 MFU | 262072 tok/s +step 2620/18794 | loss 3.391134 (-0.38z)| norm 0.1992 (+0.10z)| lr 5.89e-03 | 2001.40 ms | 68.6% bf16 MFU | 262066 tok/s +step 2621/18794 | loss 3.402813 (-0.03z)| norm 0.2022 (+0.22z)| lr 5.89e-03 | 1980.63 ms | 69.3% bf16 MFU | 262198 tok/s +step 2622/18794 | loss 3.385050 (-0.55z)| norm 0.1858 (-0.35z)| lr 5.89e-03 | 1990.26 ms | 69.0% bf16 MFU | 262260 tok/s +step 2623/18794 | loss 3.396967 (-0.19z)| norm 0.1939 (-0.06z)| lr 5.89e-03 | 2002.66 ms | 68.5% bf16 MFU | 262236 tok/s +step 2624/18794 | loss 3.425336 (+0.62z)| norm 0.2203 (+0.85z)| lr 5.89e-03 | 1986.81 ms | 69.1% bf16 MFU | 262319 tok/s +step 2625/18794 | loss 3.371495 (-0.93z)| norm 0.2437 (+1.64z)| lr 5.89e-03 | 1981.61 ms | 69.3% bf16 MFU | 262432 tok/s +step 2626/18794 | loss 3.428139 (+0.72z)| norm 0.2297 (+1.13z)| lr 5.89e-03 | 1985.28 ms | 69.1% bf16 MFU | 262514 tok/s +step 2627/18794 | loss 3.430413 (+0.78z)| norm 0.2118 (+0.49z)| lr 5.89e-03 | 2005.61 ms | 68.4% bf16 MFU | 262459 tok/s +step 2628/18794 | loss 3.398536 (-0.16z)| norm 0.2125 (+0.50z)| lr 5.89e-03 | 1989.98 ms | 69.0% bf16 MFU | 262510 tok/s +step 2629/18794 | loss 3.355555 (-1.39z)| norm 0.2565 (+2.00z)| lr 5.89e-03 | 1978.64 ms | 69.4% bf16 MFU | 262633 tok/s +reducing beta2 to 0.9 and lr/wd by 0.753 due to grad z-score of 4.649765 +step 2630/18794 | loss 3.407778 (+0.16z)| norm 0.3511 (+4.65z)| lr 4.43e-03 | 1997.63 ms | 68.7% bf16 MFU | 262624 tok/s +step 2631/18794 | loss 3.410204 (+0.26z)| norm 0.1578 (-1.31z)| lr 5.89e-03 | 1983.11 ms | 69.2% bf16 MFU | 262711 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.078311 +step 2632/18794 | loss 3.405459 (+0.12z)| norm 0.2693 (+2.08z)| lr 5.89e-03 | 1995.19 ms | 68.8% bf16 MFU | 262715 tok/s +step 2633/18794 | loss 3.427226 (+0.80z)| norm 0.1927 (-0.24z)| lr 5.89e-03 | 1978.47 ms | 69.4% bf16 MFU | 262829 tok/s +step 2634/18794 | loss 3.370055 (-0.96z)| norm 0.1774 (-0.69z)| lr 5.89e-03 | 1991.72 ms | 68.9% bf16 MFU | 262849 tok/s +step 2635/18794 | loss 3.387574 (-0.42z)| norm 0.1433 (-1.69z)| lr 5.89e-03 | 1982.66 ms | 69.2% bf16 MFU | 262928 tok/s +step 2636/18794 | loss 3.396966 (-0.13z)| norm 0.1526 (-1.38z)| lr 5.89e-03 | 1982.96 ms | 69.2% bf16 MFU | 263002 tok/s +step 2637/18794 | loss 3.434905 (+1.04z)| norm 0.1953 (-0.10z)| lr 5.89e-03 | 1991.04 ms | 68.9% bf16 MFU | 263018 tok/s +step 2638/18794 | loss 3.356338 (-1.35z)| norm 0.1983 (-0.01z)| lr 5.89e-03 | 1995.55 ms | 68.8% bf16 MFU | 263003 tok/s +step 2639/18794 | loss 3.363670 (-1.11z)| norm 0.2092 (+0.31z)| lr 5.89e-03 | 1989.18 ms | 69.0% bf16 MFU | 263032 tok/s +step 2640/18794 | loss 3.454740 (+1.62z)| norm 0.2285 (+0.89z)| lr 5.89e-03 | 1991.72 ms | 68.9% bf16 MFU | 263042 tok/s +step 2641/18794 | loss 3.415318 (+0.42z)| norm 0.2397 (+1.21z)| lr 5.89e-03 | 1998.84 ms | 68.7% bf16 MFU | 263004 tok/s +step 2642/18794 | loss 3.328760 (-2.11z)| norm 0.1906 (-0.26z)| lr 5.89e-03 | 1979.93 ms | 69.3% bf16 MFU | 263094 tok/s +step 2643/18794 | loss 3.420257 (+0.63z)| norm 0.1702 (-0.85z)| lr 5.89e-03 | 1991.65 ms | 68.9% bf16 MFU | 263102 tok/s +step 2644/18794 | loss 3.329117 (-2.15z)| norm 0.1779 (-0.62z)| lr 5.89e-03 | 1997.13 ms | 68.7% bf16 MFU | 263073 tok/s +step 2645/18794 | loss 3.357521 (-1.26z)| norm 0.1684 (-0.92z)| lr 5.89e-03 | 1996.28 ms | 68.7% bf16 MFU | 263051 tok/s +step 2646/18794 | loss 3.416048 (+0.51z)| norm 0.1948 (-0.12z)| lr 5.89e-03 | 1991.68 ms | 68.9% bf16 MFU | 263060 tok/s +step 2647/18794 | loss 3.355459 (-1.30z)| norm 0.2108 (+0.36z)| lr 5.89e-03 | 1977.69 ms | 69.4% bf16 MFU | 263162 tok/s +step 2648/18794 | loss 3.379876 (-0.55z)| norm 0.1953 (-0.12z)| lr 5.89e-03 | 1997.96 ms | 68.7% bf16 MFU | 263125 tok/s +step 2649/18794 | loss 3.404817 (+0.20z)| norm 0.1676 (-0.96z)| lr 5.89e-03 | 2002.27 ms | 68.5% bf16 MFU | 263061 tok/s +step 2650/18794 | loss 3.395056 (-0.07z)| norm 0.1553 (-1.31z)| lr 5.89e-03 | 1998.68 ms | 68.7% bf16 MFU | 263024 tok/s +step 2651/18794 | loss 3.438462 (+1.25z)| norm 0.1575 (-1.22z)| lr 5.89e-03 | 1987.69 ms | 69.0% bf16 MFU | 263061 tok/s +step 2652/18794 | loss 3.366890 (-0.95z)| norm 0.1788 (-0.59z)| lr 5.89e-03 | 1987.79 ms | 69.0% bf16 MFU | 263095 tok/s +step 2653/18794 | loss 3.315400 (-2.45z)| norm 0.1605 (-1.14z)| lr 5.89e-03 | 1980.38 ms | 69.3% bf16 MFU | 263178 tok/s +step 2654/18794 | loss 3.405451 (+0.27z)| norm 0.1544 (-1.30z)| lr 5.89e-03 | 1990.26 ms | 69.0% bf16 MFU | 263190 tok/s +step 2655/18794 | loss 3.415860 (+0.59z)| norm 0.1727 (-0.75z)| lr 5.89e-03 | 1988.20 ms | 69.0% bf16 MFU | 263216 tok/s +step 2656/18794 | loss 3.399874 (+0.10z)| norm 0.2379 (+1.16z)| lr 5.89e-03 | 1993.63 ms | 68.8% bf16 MFU | 263204 tok/s +step 2657/18794 | loss 3.454570 (+1.74z)| norm 0.2138 (+0.43z)| lr 5.89e-03 | 2000.00 ms | 68.6% bf16 MFU | 263151 tok/s +step 2658/18794 | loss 3.383031 (-0.45z)| norm 0.1899 (-0.29z)| lr 5.89e-03 | 1978.26 ms | 69.4% bf16 MFU | 263245 tok/s +step 2659/18794 | loss 3.367761 (-0.90z)| norm 0.1767 (-0.68z)| lr 5.89e-03 | 1984.59 ms | 69.1% bf16 MFU | 263291 tok/s +step 2660/18794 | loss 3.403556 (+0.20z)| norm 0.1548 (-1.31z)| lr 5.89e-03 | 1982.45 ms | 69.2% bf16 MFU | 263350 tok/s +step 2661/18794 | loss 3.375927 (-0.67z)| norm 0.1604 (-1.13z)| lr 5.88e-03 | 1981.39 ms | 69.3% bf16 MFU | 263413 tok/s +step 2662/18794 | loss 3.381870 (-0.47z)| norm 0.1853 (-0.38z)| lr 5.88e-03 | 1981.04 ms | 69.3% bf16 MFU | 263475 tok/s +step 2663/18794 | loss 3.377122 (-0.61z)| norm 0.1827 (-0.45z)| lr 5.88e-03 | 1981.98 ms | 69.2% bf16 MFU | 263527 tok/s +step 2664/18794 | loss 3.400488 (+0.17z)| norm 0.1614 (-1.07z)| lr 5.88e-03 | 1984.21 ms | 69.2% bf16 MFU | 263562 tok/s +step 2665/18794 | loss 3.390150 (-0.16z)| norm 0.2193 (+0.63z)| lr 5.88e-03 | 1984.05 ms | 69.2% bf16 MFU | 263597 tok/s +step 2666/18794 | loss 3.403313 (+0.29z)| norm 0.2513 (+1.56z)| lr 5.88e-03 | 1980.07 ms | 69.3% bf16 MFU | 263656 tok/s +step 2667/18794 | loss 3.352434 (-1.40z)| norm 0.2426 (+1.31z)| lr 5.88e-03 | 1984.82 ms | 69.1% bf16 MFU | 263681 tok/s +step 2668/18794 | loss 3.419126 (+0.86z)| norm 0.2248 (+0.79z)| lr 5.88e-03 | 1981.10 ms | 69.3% bf16 MFU | 263729 tok/s +step 2669/18794 | loss 3.356552 (-1.25z)| norm 0.2003 (+0.08z)| lr 5.88e-03 | 1995.36 ms | 68.8% bf16 MFU | 263680 tok/s +step 2670/18794 | loss 3.390885 (-0.09z)| norm 0.1809 (-0.48z)| lr 5.88e-03 | 2038.23 ms | 67.3% bf16 MFU | 263358 tok/s +step 2671/18794 | loss 3.360732 (-1.10z)| norm 0.1758 (-0.63z)| lr 5.88e-03 | 2040.17 ms | 67.3% bf16 MFU | 263039 tok/s +step 2672/18794 | loss 3.392113 (-0.05z)| norm 0.1647 (-0.96z)| lr 5.88e-03 | 2025.00 ms | 67.8% bf16 MFU | 262832 tok/s +step 2673/18794 | loss 3.356551 (-1.23z)| norm 0.1479 (-1.44z)| lr 5.88e-03 | 2039.14 ms | 67.3% bf16 MFU | 262546 tok/s +step 2674/18794 | loss 3.429713 (+1.21z)| norm 0.1617 (-1.00z)| lr 5.88e-03 | 2039.85 ms | 67.3% bf16 MFU | 262270 tok/s +step 2675/18794 | loss 3.360979 (-1.07z)| norm 0.1913 (-0.11z)| lr 5.88e-03 | 2027.49 ms | 67.7% bf16 MFU | 262086 tok/s +step 2676/18794 | loss 3.410189 (+0.55z)| norm 0.2152 (+0.59z)| lr 5.88e-03 | 2025.72 ms | 67.7% bf16 MFU | 261923 tok/s +step 2677/18794 | loss 3.352028 (-1.42z)| norm 0.2230 (+0.82z)| lr 5.88e-03 | 2034.05 ms | 67.5% bf16 MFU | 261714 tok/s +step 2678/18794 | loss 3.386056 (-0.27z)| norm 0.2077 (+0.35z)| lr 5.88e-03 | 2031.88 ms | 67.5% bf16 MFU | 261530 tok/s +step 2679/18794 | loss 3.381227 (-0.42z)| norm 0.2044 (+0.24z)| lr 5.88e-03 | 2019.41 ms | 68.0% bf16 MFU | 261435 tok/s +step 2680/18794 | loss 3.437392 (+1.45z)| norm 0.2078 (+0.34z)| lr 5.88e-03 | 2039.41 ms | 67.3% bf16 MFU | 261217 tok/s +step 2681/18794 | loss 3.401559 (+0.25z)| norm 0.2162 (+0.57z)| lr 5.88e-03 | 2033.36 ms | 67.5% bf16 MFU | 261048 tok/s +step 2682/18794 | loss 3.394855 (+0.03z)| norm 0.1754 (-0.65z)| lr 5.88e-03 | 2039.26 ms | 67.3% bf16 MFU | 260851 tok/s +step 2683/18794 | loss 3.357778 (-1.20z)| norm 0.2194 (+0.67z)| lr 5.88e-03 | 2040.61 ms | 67.3% bf16 MFU | 260655 tok/s +step 2684/18794 | loss 3.310651 (-2.66z)| norm 0.2253 (+0.86z)| lr 5.88e-03 | 2021.21 ms | 67.9% bf16 MFU | 260591 tok/s +step 2685/18794 | loss 3.401007 (+0.25z)| norm 0.1838 (-0.40z)| lr 5.88e-03 | 2026.26 ms | 67.7% bf16 MFU | 260499 tok/s +step 2686/18794 | loss 3.333146 (-1.90z)| norm 0.1724 (-0.73z)| lr 5.88e-03 | 2027.93 ms | 67.7% bf16 MFU | 260401 tok/s +step 2687/18794 | loss 3.367871 (-0.78z)| norm 0.2126 (+0.49z)| lr 5.88e-03 | 2036.31 ms | 67.4% bf16 MFU | 260254 tok/s +step 2688/18794 | loss 3.424900 (+1.02z)| norm 0.2090 (+0.39z)| lr 5.88e-03 | 2013.51 ms | 68.2% bf16 MFU | 260261 tok/s +step 2689/18794 | loss 3.417659 (+0.79z)| norm 0.1978 (+0.07z)| lr 5.88e-03 | 2022.05 ms | 67.9% bf16 MFU | 260212 tok/s +step 2690/18794 | loss 3.376793 (-0.50z)| norm 0.1802 (-0.46z)| lr 5.88e-03 | 2037.75 ms | 67.3% bf16 MFU | 260066 tok/s +step 2691/18794 | loss 3.506165 (+3.37z)| norm 0.2032 (+0.26z)| lr 5.88e-03 | 2033.06 ms | 67.5% bf16 MFU | 259957 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.543984 +step 2692/18794 | loss 3.412156 (+0.56z)| norm 0.2800 (+2.54z)| lr 5.88e-03 | 2031.55 ms | 67.6% bf16 MFU | 259862 tok/s +step 2693/18794 | loss 3.462327 (+2.00z)| norm 0.2611 (+1.92z)| lr 5.88e-03 | 2012.18 ms | 68.2% bf16 MFU | 259897 tok/s +step 2694/18794 | loss 3.439396 (+1.32z)| norm 0.2323 (+1.05z)| lr 5.88e-03 | 2025.72 ms | 67.7% bf16 MFU | 259843 tok/s +step 2695/18794 | loss 3.397449 (+0.11z)| norm 0.2122 (+0.44z)| lr 5.88e-03 | 2026.19 ms | 67.7% bf16 MFU | 259789 tok/s +step 2696/18794 | loss 3.341024 (-1.53z)| norm 0.2077 (+0.30z)| lr 5.88e-03 | 2026.04 ms | 67.7% bf16 MFU | 259738 tok/s +step 2697/18794 | loss 3.371396 (-0.65z)| norm 0.1966 (-0.05z)| lr 5.88e-03 | 2003.21 ms | 68.5% bf16 MFU | 259837 tok/s +step 2698/18794 | loss 3.356777 (-1.06z)| norm 0.1844 (-0.43z)| lr 5.88e-03 | 2026.63 ms | 67.7% bf16 MFU | 259780 tok/s +step 2699/18794 | loss 3.348316 (-1.28z)| norm 0.1837 (-0.44z)| lr 5.88e-03 | 2025.01 ms | 67.8% bf16 MFU | 259737 tok/s +step 2700/18794 | loss 3.374628 (-0.51z)| norm 0.1686 (-0.88z)| lr 5.88e-03 | 2019.72 ms | 67.9% bf16 MFU | 259729 tok/s +step 2701/18794 | loss 3.380237 (-0.36z)| norm 0.1606 (-1.12z)| lr 5.88e-03 | 2024.54 ms | 67.8% bf16 MFU | 259691 tok/s +step 2702/18794 | loss 3.315578 (-2.21z)| norm 0.1804 (-0.51z)| lr 5.88e-03 | 2037.74 ms | 67.3% bf16 MFU | 259571 tok/s +step 2703/18794 | loss 3.372786 (-0.56z)| norm 0.1665 (-0.91z)| lr 5.88e-03 | 2025.63 ms | 67.7% bf16 MFU | 259534 tok/s +step 2704/18794 | loss 3.358872 (-0.94z)| norm 0.1588 (-1.12z)| lr 5.88e-03 | 2019.04 ms | 68.0% bf16 MFU | 259541 tok/s +step 2705/18794 | loss 3.387133 (-0.14z)| norm 0.2200 (+0.71z)| lr 5.88e-03 | 2000.75 ms | 68.6% bf16 MFU | 259666 tok/s +step 2706/18794 | loss 3.354594 (-1.06z)| norm 0.2102 (+0.40z)| lr 5.88e-03 | 2027.32 ms | 67.7% bf16 MFU | 259613 tok/s +step 2707/18794 | loss 3.418401 (+0.74z)| norm 0.2017 (+0.14z)| lr 5.88e-03 | 2033.21 ms | 67.5% bf16 MFU | 259526 tok/s +step 2708/18794 | loss 3.426289 (+0.98z)| norm 0.2150 (+0.53z)| lr 5.88e-03 | 2025.86 ms | 67.7% bf16 MFU | 259489 tok/s +step 2709/18794 | loss 3.350521 (-1.18z)| norm 0.2355 (+1.13z)| lr 5.88e-03 | 2017.19 ms | 68.0% bf16 MFU | 259510 tok/s +step 2710/18794 | loss 3.419365 (+0.79z)| norm 0.1979 (-0.02z)| lr 5.88e-03 | 2029.06 ms | 67.6% bf16 MFU | 259454 tok/s +step 2711/18794 | loss 3.356819 (-0.98z)| norm 0.1642 (-1.04z)| lr 5.88e-03 | 2020.64 ms | 67.9% bf16 MFU | 259455 tok/s +step 2712/18794 | loss 3.404808 (+0.39z)| norm 0.2094 (+0.32z)| lr 5.88e-03 | 2024.86 ms | 67.8% bf16 MFU | 259428 tok/s +step 2713/18794 | loss 3.441800 (+1.43z)| norm 0.2141 (+0.46z)| lr 5.88e-03 | 2018.22 ms | 68.0% bf16 MFU | 259446 tok/s +step 2714/18794 | loss 3.376679 (-0.41z)| norm 0.1843 (-0.44z)| lr 5.88e-03 | 2040.01 ms | 67.3% bf16 MFU | 259324 tok/s +step 2715/18794 | loss 3.377433 (-0.38z)| norm 0.1867 (-0.35z)| lr 5.88e-03 | 2018.81 ms | 68.0% bf16 MFU | 259343 tok/s +step 2716/18794 | loss 3.290469 (-2.75z)| norm 0.1785 (-0.59z)| lr 5.88e-03 | 2039.82 ms | 67.3% bf16 MFU | 259227 tok/s +step 2717/18794 | loss 3.353175 (-0.98z)| norm 0.2119 (+0.44z)| lr 5.88e-03 | 2017.78 ms | 68.0% bf16 MFU | 259257 tok/s +step 2718/18794 | loss 3.365669 (-0.64z)| norm 0.2368 (+1.18z)| lr 5.88e-03 | 2024.96 ms | 67.8% bf16 MFU | 259240 tok/s +step 2719/18794 | loss 3.394836 (+0.19z)| norm 0.2420 (+1.32z)| lr 5.88e-03 | 2009.74 ms | 68.3% bf16 MFU | 259322 tok/s +step 2720/18794 | loss 3.410739 (+0.63z)| norm 0.2607 (+1.84z)| lr 5.88e-03 | 2008.96 ms | 68.3% bf16 MFU | 259404 tok/s +step 2721/18794 | loss 3.416728 (+0.79z)| norm 0.2612 (+1.80z)| lr 5.88e-03 | 2017.73 ms | 68.0% bf16 MFU | 259426 tok/s +step 2722/18794 | loss 3.449080 (+1.66z)| norm 0.2148 (+0.42z)| lr 5.88e-03 | 2009.89 ms | 68.3% bf16 MFU | 259497 tok/s +step 2723/18794 | loss 3.406375 (+0.48z)| norm 0.2400 (+1.15z)| lr 5.88e-03 | 2010.53 ms | 68.3% bf16 MFU | 259561 tok/s +step 2724/18794 | loss 3.383560 (-0.14z)| norm 0.2368 (+1.04z)| lr 5.88e-03 | 2011.44 ms | 68.2% bf16 MFU | 259616 tok/s +step 2725/18794 | loss 3.429132 (+1.10z)| norm 0.2299 (+0.85z)| lr 5.88e-03 | 2026.51 ms | 67.7% bf16 MFU | 259571 tok/s +step 2726/18794 | loss 3.421847 (+0.90z)| norm 0.2438 (+1.25z)| lr 5.88e-03 | 2018.50 ms | 68.0% bf16 MFU | 259579 tok/s +step 2727/18794 | loss 3.392417 (+0.10z)| norm 0.2131 (+0.35z)| lr 5.88e-03 | 2011.31 ms | 68.2% bf16 MFU | 259634 tok/s +step 2728/18794 | loss 3.335622 (-1.45z)| norm 0.1991 (-0.06z)| lr 5.88e-03 | 2016.54 ms | 68.1% bf16 MFU | 259652 tok/s +step 2729/18794 | loss 3.413590 (+0.68z)| norm 0.2058 (+0.15z)| lr 5.88e-03 | 2009.05 ms | 68.3% bf16 MFU | 259717 tok/s +step 2730/18794 | loss 3.457231 (+1.85z)| norm 0.2173 (+0.60z)| lr 5.88e-03 | 2023.91 ms | 67.8% bf16 MFU | 259684 tok/s +step 2731/18794 | loss 3.380812 (-0.22z)| norm 0.2318 (+1.06z)| lr 5.88e-03 | 2009.03 ms | 68.3% bf16 MFU | 259748 tok/s +step 2732/18794 | loss 3.383492 (-0.15z)| norm 0.2186 (+0.66z)| lr 5.87e-03 | 2007.85 ms | 68.3% bf16 MFU | 259816 tok/s +step 2733/18794 | loss 3.416468 (+0.76z)| norm 0.1913 (-0.28z)| lr 5.87e-03 | 2013.93 ms | 68.1% bf16 MFU | 259842 tok/s +step 2734/18794 | loss 3.331789 (-1.53z)| norm 0.1820 (-0.60z)| lr 5.87e-03 | 2006.36 ms | 68.4% bf16 MFU | 259916 tok/s +step 2735/18794 | loss 3.391550 (+0.09z)| norm 0.1806 (-0.67z)| lr 5.87e-03 | 2017.73 ms | 68.0% bf16 MFU | 259912 tok/s +step 2736/18794 | loss 3.382355 (-0.16z)| norm 0.1600 (-1.39z)| lr 5.87e-03 | 2009.61 ms | 68.3% bf16 MFU | 259961 tok/s +step 2737/18794 | loss 3.387733 (-0.00z)| norm 0.1426 (-1.94z)| lr 5.87e-03 | 2021.20 ms | 67.9% bf16 MFU | 259933 tok/s +step 2738/18794 | loss 3.414463 (+0.71z)| norm 0.1599 (-1.33z)| lr 5.87e-03 | 2017.06 ms | 68.0% bf16 MFU | 259932 tok/s +step 2739/18794 | loss 3.386673 (-0.05z)| norm 0.1548 (-1.47z)| lr 5.87e-03 | 2009.25 ms | 68.3% bf16 MFU | 259983 tok/s +step 2740/18794 | loss 3.387175 (-0.02z)| norm 0.1840 (-0.47z)| lr 5.87e-03 | 2024.72 ms | 67.8% bf16 MFU | 259931 tok/s +step 2741/18794 | loss 3.358110 (-0.81z)| norm 0.1768 (-0.70z)| lr 5.87e-03 | 2021.03 ms | 67.9% bf16 MFU | 259905 tok/s +step 2742/18794 | loss 3.348652 (-1.09z)| norm 0.1773 (-0.68z)| lr 5.87e-03 | 2009.52 ms | 68.3% bf16 MFU | 259955 tok/s +step 2743/18794 | loss 3.420463 (+0.92z)| norm 0.2406 (+1.44z)| lr 5.87e-03 | 2014.70 ms | 68.1% bf16 MFU | 259969 tok/s +step 2744/18794 | loss 3.384291 (-0.11z)| norm 0.2329 (+1.15z)| lr 5.87e-03 | 2010.91 ms | 68.2% bf16 MFU | 260006 tok/s +step 2745/18794 | loss 3.453878 (+1.82z)| norm 0.2181 (+0.64z)| lr 5.87e-03 | 2024.12 ms | 67.8% bf16 MFU | 259957 tok/s +step 2746/18794 | loss 3.393648 (+0.14z)| norm 0.2051 (+0.20z)| lr 5.87e-03 | 2000.83 ms | 68.6% bf16 MFU | 260061 tok/s +step 2747/18794 | loss 3.348958 (-1.12z)| norm 0.1649 (-1.13z)| lr 5.87e-03 | 2016.04 ms | 68.1% bf16 MFU | 260061 tok/s +step 2748/18794 | loss 3.398119 (+0.26z)| norm 0.1685 (-0.99z)| lr 5.87e-03 | 2010.93 ms | 68.2% bf16 MFU | 260094 tok/s +step 2749/18794 | loss 3.426958 (+1.06z)| norm 0.2487 (+1.64z)| lr 5.87e-03 | 2009.60 ms | 68.3% bf16 MFU | 260133 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.424246 +step 2750/18794 | loss 3.397898 (+0.24z)| norm 0.3100 (+3.42z)| lr 5.87e-03 | 1995.48 ms | 68.8% bf16 MFU | 260264 tok/s +val loss 3.408185 +HellaSwag: 2736/10042 = 0.272456: 0/1256 +step 2751/18794 | loss 3.368092 (-0.57z)| norm 0.2650 (+1.96z)| lr 5.87e-03 | 2010.50 ms | 68.3% bf16 MFU | 260289 tok/s +step 2752/18794 | loss 3.418825 (+0.84z)| norm 0.2111 (+0.28z)| lr 5.87e-03 | 2012.10 ms | 68.2% bf16 MFU | 260303 tok/s +step 2753/18794 | loss 3.346637 (-1.22z)| norm 0.1952 (-0.23z)| lr 5.87e-03 | 2007.63 ms | 68.4% bf16 MFU | 260345 tok/s +step 2754/18794 | loss 3.349175 (-1.12z)| norm 0.1724 (-0.96z)| lr 5.87e-03 | 2020.41 ms | 67.9% bf16 MFU | 260303 tok/s +step 2755/18794 | loss 3.351206 (-1.04z)| norm 0.1699 (-1.03z)| lr 5.87e-03 | 2018.30 ms | 68.0% bf16 MFU | 260276 tok/s +step 2756/18794 | loss 3.393121 (+0.14z)| norm 0.1752 (-0.85z)| lr 5.87e-03 | 2018.65 ms | 68.0% bf16 MFU | 260248 tok/s +step 2757/18794 | loss 3.372497 (-0.42z)| norm 0.2037 (+0.06z)| lr 5.87e-03 | 1995.53 ms | 68.8% bf16 MFU | 260373 tok/s +step 2758/18794 | loss 3.423479 (+1.03z)| norm 0.1805 (-0.67z)| lr 5.87e-03 | 2011.19 ms | 68.2% bf16 MFU | 260388 tok/s +step 2759/18794 | loss 3.394570 (+0.19z)| norm 0.1806 (-0.67z)| lr 5.87e-03 | 2003.25 ms | 68.5% bf16 MFU | 260455 tok/s +step 2760/18794 | loss 3.327908 (-1.68z)| norm 0.1895 (-0.40z)| lr 5.87e-03 | 2003.37 ms | 68.5% bf16 MFU | 260517 tok/s +step 2761/18794 | loss 3.390616 (+0.09z)| norm 0.1818 (-0.66z)| lr 5.87e-03 | 2014.13 ms | 68.1% bf16 MFU | 260506 tok/s +step 2762/18794 | loss 3.357482 (-0.83z)| norm 0.1960 (-0.21z)| lr 5.87e-03 | 2002.88 ms | 68.5% bf16 MFU | 260570 tok/s +step 2763/18794 | loss 3.399165 (+0.34z)| norm 0.2004 (-0.07z)| lr 5.87e-03 | 1995.99 ms | 68.8% bf16 MFU | 260675 tok/s +step 2764/18794 | loss 3.378615 (-0.24z)| norm 0.1569 (-1.47z)| lr 5.87e-03 | 2004.06 ms | 68.5% bf16 MFU | 260721 tok/s +step 2765/18794 | loss 3.372643 (-0.40z)| norm 0.1722 (-0.96z)| lr 5.87e-03 | 2005.20 ms | 68.4% bf16 MFU | 260759 tok/s +step 2766/18794 | loss 3.342562 (-1.22z)| norm 0.2601 (+1.85z)| lr 5.87e-03 | 2009.98 ms | 68.3% bf16 MFU | 260763 tok/s +step 2767/18794 | loss 3.412653 (+0.72z)| norm 0.2616 (+1.87z)| lr 5.87e-03 | 2018.71 ms | 68.0% bf16 MFU | 260710 tok/s +step 2768/18794 | loss 3.329004 (-1.58z)| norm 0.1931 (-0.29z)| lr 5.87e-03 | 2005.35 ms | 68.4% bf16 MFU | 260747 tok/s +step 2769/18794 | loss 3.367077 (-0.53z)| norm 0.2066 (+0.14z)| lr 5.87e-03 | 2005.04 ms | 68.4% bf16 MFU | 260784 tok/s +step 2770/18794 | loss 3.475118 (+2.39z)| norm 0.1915 (-0.34z)| lr 5.87e-03 | 2006.44 ms | 68.4% bf16 MFU | 260810 tok/s +step 2771/18794 | loss 3.378943 (-0.22z)| norm 0.1954 (-0.23z)| lr 5.87e-03 | 1992.36 ms | 68.9% bf16 MFU | 260927 tok/s +step 2772/18794 | loss 3.368694 (-0.49z)| norm 0.1645 (-1.21z)| lr 5.87e-03 | 1996.59 ms | 68.7% bf16 MFU | 261010 tok/s +step 2773/18794 | loss 3.393060 (+0.16z)| norm 0.2035 (+0.01z)| lr 5.87e-03 | 2012.68 ms | 68.2% bf16 MFU | 260984 tok/s +step 2774/18794 | loss 3.343392 (-1.17z)| norm 0.2352 (+1.02z)| lr 5.87e-03 | 2012.80 ms | 68.2% bf16 MFU | 260959 tok/s +step 2775/18794 | loss 3.381862 (-0.13z)| norm 0.2354 (+1.01z)| lr 5.87e-03 | 2007.26 ms | 68.4% bf16 MFU | 260971 tok/s +step 2776/18794 | loss 3.420190 (+0.91z)| norm 0.2232 (+0.61z)| lr 5.87e-03 | 1994.98 ms | 68.8% bf16 MFU | 261062 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.073477 +step 2777/18794 | loss 3.410128 (+0.62z)| norm 0.2703 (+2.07z)| lr 5.87e-03 | 1993.81 ms | 68.8% bf16 MFU | 261157 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.203586 +step 2778/18794 | loss 3.366686 (-0.55z)| norm 0.2769 (+2.20z)| lr 5.87e-03 | 2003.85 ms | 68.5% bf16 MFU | 261181 tok/s +step 2779/18794 | loss 3.385664 (-0.04z)| norm 0.2297 (+0.74z)| lr 5.87e-03 | 1997.11 ms | 68.7% bf16 MFU | 261248 tok/s +step 2780/18794 | loss 3.374385 (-0.33z)| norm 0.1894 (-0.49z)| lr 5.87e-03 | 2008.87 ms | 68.3% bf16 MFU | 261235 tok/s +step 2781/18794 | loss 3.379514 (-0.18z)| norm 0.1700 (-1.07z)| lr 5.87e-03 | 2003.21 ms | 68.5% bf16 MFU | 261260 tok/s +step 2782/18794 | loss 3.354952 (-0.85z)| norm 0.2138 (+0.26z)| lr 5.87e-03 | 2005.94 ms | 68.4% bf16 MFU | 261265 tok/s +step 2783/18794 | loss 3.395899 (+0.27z)| norm 0.2219 (+0.50z)| lr 5.87e-03 | 1993.45 ms | 68.8% bf16 MFU | 261352 tok/s +step 2784/18794 | loss 3.363677 (-0.65z)| norm 0.2072 (+0.06z)| lr 5.87e-03 | 2002.73 ms | 68.5% bf16 MFU | 261374 tok/s +step 2785/18794 | loss 3.394095 (+0.21z)| norm 0.2163 (+0.33z)| lr 5.87e-03 | 1988.76 ms | 69.0% bf16 MFU | 261486 tok/s +step 2786/18794 | loss 3.365471 (-0.61z)| norm 0.1765 (-0.90z)| lr 5.87e-03 | 2010.06 ms | 68.3% bf16 MFU | 261454 tok/s +step 2787/18794 | loss 3.423934 (+1.03z)| norm 0.1470 (-1.76z)| lr 5.87e-03 | 2001.12 ms | 68.6% bf16 MFU | 261481 tok/s +step 2788/18794 | loss 3.367597 (-0.55z)| norm 0.1853 (-0.59z)| lr 5.87e-03 | 2020.12 ms | 67.9% bf16 MFU | 261383 tok/s +step 2789/18794 | loss 3.393183 (+0.18z)| norm 0.1549 (-1.48z)| lr 5.87e-03 | 2010.66 ms | 68.3% bf16 MFU | 261352 tok/s +step 2790/18794 | loss 3.409921 (+0.65z)| norm 0.1699 (-1.03z)| lr 5.87e-03 | 2013.25 ms | 68.2% bf16 MFU | 261305 tok/s +step 2791/18794 | loss 3.347496 (-1.14z)| norm 0.1933 (-0.33z)| lr 5.87e-03 | 2003.45 ms | 68.5% bf16 MFU | 261325 tok/s +step 2792/18794 | loss 3.390654 (+0.16z)| norm 0.2084 (+0.15z)| lr 5.87e-03 | 2005.22 ms | 68.4% bf16 MFU | 261332 tok/s +step 2793/18794 | loss 3.395947 (+0.35z)| norm 0.1754 (-0.84z)| lr 5.87e-03 | 2003.61 ms | 68.5% bf16 MFU | 261349 tok/s +step 2794/18794 | loss 3.381352 (-0.08z)| norm 0.1940 (-0.25z)| lr 5.87e-03 | 2017.16 ms | 68.0% bf16 MFU | 261277 tok/s +step 2795/18794 | loss 3.340662 (-1.33z)| norm 0.2465 (+1.36z)| lr 5.87e-03 | 1999.59 ms | 68.6% bf16 MFU | 261323 tok/s +step 2796/18794 | loss 3.387837 (+0.12z)| norm 0.2459 (+1.32z)| lr 5.87e-03 | 2008.36 ms | 68.3% bf16 MFU | 261309 tok/s +step 2797/18794 | loss 3.426710 (+1.31z)| norm 0.1978 (-0.16z)| lr 5.87e-03 | 2015.67 ms | 68.1% bf16 MFU | 261249 tok/s +step 2798/18794 | loss 3.390388 (+0.17z)| norm 0.2068 (+0.11z)| lr 5.87e-03 | 1994.66 ms | 68.8% bf16 MFU | 261329 tok/s +step 2799/18794 | loss 3.390512 (+0.17z)| norm 0.2690 (+1.96z)| lr 5.87e-03 | 2006.42 ms | 68.4% bf16 MFU | 261328 tok/s +step 2800/18794 | loss 3.360688 (-0.77z)| norm 0.1685 (-1.07z)| lr 5.86e-03 | 2009.12 ms | 68.3% bf16 MFU | 261309 tok/s +step 2801/18794 | loss 3.387136 (+0.06z)| norm 0.1693 (-1.05z)| lr 5.86e-03 | 2009.51 ms | 68.3% bf16 MFU | 261289 tok/s +step 2802/18794 | loss 3.456040 (+2.18z)| norm 0.2413 (+1.10z)| lr 5.86e-03 | 1996.06 ms | 68.8% bf16 MFU | 261358 tok/s +step 2803/18794 | loss 3.475616 (+2.66z)| norm 0.2150 (+0.30z)| lr 5.86e-03 | 1999.61 ms | 68.6% bf16 MFU | 261399 tok/s +step 2804/18794 | loss 3.329558 (-1.73z)| norm 0.2327 (+0.82z)| lr 5.86e-03 | 1998.55 ms | 68.7% bf16 MFU | 261446 tok/s +step 2805/18794 | loss 3.340209 (-1.38z)| norm 0.2228 (+0.51z)| lr 5.86e-03 | 1995.24 ms | 68.8% bf16 MFU | 261512 tok/s +step 2806/18794 | loss 3.381770 (-0.16z)| norm 0.1836 (-0.67z)| lr 5.86e-03 | 1992.74 ms | 68.9% bf16 MFU | 261592 tok/s +step 2807/18794 | loss 3.423983 (+1.09z)| norm 0.1655 (-1.20z)| lr 5.86e-03 | 1995.83 ms | 68.8% bf16 MFU | 261647 tok/s +step 2808/18794 | loss 3.345149 (-1.22z)| norm 0.1787 (-0.79z)| lr 5.86e-03 | 1996.29 ms | 68.7% bf16 MFU | 261696 tok/s +step 2809/18794 | loss 3.360812 (-0.76z)| norm 0.1644 (-1.20z)| lr 5.86e-03 | 2003.33 ms | 68.5% bf16 MFU | 261696 tok/s +step 2810/18794 | loss 3.418001 (+0.94z)| norm 0.1806 (-0.70z)| lr 5.86e-03 | 1989.15 ms | 69.0% bf16 MFU | 261790 tok/s +step 2811/18794 | loss 3.380873 (-0.17z)| norm 0.1554 (-1.45z)| lr 5.86e-03 | 1995.01 ms | 68.8% bf16 MFU | 261841 tok/s +step 2812/18794 | loss 3.380900 (-0.17z)| norm 0.1571 (-1.37z)| lr 5.86e-03 | 1996.37 ms | 68.7% bf16 MFU | 261880 tok/s +step 2813/18794 | loss 3.379523 (-0.19z)| norm 0.2022 (-0.03z)| lr 5.86e-03 | 1995.91 ms | 68.8% bf16 MFU | 261920 tok/s +step 2814/18794 | loss 3.314087 (-2.11z)| norm 0.2044 (+0.03z)| lr 5.86e-03 | 2001.94 ms | 68.5% bf16 MFU | 261918 tok/s +step 2815/18794 | loss 3.418571 (+0.97z)| norm 0.2296 (+0.76z)| lr 5.86e-03 | 2000.11 ms | 68.6% bf16 MFU | 261929 tok/s +step 2816/18794 | loss 3.386804 (+0.01z)| norm 0.2451 (+1.19z)| lr 5.86e-03 | 1995.57 ms | 68.8% bf16 MFU | 261969 tok/s +step 2817/18794 | loss 3.374826 (-0.37z)| norm 0.2343 (+0.87z)| lr 5.86e-03 | 2010.45 ms | 68.3% bf16 MFU | 261909 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.282243 +step 2818/18794 | loss 3.402033 (+0.46z)| norm 0.2848 (+2.28z)| lr 5.86e-03 | 1993.80 ms | 68.8% bf16 MFU | 261962 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.487141 +step 2819/18794 | loss 3.363766 (-0.71z)| norm 0.2948 (+2.49z)| lr 5.86e-03 | 1996.67 ms | 68.7% bf16 MFU | 261993 tok/s +step 2820/18794 | loss 3.386482 (-0.00z)| norm 0.1911 (-0.40z)| lr 5.86e-03 | 1987.06 ms | 69.1% bf16 MFU | 262086 tok/s +step 2821/18794 | loss 3.323906 (-1.89z)| norm 0.1924 (-0.35z)| lr 5.86e-03 | 1988.40 ms | 69.0% bf16 MFU | 262165 tok/s +step 2822/18794 | loss 3.395137 (+0.31z)| norm 0.2100 (+0.16z)| lr 5.86e-03 | 1994.89 ms | 68.8% bf16 MFU | 262198 tok/s +step 2823/18794 | loss 3.381223 (-0.11z)| norm 0.2193 (+0.43z)| lr 5.86e-03 | 1985.49 ms | 69.1% bf16 MFU | 262291 tok/s +step 2824/18794 | loss 3.386282 (+0.04z)| norm 0.1716 (-0.92z)| lr 5.86e-03 | 1994.06 ms | 68.8% bf16 MFU | 262322 tok/s +step 2825/18794 | loss 3.360170 (-0.76z)| norm 0.1507 (-1.49z)| lr 5.86e-03 | 1996.08 ms | 68.8% bf16 MFU | 262339 tok/s +step 2826/18794 | loss 3.381555 (-0.07z)| norm 0.1920 (-0.29z)| lr 5.86e-03 | 1986.10 ms | 69.1% bf16 MFU | 262421 tok/s +step 2827/18794 | loss 3.384946 (+0.04z)| norm 0.2182 (+0.45z)| lr 5.86e-03 | 2002.05 ms | 68.5% bf16 MFU | 262394 tok/s +step 2828/18794 | loss 3.356338 (-0.88z)| norm 0.1955 (-0.20z)| lr 5.86e-03 | 1995.83 ms | 68.8% bf16 MFU | 262409 tok/s +step 2829/18794 | loss 3.390939 (+0.23z)| norm 0.1904 (-0.34z)| lr 5.86e-03 | 1995.39 ms | 68.8% bf16 MFU | 262426 tok/s +step 2830/18794 | loss 3.305242 (-2.46z)| norm 0.1712 (-0.87z)| lr 5.86e-03 | 1988.58 ms | 69.0% bf16 MFU | 262487 tok/s +step 2831/18794 | loss 3.355241 (-0.85z)| norm 0.2106 (+0.26z)| lr 5.86e-03 | 1992.95 ms | 68.9% bf16 MFU | 262516 tok/s +step 2832/18794 | loss 3.297442 (-2.57z)| norm 0.1977 (-0.10z)| lr 5.86e-03 | 1979.60 ms | 69.3% bf16 MFU | 262633 tok/s +step 2833/18794 | loss 3.345228 (-1.08z)| norm 0.1968 (-0.13z)| lr 5.86e-03 | 1996.13 ms | 68.7% bf16 MFU | 262634 tok/s +step 2834/18794 | loss 3.331434 (-1.50z)| norm 0.1795 (-0.63z)| lr 5.86e-03 | 2000.18 ms | 68.6% bf16 MFU | 262608 tok/s +step 2835/18794 | loss 3.404222 (+0.73z)| norm 0.2014 (-0.00z)| lr 5.86e-03 | 1990.68 ms | 68.9% bf16 MFU | 262646 tok/s +step 2836/18794 | loss 3.402039 (+0.65z)| norm 0.2160 (+0.40z)| lr 5.86e-03 | 1987.85 ms | 69.0% bf16 MFU | 262701 tok/s +step 2837/18794 | loss 3.410995 (+0.92z)| norm 0.1993 (-0.10z)| lr 5.86e-03 | 1997.21 ms | 68.7% bf16 MFU | 262692 tok/s +step 2838/18794 | loss 3.381486 (+0.03z)| norm 0.1810 (-0.65z)| lr 5.86e-03 | 1995.62 ms | 68.8% bf16 MFU | 262693 tok/s +step 2839/18794 | loss 3.392449 (+0.36z)| norm 0.1909 (-0.37z)| lr 5.86e-03 | 2002.60 ms | 68.5% bf16 MFU | 262648 tok/s +step 2840/18794 | loss 3.348748 (-0.96z)| norm 0.2004 (-0.09z)| lr 5.86e-03 | 1994.53 ms | 68.8% bf16 MFU | 262659 tok/s +step 2841/18794 | loss 3.383865 (+0.10z)| norm 0.2208 (+0.51z)| lr 5.86e-03 | 1984.15 ms | 69.2% bf16 MFU | 262738 tok/s +step 2842/18794 | loss 3.389659 (+0.27z)| norm 0.2199 (+0.47z)| lr 5.86e-03 | 2007.62 ms | 68.4% bf16 MFU | 262659 tok/s +step 2843/18794 | loss 3.391301 (+0.33z)| norm 0.1685 (-1.05z)| lr 5.86e-03 | 1987.21 ms | 69.1% bf16 MFU | 262717 tok/s +step 2844/18794 | loss 3.327316 (-1.61z)| norm 0.2067 (+0.10z)| lr 5.86e-03 | 1980.26 ms | 69.3% bf16 MFU | 262819 tok/s +step 2845/18794 | loss 3.439822 (+1.84z)| norm 0.2179 (+0.44z)| lr 5.86e-03 | 1978.87 ms | 69.3% bf16 MFU | 262925 tok/s +step 2846/18794 | loss 3.383356 (+0.11z)| norm 0.1736 (-0.88z)| lr 5.86e-03 | 1986.71 ms | 69.1% bf16 MFU | 262974 tok/s +step 2847/18794 | loss 3.302874 (-2.30z)| norm 0.2093 (+0.18z)| lr 5.86e-03 | 1978.85 ms | 69.3% bf16 MFU | 263073 tok/s +step 2848/18794 | loss 3.302392 (-2.23z)| norm 0.2025 (-0.04z)| lr 5.86e-03 | 1983.58 ms | 69.2% bf16 MFU | 263135 tok/s +step 2849/18794 | loss 3.401794 (+0.70z)| norm 0.1801 (-0.70z)| lr 5.86e-03 | 1993.88 ms | 68.8% bf16 MFU | 263125 tok/s +step 2850/18794 | loss 3.428688 (+1.47z)| norm 0.1843 (-0.56z)| lr 5.86e-03 | 1983.86 ms | 69.2% bf16 MFU | 263183 tok/s +step 2851/18794 | loss 3.430174 (+1.48z)| norm 0.1928 (-0.27z)| lr 5.86e-03 | 1986.34 ms | 69.1% bf16 MFU | 263221 tok/s +step 2852/18794 | loss 3.363487 (-0.44z)| norm 0.2167 (+0.51z)| lr 5.86e-03 | 1980.56 ms | 69.3% bf16 MFU | 263296 tok/s +step 2853/18794 | loss 3.368018 (-0.31z)| norm 0.2632 (+1.99z)| lr 5.86e-03 | 1984.98 ms | 69.1% bf16 MFU | 263338 tok/s +step 2854/18794 | loss 3.390928 (+0.35z)| norm 0.1993 (-0.09z)| lr 5.86e-03 | 1979.69 ms | 69.3% bf16 MFU | 263412 tok/s +step 2855/18794 | loss 3.351894 (-0.80z)| norm 0.1844 (-0.58z)| lr 5.86e-03 | 1977.84 ms | 69.4% bf16 MFU | 263496 tok/s +step 2856/18794 | loss 3.386915 (+0.23z)| norm 0.1954 (-0.23z)| lr 5.86e-03 | 1980.75 ms | 69.3% bf16 MFU | 263556 tok/s +step 2857/18794 | loss 3.361462 (-0.52z)| norm 0.1568 (-1.46z)| lr 5.86e-03 | 1979.66 ms | 69.3% bf16 MFU | 263620 tok/s +step 2858/18794 | loss 3.446470 (+1.96z)| norm 0.1576 (-1.42z)| lr 5.86e-03 | 1979.55 ms | 69.3% bf16 MFU | 263681 tok/s +step 2859/18794 | loss 3.394696 (+0.45z)| norm 0.1848 (-0.55z)| lr 5.86e-03 | 1979.25 ms | 69.3% bf16 MFU | 263742 tok/s +step 2860/18794 | loss 3.392278 (+0.36z)| norm 0.1874 (-0.46z)| lr 5.86e-03 | 2017.09 ms | 68.0% bf16 MFU | 263551 tok/s +step 2861/18794 | loss 3.378669 (-0.03z)| norm 0.2168 (+0.47z)| lr 5.86e-03 | 2035.19 ms | 67.4% bf16 MFU | 263254 tok/s +step 2862/18794 | loss 3.395882 (+0.46z)| norm 0.2374 (+1.12z)| lr 5.86e-03 | 2033.01 ms | 67.5% bf16 MFU | 262986 tok/s +step 2863/18794 | loss 3.441921 (+1.78z)| norm 0.2636 (+1.90z)| lr 5.86e-03 | 2041.72 ms | 67.2% bf16 MFU | 262676 tok/s +step 2864/18794 | loss 3.379095 (-0.04z)| norm 0.2397 (+1.13z)| lr 5.86e-03 | 2033.42 ms | 67.5% bf16 MFU | 262434 tok/s +step 2865/18794 | loss 3.353705 (-0.77z)| norm 0.2019 (-0.07z)| lr 5.86e-03 | 2034.68 ms | 67.4% bf16 MFU | 262196 tok/s +step 2866/18794 | loss 3.403835 (+0.66z)| norm 0.1776 (-0.83z)| lr 5.85e-03 | 2041.83 ms | 67.2% bf16 MFU | 261925 tok/s +step 2867/18794 | loss 3.376101 (-0.13z)| norm 0.1619 (-1.31z)| lr 5.85e-03 | 2035.11 ms | 67.4% bf16 MFU | 261709 tok/s +step 2868/18794 | loss 3.331832 (-1.43z)| norm 0.1467 (-1.76z)| lr 5.85e-03 | 2033.67 ms | 67.5% bf16 MFU | 261514 tok/s +step 2869/18794 | loss 3.409978 (+0.84z)| norm 0.1546 (-1.48z)| lr 5.85e-03 | 2032.93 ms | 67.5% bf16 MFU | 261333 tok/s +step 2870/18794 | loss 3.360001 (-0.60z)| norm 0.1672 (-1.07z)| lr 5.85e-03 | 2040.50 ms | 67.3% bf16 MFU | 261114 tok/s +step 2871/18794 | loss 3.422291 (+1.26z)| norm 0.1939 (-0.23z)| lr 5.85e-03 | 2025.83 ms | 67.7% bf16 MFU | 260998 tok/s +step 2872/18794 | loss 3.274285 (-3.01z)| norm 0.2301 (+0.89z)| lr 5.85e-03 | 2041.58 ms | 67.2% bf16 MFU | 260788 tok/s +step 2873/18794 | loss 3.409708 (+0.86z)| norm 0.2218 (+0.62z)| lr 5.85e-03 | 2040.53 ms | 67.3% bf16 MFU | 260596 tok/s +step 2874/18794 | loss 3.352533 (-0.78z)| norm 0.2586 (+1.75z)| lr 5.85e-03 | 2033.49 ms | 67.5% bf16 MFU | 260457 tok/s +step 2875/18794 | loss 3.404048 (+0.69z)| norm 0.2591 (+1.74z)| lr 5.85e-03 | 2033.65 ms | 67.5% bf16 MFU | 260325 tok/s +step 2876/18794 | loss 3.373477 (-0.17z)| norm 0.2081 (+0.18z)| lr 5.85e-03 | 2025.28 ms | 67.8% bf16 MFU | 260252 tok/s +step 2877/18794 | loss 3.406539 (+0.78z)| norm 0.2108 (+0.29z)| lr 5.85e-03 | 2033.60 ms | 67.5% bf16 MFU | 260130 tok/s +step 2878/18794 | loss 3.355647 (-0.68z)| norm 0.1813 (-0.63z)| lr 5.85e-03 | 2027.49 ms | 67.7% bf16 MFU | 260053 tok/s +step 2879/18794 | loss 3.360740 (-0.52z)| norm 0.1944 (-0.20z)| lr 5.85e-03 | 2026.54 ms | 67.7% bf16 MFU | 259986 tok/s +step 2880/18794 | loss 3.299709 (-2.20z)| norm 0.2186 (+0.58z)| lr 5.85e-03 | 2041.57 ms | 67.2% bf16 MFU | 259827 tok/s +step 2881/18794 | loss 3.344238 (-0.94z)| norm 0.2303 (+0.94z)| lr 5.85e-03 | 2027.26 ms | 67.7% bf16 MFU | 259767 tok/s +step 2882/18794 | loss 3.310213 (-1.84z)| norm 0.2131 (+0.38z)| lr 5.85e-03 | 2026.80 ms | 67.7% bf16 MFU | 259712 tok/s +step 2883/18794 | loss 3.406255 (+0.78z)| norm 0.2613 (+1.91z)| lr 5.85e-03 | 2043.02 ms | 67.2% bf16 MFU | 259558 tok/s +step 2884/18794 | loss 3.409867 (+0.87z)| norm 0.2560 (+1.70z)| lr 5.85e-03 | 2040.24 ms | 67.3% bf16 MFU | 259429 tok/s +step 2885/18794 | loss 3.348848 (-0.78z)| norm 0.1924 (-0.30z)| lr 5.85e-03 | 2016.46 ms | 68.1% bf16 MFU | 259457 tok/s +step 2886/18794 | loss 3.335996 (-1.12z)| norm 0.2162 (+0.44z)| lr 5.85e-03 | 2033.89 ms | 67.5% bf16 MFU | 259373 tok/s +step 2887/18794 | loss 3.371562 (-0.14z)| norm 0.1924 (-0.33z)| lr 5.85e-03 | 2032.93 ms | 67.5% bf16 MFU | 259300 tok/s +step 2888/18794 | loss 3.398717 (+0.59z)| norm 0.2413 (+1.21z)| lr 5.85e-03 | 2034.64 ms | 67.4% bf16 MFU | 259219 tok/s +step 2889/18794 | loss 3.378426 (+0.04z)| norm 0.1805 (-0.74z)| lr 5.85e-03 | 2040.91 ms | 67.2% bf16 MFU | 259102 tok/s +step 2890/18794 | loss 3.361390 (-0.41z)| norm 0.1781 (-0.83z)| lr 5.85e-03 | 2017.65 ms | 68.0% bf16 MFU | 259140 tok/s +step 2891/18794 | loss 3.400977 (+0.66z)| norm 0.1823 (-0.69z)| lr 5.85e-03 | 2034.53 ms | 67.5% bf16 MFU | 259067 tok/s +step 2892/18794 | loss 3.367652 (-0.25z)| norm 0.1850 (-0.59z)| lr 5.85e-03 | 2026.44 ms | 67.7% bf16 MFU | 259050 tok/s +step 2893/18794 | loss 3.352001 (-0.67z)| norm 0.1872 (-0.53z)| lr 5.85e-03 | 2008.17 ms | 68.3% bf16 MFU | 259152 tok/s +step 2894/18794 | loss 3.374967 (-0.04z)| norm 0.2216 (+0.57z)| lr 5.85e-03 | 2026.22 ms | 67.7% bf16 MFU | 259132 tok/s +step 2895/18794 | loss 3.382313 (+0.15z)| norm 0.2555 (+1.66z)| lr 5.85e-03 | 2033.26 ms | 67.5% bf16 MFU | 259068 tok/s +step 2896/18794 | loss 3.381609 (+0.14z)| norm 0.2455 (+1.34z)| lr 5.85e-03 | 2033.84 ms | 67.5% bf16 MFU | 259003 tok/s +step 2897/18794 | loss 3.387446 (+0.31z)| norm 0.2037 (-0.01z)| lr 5.85e-03 | 2017.39 ms | 68.0% bf16 MFU | 259048 tok/s +step 2898/18794 | loss 3.357314 (-0.52z)| norm 0.1794 (-0.77z)| lr 5.85e-03 | 2017.50 ms | 68.0% bf16 MFU | 259089 tok/s +step 2899/18794 | loss 3.322144 (-1.46z)| norm 0.1603 (-1.37z)| lr 5.85e-03 | 2017.97 ms | 68.0% bf16 MFU | 259125 tok/s +step 2900/18794 | loss 3.410026 (+0.94z)| norm 0.1889 (-0.45z)| lr 5.85e-03 | 2017.69 ms | 68.0% bf16 MFU | 259161 tok/s +step 2901/18794 | loss 3.373369 (-0.06z)| norm 0.1941 (-0.29z)| lr 5.85e-03 | 2002.96 ms | 68.5% bf16 MFU | 259291 tok/s +step 2902/18794 | loss 3.323874 (-1.40z)| norm 0.1611 (-1.35z)| lr 5.85e-03 | 2025.92 ms | 67.7% bf16 MFU | 259266 tok/s +step 2903/18794 | loss 3.321034 (-1.48z)| norm 0.1705 (-1.02z)| lr 5.85e-03 | 2009.98 ms | 68.3% bf16 MFU | 259344 tok/s +step 2904/18794 | loss 3.395306 (+0.63z)| norm 0.2056 (+0.14z)| lr 5.85e-03 | 2034.69 ms | 67.4% bf16 MFU | 259261 tok/s +step 2905/18794 | loss 3.328140 (-1.29z)| norm 0.1846 (-0.54z)| lr 5.85e-03 | 2026.31 ms | 67.7% bf16 MFU | 259235 tok/s +step 2906/18794 | loss 3.380099 (+0.20z)| norm 0.1790 (-0.72z)| lr 5.85e-03 | 2025.81 ms | 67.7% bf16 MFU | 259213 tok/s +step 2907/18794 | loss 3.347362 (-0.73z)| norm 0.1772 (-0.79z)| lr 5.85e-03 | 2040.87 ms | 67.2% bf16 MFU | 259097 tok/s +step 2908/18794 | loss 3.340091 (-0.93z)| norm 0.1787 (-0.74z)| lr 5.85e-03 | 2026.75 ms | 67.7% bf16 MFU | 259077 tok/s +step 2909/18794 | loss 3.344227 (-0.81z)| norm 0.1860 (-0.51z)| lr 5.85e-03 | 2017.59 ms | 68.0% bf16 MFU | 259116 tok/s +step 2910/18794 | loss 3.391413 (+0.56z)| norm 0.1947 (-0.23z)| lr 5.85e-03 | 2025.36 ms | 67.8% bf16 MFU | 259103 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.579801 +step 2911/18794 | loss 3.397459 (+0.73z)| norm 0.2823 (+2.58z)| lr 5.85e-03 | 2018.43 ms | 68.0% bf16 MFU | 259135 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.142713 +step 2912/18794 | loss 3.385136 (+0.38z)| norm 0.3052 (+3.14z)| lr 5.85e-03 | 2002.19 ms | 68.5% bf16 MFU | 259271 tok/s +step 2913/18794 | loss 3.334844 (-1.06z)| norm 0.1879 (-0.50z)| lr 5.85e-03 | 2025.39 ms | 67.8% bf16 MFU | 259251 tok/s +step 2914/18794 | loss 3.383715 (+0.33z)| norm 0.2152 (+0.34z)| lr 5.85e-03 | 2004.18 ms | 68.5% bf16 MFU | 259368 tok/s +step 2915/18794 | loss 3.341222 (-0.89z)| norm 0.2108 (+0.21z)| lr 5.85e-03 | 1994.49 ms | 68.8% bf16 MFU | 259543 tok/s +step 2916/18794 | loss 3.374838 (+0.10z)| norm 0.1763 (-0.85z)| lr 5.85e-03 | 2016.33 ms | 68.1% bf16 MFU | 259567 tok/s +step 2917/18794 | loss 3.349695 (-0.63z)| norm 0.1704 (-1.01z)| lr 5.85e-03 | 2033.24 ms | 67.5% bf16 MFU | 259482 tok/s +step 2918/18794 | loss 3.375059 (+0.12z)| norm 0.1894 (-0.40z)| lr 5.85e-03 | 2017.66 ms | 68.0% bf16 MFU | 259500 tok/s +step 2919/18794 | loss 3.353218 (-0.52z)| norm 0.1811 (-0.66z)| lr 5.85e-03 | 2034.58 ms | 67.4% bf16 MFU | 259409 tok/s +step 2920/18794 | loss 3.396522 (+0.75z)| norm 0.1583 (-1.41z)| lr 5.85e-03 | 2017.78 ms | 68.0% bf16 MFU | 259431 tok/s +step 2921/18794 | loss 3.368282 (-0.09z)| norm 0.1476 (-1.73z)| lr 5.85e-03 | 2042.61 ms | 67.2% bf16 MFU | 259293 tok/s +step 2922/18794 | loss 3.370324 (-0.03z)| norm 0.1605 (-1.28z)| lr 5.85e-03 | 2025.53 ms | 67.8% bf16 MFU | 259270 tok/s +step 2923/18794 | loss 3.343313 (-0.82z)| norm 0.1519 (-1.52z)| lr 5.85e-03 | 2002.13 ms | 68.5% bf16 MFU | 259400 tok/s +step 2924/18794 | loss 3.359707 (-0.32z)| norm 0.1589 (-1.28z)| lr 5.85e-03 | 2004.07 ms | 68.5% bf16 MFU | 259510 tok/s +step 2925/18794 | loss 3.357969 (-0.37z)| norm 0.1519 (-1.51z)| lr 5.85e-03 | 2023.99 ms | 67.8% bf16 MFU | 259487 tok/s +step 2926/18794 | loss 3.375847 (+0.16z)| norm 0.1530 (-1.45z)| lr 5.85e-03 | 2025.57 ms | 67.8% bf16 MFU | 259454 tok/s +step 2927/18794 | loss 3.314602 (-1.62z)| norm 0.2207 (+0.72z)| lr 5.85e-03 | 2016.93 ms | 68.0% bf16 MFU | 259479 tok/s +step 2928/18794 | loss 3.397507 (+0.80z)| norm 0.2286 (+0.96z)| lr 5.85e-03 | 2017.80 ms | 68.0% bf16 MFU | 259496 tok/s +step 2929/18794 | loss 3.350680 (-0.56z)| norm 0.2484 (+1.55z)| lr 5.85e-03 | 2024.56 ms | 67.8% bf16 MFU | 259470 tok/s +step 2930/18794 | loss 3.364136 (-0.19z)| norm 0.2384 (+1.21z)| lr 5.84e-03 | 2016.37 ms | 68.1% bf16 MFU | 259497 tok/s +step 2931/18794 | loss 3.338162 (-0.95z)| norm 0.2055 (+0.18z)| lr 5.84e-03 | 2016.22 ms | 68.1% bf16 MFU | 259524 tok/s +step 2932/18794 | loss 3.360022 (-0.33z)| norm 0.1950 (-0.15z)| lr 5.84e-03 | 2025.41 ms | 67.8% bf16 MFU | 259490 tok/s +step 2933/18794 | loss 3.430866 (+1.77z)| norm 0.1977 (-0.06z)| lr 5.84e-03 | 2016.69 ms | 68.0% bf16 MFU | 259515 tok/s +step 2934/18794 | loss 3.386540 (+0.43z)| norm 0.2303 (+0.94z)| lr 5.84e-03 | 2017.23 ms | 68.0% bf16 MFU | 259534 tok/s +reducing beta2 to 0.9 and lr/wd by 0.866 due to grad z-score of 4.041031 +step 2935/18794 | loss 3.324711 (-1.40z)| norm 0.3429 (+4.04z)| lr 5.06e-03 | 2018.50 ms | 68.0% bf16 MFU | 259544 tok/s +step 2936/18794 | loss 3.412515 (+1.22z)| norm 0.2653 (+1.78z)| lr 5.84e-03 | 2025.76 ms | 67.7% bf16 MFU | 259508 tok/s +step 2937/18794 | loss 3.340080 (-0.92z)| norm 0.2292 (+0.75z)| lr 5.84e-03 | 2026.26 ms | 67.7% bf16 MFU | 259470 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.214732 +step 2938/18794 | loss 3.337784 (-0.97z)| norm 0.2842 (+2.21z)| lr 5.84e-03 | 2025.67 ms | 67.7% bf16 MFU | 259437 tok/s +step 2939/18794 | loss 3.325737 (-1.30z)| norm 0.1860 (-0.48z)| lr 5.84e-03 | 1993.15 ms | 68.9% bf16 MFU | 259618 tok/s +step 2940/18794 | loss 3.347463 (-0.66z)| norm 0.1937 (-0.26z)| lr 5.84e-03 | 2026.21 ms | 67.7% bf16 MFU | 259574 tok/s +step 2941/18794 | loss 3.394096 (+0.72z)| norm 0.1942 (-0.24z)| lr 5.84e-03 | 2009.21 ms | 68.3% bf16 MFU | 259643 tok/s +step 2942/18794 | loss 3.366736 (-0.08z)| norm 0.1599 (-1.16z)| lr 5.84e-03 | 2010.40 ms | 68.3% bf16 MFU | 259700 tok/s +step 2943/18794 | loss 3.302808 (-1.92z)| norm 0.1748 (-0.76z)| lr 5.84e-03 | 2018.25 ms | 68.0% bf16 MFU | 259704 tok/s +step 2944/18794 | loss 3.431536 (+1.78z)| norm 0.1681 (-0.93z)| lr 5.84e-03 | 2017.39 ms | 68.0% bf16 MFU | 259713 tok/s +step 2945/18794 | loss 3.346063 (-0.67z)| norm 0.1703 (-0.85z)| lr 5.84e-03 | 2010.56 ms | 68.3% bf16 MFU | 259765 tok/s +step 2946/18794 | loss 3.434685 (+1.89z)| norm 0.1907 (-0.30z)| lr 5.84e-03 | 2018.10 ms | 68.0% bf16 MFU | 259767 tok/s +step 2947/18794 | loss 3.310474 (-1.71z)| norm 0.1938 (-0.22z)| lr 5.84e-03 | 2009.49 ms | 68.3% bf16 MFU | 259824 tok/s +step 2948/18794 | loss 3.329879 (-1.17z)| norm 0.1422 (-1.58z)| lr 5.84e-03 | 2010.40 ms | 68.3% bf16 MFU | 259872 tok/s +step 2949/18794 | loss 3.381248 (+0.35z)| norm 0.1558 (-1.20z)| lr 5.84e-03 | 2025.33 ms | 67.8% bf16 MFU | 259822 tok/s +step 2950/18794 | loss 3.355247 (-0.40z)| norm 0.1935 (-0.20z)| lr 5.84e-03 | 2026.52 ms | 67.7% bf16 MFU | 259766 tok/s +step 2951/18794 | loss 3.346022 (-0.67z)| norm 0.2013 (+0.01z)| lr 5.84e-03 | 2009.83 ms | 68.3% bf16 MFU | 259821 tok/s +step 2952/18794 | loss 3.371404 (+0.10z)| norm 0.1983 (-0.07z)| lr 5.84e-03 | 2009.91 ms | 68.3% bf16 MFU | 259873 tok/s +step 2953/18794 | loss 3.408672 (+1.22z)| norm 0.1657 (-0.92z)| lr 5.84e-03 | 2008.71 ms | 68.3% bf16 MFU | 259929 tok/s +step 2954/18794 | loss 3.394328 (+0.78z)| norm 0.1567 (-1.15z)| lr 5.84e-03 | 1992.02 ms | 68.9% bf16 MFU | 260093 tok/s +step 2955/18794 | loss 3.402380 (+1.01z)| norm 0.2226 (+0.61z)| lr 5.84e-03 | 2011.87 ms | 68.2% bf16 MFU | 260118 tok/s +step 2956/18794 | loss 3.317822 (-1.50z)| norm 0.2105 (+0.28z)| lr 5.84e-03 | 2008.65 ms | 68.3% bf16 MFU | 260163 tok/s +step 2957/18794 | loss 3.346658 (-0.64z)| norm 0.1698 (-0.81z)| lr 5.84e-03 | 2002.10 ms | 68.5% bf16 MFU | 260248 tok/s +step 2958/18794 | loss 3.364983 (-0.07z)| norm 0.2118 (+0.30z)| lr 5.84e-03 | 2016.48 ms | 68.1% bf16 MFU | 260236 tok/s +step 2959/18794 | loss 3.306385 (-1.81z)| norm 0.1837 (-0.46z)| lr 5.84e-03 | 2017.31 ms | 68.0% bf16 MFU | 260219 tok/s +step 2960/18794 | loss 3.377861 (+0.35z)| norm 0.2250 (+0.65z)| lr 5.84e-03 | 1986.70 ms | 69.1% bf16 MFU | 260403 tok/s +step 2961/18794 | loss 3.366896 (+0.02z)| norm 0.1978 (-0.08z)| lr 5.84e-03 | 2017.58 ms | 68.0% bf16 MFU | 260375 tok/s +step 2962/18794 | loss 3.439961 (+2.18z)| norm 0.1814 (-0.51z)| lr 5.84e-03 | 2002.52 ms | 68.5% bf16 MFU | 260447 tok/s +step 2963/18794 | loss 3.378934 (+0.40z)| norm 0.1443 (-1.49z)| lr 5.84e-03 | 2008.15 ms | 68.3% bf16 MFU | 260479 tok/s +step 2964/18794 | loss 3.318386 (-1.41z)| norm 0.1606 (-1.03z)| lr 5.84e-03 | 1989.67 ms | 69.0% bf16 MFU | 260630 tok/s +step 2965/18794 | loss 3.362620 (-0.08z)| norm 0.1731 (-0.68z)| lr 5.84e-03 | 1995.06 ms | 68.8% bf16 MFU | 260738 tok/s +step 2966/18794 | loss 3.373633 (+0.26z)| norm 0.1946 (-0.10z)| lr 5.84e-03 | 1993.07 ms | 68.9% bf16 MFU | 260854 tok/s +step 2967/18794 | loss 3.333143 (-0.95z)| norm 0.2159 (+0.47z)| lr 5.84e-03 | 1992.97 ms | 68.9% bf16 MFU | 260965 tok/s +step 2968/18794 | loss 3.368293 (+0.10z)| norm 0.2163 (+0.47z)| lr 5.84e-03 | 2026.10 ms | 67.7% bf16 MFU | 260855 tok/s +step 2969/18794 | loss 3.345726 (-0.57z)| norm 0.2245 (+0.68z)| lr 5.84e-03 | 1994.29 ms | 68.8% bf16 MFU | 260957 tok/s +step 2970/18794 | loss 3.355309 (-0.28z)| norm 0.1991 (-0.04z)| lr 5.84e-03 | 2001.88 ms | 68.6% bf16 MFU | 261004 tok/s +step 2971/18794 | loss 3.358971 (-0.15z)| norm 0.1763 (-0.67z)| lr 5.84e-03 | 1995.77 ms | 68.8% bf16 MFU | 261089 tok/s +step 2972/18794 | loss 3.358990 (-0.18z)| norm 0.1944 (-0.15z)| lr 5.84e-03 | 1994.86 ms | 68.8% bf16 MFU | 261175 tok/s +step 2973/18794 | loss 3.366875 (+0.09z)| norm 0.2172 (+0.48z)| lr 5.84e-03 | 1986.66 ms | 69.1% bf16 MFU | 261312 tok/s +step 2974/18794 | loss 3.368706 (+0.14z)| norm 0.2668 (+1.86z)| lr 5.84e-03 | 1995.32 ms | 68.8% bf16 MFU | 261384 tok/s +step 2975/18794 | loss 3.391604 (+0.90z)| norm 0.2316 (+0.90z)| lr 5.84e-03 | 2009.46 ms | 68.3% bf16 MFU | 261360 tok/s +step 2976/18794 | loss 3.377441 (+0.43z)| norm 0.2132 (+0.38z)| lr 5.84e-03 | 1994.60 ms | 68.8% bf16 MFU | 261435 tok/s +step 2977/18794 | loss 3.317889 (-1.49z)| norm 0.1860 (-0.38z)| lr 5.84e-03 | 2002.38 ms | 68.5% bf16 MFU | 261455 tok/s +step 2978/18794 | loss 3.377922 (+0.47z)| norm 0.1525 (-1.30z)| lr 5.84e-03 | 2002.11 ms | 68.5% bf16 MFU | 261476 tok/s +step 2979/18794 | loss 3.334029 (-0.95z)| norm 0.1706 (-0.79z)| lr 5.84e-03 | 2010.28 ms | 68.3% bf16 MFU | 261442 tok/s +step 2980/18794 | loss 3.339627 (-0.80z)| norm 0.1996 (+0.02z)| lr 5.84e-03 | 1986.48 ms | 69.1% bf16 MFU | 261566 tok/s +step 2981/18794 | loss 3.350324 (-0.45z)| norm 0.2091 (+0.29z)| lr 5.84e-03 | 2000.77 ms | 68.6% bf16 MFU | 261590 tok/s +step 2982/18794 | loss 3.415049 (+1.68z)| norm 0.2061 (+0.21z)| lr 5.84e-03 | 2004.09 ms | 68.5% bf16 MFU | 261591 tok/s +step 2983/18794 | loss 3.387391 (+0.76z)| norm 0.2075 (+0.27z)| lr 5.84e-03 | 2002.16 ms | 68.5% bf16 MFU | 261605 tok/s +step 2984/18794 | loss 3.381620 (+0.59z)| norm 0.2266 (+0.83z)| lr 5.84e-03 | 2003.52 ms | 68.5% bf16 MFU | 261609 tok/s +step 2985/18794 | loss 3.397484 (+1.10z)| norm 0.2268 (+0.83z)| lr 5.84e-03 | 1985.67 ms | 69.1% bf16 MFU | 261730 tok/s +step 2986/18794 | loss 3.348921 (-0.54z)| norm 0.1878 (-0.28z)| lr 5.84e-03 | 2015.67 ms | 68.1% bf16 MFU | 261649 tok/s +step 2987/18794 | loss 3.345736 (-0.64z)| norm 0.1632 (-0.97z)| lr 5.84e-03 | 2003.02 ms | 68.5% bf16 MFU | 261654 tok/s +step 2988/18794 | loss 3.320524 (-1.46z)| norm 0.1949 (-0.06z)| lr 5.84e-03 | 1986.90 ms | 69.1% bf16 MFU | 261765 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.475790 +step 2989/18794 | loss 3.353076 (-0.36z)| norm 0.2870 (+2.48z)| lr 5.84e-03 | 1994.00 ms | 68.8% bf16 MFU | 261823 tok/s +step 2990/18794 | loss 3.472486 (+3.41z)| norm 0.2108 (+0.35z)| lr 5.84e-03 | 1994.90 ms | 68.8% bf16 MFU | 261873 tok/s +step 2991/18794 | loss 3.386675 (+0.70z)| norm 0.1733 (-0.69z)| lr 5.84e-03 | 1994.29 ms | 68.8% bf16 MFU | 261924 tok/s +step 2992/18794 | loss 3.352067 (-0.39z)| norm 0.2157 (+0.48z)| lr 5.83e-03 | 2017.68 ms | 68.0% bf16 MFU | 261820 tok/s +step 2993/18794 | loss 3.362736 (-0.06z)| norm 0.1983 (-0.01z)| lr 5.83e-03 | 1984.23 ms | 69.2% bf16 MFU | 261940 tok/s +step 2994/18794 | loss 3.404083 (+1.24z)| norm 0.1618 (-1.01z)| lr 5.83e-03 | 1996.48 ms | 68.7% bf16 MFU | 261974 tok/s +step 2995/18794 | loss 3.410760 (+1.43z)| norm 0.1727 (-0.69z)| lr 5.83e-03 | 1984.02 ms | 69.2% bf16 MFU | 262088 tok/s +step 2996/18794 | loss 3.335142 (-0.92z)| norm 0.2004 (+0.10z)| lr 5.83e-03 | 2017.75 ms | 68.0% bf16 MFU | 261975 tok/s +step 2997/18794 | loss 3.353894 (-0.33z)| norm 0.2516 (+1.52z)| lr 5.83e-03 | 2009.63 ms | 68.3% bf16 MFU | 261921 tok/s +step 2998/18794 | loss 3.350631 (-0.43z)| norm 0.2345 (+1.02z)| lr 5.83e-03 | 2002.69 ms | 68.5% bf16 MFU | 261914 tok/s +step 2999/18794 | loss 3.365347 (+0.02z)| norm 0.1732 (-0.69z)| lr 5.83e-03 | 1994.23 ms | 68.8% bf16 MFU | 261964 tok/s +step 3000/18794 | loss 3.331673 (-1.02z)| norm 0.2236 (+0.71z)| lr 5.83e-03 | 2009.84 ms | 68.3% bf16 MFU | 261909 tok/s +val loss 3.379425 +HellaSwag: 2733/10042 = 0.272157Swag: 990/1256: 0/1256 +generating: +--- +Writing state to log_gpt3_125M_edu_v4/state_00003000_00001.bin +|and more rows get been done| +Now that we're able to read the keyboard, let's have an easy-to-follow step to do purserkline? +Barl Dallowby (016) Introduces Page Navigation: +Page Navigation Intervention (479) +When reading putting presses on traditional typewriter, the keyboard is painted a bit more transparent and easy to view all of the press controls you write. This goes to enlarge the bit of informatio +n that's typed in to your cursor at any time step (600) and the bit can be any number of times you go to cursor position, where you can see handwritten notes of the presses' positions in your hand ri +ght below the type +--- +Writing checkpoint at step 3000 +Writing model to log_gpt3_125M_edu_v4/model_00003000.bin +Writing state to log_gpt3_125M_edu_v4/state_00003000_00000.bin +Deleting checkpoint at step 500 +step 3001/18794 | loss 3.368004 (+0.13z)| norm 0.2426 (+1.21z)| lr 5.83e-03 | 1977.38 ms | 69.4% bf16 MFU | 262070 tok/s +step 3002/18794 | loss 3.305692 (-1.82z)| norm 0.2667 (+1.83z)| lr 5.83e-03 | 1994.59 ms | 68.8% bf16 MFU | 262110 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.164456 +step 3003/18794 | loss 3.351795 (-0.39z)| norm 0.3231 (+3.16z)| lr 5.83e-03 | 2001.82 ms | 68.6% bf16 MFU | 262099 tok/s +step 3004/18794 | loss 3.378469 (+0.47z)| norm 0.2799 (+1.98z)| lr 5.83e-03 | 2001.80 ms | 68.6% bf16 MFU | 262090 tok/s +step 3005/18794 | loss 3.382602 (+0.58z)| norm 0.1915 (-0.27z)| lr 5.83e-03 | 2012.24 ms | 68.2% bf16 MFU | 262013 tok/s +step 3006/18794 | loss 3.322795 (-1.30z)| norm 0.2284 (+0.66z)| lr 5.83e-03 | 1980.93 ms | 69.3% bf16 MFU | 262146 tok/s +step 3007/18794 | loss 3.334418 (-0.92z)| norm 0.2076 (+0.12z)| lr 5.83e-03 | 1982.34 ms | 69.2% bf16 MFU | 262262 tok/s +step 3008/18794 | loss 3.403244 (+1.22z)| norm 0.1553 (-1.20z)| lr 5.83e-03 | 1985.66 ms | 69.1% bf16 MFU | 262351 tok/s +step 3009/18794 | loss 3.407173 (+1.32z)| norm 0.2107 (+0.20z)| lr 5.83e-03 | 1986.38 ms | 69.1% bf16 MFU | 262430 tok/s +step 3010/18794 | loss 3.445262 (+2.43z)| norm 0.2111 (+0.20z)| lr 5.83e-03 | 1994.45 ms | 68.8% bf16 MFU | 262453 tok/s +step 3011/18794 | loss 3.362361 (-0.08z)| norm 0.1914 (-0.28z)| lr 5.83e-03 | 2001.79 ms | 68.6% bf16 MFU | 262425 tok/s +step 3012/18794 | loss 3.317090 (-1.43z)| norm 0.1791 (-0.59z)| lr 5.83e-03 | 1985.36 ms | 69.1% bf16 MFU | 262508 tok/s +step 3013/18794 | loss 3.496465 (+3.68z)| norm 0.2096 (+0.23z)| lr 5.83e-03 | 2002.34 ms | 68.5% bf16 MFU | 262474 tok/s +step 3014/18794 | loss 3.406555 (+1.13z)| norm 0.2379 (+0.98z)| lr 5.83e-03 | 1978.13 ms | 69.4% bf16 MFU | 262603 tok/s +step 3015/18794 | loss 3.317371 (-1.36z)| norm 0.1686 (-0.86z)| lr 5.83e-03 | 2001.72 ms | 68.6% bf16 MFU | 262569 tok/s +step 3016/18794 | loss 3.306262 (-1.63z)| norm 0.2043 (+0.08z)| lr 5.83e-03 | 1995.04 ms | 68.8% bf16 MFU | 262580 tok/s +step 3017/18794 | loss 3.320134 (-1.23z)| norm 0.2019 (+0.01z)| lr 5.83e-03 | 2000.76 ms | 68.6% bf16 MFU | 262553 tok/s +step 3018/18794 | loss 3.314190 (-1.36z)| norm 0.1745 (-0.72z)| lr 5.83e-03 | 1995.98 ms | 68.8% bf16 MFU | 262559 tok/s +step 3019/18794 | loss 3.360830 (-0.10z)| norm 0.1840 (-0.47z)| lr 5.83e-03 | 1985.91 ms | 69.1% bf16 MFU | 262631 tok/s +step 3020/18794 | loss 3.392642 (+0.76z)| norm 0.1892 (-0.34z)| lr 5.83e-03 | 1995.21 ms | 68.8% bf16 MFU | 262639 tok/s +step 3021/18794 | loss 3.376657 (+0.33z)| norm 0.1851 (-0.46z)| lr 5.83e-03 | 1985.88 ms | 69.1% bf16 MFU | 262707 tok/s +step 3022/18794 | loss 3.383920 (+0.52z)| norm 0.1841 (-0.50z)| lr 5.83e-03 | 1986.77 ms | 69.1% bf16 MFU | 262766 tok/s +step 3023/18794 | loss 3.418135 (+1.42z)| norm 0.1869 (-0.44z)| lr 5.83e-03 | 1993.79 ms | 68.8% bf16 MFU | 262776 tok/s +step 3024/18794 | loss 3.359954 (-0.15z)| norm 0.1528 (-1.37z)| lr 5.83e-03 | 1985.22 ms | 69.1% bf16 MFU | 262842 tok/s +step 3025/18794 | loss 3.366066 (+0.01z)| norm 0.1670 (-0.99z)| lr 5.83e-03 | 1994.98 ms | 68.8% bf16 MFU | 262840 tok/s +step 3026/18794 | loss 3.364028 (-0.04z)| norm 0.1576 (-1.26z)| lr 5.83e-03 | 1986.11 ms | 69.1% bf16 MFU | 262897 tok/s +step 3027/18794 | loss 3.390794 (+0.67z)| norm 0.1510 (-1.41z)| lr 5.83e-03 | 2000.83 ms | 68.6% bf16 MFU | 262854 tok/s +step 3028/18794 | loss 3.314991 (-1.36z)| norm 0.1701 (-0.87z)| lr 5.83e-03 | 1986.10 ms | 69.1% bf16 MFU | 262910 tok/s +step 3029/18794 | loss 3.371141 (+0.15z)| norm 0.2506 (+1.34z)| lr 5.83e-03 | 1978.47 ms | 69.4% bf16 MFU | 263014 tok/s +step 3030/18794 | loss 3.318607 (-1.24z)| norm 0.2653 (+1.72z)| lr 5.83e-03 | 1994.03 ms | 68.8% bf16 MFU | 263010 tok/s +step 3031/18794 | loss 3.299903 (-1.71z)| norm 0.2340 (+0.86z)| lr 5.83e-03 | 1995.56 ms | 68.8% bf16 MFU | 262996 tok/s +step 3032/18794 | loss 3.372667 (+0.21z)| norm 0.2416 (+1.05z)| lr 5.83e-03 | 1985.07 ms | 69.1% bf16 MFU | 263052 tok/s +step 3033/18794 | loss 3.346084 (-0.48z)| norm 0.2226 (+0.53z)| lr 5.83e-03 | 1986.19 ms | 69.1% bf16 MFU | 263098 tok/s +step 3034/18794 | loss 3.307778 (-1.47z)| norm 0.1640 (-1.02z)| lr 5.83e-03 | 1979.11 ms | 69.3% bf16 MFU | 263188 tok/s +step 3035/18794 | loss 3.363943 (+0.01z)| norm 0.1679 (-0.94z)| lr 5.83e-03 | 1986.96 ms | 69.1% bf16 MFU | 263222 tok/s +step 3036/18794 | loss 3.328003 (-0.93z)| norm 0.1928 (-0.20z)| lr 5.83e-03 | 1980.21 ms | 69.3% bf16 MFU | 263299 tok/s +step 3037/18794 | loss 3.341896 (-0.56z)| norm 0.2665 (+1.92z)| lr 5.83e-03 | 1979.37 ms | 69.3% bf16 MFU | 263378 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.089506 +step 3038/18794 | loss 3.300592 (-1.64z)| norm 0.2716 (+2.09z)| lr 5.83e-03 | 1986.69 ms | 69.1% bf16 MFU | 263404 tok/s +step 3039/18794 | loss 3.336339 (-0.70z)| norm 0.2041 (+0.11z)| lr 5.83e-03 | 1985.72 ms | 69.1% bf16 MFU | 263435 tok/s +step 3040/18794 | loss 3.349703 (-0.34z)| norm 0.1771 (-0.67z)| lr 5.83e-03 | 1981.76 ms | 69.2% bf16 MFU | 263491 tok/s +step 3041/18794 | loss 3.369623 (+0.20z)| norm 0.1662 (-0.97z)| lr 5.83e-03 | 1985.62 ms | 69.1% bf16 MFU | 263519 tok/s +step 3042/18794 | loss 3.401237 (+1.02z)| norm 0.1723 (-0.80z)| lr 5.83e-03 | 1983.12 ms | 69.2% bf16 MFU | 263562 tok/s +step 3043/18794 | loss 3.346452 (-0.45z)| norm 0.1629 (-1.07z)| lr 5.83e-03 | 1992.91 ms | 68.9% bf16 MFU | 263537 tok/s +step 3044/18794 | loss 3.343128 (-0.52z)| norm 0.1610 (-1.12z)| lr 5.83e-03 | 1980.12 ms | 69.3% bf16 MFU | 263599 tok/s +step 3045/18794 | loss 3.330458 (-0.86z)| norm 0.1638 (-1.03z)| lr 5.83e-03 | 1980.19 ms | 69.3% bf16 MFU | 263658 tok/s +step 3046/18794 | loss 3.393177 (+0.87z)| norm 0.1570 (-1.21z)| lr 5.83e-03 | 1979.81 ms | 69.3% bf16 MFU | 263716 tok/s +step 3047/18794 | loss 3.337141 (-0.69z)| norm 0.1730 (-0.74z)| lr 5.83e-03 | 1978.77 ms | 69.4% bf16 MFU | 263778 tok/s +step 3048/18794 | loss 3.347158 (-0.42z)| norm 0.1930 (-0.19z)| lr 5.83e-03 | 1979.54 ms | 69.3% bf16 MFU | 263831 tok/s +reducing beta2 to 0.9 and lr/wd by 0.986 due to grad z-score of 3.549740 +step 3049/18794 | loss 3.360171 (-0.05z)| norm 0.3313 (+3.55z)| lr 5.74e-03 | 1978.94 ms | 69.3% bf16 MFU | 263887 tok/s +reducing beta2 to 0.9 and lr/wd by 0.972 due to grad z-score of 3.599222 +step 3050/18794 | loss 3.309602 (-1.43z)| norm 0.3441 (+3.60z)| lr 5.66e-03 | 1979.16 ms | 69.3% bf16 MFU | 263937 tok/s +step 3051/18794 | loss 3.246022 (-3.02z)| norm 0.2383 (+0.89z)| lr 5.83e-03 | 2035.89 ms | 67.4% bf16 MFU | 263617 tok/s +step 3052/18794 | loss 3.316291 (-1.14z)| norm 0.2215 (+0.46z)| lr 5.82e-03 | 2032.06 ms | 67.5% bf16 MFU | 263336 tok/s +step 3053/18794 | loss 3.341962 (-0.46z)| norm 0.1938 (-0.25z)| lr 5.82e-03 | 2032.77 ms | 67.5% bf16 MFU | 263065 tok/s +step 3054/18794 | loss 3.273949 (-2.18z)| norm 0.2086 (+0.11z)| lr 5.82e-03 | 2032.15 ms | 67.5% bf16 MFU | 262812 tok/s +step 3055/18794 | loss 3.315748 (-1.07z)| norm 0.1704 (-0.85z)| lr 5.82e-03 | 2032.63 ms | 67.5% bf16 MFU | 262568 tok/s +step 3056/18794 | loss 3.334541 (-0.59z)| norm 0.1235 (-1.99z)| lr 5.82e-03 | 2032.95 ms | 67.5% bf16 MFU | 262334 tok/s +step 3057/18794 | loss 3.345689 (-0.30z)| norm 0.1514 (-1.28z)| lr 5.82e-03 | 2032.96 ms | 67.5% bf16 MFU | 262112 tok/s +step 3058/18794 | loss 3.302789 (-1.39z)| norm 0.1951 (-0.18z)| lr 5.82e-03 | 2035.29 ms | 67.4% bf16 MFU | 261887 tok/s +step 3059/18794 | loss 3.353828 (-0.09z)| norm 0.2076 (+0.12z)| lr 5.82e-03 | 2040.17 ms | 67.3% bf16 MFU | 261642 tok/s +step 3060/18794 | loss 3.284948 (-1.82z)| norm 0.2040 (+0.04z)| lr 5.82e-03 | 2022.69 ms | 67.8% bf16 MFU | 261520 tok/s +step 3061/18794 | loss 3.361708 (+0.14z)| norm 0.2145 (+0.30z)| lr 5.82e-03 | 2038.83 ms | 67.3% bf16 MFU | 261301 tok/s +step 3062/18794 | loss 3.283718 (-1.83z)| norm 0.1919 (-0.27z)| lr 5.82e-03 | 2037.90 ms | 67.3% bf16 MFU | 261100 tok/s +step 3063/18794 | loss 3.295144 (-1.50z)| norm 0.1648 (-0.96z)| lr 5.82e-03 | 2037.69 ms | 67.3% bf16 MFU | 260909 tok/s +step 3064/18794 | loss 3.281677 (-1.81z)| norm 0.2188 (+0.39z)| lr 5.82e-03 | 2038.71 ms | 67.3% bf16 MFU | 260722 tok/s +step 3065/18794 | loss 3.352922 (-0.01z)| norm 0.2155 (+0.29z)| lr 5.82e-03 | 2025.40 ms | 67.8% bf16 MFU | 260629 tok/s +step 3066/18794 | loss 3.351314 (-0.04z)| norm 0.1817 (-0.56z)| lr 5.82e-03 | 2030.32 ms | 67.6% bf16 MFU | 260509 tok/s +step 3067/18794 | loss 3.326892 (-0.66z)| norm 0.1536 (-1.25z)| lr 5.82e-03 | 2036.38 ms | 67.4% bf16 MFU | 260356 tok/s +step 3068/18794 | loss 3.294600 (-1.44z)| norm 0.1680 (-0.87z)| lr 5.82e-03 | 2025.01 ms | 67.8% bf16 MFU | 260284 tok/s +step 3069/18794 | loss 3.280703 (-1.74z)| norm 0.1742 (-0.70z)| lr 5.82e-03 | 2033.91 ms | 67.5% bf16 MFU | 260158 tok/s +step 3070/18794 | loss 3.343565 (-0.19z)| norm 0.2084 (+0.15z)| lr 5.82e-03 | 2018.79 ms | 68.0% bf16 MFU | 260136 tok/s +step 3071/18794 | loss 3.297230 (-1.31z)| norm 0.2180 (+0.38z)| lr 5.82e-03 | 2024.68 ms | 67.8% bf16 MFU | 260076 tok/s +step 3072/18794 | loss 3.302337 (-1.16z)| norm 0.1939 (-0.22z)| lr 5.82e-03 | 2032.07 ms | 67.5% bf16 MFU | 259973 tok/s +step 3073/18794 | loss 3.319471 (-0.73z)| norm 0.1587 (-1.08z)| lr 5.82e-03 | 2019.47 ms | 68.0% bf16 MFU | 259955 tok/s +step 3074/18794 | loss 3.318522 (-0.74z)| norm 0.1984 (-0.08z)| lr 5.82e-03 | 2026.08 ms | 67.7% bf16 MFU | 259896 tok/s +step 3075/18794 | loss 3.344281 (-0.11z)| norm 0.2169 (+0.39z)| lr 5.82e-03 | 2033.63 ms | 67.5% bf16 MFU | 259791 tok/s +step 3076/18794 | loss 3.309968 (-0.92z)| norm 0.2078 (+0.17z)| lr 5.82e-03 | 2023.37 ms | 67.8% bf16 MFU | 259758 tok/s +step 3077/18794 | loss 3.332354 (-0.39z)| norm 0.2047 (+0.08z)| lr 5.82e-03 | 1995.23 ms | 68.8% bf16 MFU | 259908 tok/s +step 3078/18794 | loss 3.350963 (+0.07z)| norm 0.2371 (+0.89z)| lr 5.82e-03 | 2016.88 ms | 68.0% bf16 MFU | 259910 tok/s +step 3079/18794 | loss 3.397886 (+1.19z)| norm 0.2630 (+1.51z)| lr 5.82e-03 | 2026.57 ms | 67.7% bf16 MFU | 259850 tok/s +step 3080/18794 | loss 3.361888 (+0.31z)| norm 0.2307 (+0.68z)| lr 5.82e-03 | 2017.97 ms | 68.0% bf16 MFU | 259848 tok/s +step 3081/18794 | loss 3.290299 (-1.39z)| norm 0.1947 (-0.22z)| lr 5.82e-03 | 2040.03 ms | 67.3% bf16 MFU | 259706 tok/s +step 3082/18794 | loss 3.318511 (-0.70z)| norm 0.1956 (-0.19z)| lr 5.82e-03 | 2025.01 ms | 67.8% bf16 MFU | 259666 tok/s +step 3083/18794 | loss 3.287872 (-1.41z)| norm 0.1866 (-0.41z)| lr 5.82e-03 | 2033.86 ms | 67.5% bf16 MFU | 259572 tok/s +step 3084/18794 | loss 3.264950 (-1.90z)| norm 0.2400 (+0.92z)| lr 5.82e-03 | 2022.31 ms | 67.9% bf16 MFU | 259556 tok/s +step 3085/18794 | loss 3.344159 (-0.01z)| norm 0.2500 (+1.16z)| lr 5.82e-03 | 2018.33 ms | 68.0% bf16 MFU | 259566 tok/s +step 3086/18794 | loss 3.309187 (-0.83z)| norm 0.1845 (-0.47z)| lr 5.82e-03 | 2020.54 ms | 67.9% bf16 MFU | 259562 tok/s +step 3087/18794 | loss 3.315891 (-0.67z)| norm 0.1845 (-0.48z)| lr 5.82e-03 | 2028.69 ms | 67.6% bf16 MFU | 259505 tok/s +step 3088/18794 | loss 3.337308 (-0.16z)| norm 0.2126 (+0.22z)| lr 5.82e-03 | 2034.83 ms | 67.4% bf16 MFU | 259413 tok/s +step 3089/18794 | loss 3.323009 (-0.49z)| norm 0.2047 (+0.04z)| lr 5.82e-03 | 2018.49 ms | 68.0% bf16 MFU | 259429 tok/s +step 3090/18794 | loss 3.363798 (+0.52z)| norm 0.2173 (+0.36z)| lr 5.82e-03 | 2026.52 ms | 67.7% bf16 MFU | 259394 tok/s +step 3091/18794 | loss 3.274857 (-1.65z)| norm 0.2041 (+0.02z)| lr 5.82e-03 | 2035.95 ms | 67.4% bf16 MFU | 259300 tok/s +step 3092/18794 | loss 3.315511 (-0.64z)| norm 0.1971 (-0.16z)| lr 5.82e-03 | 2024.91 ms | 67.8% bf16 MFU | 259281 tok/s +step 3093/18794 | loss 3.279263 (-1.49z)| norm 0.2045 (+0.03z)| lr 5.82e-03 | 2032.68 ms | 67.5% bf16 MFU | 259213 tok/s +step 3094/18794 | loss 3.314209 (-0.62z)| norm 0.1938 (-0.25z)| lr 5.82e-03 | 2026.30 ms | 67.7% bf16 MFU | 259190 tok/s +step 3095/18794 | loss 3.313348 (-0.63z)| norm 0.1948 (-0.23z)| lr 5.82e-03 | 2018.80 ms | 68.0% bf16 MFU | 259215 tok/s +step 3096/18794 | loss 3.237582 (-2.43z)| norm 0.1686 (-0.90z)| lr 5.82e-03 | 2027.70 ms | 67.7% bf16 MFU | 259183 tok/s +step 3097/18794 | loss 3.275422 (-1.47z)| norm 0.1923 (-0.28z)| lr 5.82e-03 | 2028.51 ms | 67.7% bf16 MFU | 259146 tok/s +step 3098/18794 | loss 3.350505 (+0.33z)| norm 0.1847 (-0.46z)| lr 5.82e-03 | 2016.89 ms | 68.0% bf16 MFU | 259187 tok/s +step 3099/18794 | loss 3.234155 (-2.37z)| norm 0.1988 (-0.10z)| lr 5.82e-03 | 2025.98 ms | 67.7% bf16 MFU | 259166 tok/s +step 3100/18794 | loss 3.261873 (-1.68z)| norm 0.1716 (-0.80z)| lr 5.82e-03 | 2021.38 ms | 67.9% bf16 MFU | 259177 tok/s +step 3101/18794 | loss 3.262962 (-1.61z)| norm 0.1897 (-0.31z)| lr 5.82e-03 | 2025.36 ms | 67.8% bf16 MFU | 259161 tok/s +step 3102/18794 | loss 3.312649 (-0.48z)| norm 0.2564 (+1.45z)| lr 5.82e-03 | 2008.16 ms | 68.3% bf16 MFU | 259257 tok/s +step 3103/18794 | loss 3.329217 (-0.10z)| norm 0.2444 (+1.21z)| lr 5.82e-03 | 2027.57 ms | 67.7% bf16 MFU | 259223 tok/s +step 3104/18794 | loss 3.321032 (-0.27z)| norm 0.2145 (+0.41z)| lr 5.82e-03 | 2010.74 ms | 68.2% bf16 MFU | 259299 tok/s +step 3105/18794 | loss 3.332004 (-0.01z)| norm 0.2349 (+0.97z)| lr 5.82e-03 | 2008.38 ms | 68.3% bf16 MFU | 259386 tok/s +step 3106/18794 | loss 3.241768 (-2.04z)| norm 0.2249 (+0.69z)| lr 5.82e-03 | 2011.54 ms | 68.2% bf16 MFU | 259449 tok/s +step 3107/18794 | loss 3.248645 (-1.83z)| norm 0.2273 (+0.75z)| lr 5.82e-03 | 2014.24 ms | 68.1% bf16 MFU | 259491 tok/s +step 3108/18794 | loss 3.297783 (-0.72z)| norm 0.2064 (+0.15z)| lr 5.82e-03 | 2020.58 ms | 67.9% bf16 MFU | 259490 tok/s +step 3109/18794 | loss 3.365840 (+0.83z)| norm 0.2156 (+0.41z)| lr 5.82e-03 | 2003.12 ms | 68.5% bf16 MFU | 259603 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.746831 +step 3110/18794 | loss 3.320846 (-0.17z)| norm 0.3030 (+2.75z)| lr 5.81e-03 | 2010.10 ms | 68.3% bf16 MFU | 259664 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.062978 +step 3111/18794 | loss 3.341726 (+0.33z)| norm 0.3216 (+3.06z)| lr 5.81e-03 | 2009.89 ms | 68.3% bf16 MFU | 259723 tok/s +step 3112/18794 | loss 3.334907 (+0.16z)| norm 0.2010 (-0.07z)| lr 5.81e-03 | 2003.34 ms | 68.5% bf16 MFU | 259822 tok/s +step 3113/18794 | loss 3.268732 (-1.46z)| norm 0.1768 (-0.69z)| lr 5.81e-03 | 2009.48 ms | 68.3% bf16 MFU | 259877 tok/s +step 3114/18794 | loss 3.383293 (+1.49z)| norm 0.2036 (+0.02z)| lr 5.81e-03 | 2040.01 ms | 67.3% bf16 MFU | 259733 tok/s +step 3115/18794 | loss 3.348832 (+0.59z)| norm 0.2090 (+0.15z)| lr 5.81e-03 | 2003.34 ms | 68.5% bf16 MFU | 259832 tok/s +step 3116/18794 | loss 3.303351 (-0.58z)| norm 0.1767 (-0.69z)| lr 5.81e-03 | 2010.26 ms | 68.3% bf16 MFU | 259880 tok/s +step 3117/18794 | loss 3.366556 (+1.03z)| norm 0.2068 (+0.10z)| lr 5.81e-03 | 2011.35 ms | 68.2% bf16 MFU | 259920 tok/s +step 3118/18794 | loss 3.360313 (+0.86z)| norm 0.2073 (+0.10z)| lr 5.81e-03 | 2009.82 ms | 68.3% bf16 MFU | 259967 tok/s +step 3119/18794 | loss 3.349446 (+0.58z)| norm 0.2301 (+0.68z)| lr 5.81e-03 | 2015.36 ms | 68.1% bf16 MFU | 259976 tok/s +step 3120/18794 | loss 3.332958 (+0.18z)| norm 0.2437 (+1.02z)| lr 5.81e-03 | 2017.85 ms | 68.0% bf16 MFU | 259968 tok/s +step 3121/18794 | loss 3.308299 (-0.45z)| norm 0.1923 (-0.32z)| lr 5.81e-03 | 2014.45 ms | 68.1% bf16 MFU | 259983 tok/s +step 3122/18794 | loss 3.261735 (-1.63z)| norm 0.1797 (-0.64z)| lr 5.81e-03 | 2015.89 ms | 68.1% bf16 MFU | 259988 tok/s +step 3123/18794 | loss 3.297800 (-0.68z)| norm 0.2027 (-0.05z)| lr 5.81e-03 | 1996.50 ms | 68.7% bf16 MFU | 260119 tok/s +step 3124/18794 | loss 3.319935 (-0.07z)| norm 0.2188 (+0.35z)| lr 5.81e-03 | 2002.28 ms | 68.5% bf16 MFU | 260205 tok/s +step 3125/18794 | loss 3.298311 (-0.64z)| norm 0.2119 (+0.16z)| lr 5.81e-03 | 2034.53 ms | 67.5% bf16 MFU | 260079 tok/s +step 3126/18794 | loss 3.317894 (-0.10z)| norm 0.1802 (-0.69z)| lr 5.81e-03 | 2010.41 ms | 68.3% bf16 MFU | 260115 tok/s +step 3127/18794 | loss 3.342211 (+0.59z)| norm 0.1565 (-1.32z)| lr 5.81e-03 | 2018.41 ms | 68.0% bf16 MFU | 260097 tok/s +step 3128/18794 | loss 3.319829 (-0.03z)| norm 0.1420 (-1.68z)| lr 5.81e-03 | 2016.98 ms | 68.0% bf16 MFU | 260089 tok/s +step 3129/18794 | loss 3.283553 (-1.02z)| norm 0.1615 (-1.14z)| lr 5.81e-03 | 2025.78 ms | 67.7% bf16 MFU | 260025 tok/s +step 3130/18794 | loss 3.347193 (+0.75z)| norm 0.1597 (-1.17z)| lr 5.81e-03 | 2012.84 ms | 68.2% bf16 MFU | 260047 tok/s +step 3131/18794 | loss 3.355583 (+0.96z)| norm 0.2100 (+0.17z)| lr 5.81e-03 | 2001.09 ms | 68.6% bf16 MFU | 260145 tok/s +step 3132/18794 | loss 3.300625 (-0.55z)| norm 0.2466 (+1.14z)| lr 5.81e-03 | 2004.09 ms | 68.5% bf16 MFU | 260218 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.106465 +step 3133/18794 | loss 3.380670 (+1.67z)| norm 0.2854 (+2.11z)| lr 5.81e-03 | 2006.10 ms | 68.4% bf16 MFU | 260274 tok/s +step 3134/18794 | loss 3.298626 (-0.61z)| norm 0.2170 (+0.32z)| lr 5.81e-03 | 2000.07 ms | 68.6% bf16 MFU | 260367 tok/s +step 3135/18794 | loss 3.268543 (-1.41z)| norm 0.1813 (-0.62z)| lr 5.81e-03 | 2015.62 ms | 68.1% bf16 MFU | 260355 tok/s +step 3136/18794 | loss 3.352375 (+0.90z)| norm 0.1963 (-0.23z)| lr 5.81e-03 | 2023.02 ms | 67.8% bf16 MFU | 260295 tok/s +step 3137/18794 | loss 3.355410 (+0.97z)| norm 0.1878 (-0.43z)| lr 5.81e-03 | 2006.13 ms | 68.4% bf16 MFU | 260347 tok/s +step 3138/18794 | loss 3.322859 (+0.07z)| norm 0.1784 (-0.67z)| lr 5.81e-03 | 2019.07 ms | 68.0% bf16 MFU | 260313 tok/s +step 3139/18794 | loss 3.263339 (-1.52z)| norm 0.1873 (-0.42z)| lr 5.81e-03 | 2009.54 ms | 68.3% bf16 MFU | 260343 tok/s +step 3140/18794 | loss 3.291261 (-0.75z)| norm 0.2127 (+0.25z)| lr 5.81e-03 | 1999.04 ms | 68.6% bf16 MFU | 260439 tok/s +step 3141/18794 | loss 3.309607 (-0.24z)| norm 0.2054 (+0.04z)| lr 5.81e-03 | 2006.80 ms | 68.4% bf16 MFU | 260480 tok/s +step 3142/18794 | loss 3.335282 (+0.50z)| norm 0.1835 (-0.55z)| lr 5.81e-03 | 2011.59 ms | 68.2% bf16 MFU | 260487 tok/s +step 3143/18794 | loss 3.272739 (-1.24z)| norm 0.1949 (-0.26z)| lr 5.81e-03 | 2006.86 ms | 68.4% bf16 MFU | 260525 tok/s +step 3144/18794 | loss 3.355596 (+1.08z)| norm 0.2013 (-0.09z)| lr 5.81e-03 | 1994.00 ms | 68.8% bf16 MFU | 260646 tok/s +step 3145/18794 | loss 3.319970 (+0.09z)| norm 0.1890 (-0.44z)| lr 5.81e-03 | 2019.13 ms | 68.0% bf16 MFU | 260597 tok/s +step 3146/18794 | loss 3.303323 (-0.36z)| norm 0.1785 (-0.74z)| lr 5.81e-03 | 2011.10 ms | 68.2% bf16 MFU | 260602 tok/s +step 3147/18794 | loss 3.323781 (+0.23z)| norm 0.1402 (-1.77z)| lr 5.81e-03 | 2003.83 ms | 68.5% bf16 MFU | 260654 tok/s +step 3148/18794 | loss 3.268794 (-1.32z)| norm 0.1669 (-1.03z)| lr 5.81e-03 | 2013.24 ms | 68.2% bf16 MFU | 260642 tok/s +step 3149/18794 | loss 3.349402 (+0.99z)| norm 0.1699 (-0.96z)| lr 5.81e-03 | 1998.39 ms | 68.7% bf16 MFU | 260728 tok/s +step 3150/18794 | loss 3.280942 (-0.96z)| norm 0.2213 (+0.62z)| lr 5.81e-03 | 2012.38 ms | 68.2% bf16 MFU | 260718 tok/s +step 3151/18794 | loss 3.324728 (+0.27z)| norm 0.2616 (+1.87z)| lr 5.81e-03 | 1988.30 ms | 69.0% bf16 MFU | 260866 tok/s +step 3152/18794 | loss 3.300136 (-0.44z)| norm 0.2306 (+0.90z)| lr 5.81e-03 | 2002.74 ms | 68.5% bf16 MFU | 260912 tok/s +step 3153/18794 | loss 3.372201 (+1.63z)| norm 0.1968 (-0.16z)| lr 5.81e-03 | 2006.15 ms | 68.4% bf16 MFU | 260934 tok/s +step 3154/18794 | loss 3.305348 (-0.30z)| norm 0.1579 (-1.36z)| lr 5.81e-03 | 2019.03 ms | 68.0% bf16 MFU | 260871 tok/s +step 3155/18794 | loss 3.229935 (-2.39z)| norm 0.1885 (-0.41z)| lr 5.81e-03 | 2010.15 ms | 68.3% bf16 MFU | 260868 tok/s +step 3156/18794 | loss 3.302210 (-0.35z)| norm 0.2171 (+0.46z)| lr 5.81e-03 | 2002.28 ms | 68.5% bf16 MFU | 260917 tok/s +step 3157/18794 | loss 3.340658 (+0.73z)| norm 0.1795 (-0.76z)| lr 5.81e-03 | 2007.96 ms | 68.3% bf16 MFU | 260926 tok/s +step 3158/18794 | loss 3.259682 (-1.52z)| norm 0.2126 (+0.31z)| lr 5.81e-03 | 2002.37 ms | 68.5% bf16 MFU | 260972 tok/s +step 3159/18794 | loss 3.289888 (-0.66z)| norm 0.2622 (+1.87z)| lr 5.81e-03 | 2008.31 ms | 68.3% bf16 MFU | 260976 tok/s +step 3160/18794 | loss 3.337713 (+0.66z)| norm 0.2598 (+1.75z)| lr 5.81e-03 | 2017.22 ms | 68.0% bf16 MFU | 260923 tok/s +step 3161/18794 | loss 3.243055 (-1.93z)| norm 0.2385 (+1.06z)| lr 5.81e-03 | 2011.04 ms | 68.2% bf16 MFU | 260912 tok/s +step 3162/18794 | loss 3.298974 (-0.39z)| norm 0.2125 (+0.24z)| lr 5.81e-03 | 2010.65 ms | 68.3% bf16 MFU | 260904 tok/s +step 3163/18794 | loss 3.314147 (+0.02z)| norm 0.2181 (+0.41z)| lr 5.81e-03 | 2004.12 ms | 68.5% bf16 MFU | 260939 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.918976 +step 3164/18794 | loss 3.312774 (-0.02z)| norm 0.3029 (+2.92z)| lr 5.81e-03 | 2010.90 ms | 68.2% bf16 MFU | 260928 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.333117 +step 3165/18794 | loss 3.412882 (+2.67z)| norm 0.2864 (+2.33z)| lr 5.81e-03 | 2001.92 ms | 68.6% bf16 MFU | 260976 tok/s +step 3166/18794 | loss 3.399574 (+2.25z)| norm 0.1820 (-0.72z)| lr 5.81e-03 | 1995.22 ms | 68.8% bf16 MFU | 261066 tok/s +step 3167/18794 | loss 3.326859 (+0.32z)| norm 0.1646 (-1.24z)| lr 5.80e-03 | 2003.46 ms | 68.5% bf16 MFU | 261097 tok/s +step 3168/18794 | loss 3.252515 (-1.61z)| norm 0.1644 (-1.25z)| lr 5.80e-03 | 1995.17 ms | 68.8% bf16 MFU | 261181 tok/s +step 3169/18794 | loss 3.306381 (-0.21z)| norm 0.1799 (-0.79z)| lr 5.80e-03 | 2003.16 ms | 68.5% bf16 MFU | 261209 tok/s +step 3170/18794 | loss 3.317089 (+0.08z)| norm 0.1687 (-1.11z)| lr 5.80e-03 | 2005.26 ms | 68.4% bf16 MFU | 261221 tok/s +step 3171/18794 | loss 3.207217 (-2.69z)| norm 0.1724 (-0.98z)| lr 5.80e-03 | 2006.36 ms | 68.4% bf16 MFU | 261226 tok/s +step 3172/18794 | loss 3.269184 (-1.11z)| norm 0.1776 (-0.82z)| lr 5.80e-03 | 2010.66 ms | 68.3% bf16 MFU | 261202 tok/s +step 3173/18794 | loss 3.328083 (+0.38z)| norm 0.1510 (-1.58z)| lr 5.80e-03 | 1995.66 ms | 68.8% bf16 MFU | 261278 tok/s +step 3174/18794 | loss 3.325653 (+0.32z)| norm 0.1940 (-0.34z)| lr 5.80e-03 | 1995.01 ms | 68.8% bf16 MFU | 261354 tok/s +step 3175/18794 | loss 3.304140 (-0.22z)| norm 0.2112 (+0.16z)| lr 5.80e-03 | 2005.00 ms | 68.4% bf16 MFU | 261361 tok/s +step 3176/18794 | loss 3.358792 (+1.14z)| norm 0.1696 (-1.03z)| lr 5.80e-03 | 2001.75 ms | 68.6% bf16 MFU | 261388 tok/s +step 3177/18794 | loss 3.323580 (+0.26z)| norm 0.2288 (+0.67z)| lr 5.80e-03 | 2002.95 ms | 68.5% bf16 MFU | 261407 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.827287 +step 3178/18794 | loss 3.302240 (-0.26z)| norm 0.3085 (+2.83z)| lr 5.80e-03 | 1993.48 ms | 68.8% bf16 MFU | 261487 tok/s +step 3179/18794 | loss 3.270871 (-1.04z)| norm 0.2281 (+0.62z)| lr 5.80e-03 | 1998.16 ms | 68.7% bf16 MFU | 261532 tok/s +step 3180/18794 | loss 3.264791 (-1.17z)| norm 0.1772 (-0.79z)| lr 5.80e-03 | 2003.73 ms | 68.5% bf16 MFU | 261538 tok/s +step 3181/18794 | loss 3.333854 (+0.59z)| norm 0.2417 (+1.00z)| lr 5.80e-03 | 1994.68 ms | 68.8% bf16 MFU | 261603 tok/s +step 3182/18794 | loss 3.338532 (+0.71z)| norm 0.2051 (-0.02z)| lr 5.80e-03 | 2003.43 ms | 68.5% bf16 MFU | 261608 tok/s +step 3183/18794 | loss 3.361806 (+1.28z)| norm 0.1950 (-0.31z)| lr 5.80e-03 | 1994.40 ms | 68.8% bf16 MFU | 261671 tok/s +step 3184/18794 | loss 3.292441 (-0.50z)| norm 0.1750 (-0.85z)| lr 5.80e-03 | 1997.40 ms | 68.7% bf16 MFU | 261712 tok/s +step 3185/18794 | loss 3.293087 (-0.48z)| norm 0.1985 (-0.18z)| lr 5.80e-03 | 2007.19 ms | 68.4% bf16 MFU | 261687 tok/s +step 3186/18794 | loss 3.294212 (-0.44z)| norm 0.2276 (+0.63z)| lr 5.80e-03 | 1995.07 ms | 68.8% bf16 MFU | 261742 tok/s +step 3187/18794 | loss 3.314333 (+0.08z)| norm 0.1845 (-0.58z)| lr 5.80e-03 | 2007.82 ms | 68.3% bf16 MFU | 261711 tok/s +step 3188/18794 | loss 3.312308 (+0.03z)| norm 0.1782 (-0.75z)| lr 5.80e-03 | 1992.73 ms | 68.9% bf16 MFU | 261780 tok/s +step 3189/18794 | loss 3.322389 (+0.29z)| norm 0.2225 (+0.49z)| lr 5.80e-03 | 1995.65 ms | 68.8% bf16 MFU | 261827 tok/s +step 3190/18794 | loss 3.276299 (-0.88z)| norm 0.2420 (+1.02z)| lr 5.80e-03 | 2003.10 ms | 68.5% bf16 MFU | 261823 tok/s +step 3191/18794 | loss 3.321149 (+0.27z)| norm 0.2495 (+1.21z)| lr 5.80e-03 | 2003.02 ms | 68.5% bf16 MFU | 261819 tok/s +step 3192/18794 | loss 3.298872 (-0.30z)| norm 0.2341 (+0.77z)| lr 5.80e-03 | 2002.06 ms | 68.5% bf16 MFU | 261822 tok/s +step 3193/18794 | loss 3.335409 (+0.63z)| norm 0.2000 (-0.17z)| lr 5.80e-03 | 1995.01 ms | 68.8% bf16 MFU | 261871 tok/s +step 3194/18794 | loss 3.273880 (-0.95z)| norm 0.1684 (-1.03z)| lr 5.80e-03 | 1995.61 ms | 68.8% bf16 MFU | 261913 tok/s +step 3195/18794 | loss 3.337780 (+0.69z)| norm 0.1686 (-1.01z)| lr 5.80e-03 | 2000.75 ms | 68.6% bf16 MFU | 261920 tok/s +step 3196/18794 | loss 3.314827 (+0.08z)| norm 0.1734 (-0.88z)| lr 5.80e-03 | 1999.74 ms | 68.6% bf16 MFU | 261933 tok/s +step 3197/18794 | loss 3.215910 (-2.44z)| norm 0.1712 (-0.93z)| lr 5.80e-03 | 1983.57 ms | 69.2% bf16 MFU | 262052 tok/s +step 3198/18794 | loss 3.306784 (-0.10z)| norm 0.2081 (+0.07z)| lr 5.80e-03 | 2002.36 ms | 68.5% bf16 MFU | 262041 tok/s +step 3199/18794 | loss 3.288822 (-0.59z)| norm 0.2005 (-0.14z)| lr 5.80e-03 | 1989.52 ms | 69.0% bf16 MFU | 262115 tok/s +step 3200/18794 | loss 3.315795 (+0.11z)| norm 0.1839 (-0.60z)| lr 5.80e-03 | 1994.43 ms | 68.8% bf16 MFU | 262153 tok/s +step 3201/18794 | loss 3.303703 (-0.23z)| norm 0.1486 (-1.54z)| lr 5.80e-03 | 1991.90 ms | 68.9% bf16 MFU | 262206 tok/s +step 3202/18794 | loss 3.273377 (-1.02z)| norm 0.1694 (-0.96z)| lr 5.80e-03 | 1996.79 ms | 68.7% bf16 MFU | 262224 tok/s +step 3203/18794 | loss 3.290109 (-0.57z)| norm 0.1544 (-1.34z)| lr 5.80e-03 | 1986.23 ms | 69.1% bf16 MFU | 262311 tok/s +step 3204/18794 | loss 3.369680 (+1.52z)| norm 0.1684 (-0.94z)| lr 5.80e-03 | 1993.71 ms | 68.8% bf16 MFU | 262344 tok/s +step 3205/18794 | loss 3.332704 (+0.55z)| norm 0.2123 (+0.25z)| lr 5.80e-03 | 2013.93 ms | 68.1% bf16 MFU | 262243 tok/s +step 3206/18794 | loss 3.333802 (+0.56z)| norm 0.2222 (+0.52z)| lr 5.80e-03 | 1985.56 ms | 69.1% bf16 MFU | 262334 tok/s +step 3207/18794 | loss 3.267543 (-1.22z)| norm 0.1801 (-0.61z)| lr 5.80e-03 | 1994.84 ms | 68.8% bf16 MFU | 262358 tok/s +step 3208/18794 | loss 3.262625 (-1.33z)| norm 0.1638 (-1.03z)| lr 5.80e-03 | 1994.68 ms | 68.8% bf16 MFU | 262382 tok/s +step 3209/18794 | loss 3.252326 (-1.57z)| norm 0.1917 (-0.27z)| lr 5.80e-03 | 1993.03 ms | 68.9% bf16 MFU | 262416 tok/s +step 3210/18794 | loss 3.292229 (-0.50z)| norm 0.1854 (-0.43z)| lr 5.80e-03 | 2002.29 ms | 68.5% bf16 MFU | 262388 tok/s +step 3211/18794 | loss 3.271655 (-1.03z)| norm 0.2241 (+0.72z)| lr 5.80e-03 | 1978.57 ms | 69.4% bf16 MFU | 262517 tok/s +step 3212/18794 | loss 3.282957 (-0.72z)| norm 0.2282 (+0.84z)| lr 5.80e-03 | 1991.38 ms | 68.9% bf16 MFU | 262555 tok/s +step 3213/18794 | loss 3.337207 (+0.70z)| norm 0.2299 (+0.87z)| lr 5.80e-03 | 1988.94 ms | 69.0% bf16 MFU | 262608 tok/s +step 3214/18794 | loss 3.328408 (+0.49z)| norm 0.2564 (+1.62z)| lr 5.80e-03 | 1994.94 ms | 68.8% bf16 MFU | 262618 tok/s +step 3215/18794 | loss 3.308573 (-0.03z)| norm 0.1903 (-0.31z)| lr 5.80e-03 | 1988.39 ms | 69.0% bf16 MFU | 262671 tok/s +step 3216/18794 | loss 3.270540 (-1.05z)| norm 0.2213 (+0.58z)| lr 5.80e-03 | 1999.48 ms | 68.6% bf16 MFU | 262648 tok/s +reducing beta2 to 0.9 and lr/wd by 0.871 due to grad z-score of 4.019747 +step 3217/18794 | loss 3.341275 (+0.88z)| norm 0.3532 (+4.02z)| lr 5.05e-03 | 1992.01 ms | 68.9% bf16 MFU | 262675 tok/s +step 3218/18794 | loss 3.341943 (+0.90z)| norm 0.2110 (+0.22z)| lr 5.80e-03 | 2008.65 ms | 68.3% bf16 MFU | 262592 tok/s +step 3219/18794 | loss 3.284329 (-0.66z)| norm 0.2103 (+0.21z)| lr 5.80e-03 | 1981.82 ms | 69.2% bf16 MFU | 262690 tok/s +step 3220/18794 | loss 3.334260 (+0.71z)| norm 0.2246 (+0.60z)| lr 5.80e-03 | 1996.93 ms | 68.7% bf16 MFU | 262683 tok/s +step 3221/18794 | loss 3.251868 (-1.52z)| norm 0.1966 (-0.16z)| lr 5.80e-03 | 1988.76 ms | 69.0% bf16 MFU | 262730 tok/s +step 3222/18794 | loss 3.359981 (+1.39z)| norm 0.2284 (+0.69z)| lr 5.80e-03 | 1991.12 ms | 68.9% bf16 MFU | 262759 tok/s +step 3223/18794 | loss 3.315571 (+0.18z)| norm 0.2430 (+1.06z)| lr 5.79e-03 | 1987.87 ms | 69.0% bf16 MFU | 262808 tok/s +step 3224/18794 | loss 3.364260 (+1.48z)| norm 0.1853 (-0.47z)| lr 5.79e-03 | 1990.27 ms | 69.0% bf16 MFU | 262839 tok/s +step 3225/18794 | loss 3.362828 (+1.41z)| norm 0.2144 (+0.31z)| lr 5.79e-03 | 1987.86 ms | 69.0% bf16 MFU | 262884 tok/s +step 3226/18794 | loss 3.283402 (-0.70z)| norm 0.2383 (+0.92z)| lr 5.79e-03 | 2003.27 ms | 68.5% bf16 MFU | 262826 tok/s +step 3227/18794 | loss 3.275537 (-0.89z)| norm 0.1939 (-0.27z)| lr 5.79e-03 | 1988.55 ms | 69.0% bf16 MFU | 262867 tok/s +step 3228/18794 | loss 3.357058 (+1.26z)| norm 0.2074 (+0.08z)| lr 5.79e-03 | 1988.26 ms | 69.0% bf16 MFU | 262909 tok/s +step 3229/18794 | loss 3.299807 (-0.26z)| norm 0.1704 (-0.94z)| lr 5.79e-03 | 1980.93 ms | 69.3% bf16 MFU | 262996 tok/s +step 3230/18794 | loss 3.255952 (-1.39z)| norm 0.1839 (-0.58z)| lr 5.79e-03 | 1997.23 ms | 68.7% bf16 MFU | 262972 tok/s +step 3231/18794 | loss 3.263456 (-1.16z)| norm 0.1959 (-0.24z)| lr 5.79e-03 | 1980.18 ms | 69.3% bf16 MFU | 263062 tok/s +step 3232/18794 | loss 3.349318 (+1.08z)| norm 0.1765 (-0.76z)| lr 5.79e-03 | 1991.40 ms | 68.9% bf16 MFU | 263073 tok/s +step 3233/18794 | loss 3.294528 (-0.34z)| norm 0.2231 (+0.55z)| lr 5.79e-03 | 1981.24 ms | 69.3% bf16 MFU | 263150 tok/s +step 3234/18794 | loss 3.289593 (-0.47z)| norm 0.2067 (+0.10z)| lr 5.79e-03 | 1994.70 ms | 68.8% bf16 MFU | 263135 tok/s +step 3235/18794 | loss 3.262024 (-1.20z)| norm 0.1756 (-0.78z)| lr 5.79e-03 | 1986.67 ms | 69.1% bf16 MFU | 263173 tok/s +step 3236/18794 | loss 3.316030 (+0.25z)| norm 0.1914 (-0.34z)| lr 5.79e-03 | 1986.73 ms | 69.1% bf16 MFU | 263209 tok/s +step 3237/18794 | loss 3.263362 (-1.14z)| norm 0.1787 (-0.69z)| lr 5.79e-03 | 1994.21 ms | 68.8% bf16 MFU | 263194 tok/s +step 3238/18794 | loss 3.347928 (+1.11z)| norm 0.1776 (-0.72z)| lr 5.79e-03 | 1980.10 ms | 69.3% bf16 MFU | 263273 tok/s +step 3239/18794 | loss 3.286418 (-0.54z)| norm 0.1642 (-1.09z)| lr 5.79e-03 | 1983.94 ms | 69.2% bf16 MFU | 263323 tok/s +step 3240/18794 | loss 3.258819 (-1.26z)| norm 0.1493 (-1.47z)| lr 5.79e-03 | 1986.92 ms | 69.1% bf16 MFU | 263350 tok/s +step 3241/18794 | loss 3.281826 (-0.64z)| norm 0.1574 (-1.22z)| lr 5.79e-03 | 2006.15 ms | 68.4% bf16 MFU | 263250 tok/s +step 3242/18794 | loss 3.329320 (+0.63z)| norm 0.1407 (-1.65z)| lr 5.79e-03 | 2029.64 ms | 67.6% bf16 MFU | 263003 tok/s +step 3243/18794 | loss 3.357253 (+1.34z)| norm 0.1719 (-0.79z)| lr 5.79e-03 | 2040.26 ms | 67.3% bf16 MFU | 262701 tok/s +step 3244/18794 | loss 3.386114 (+2.07z)| norm 0.1635 (-1.00z)| lr 5.79e-03 | 2042.06 ms | 67.2% bf16 MFU | 262404 tok/s +step 3245/18794 | loss 3.373737 (+1.71z)| norm 0.1890 (-0.32z)| lr 5.79e-03 | 2030.23 ms | 67.6% bf16 MFU | 262195 tok/s +step 3246/18794 | loss 3.365822 (+1.47z)| norm 0.2027 (+0.05z)| lr 5.79e-03 | 2043.81 ms | 67.1% bf16 MFU | 261912 tok/s +step 3247/18794 | loss 3.326138 (+0.46z)| norm 0.1847 (-0.46z)| lr 5.79e-03 | 2041.87 ms | 67.2% bf16 MFU | 261655 tok/s +step 3248/18794 | loss 3.357395 (+1.23z)| norm 0.1592 (-1.15z)| lr 5.79e-03 | 2039.60 ms | 67.3% bf16 MFU | 261425 tok/s +step 3249/18794 | loss 3.341400 (+0.83z)| norm 0.1953 (-0.17z)| lr 5.79e-03 | 2036.44 ms | 67.4% bf16 MFU | 261226 tok/s +step 3250/18794 | loss 3.295569 (-0.34z)| norm 0.1888 (-0.34z)| lr 5.79e-03 | 2036.00 ms | 67.4% bf16 MFU | 261040 tok/s +val loss 3.350554 +HellaSwag: 2745/10042 = 0.273352: 0/1256 +step 3251/18794 | loss 3.375182 (+1.65z)| norm 0.2005 (-0.01z)| lr 5.79e-03 | 2043.06 ms | 67.2% bf16 MFU | 260819 tok/s +step 3252/18794 | loss 3.295190 (-0.36z)| norm 0.2374 (+1.02z)| lr 5.79e-03 | 2032.12 ms | 67.5% bf16 MFU | 260678 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.327703 +step 3253/18794 | loss 3.325956 (+0.43z)| norm 0.2879 (+2.33z)| lr 5.79e-03 | 2033.78 ms | 67.5% bf16 MFU | 260534 tok/s +step 3254/18794 | loss 3.319170 (+0.25z)| norm 0.2712 (+1.83z)| lr 5.79e-03 | 2043.84 ms | 67.1% bf16 MFU | 260333 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.066824 +step 3255/18794 | loss 3.394611 (+2.12z)| norm 0.2828 (+2.07z)| lr 5.79e-03 | 2043.16 ms | 67.2% bf16 MFU | 260147 tok/s +step 3256/18794 | loss 3.361612 (+1.26z)| norm 0.2293 (+0.66z)| lr 5.79e-03 | 2050.64 ms | 66.9% bf16 MFU | 259923 tok/s +step 3257/18794 | loss 3.378831 (+1.67z)| norm 0.2034 (-0.02z)| lr 5.79e-03 | 2041.49 ms | 67.2% bf16 MFU | 259768 tok/s +step 3258/18794 | loss 3.332300 (+0.50z)| norm 0.1806 (-0.61z)| lr 5.79e-03 | 2017.15 ms | 68.0% bf16 MFU | 259775 tok/s +step 3259/18794 | loss 3.352436 (+0.98z)| norm 0.2132 (+0.26z)| lr 5.79e-03 | 2042.24 ms | 67.2% bf16 MFU | 259622 tok/s +step 3260/18794 | loss 3.343218 (+0.75z)| norm 0.1989 (-0.10z)| lr 5.79e-03 | 2012.34 ms | 68.2% bf16 MFU | 259668 tok/s +step 3261/18794 | loss 3.314274 (+0.01z)| norm 0.1704 (-0.84z)| lr 5.79e-03 | 2029.11 ms | 67.6% bf16 MFU | 259604 tok/s +step 3262/18794 | loss 3.324847 (+0.27z)| norm 0.1813 (-0.55z)| lr 5.79e-03 | 2035.11 ms | 67.4% bf16 MFU | 259505 tok/s +step 3263/18794 | loss 3.318690 (+0.11z)| norm 0.1594 (-1.11z)| lr 5.79e-03 | 2037.40 ms | 67.4% bf16 MFU | 259396 tok/s +step 3264/18794 | loss 3.368276 (+1.34z)| norm 0.1543 (-1.24z)| lr 5.79e-03 | 2027.16 ms | 67.7% bf16 MFU | 259358 tok/s +step 3265/18794 | loss 3.315329 (+0.04z)| norm 0.1657 (-0.92z)| lr 5.79e-03 | 2035.92 ms | 67.4% bf16 MFU | 259266 tok/s +step 3266/18794 | loss 3.320552 (+0.20z)| norm 0.1661 (-0.90z)| lr 5.79e-03 | 2016.27 ms | 68.1% bf16 MFU | 259304 tok/s +step 3267/18794 | loss 3.352087 (+1.03z)| norm 0.1438 (-1.51z)| lr 5.79e-03 | 2041.50 ms | 67.2% bf16 MFU | 259180 tok/s +step 3268/18794 | loss 3.333759 (+0.53z)| norm 0.2145 (+0.45z)| lr 5.79e-03 | 2039.65 ms | 67.3% bf16 MFU | 259073 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.416745 +step 3269/18794 | loss 3.313110 (-0.03z)| norm 0.3301 (+3.42z)| lr 5.79e-03 | 2031.77 ms | 67.5% bf16 MFU | 259022 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.511979 +step 3270/18794 | loss 3.308154 (-0.16z)| norm 0.2999 (+2.51z)| lr 5.79e-03 | 2019.98 ms | 67.9% bf16 MFU | 259048 tok/s +step 3271/18794 | loss 3.293561 (-0.60z)| norm 0.1845 (-0.43z)| lr 5.79e-03 | 2025.24 ms | 67.8% bf16 MFU | 259040 tok/s +step 3272/18794 | loss 3.308806 (-0.18z)| norm 0.1879 (-0.35z)| lr 5.79e-03 | 2036.22 ms | 67.4% bf16 MFU | 258962 tok/s +step 3273/18794 | loss 3.319556 (+0.12z)| norm 0.1942 (-0.20z)| lr 5.79e-03 | 2036.69 ms | 67.4% bf16 MFU | 258885 tok/s +step 3274/18794 | loss 3.307149 (-0.22z)| norm 0.2087 (+0.17z)| lr 5.79e-03 | 2011.75 ms | 68.2% bf16 MFU | 258971 tok/s +step 3275/18794 | loss 3.285054 (-0.84z)| norm 0.1828 (-0.49z)| lr 5.79e-03 | 2003.37 ms | 68.5% bf16 MFU | 259108 tok/s +step 3276/18794 | loss 3.330007 (+0.44z)| norm 0.1610 (-1.05z)| lr 5.79e-03 | 2027.96 ms | 67.7% bf16 MFU | 259079 tok/s +step 3277/18794 | loss 3.333348 (+0.53z)| norm 0.1867 (-0.38z)| lr 5.78e-03 | 2028.24 ms | 67.7% bf16 MFU | 259050 tok/s +step 3278/18794 | loss 3.307547 (-0.20z)| norm 0.2128 (+0.33z)| lr 5.78e-03 | 2023.44 ms | 67.8% bf16 MFU | 259052 tok/s +step 3279/18794 | loss 3.292535 (-0.64z)| norm 0.2282 (+0.74z)| lr 5.78e-03 | 2043.42 ms | 67.2% bf16 MFU | 258928 tok/s +step 3280/18794 | loss 3.310773 (-0.13z)| norm 0.2078 (+0.19z)| lr 5.78e-03 | 2021.79 ms | 67.9% bf16 MFU | 258948 tok/s +step 3281/18794 | loss 3.346057 (+0.88z)| norm 0.1879 (-0.33z)| lr 5.78e-03 | 2025.18 ms | 67.8% bf16 MFU | 258945 tok/s +step 3282/18794 | loss 3.337009 (+0.62z)| norm 0.2287 (+0.76z)| lr 5.78e-03 | 2018.26 ms | 68.0% bf16 MFU | 258986 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.907984 +step 3283/18794 | loss 3.353423 (+1.09z)| norm 0.3146 (+2.91z)| lr 5.78e-03 | 2024.44 ms | 67.8% bf16 MFU | 258986 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.129939 +step 3284/18794 | loss 3.371597 (+1.57z)| norm 0.2872 (+2.13z)| lr 5.78e-03 | 2024.80 ms | 67.8% bf16 MFU | 258983 tok/s +step 3285/18794 | loss 3.371321 (+1.53z)| norm 0.2279 (+0.63z)| lr 5.78e-03 | 2034.08 ms | 67.5% bf16 MFU | 258922 tok/s +step 3286/18794 | loss 3.332744 (+0.43z)| norm 0.1662 (-0.91z)| lr 5.78e-03 | 2023.96 ms | 67.8% bf16 MFU | 258928 tok/s +step 3287/18794 | loss 3.332775 (+0.43z)| norm 0.2351 (+0.81z)| lr 5.78e-03 | 2048.03 ms | 67.0% bf16 MFU | 258781 tok/s +step 3288/18794 | loss 3.358712 (+1.14z)| norm 0.2573 (+1.33z)| lr 5.78e-03 | 2015.07 ms | 68.1% bf16 MFU | 258851 tok/s +step 3289/18794 | loss 3.267561 (-1.38z)| norm 0.1991 (-0.11z)| lr 5.78e-03 | 2014.64 ms | 68.1% bf16 MFU | 258921 tok/s +step 3290/18794 | loss 3.338828 (+0.58z)| norm 0.1944 (-0.22z)| lr 5.78e-03 | 2023.45 ms | 67.8% bf16 MFU | 258930 tok/s +step 3291/18794 | loss 3.285354 (-0.90z)| norm 0.2020 (-0.01z)| lr 5.78e-03 | 2014.76 ms | 68.1% bf16 MFU | 258994 tok/s +step 3292/18794 | loss 3.349204 (+0.86z)| norm 0.1398 (-1.54z)| lr 5.78e-03 | 2033.51 ms | 67.5% bf16 MFU | 258936 tok/s +step 3293/18794 | loss 3.308280 (-0.27z)| norm 0.1597 (-1.03z)| lr 5.78e-03 | 2027.80 ms | 67.7% bf16 MFU | 258917 tok/s +step 3294/18794 | loss 3.291864 (-0.73z)| norm 0.1723 (-0.72z)| lr 5.78e-03 | 2040.46 ms | 67.3% bf16 MFU | 258818 tok/s +step 3295/18794 | loss 3.311075 (-0.19z)| norm 0.1688 (-0.81z)| lr 5.78e-03 | 2030.31 ms | 67.6% bf16 MFU | 258789 tok/s +step 3296/18794 | loss 3.280830 (-1.02z)| norm 0.1813 (-0.50z)| lr 5.78e-03 | 2017.11 ms | 68.0% bf16 MFU | 258845 tok/s +step 3297/18794 | loss 3.386867 (+1.92z)| norm 0.1964 (-0.13z)| lr 5.78e-03 | 2043.11 ms | 67.2% bf16 MFU | 258734 tok/s +step 3298/18794 | loss 3.316441 (-0.08z)| norm 0.2248 (+0.57z)| lr 5.78e-03 | 2011.26 ms | 68.2% bf16 MFU | 258831 tok/s +step 3299/18794 | loss 3.314803 (-0.14z)| norm 0.2385 (+0.90z)| lr 5.78e-03 | 2016.13 ms | 68.1% bf16 MFU | 258892 tok/s +step 3300/18794 | loss 3.388842 (+1.91z)| norm 0.1766 (-0.63z)| lr 5.78e-03 | 2027.37 ms | 67.7% bf16 MFU | 258877 tok/s +step 3301/18794 | loss 3.284402 (-0.99z)| norm 0.2074 (+0.12z)| lr 5.78e-03 | 2014.60 ms | 68.1% bf16 MFU | 258946 tok/s +step 3302/18794 | loss 3.368515 (+1.32z)| norm 0.2400 (+0.92z)| lr 5.78e-03 | 2013.01 ms | 68.2% bf16 MFU | 259021 tok/s +step 3303/18794 | loss 3.358420 (+1.02z)| norm 0.1946 (-0.23z)| lr 5.78e-03 | 2015.71 ms | 68.1% bf16 MFU | 259075 tok/s +step 3304/18794 | loss 3.282089 (-1.08z)| norm 0.1868 (-0.43z)| lr 5.78e-03 | 2033.90 ms | 67.5% bf16 MFU | 259010 tok/s +step 3305/18794 | loss 3.338249 (+0.48z)| norm 0.1814 (-0.56z)| lr 5.78e-03 | 2019.33 ms | 68.0% bf16 MFU | 259041 tok/s +step 3306/18794 | loss 3.312799 (-0.22z)| norm 0.1842 (-0.48z)| lr 5.78e-03 | 2002.35 ms | 68.5% bf16 MFU | 259181 tok/s +step 3307/18794 | loss 3.334402 (+0.37z)| norm 0.1802 (-0.58z)| lr 5.78e-03 | 2015.83 ms | 68.1% bf16 MFU | 259226 tok/s +step 3308/18794 | loss 3.329272 (+0.21z)| norm 0.1793 (-0.61z)| lr 5.78e-03 | 2025.28 ms | 67.8% bf16 MFU | 259208 tok/s +step 3309/18794 | loss 3.332663 (+0.29z)| norm 0.1711 (-0.81z)| lr 5.78e-03 | 2020.17 ms | 67.9% bf16 MFU | 259224 tok/s +step 3310/18794 | loss 3.368088 (+1.29z)| norm 0.1439 (-1.47z)| lr 5.78e-03 | 2023.60 ms | 67.8% bf16 MFU | 259217 tok/s +step 3311/18794 | loss 3.249811 (-2.11z)| norm 0.1762 (-0.65z)| lr 5.78e-03 | 2011.28 ms | 68.2% bf16 MFU | 259290 tok/s +step 3312/18794 | loss 3.329938 (+0.18z)| norm 0.1845 (-0.43z)| lr 5.78e-03 | 2017.90 ms | 68.0% bf16 MFU | 259317 tok/s +step 3313/18794 | loss 3.281083 (-1.21z)| norm 0.1847 (-0.42z)| lr 5.78e-03 | 2003.93 ms | 68.5% bf16 MFU | 259432 tok/s +step 3314/18794 | loss 3.306866 (-0.46z)| norm 0.2232 (+0.56z)| lr 5.78e-03 | 2005.59 ms | 68.4% bf16 MFU | 259531 tok/s +step 3315/18794 | loss 3.328140 (+0.14z)| norm 0.2712 (+1.72z)| lr 5.78e-03 | 2014.66 ms | 68.1% bf16 MFU | 259567 tok/s +step 3316/18794 | loss 3.287662 (-1.03z)| norm 0.2352 (+0.82z)| lr 5.78e-03 | 2017.72 ms | 68.0% bf16 MFU | 259580 tok/s +step 3317/18794 | loss 3.313059 (-0.29z)| norm 0.1768 (-0.63z)| lr 5.78e-03 | 2013.73 ms | 68.1% bf16 MFU | 259619 tok/s +step 3318/18794 | loss 3.374444 (+1.46z)| norm 0.1934 (-0.18z)| lr 5.78e-03 | 2029.52 ms | 67.6% bf16 MFU | 259555 tok/s +step 3319/18794 | loss 3.313125 (-0.31z)| norm 0.1889 (-0.29z)| lr 5.78e-03 | 1992.30 ms | 68.9% bf16 MFU | 259735 tok/s +step 3320/18794 | loss 3.350252 (+0.75z)| norm 0.2458 (+1.21z)| lr 5.78e-03 | 2006.92 ms | 68.4% bf16 MFU | 259810 tok/s +step 3321/18794 | loss 3.341719 (+0.49z)| norm 0.2502 (+1.31z)| lr 5.78e-03 | 2023.76 ms | 67.8% bf16 MFU | 259773 tok/s +step 3322/18794 | loss 3.304472 (-0.58z)| norm 0.2568 (+1.46z)| lr 5.78e-03 | 2007.46 ms | 68.4% bf16 MFU | 259843 tok/s +step 3323/18794 | loss 3.407147 (+2.34z)| norm 0.2465 (+1.19z)| lr 5.78e-03 | 2016.05 ms | 68.1% bf16 MFU | 259854 tok/s +step 3324/18794 | loss 3.324873 (+0.00z)| norm 0.2527 (+1.32z)| lr 5.78e-03 | 2025.96 ms | 67.7% bf16 MFU | 259800 tok/s +step 3325/18794 | loss 3.322024 (-0.07z)| norm 0.2098 (+0.21z)| lr 5.78e-03 | 2019.08 ms | 68.0% bf16 MFU | 259793 tok/s +step 3326/18794 | loss 3.275208 (-1.42z)| norm 0.2268 (+0.66z)| lr 5.78e-03 | 2016.71 ms | 68.0% bf16 MFU | 259802 tok/s +step 3327/18794 | loss 3.334287 (+0.27z)| norm 0.1968 (-0.12z)| lr 5.78e-03 | 2027.51 ms | 67.7% bf16 MFU | 259742 tok/s +step 3328/18794 | loss 3.342282 (+0.51z)| norm 0.1911 (-0.26z)| lr 5.78e-03 | 2012.48 ms | 68.2% bf16 MFU | 259780 tok/s +step 3329/18794 | loss 3.348693 (+0.69z)| norm 0.2372 (+0.91z)| lr 5.78e-03 | 2018.08 ms | 68.0% bf16 MFU | 259781 tok/s +step 3330/18794 | loss 3.299084 (-0.79z)| norm 0.2060 (+0.10z)| lr 5.77e-03 | 1986.73 ms | 69.1% bf16 MFU | 259987 tok/s +step 3331/18794 | loss 3.300627 (-0.77z)| norm 0.1632 (-1.00z)| lr 5.77e-03 | 1985.02 ms | 69.1% bf16 MFU | 260194 tok/s +step 3332/18794 | loss 3.295144 (-0.92z)| norm 0.1660 (-0.92z)| lr 5.77e-03 | 2020.19 ms | 67.9% bf16 MFU | 260160 tok/s +step 3333/18794 | loss 3.319728 (-0.18z)| norm 0.1657 (-0.91z)| lr 5.77e-03 | 2008.69 ms | 68.3% bf16 MFU | 260203 tok/s +step 3334/18794 | loss 3.369696 (+1.30z)| norm 0.1998 (-0.03z)| lr 5.77e-03 | 2002.18 ms | 68.5% bf16 MFU | 260286 tok/s +step 3335/18794 | loss 3.327908 (+0.02z)| norm 0.2719 (+1.77z)| lr 5.77e-03 | 2001.63 ms | 68.6% bf16 MFU | 260368 tok/s +step 3336/18794 | loss 3.290308 (-1.12z)| norm 0.2463 (+1.10z)| lr 5.77e-03 | 2029.26 ms | 67.6% bf16 MFU | 260268 tok/s +step 3337/18794 | loss 3.330049 (+0.08z)| norm 0.2369 (+0.85z)| lr 5.77e-03 | 2007.24 ms | 68.4% bf16 MFU | 260314 tok/s +step 3338/18794 | loss 3.271710 (-1.70z)| norm 0.2317 (+0.70z)| lr 5.77e-03 | 2013.06 ms | 68.2% bf16 MFU | 260321 tok/s +step 3339/18794 | loss 3.309511 (-0.55z)| norm 0.1925 (-0.29z)| lr 5.77e-03 | 1994.45 ms | 68.8% bf16 MFU | 260448 tok/s +step 3340/18794 | loss 3.331728 (+0.12z)| norm 0.1971 (-0.19z)| lr 5.77e-03 | 2016.31 ms | 68.1% bf16 MFU | 260427 tok/s +step 3341/18794 | loss 3.338667 (+0.33z)| norm 0.2124 (+0.19z)| lr 5.77e-03 | 2018.35 ms | 68.0% bf16 MFU | 260394 tok/s +step 3342/18794 | loss 3.300130 (-0.89z)| norm 0.1925 (-0.34z)| lr 5.77e-03 | 2010.82 ms | 68.2% bf16 MFU | 260411 tok/s +step 3343/18794 | loss 3.399970 (+2.23z)| norm 0.1776 (-0.73z)| lr 5.77e-03 | 2001.56 ms | 68.6% bf16 MFU | 260487 tok/s +step 3344/18794 | loss 3.306878 (-0.66z)| norm 0.2115 (+0.14z)| lr 5.77e-03 | 1995.94 ms | 68.8% bf16 MFU | 260597 tok/s +step 3345/18794 | loss 3.306703 (-0.65z)| norm 0.1698 (-0.94z)| lr 5.77e-03 | 2002.02 ms | 68.5% bf16 MFU | 260661 tok/s +step 3346/18794 | loss 3.340330 (+0.44z)| norm 0.1902 (-0.41z)| lr 5.77e-03 | 2021.54 ms | 67.9% bf16 MFU | 260595 tok/s +step 3347/18794 | loss 3.283325 (-1.37z)| norm 0.1576 (-1.24z)| lr 5.77e-03 | 1996.91 ms | 68.7% bf16 MFU | 260693 tok/s +step 3348/18794 | loss 3.341170 (+0.48z)| norm 0.1868 (-0.50z)| lr 5.77e-03 | 2021.54 ms | 67.9% bf16 MFU | 260626 tok/s +step 3349/18794 | loss 3.348197 (+0.70z)| norm 0.2254 (+0.50z)| lr 5.77e-03 | 2006.29 ms | 68.4% bf16 MFU | 260661 tok/s +step 3350/18794 | loss 3.353426 (+0.85z)| norm 0.1887 (-0.46z)| lr 5.77e-03 | 1994.29 ms | 68.8% bf16 MFU | 260772 tok/s +step 3351/18794 | loss 3.299732 (-0.85z)| norm 0.1743 (-0.82z)| lr 5.77e-03 | 1996.14 ms | 68.7% bf16 MFU | 260866 tok/s +step 3352/18794 | loss 3.318036 (-0.27z)| norm 0.1586 (-1.20z)| lr 5.77e-03 | 2012.39 ms | 68.2% bf16 MFU | 260850 tok/s +step 3353/18794 | loss 3.408153 (+2.54z)| norm 0.2090 (+0.13z)| lr 5.77e-03 | 2013.75 ms | 68.1% bf16 MFU | 260825 tok/s +step 3354/18794 | loss 3.312699 (-0.45z)| norm 0.2166 (+0.35z)| lr 5.77e-03 | 2008.94 ms | 68.3% bf16 MFU | 260832 tok/s +step 3355/18794 | loss 3.428398 (+3.08z)| norm 0.2052 (+0.06z)| lr 5.77e-03 | 2006.99 ms | 68.4% bf16 MFU | 260852 tok/s +step 3356/18794 | loss 3.295382 (-0.96z)| norm 0.1761 (-0.72z)| lr 5.77e-03 | 2004.29 ms | 68.5% bf16 MFU | 260889 tok/s +step 3357/18794 | loss 3.358558 (+0.98z)| norm 0.1747 (-0.75z)| lr 5.77e-03 | 2011.91 ms | 68.2% bf16 MFU | 260874 tok/s +step 3358/18794 | loss 3.332862 (+0.19z)| norm 0.2035 (+0.03z)| lr 5.77e-03 | 2010.25 ms | 68.3% bf16 MFU | 260871 tok/s +step 3359/18794 | loss 3.299732 (-0.81z)| norm 0.2324 (+0.82z)| lr 5.77e-03 | 1992.78 ms | 68.9% bf16 MFU | 260982 tok/s +step 3360/18794 | loss 3.302582 (-0.71z)| norm 0.2740 (+1.91z)| lr 5.77e-03 | 1995.36 ms | 68.8% bf16 MFU | 261070 tok/s +step 3361/18794 | loss 3.319477 (-0.19z)| norm 0.2021 (-0.04z)| lr 5.77e-03 | 2012.94 ms | 68.2% bf16 MFU | 261040 tok/s +step 3362/18794 | loss 3.269783 (-1.68z)| norm 0.1651 (-1.03z)| lr 5.77e-03 | 2020.55 ms | 67.9% bf16 MFU | 260962 tok/s +step 3363/18794 | loss 3.343903 (+0.56z)| norm 0.1665 (-1.00z)| lr 5.77e-03 | 1983.24 ms | 69.2% bf16 MFU | 261132 tok/s +step 3364/18794 | loss 3.291849 (-1.00z)| norm 0.1935 (-0.28z)| lr 5.77e-03 | 2019.17 ms | 68.0% bf16 MFU | 261058 tok/s +step 3365/18794 | loss 3.313920 (-0.33z)| norm 0.1833 (-0.57z)| lr 5.77e-03 | 2005.94 ms | 68.4% bf16 MFU | 261073 tok/s +step 3366/18794 | loss 3.344990 (+0.61z)| norm 0.1769 (-0.75z)| lr 5.77e-03 | 1993.66 ms | 68.8% bf16 MFU | 261169 tok/s +step 3367/18794 | loss 3.305401 (-0.58z)| norm 0.2295 (+0.68z)| lr 5.77e-03 | 2002.31 ms | 68.5% bf16 MFU | 261202 tok/s +step 3368/18794 | loss 3.331342 (+0.21z)| norm 0.2446 (+1.08z)| lr 5.77e-03 | 2025.12 ms | 67.8% bf16 MFU | 261087 tok/s +step 3369/18794 | loss 3.337021 (+0.38z)| norm 0.2001 (-0.12z)| lr 5.77e-03 | 2004.33 ms | 68.5% bf16 MFU | 261111 tok/s +step 3370/18794 | loss 3.325512 (+0.02z)| norm 0.1439 (-1.77z)| lr 5.77e-03 | 2015.84 ms | 68.1% bf16 MFU | 261060 tok/s +step 3371/18794 | loss 3.358809 (+1.01z)| norm 0.2335 (+0.92z)| lr 5.77e-03 | 2003.15 ms | 68.5% bf16 MFU | 261094 tok/s +step 3372/18794 | loss 3.297499 (-0.84z)| norm 0.1868 (-0.49z)| lr 5.77e-03 | 2006.23 ms | 68.4% bf16 MFU | 261105 tok/s +step 3373/18794 | loss 3.311671 (-0.41z)| norm 0.2073 (+0.13z)| lr 5.77e-03 | 2011.82 ms | 68.2% bf16 MFU | 261080 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.466267 +step 3374/18794 | loss 3.292473 (-0.99z)| norm 0.2884 (+2.47z)| lr 5.77e-03 | 1984.21 ms | 69.2% bf16 MFU | 261238 tok/s +step 3375/18794 | loss 3.348782 (+0.70z)| norm 0.2613 (+1.63z)| lr 5.77e-03 | 1994.51 ms | 68.8% bf16 MFU | 261319 tok/s +step 3376/18794 | loss 3.307481 (-0.55z)| norm 0.2311 (+0.75z)| lr 5.77e-03 | 1979.77 ms | 69.3% bf16 MFU | 261494 tok/s +step 3377/18794 | loss 3.312780 (-0.38z)| norm 0.2117 (+0.18z)| lr 5.77e-03 | 2000.88 ms | 68.6% bf16 MFU | 261521 tok/s +step 3378/18794 | loss 3.375434 (+1.48z)| norm 0.1940 (-0.33z)| lr 5.77e-03 | 2015.00 ms | 68.1% bf16 MFU | 261455 tok/s +step 3379/18794 | loss 3.357208 (+0.92z)| norm 0.2441 (+1.11z)| lr 5.77e-03 | 2001.81 ms | 68.6% bf16 MFU | 261477 tok/s +step 3380/18794 | loss 3.273307 (-1.57z)| norm 0.2064 (+0.02z)| lr 5.77e-03 | 1986.81 ms | 69.1% bf16 MFU | 261598 tok/s +step 3381/18794 | loss 3.344671 (+0.55z)| norm 0.1864 (-0.55z)| lr 5.77e-03 | 2001.98 ms | 68.5% bf16 MFU | 261612 tok/s +step 3382/18794 | loss 3.304694 (-0.63z)| norm 0.1822 (-0.66z)| lr 5.76e-03 | 2003.86 ms | 68.5% bf16 MFU | 261613 tok/s +step 3383/18794 | loss 3.321505 (-0.12z)| norm 0.2467 (+1.28z)| lr 5.76e-03 | 1987.81 ms | 69.0% bf16 MFU | 261720 tok/s +step 3384/18794 | loss 3.290235 (-1.03z)| norm 0.2461 (+1.30z)| lr 5.76e-03 | 1986.80 ms | 69.1% bf16 MFU | 261828 tok/s +step 3385/18794 | loss 3.321979 (-0.07z)| norm 0.2423 (+1.17z)| lr 5.76e-03 | 1996.31 ms | 68.7% bf16 MFU | 261868 tok/s +step 3386/18794 | loss 3.267488 (-1.67z)| norm 0.1890 (-0.48z)| lr 5.76e-03 | 2001.47 ms | 68.6% bf16 MFU | 261873 tok/s +step 3387/18794 | loss 3.337757 (+0.42z)| norm 0.2062 (+0.07z)| lr 5.76e-03 | 1992.11 ms | 68.9% bf16 MFU | 261938 tok/s +step 3388/18794 | loss 3.286500 (-1.08z)| norm 0.1912 (-0.39z)| lr 5.76e-03 | 1979.26 ms | 69.3% bf16 MFU | 262086 tok/s +step 3389/18794 | loss 3.314993 (-0.25z)| norm 0.1532 (-1.55z)| lr 5.76e-03 | 2004.60 ms | 68.5% bf16 MFU | 262059 tok/s +step 3390/18794 | loss 3.414549 (+2.63z)| norm 0.1790 (-0.74z)| lr 5.76e-03 | 2001.13 ms | 68.6% bf16 MFU | 262055 tok/s +step 3391/18794 | loss 3.324148 (-0.01z)| norm 0.1478 (-1.67z)| lr 5.76e-03 | 1982.42 ms | 69.2% bf16 MFU | 262176 tok/s +step 3392/18794 | loss 3.268318 (-1.61z)| norm 0.1416 (-1.86z)| lr 5.76e-03 | 1994.72 ms | 68.8% bf16 MFU | 262209 tok/s +step 3393/18794 | loss 3.314530 (-0.27z)| norm 0.1635 (-1.19z)| lr 5.76e-03 | 1997.78 ms | 68.7% bf16 MFU | 262220 tok/s +step 3394/18794 | loss 3.307975 (-0.47z)| norm 0.1718 (-0.94z)| lr 5.76e-03 | 1988.07 ms | 69.0% bf16 MFU | 262295 tok/s +step 3395/18794 | loss 3.289915 (-0.98z)| norm 0.1483 (-1.64z)| lr 5.76e-03 | 1994.49 ms | 68.8% bf16 MFU | 262324 tok/s +step 3396/18794 | loss 3.326108 (+0.05z)| norm 0.2061 (+0.11z)| lr 5.76e-03 | 1995.30 ms | 68.8% bf16 MFU | 262346 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.254276 +step 3397/18794 | loss 3.330106 (+0.19z)| norm 0.2790 (+2.25z)| lr 5.76e-03 | 1989.67 ms | 69.0% bf16 MFU | 262404 tok/s +reducing beta2 to 0.9 and lr/wd by 0.872 due to grad z-score of 4.012993 +step 3398/18794 | loss 3.348960 (+0.74z)| norm 0.3516 (+4.01z)| lr 5.03e-03 | 2008.54 ms | 68.3% bf16 MFU | 262335 tok/s +step 3399/18794 | loss 3.334248 (+0.30z)| norm 0.1968 (-0.20z)| lr 5.76e-03 | 1989.95 ms | 69.0% bf16 MFU | 262392 tok/s +step 3400/18794 | loss 3.281450 (-1.25z)| norm 0.2379 (+0.91z)| lr 5.76e-03 | 1995.07 ms | 68.8% bf16 MFU | 262412 tok/s +reducing beta2 to 0.9 and lr/wd by 0.960 due to grad z-score of 3.644756 +step 3401/18794 | loss 3.383346 (+1.76z)| norm 0.3493 (+3.64z)| lr 5.53e-03 | 1997.86 ms | 68.7% bf16 MFU | 262412 tok/s +step 3402/18794 | loss 3.261996 (-1.80z)| norm 0.2626 (+1.42z)| lr 5.76e-03 | 1992.25 ms | 68.9% bf16 MFU | 262450 tok/s +step 3403/18794 | loss 3.319215 (-0.10z)| norm 0.2077 (+0.03z)| lr 5.76e-03 | 1989.36 ms | 69.0% bf16 MFU | 262505 tok/s +step 3404/18794 | loss 3.298049 (-0.74z)| norm 0.2264 (+0.49z)| lr 5.76e-03 | 1987.18 ms | 69.1% bf16 MFU | 262571 tok/s +step 3405/18794 | loss 3.308724 (-0.41z)| norm 0.1823 (-0.62z)| lr 5.76e-03 | 1992.67 ms | 68.9% bf16 MFU | 262598 tok/s +step 3406/18794 | loss 3.322811 (+0.01z)| norm 0.1651 (-1.05z)| lr 5.76e-03 | 1985.75 ms | 69.1% bf16 MFU | 262669 tok/s +step 3407/18794 | loss 3.347510 (+0.74z)| norm 0.1648 (-1.05z)| lr 5.76e-03 | 1998.19 ms | 68.7% bf16 MFU | 262655 tok/s +step 3408/18794 | loss 3.330670 (+0.23z)| norm 0.1886 (-0.45z)| lr 5.76e-03 | 1983.98 ms | 69.2% bf16 MFU | 262735 tok/s +step 3409/18794 | loss 3.328421 (+0.17z)| norm 0.1609 (-1.14z)| lr 5.76e-03 | 1986.05 ms | 69.1% bf16 MFU | 262798 tok/s +step 3410/18794 | loss 3.271103 (-1.50z)| norm 0.1604 (-1.17z)| lr 5.76e-03 | 1983.64 ms | 69.2% bf16 MFU | 262873 tok/s +step 3411/18794 | loss 3.266182 (-1.67z)| norm 0.1699 (-0.92z)| lr 5.76e-03 | 1988.75 ms | 69.0% bf16 MFU | 262911 tok/s +step 3412/18794 | loss 3.308325 (-0.40z)| norm 0.1803 (-0.66z)| lr 5.76e-03 | 1995.53 ms | 68.8% bf16 MFU | 262902 tok/s +step 3413/18794 | loss 3.283813 (-1.14z)| norm 0.1765 (-0.75z)| lr 5.76e-03 | 1978.72 ms | 69.4% bf16 MFU | 263005 tok/s +step 3414/18794 | loss 3.340775 (+0.56z)| norm 0.2273 (+0.52z)| lr 5.76e-03 | 1979.83 ms | 69.3% bf16 MFU | 263096 tok/s +step 3415/18794 | loss 3.310516 (-0.34z)| norm 0.2179 (+0.30z)| lr 5.76e-03 | 2001.11 ms | 68.6% bf16 MFU | 263041 tok/s +step 3416/18794 | loss 3.253271 (-2.01z)| norm 0.1990 (-0.17z)| lr 5.76e-03 | 1999.79 ms | 68.6% bf16 MFU | 262997 tok/s +step 3417/18794 | loss 3.215614 (-2.95z)| norm 0.2077 (+0.04z)| lr 5.76e-03 | 1982.37 ms | 69.2% bf16 MFU | 263071 tok/s +step 3418/18794 | loss 3.260052 (-1.67z)| norm 0.2076 (+0.04z)| lr 5.76e-03 | 1980.37 ms | 69.3% bf16 MFU | 263155 tok/s +step 3419/18794 | loss 3.345156 (+0.71z)| norm 0.1725 (-0.85z)| lr 5.76e-03 | 1992.20 ms | 68.9% bf16 MFU | 263156 tok/s +step 3420/18794 | loss 3.285157 (-0.95z)| norm 0.1545 (-1.29z)| lr 5.76e-03 | 1992.42 ms | 68.9% bf16 MFU | 263155 tok/s +step 3421/18794 | loss 3.323843 (+0.14z)| norm 0.1493 (-1.39z)| lr 5.76e-03 | 1990.45 ms | 68.9% bf16 MFU | 263167 tok/s +step 3422/18794 | loss 3.342508 (+0.65z)| norm 0.1716 (-0.81z)| lr 5.76e-03 | 1982.51 ms | 69.2% bf16 MFU | 263232 tok/s +step 3423/18794 | loss 3.274462 (-1.25z)| norm 0.1770 (-0.65z)| lr 5.76e-03 | 1980.92 ms | 69.3% bf16 MFU | 263304 tok/s +step 3424/18794 | loss 3.329958 (+0.34z)| norm 0.1735 (-0.73z)| lr 5.76e-03 | 1991.35 ms | 68.9% bf16 MFU | 263302 tok/s +step 3425/18794 | loss 3.299728 (-0.52z)| norm 0.1591 (-1.08z)| lr 5.76e-03 | 1982.53 ms | 69.2% bf16 MFU | 263360 tok/s +step 3426/18794 | loss 3.270470 (-1.35z)| norm 0.1674 (-0.85z)| lr 5.76e-03 | 1990.52 ms | 68.9% bf16 MFU | 263362 tok/s +step 3427/18794 | loss 3.272389 (-1.27z)| norm 0.2159 (+0.39z)| lr 5.76e-03 | 1991.96 ms | 68.9% bf16 MFU | 263354 tok/s +step 3428/18794 | loss 3.323498 (+0.19z)| norm 0.1939 (-0.18z)| lr 5.76e-03 | 1979.03 ms | 69.3% bf16 MFU | 263432 tok/s +step 3429/18794 | loss 3.304176 (-0.35z)| norm 0.1843 (-0.41z)| lr 5.76e-03 | 1987.39 ms | 69.1% bf16 MFU | 263451 tok/s +step 3430/18794 | loss 3.287843 (-0.81z)| norm 0.2049 (+0.12z)| lr 5.76e-03 | 1979.34 ms | 69.3% bf16 MFU | 263522 tok/s +step 3431/18794 | loss 3.310804 (-0.16z)| norm 0.1644 (-0.92z)| lr 5.76e-03 | 1980.49 ms | 69.3% bf16 MFU | 263583 tok/s +step 3432/18794 | loss 3.344055 (+0.77z)| norm 0.2418 (+1.05z)| lr 5.76e-03 | 2014.80 ms | 68.1% bf16 MFU | 263414 tok/s +step 3433/18794 | loss 3.356339 (+1.10z)| norm 0.2739 (+1.82z)| lr 5.75e-03 | 2041.81 ms | 67.2% bf16 MFU | 263082 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.231888 +step 3434/18794 | loss 3.326413 (+0.27z)| norm 0.2935 (+2.23z)| lr 5.75e-03 | 2044.94 ms | 67.1% bf16 MFU | 262747 tok/s +step 3435/18794 | loss 3.333357 (+0.47z)| norm 0.1764 (-0.64z)| lr 5.75e-03 | 2031.44 ms | 67.6% bf16 MFU | 262514 tok/s +step 3436/18794 | loss 3.314493 (-0.08z)| norm 0.1901 (-0.29z)| lr 5.75e-03 | 2037.70 ms | 67.3% bf16 MFU | 262253 tok/s +step 3437/18794 | loss 3.312593 (-0.13z)| norm 0.1845 (-0.42z)| lr 5.75e-03 | 2036.10 ms | 67.4% bf16 MFU | 262016 tok/s +step 3438/18794 | loss 3.326861 (+0.27z)| norm 0.1666 (-0.85z)| lr 5.75e-03 | 2041.82 ms | 67.2% bf16 MFU | 261753 tok/s +step 3439/18794 | loss 3.325080 (+0.21z)| norm 0.1750 (-0.63z)| lr 5.75e-03 | 2035.26 ms | 67.4% bf16 MFU | 261546 tok/s +step 3440/18794 | loss 3.307632 (-0.29z)| norm 0.1667 (-0.83z)| lr 5.75e-03 | 2022.64 ms | 67.8% bf16 MFU | 261429 tok/s +step 3441/18794 | loss 3.303734 (-0.39z)| norm 0.1986 (-0.03z)| lr 5.75e-03 | 2043.78 ms | 67.1% bf16 MFU | 261184 tok/s +step 3442/18794 | loss 3.328969 (+0.33z)| norm 0.2114 (+0.29z)| lr 5.75e-03 | 2046.01 ms | 67.1% bf16 MFU | 260937 tok/s +step 3443/18794 | loss 3.328963 (+0.36z)| norm 0.2104 (+0.26z)| lr 5.75e-03 | 2036.48 ms | 67.4% bf16 MFU | 260763 tok/s +step 3444/18794 | loss 3.334856 (+0.53z)| norm 0.1632 (-0.91z)| lr 5.75e-03 | 2036.28 ms | 67.4% bf16 MFU | 260598 tok/s +step 3445/18794 | loss 3.309088 (-0.24z)| norm 0.1627 (-0.92z)| lr 5.75e-03 | 2041.42 ms | 67.2% bf16 MFU | 260410 tok/s +step 3446/18794 | loss 3.339426 (+0.67z)| norm 0.2063 (+0.16z)| lr 5.75e-03 | 2041.23 ms | 67.2% bf16 MFU | 260232 tok/s +step 3447/18794 | loss 3.344175 (+0.79z)| norm 0.2378 (+0.93z)| lr 5.75e-03 | 2050.18 ms | 66.9% bf16 MFU | 260007 tok/s +step 3448/18794 | loss 3.299142 (-0.54z)| norm 0.2339 (+0.82z)| lr 5.75e-03 | 2025.66 ms | 67.7% bf16 MFU | 259947 tok/s +step 3449/18794 | loss 3.311710 (-0.15z)| norm 0.1599 (-1.01z)| lr 5.75e-03 | 2033.01 ms | 67.5% bf16 MFU | 259844 tok/s +step 3450/18794 | loss 3.294496 (-0.66z)| norm 0.1728 (-0.68z)| lr 5.75e-03 | 2033.51 ms | 67.5% bf16 MFU | 259743 tok/s +step 3451/18794 | loss 3.387092 (+2.06z)| norm 0.1973 (-0.08z)| lr 5.75e-03 | 2036.36 ms | 67.4% bf16 MFU | 259629 tok/s +step 3452/18794 | loss 3.215446 (-2.85z)| norm 0.1959 (-0.13z)| lr 5.75e-03 | 2026.61 ms | 67.7% bf16 MFU | 259583 tok/s +step 3453/18794 | loss 3.289333 (-0.75z)| norm 0.1746 (-0.65z)| lr 5.75e-03 | 2040.11 ms | 67.3% bf16 MFU | 259453 tok/s +step 3454/18794 | loss 3.310965 (-0.12z)| norm 0.2110 (+0.26z)| lr 5.75e-03 | 2021.24 ms | 67.9% bf16 MFU | 259450 tok/s +step 3455/18794 | loss 3.331130 (+0.53z)| norm 0.2077 (+0.18z)| lr 5.75e-03 | 2026.96 ms | 67.7% bf16 MFU | 259410 tok/s +step 3456/18794 | loss 3.250344 (-1.92z)| norm 0.2247 (+0.59z)| lr 5.75e-03 | 2025.29 ms | 67.8% bf16 MFU | 259383 tok/s +step 3457/18794 | loss 3.291142 (-0.66z)| norm 0.1874 (-0.34z)| lr 5.75e-03 | 2039.85 ms | 67.3% bf16 MFU | 259265 tok/s +step 3458/18794 | loss 3.305310 (-0.22z)| norm 0.1765 (-0.61z)| lr 5.75e-03 | 2039.02 ms | 67.3% bf16 MFU | 259159 tok/s +step 3459/18794 | loss 3.338219 (+0.77z)| norm 0.1712 (-0.73z)| lr 5.75e-03 | 2029.70 ms | 67.6% bf16 MFU | 259116 tok/s +step 3460/18794 | loss 3.316887 (+0.12z)| norm 0.1788 (-0.52z)| lr 5.75e-03 | 2040.05 ms | 67.3% bf16 MFU | 259010 tok/s +step 3461/18794 | loss 3.328875 (+0.48z)| norm 0.1945 (-0.12z)| lr 5.75e-03 | 2020.97 ms | 67.9% bf16 MFU | 259031 tok/s +step 3462/18794 | loss 3.309242 (-0.13z)| norm 0.1982 (-0.03z)| lr 5.75e-03 | 2033.09 ms | 67.5% bf16 MFU | 258973 tok/s +step 3463/18794 | loss 3.286705 (-0.81z)| norm 0.1567 (-1.09z)| lr 5.75e-03 | 2046.48 ms | 67.1% bf16 MFU | 258834 tok/s +step 3464/18794 | loss 3.251954 (-1.85z)| norm 0.1768 (-0.57z)| lr 5.75e-03 | 2030.66 ms | 67.6% bf16 MFU | 258802 tok/s +step 3465/18794 | loss 3.348723 (+1.08z)| norm 0.1776 (-0.55z)| lr 5.75e-03 | 2032.88 ms | 67.5% bf16 MFU | 258757 tok/s +step 3466/18794 | loss 3.359006 (+1.38z)| norm 0.1849 (-0.37z)| lr 5.75e-03 | 2012.88 ms | 68.2% bf16 MFU | 258842 tok/s +step 3467/18794 | loss 3.368168 (+1.62z)| norm 0.2062 (+0.18z)| lr 5.75e-03 | 2036.18 ms | 67.4% bf16 MFU | 258774 tok/s +step 3468/18794 | loss 3.351698 (+1.12z)| norm 0.2644 (+1.65z)| lr 5.75e-03 | 2035.82 ms | 67.4% bf16 MFU | 258712 tok/s +step 3469/18794 | loss 3.322449 (+0.26z)| norm 0.2421 (+1.07z)| lr 5.75e-03 | 2002.25 ms | 68.5% bf16 MFU | 258869 tok/s +step 3470/18794 | loss 3.312262 (-0.04z)| norm 0.2117 (+0.29z)| lr 5.75e-03 | 2032.97 ms | 67.5% bf16 MFU | 258820 tok/s +step 3471/18794 | loss 3.356598 (+1.28z)| norm 0.2138 (+0.35z)| lr 5.75e-03 | 2029.31 ms | 67.6% bf16 MFU | 258797 tok/s +step 3472/18794 | loss 3.337607 (+0.70z)| norm 0.2083 (+0.20z)| lr 5.75e-03 | 2034.07 ms | 67.5% bf16 MFU | 258745 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.162982 +step 3473/18794 | loss 3.317561 (+0.10z)| norm 0.3327 (+3.16z)| lr 5.75e-03 | 2030.82 ms | 67.6% bf16 MFU | 258716 tok/s +step 3474/18794 | loss 3.334845 (+0.60z)| norm 0.2655 (+1.56z)| lr 5.75e-03 | 2011.61 ms | 68.2% bf16 MFU | 258812 tok/s +step 3475/18794 | loss 3.358519 (+1.30z)| norm 0.2115 (+0.26z)| lr 5.75e-03 | 2023.52 ms | 67.8% bf16 MFU | 258826 tok/s +step 3476/18794 | loss 3.298420 (-0.47z)| norm 0.2533 (+1.28z)| lr 5.75e-03 | 2030.91 ms | 67.6% bf16 MFU | 258792 tok/s +step 3477/18794 | loss 3.285787 (-0.84z)| norm 0.2093 (+0.20z)| lr 5.75e-03 | 2018.76 ms | 68.0% bf16 MFU | 258838 tok/s +step 3478/18794 | loss 3.292577 (-0.62z)| norm 0.1665 (-0.84z)| lr 5.75e-03 | 2023.70 ms | 67.8% bf16 MFU | 258850 tok/s +step 3479/18794 | loss 3.361657 (+1.44z)| norm 0.1774 (-0.56z)| lr 5.75e-03 | 2026.22 ms | 67.7% bf16 MFU | 258845 tok/s +step 3480/18794 | loss 3.322354 (+0.25z)| norm 0.1478 (-1.26z)| lr 5.75e-03 | 2028.62 ms | 67.6% bf16 MFU | 258825 tok/s +step 3481/18794 | loss 3.313829 (+0.01z)| norm 0.1609 (-0.93z)| lr 5.75e-03 | 2029.09 ms | 67.6% bf16 MFU | 258803 tok/s +step 3482/18794 | loss 3.310730 (-0.09z)| norm 0.2324 (+0.79z)| lr 5.75e-03 | 2022.24 ms | 67.9% bf16 MFU | 258826 tok/s +step 3483/18794 | loss 3.287961 (-0.76z)| norm 0.2605 (+1.46z)| lr 5.74e-03 | 2023.57 ms | 67.8% bf16 MFU | 258839 tok/s +step 3484/18794 | loss 3.267382 (-1.36z)| norm 0.2372 (+0.90z)| lr 5.74e-03 | 2034.13 ms | 67.5% bf16 MFU | 258785 tok/s +step 3485/18794 | loss 3.317127 (+0.12z)| norm 0.1883 (-0.27z)| lr 5.74e-03 | 2027.08 ms | 67.7% bf16 MFU | 258777 tok/s +step 3486/18794 | loss 3.267477 (-1.36z)| norm 0.1932 (-0.15z)| lr 5.74e-03 | 2029.43 ms | 67.6% bf16 MFU | 258756 tok/s +step 3487/18794 | loss 3.376175 (+1.85z)| norm 0.2078 (+0.20z)| lr 5.74e-03 | 2017.79 ms | 68.0% bf16 MFU | 258809 tok/s +step 3488/18794 | loss 3.351066 (+1.08z)| norm 0.2167 (+0.41z)| lr 5.74e-03 | 2022.32 ms | 67.9% bf16 MFU | 258832 tok/s +step 3489/18794 | loss 3.360440 (+1.33z)| norm 0.2233 (+0.56z)| lr 5.74e-03 | 2016.34 ms | 68.1% bf16 MFU | 258891 tok/s +step 3490/18794 | loss 3.261942 (-1.53z)| norm 0.2364 (+0.86z)| lr 5.74e-03 | 2019.97 ms | 67.9% bf16 MFU | 258924 tok/s +step 3491/18794 | loss 3.355953 (+1.27z)| norm 0.2053 (+0.09z)| lr 5.74e-03 | 2042.28 ms | 67.2% bf16 MFU | 258814 tok/s +step 3492/18794 | loss 3.272957 (-1.21z)| norm 0.2199 (+0.44z)| lr 5.74e-03 | 2013.58 ms | 68.2% bf16 MFU | 258892 tok/s +step 3493/18794 | loss 3.280582 (-0.96z)| norm 0.2019 (-0.02z)| lr 5.74e-03 | 2008.90 ms | 68.3% bf16 MFU | 258996 tok/s +step 3494/18794 | loss 3.291196 (-0.64z)| norm 0.2160 (+0.32z)| lr 5.74e-03 | 2003.85 ms | 68.5% bf16 MFU | 259128 tok/s +step 3495/18794 | loss 3.261424 (-1.50z)| norm 0.2048 (+0.03z)| lr 5.74e-03 | 2013.18 ms | 68.2% bf16 MFU | 259193 tok/s +step 3496/18794 | loss 3.276639 (-1.04z)| norm 0.2153 (+0.29z)| lr 5.74e-03 | 2014.55 ms | 68.1% bf16 MFU | 259246 tok/s +step 3497/18794 | loss 3.276751 (-1.01z)| norm 0.2514 (+1.22z)| lr 5.74e-03 | 2016.69 ms | 68.0% bf16 MFU | 259283 tok/s +step 3498/18794 | loss 3.365705 (+1.56z)| norm 0.2292 (+0.74z)| lr 5.74e-03 | 2025.94 ms | 67.7% bf16 MFU | 259258 tok/s +step 3499/18794 | loss 3.274698 (-1.05z)| norm 0.2002 (-0.06z)| lr 5.74e-03 | 1994.27 ms | 68.8% bf16 MFU | 259440 tok/s +step 3500/18794 | loss 3.337293 (+0.74z)| norm 0.2235 (+0.59z)| lr 5.74e-03 | 2032.28 ms | 67.5% bf16 MFU | 259367 tok/s +val loss 3.323812 +Writing state to log_gpt3_125M_edu_v4/state_00003500_00001.bin +HellaSwag: 2783/10042 = 0.277136 +Writing checkpoint at step 3500 +Writing model to log_gpt3_125M_edu_v4/model_00003500.bin +Writing state to log_gpt3_125M_edu_v4/state_00003500_00000.bin +Deleting checkpoint at step 1000 +step 3501/18794 | loss 3.386911 (+2.17z)| norm 0.1869 (-0.41z)| lr 5.74e-03 | 2017.77 ms | 68.0% bf16 MFU | 259390 tok/s +step 3502/18794 | loss 3.252284 (-1.70z)| norm 0.1539 (-1.38z)| lr 5.74e-03 | 2030.32 ms | 67.6% bf16 MFU | 259332 tok/s +step 3503/18794 | loss 3.282351 (-0.83z)| norm 0.1816 (-0.53z)| lr 5.74e-03 | 1995.78 ms | 68.8% bf16 MFU | 259501 tok/s +step 3504/18794 | loss 3.319689 (+0.23z)| norm 0.1745 (-0.73z)| lr 5.74e-03 | 2020.31 ms | 67.9% bf16 MFU | 259501 tok/s +step 3505/18794 | loss 3.293049 (-0.52z)| norm 0.1587 (-1.20z)| lr 5.74e-03 | 2016.02 ms | 68.1% bf16 MFU | 259529 tok/s +step 3506/18794 | loss 3.267193 (-1.24z)| norm 0.1484 (-1.49z)| lr 5.74e-03 | 2020.18 ms | 67.9% bf16 MFU | 259529 tok/s +step 3507/18794 | loss 3.318640 (+0.23z)| norm 0.1872 (-0.34z)| lr 5.74e-03 | 2034.95 ms | 67.4% bf16 MFU | 259434 tok/s +step 3508/18794 | loss 3.301156 (-0.26z)| norm 0.2342 (+1.05z)| lr 5.74e-03 | 2006.52 ms | 68.4% bf16 MFU | 259527 tok/s +step 3509/18794 | loss 3.262460 (-1.33z)| norm 0.2424 (+1.27z)| lr 5.74e-03 | 2024.22 ms | 67.8% bf16 MFU | 259501 tok/s +step 3510/18794 | loss 3.311308 (+0.04z)| norm 0.2116 (+0.34z)| lr 5.74e-03 | 2030.79 ms | 67.6% bf16 MFU | 259435 tok/s +step 3511/18794 | loss 3.297198 (-0.37z)| norm 0.1720 (-0.85z)| lr 5.74e-03 | 2012.79 ms | 68.2% bf16 MFU | 259487 tok/s +step 3512/18794 | loss 3.309508 (-0.02z)| norm 0.1579 (-1.26z)| lr 5.74e-03 | 2008.51 ms | 68.3% bf16 MFU | 259564 tok/s +step 3513/18794 | loss 3.263630 (-1.32z)| norm 0.2080 (+0.23z)| lr 5.74e-03 | 2027.92 ms | 67.7% bf16 MFU | 259513 tok/s +step 3514/18794 | loss 3.295022 (-0.42z)| norm 0.2447 (+1.31z)| lr 5.74e-03 | 2025.85 ms | 67.7% bf16 MFU | 259477 tok/s +step 3515/18794 | loss 3.308400 (-0.03z)| norm 0.2577 (+1.67z)| lr 5.74e-03 | 1994.29 ms | 68.8% bf16 MFU | 259648 tok/s +step 3516/18794 | loss 3.257286 (-1.50z)| norm 0.2138 (+0.37z)| lr 5.74e-03 | 1989.56 ms | 69.0% bf16 MFU | 259841 tok/s +step 3517/18794 | loss 3.266739 (-1.28z)| norm 0.1513 (-1.43z)| lr 5.74e-03 | 1982.17 ms | 69.2% bf16 MFU | 260074 tok/s +step 3518/18794 | loss 3.268870 (-1.23z)| norm 0.2143 (+0.40z)| lr 5.74e-03 | 2008.93 ms | 68.3% bf16 MFU | 260120 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.168260 +step 3519/18794 | loss 3.298383 (-0.34z)| norm 0.2779 (+2.17z)| lr 5.74e-03 | 2009.48 ms | 68.3% bf16 MFU | 260159 tok/s +step 3520/18794 | loss 3.253653 (-1.64z)| norm 0.2282 (+0.74z)| lr 5.74e-03 | 2006.14 ms | 68.4% bf16 MFU | 260218 tok/s +step 3521/18794 | loss 3.238014 (-2.04z)| norm 0.1692 (-0.96z)| lr 5.74e-03 | 2019.73 ms | 67.9% bf16 MFU | 260186 tok/s +step 3522/18794 | loss 3.331535 (+0.67z)| norm 0.1524 (-1.43z)| lr 5.74e-03 | 1992.70 ms | 68.9% bf16 MFU | 260332 tok/s +step 3523/18794 | loss 3.242776 (-1.87z)| norm 0.1860 (-0.47z)| lr 5.74e-03 | 1990.15 ms | 69.0% bf16 MFU | 260488 tok/s +step 3524/18794 | loss 3.298814 (-0.26z)| norm 0.1819 (-0.59z)| lr 5.74e-03 | 2033.23 ms | 67.5% bf16 MFU | 260356 tok/s +step 3525/18794 | loss 3.305407 (-0.07z)| norm 0.1722 (-0.88z)| lr 5.74e-03 | 2007.03 ms | 68.4% bf16 MFU | 260400 tok/s +step 3526/18794 | loss 3.328910 (+0.59z)| norm 0.1579 (-1.29z)| lr 5.74e-03 | 2020.13 ms | 67.9% bf16 MFU | 260356 tok/s +step 3527/18794 | loss 3.314698 (+0.17z)| norm 0.1784 (-0.68z)| lr 5.74e-03 | 2031.32 ms | 67.6% bf16 MFU | 260244 tok/s +step 3528/18794 | loss 3.346574 (+1.08z)| norm 0.1769 (-0.72z)| lr 5.74e-03 | 2010.62 ms | 68.3% bf16 MFU | 260270 tok/s +step 3529/18794 | loss 3.246370 (-1.76z)| norm 0.1483 (-1.51z)| lr 5.74e-03 | 2001.87 ms | 68.6% bf16 MFU | 260351 tok/s +step 3530/18794 | loss 3.279232 (-0.83z)| norm 0.1877 (-0.39z)| lr 5.74e-03 | 2024.34 ms | 67.8% bf16 MFU | 260283 tok/s +step 3531/18794 | loss 3.267265 (-1.14z)| norm 0.2269 (+0.70z)| lr 5.74e-03 | 2025.08 ms | 67.8% bf16 MFU | 260214 tok/s +step 3532/18794 | loss 3.318226 (+0.29z)| norm 0.2211 (+0.55z)| lr 5.73e-03 | 2005.80 ms | 68.4% bf16 MFU | 260272 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.446120 +step 3533/18794 | loss 3.265393 (-1.17z)| norm 0.2886 (+2.45z)| lr 5.73e-03 | 2019.13 ms | 68.0% bf16 MFU | 260242 tok/s +step 3534/18794 | loss 3.302445 (-0.12z)| norm 0.2455 (+1.28z)| lr 5.73e-03 | 2007.29 ms | 68.4% bf16 MFU | 260289 tok/s +step 3535/18794 | loss 3.288550 (-0.50z)| norm 0.2005 (-0.04z)| lr 5.73e-03 | 2005.80 ms | 68.4% bf16 MFU | 260344 tok/s +step 3536/18794 | loss 3.363923 (+1.60z)| norm 0.2059 (+0.11z)| lr 5.73e-03 | 2005.08 ms | 68.4% bf16 MFU | 260401 tok/s +step 3537/18794 | loss 3.340314 (+0.93z)| norm 0.1762 (-0.75z)| lr 5.73e-03 | 2013.27 ms | 68.2% bf16 MFU | 260402 tok/s +step 3538/18794 | loss 3.302342 (-0.12z)| norm 0.1933 (-0.26z)| lr 5.73e-03 | 1987.91 ms | 69.0% bf16 MFU | 260568 tok/s +step 3539/18794 | loss 3.304446 (-0.06z)| norm 0.1629 (-1.14z)| lr 5.73e-03 | 2009.27 ms | 68.3% bf16 MFU | 260587 tok/s +step 3540/18794 | loss 3.248279 (-1.59z)| norm 0.1551 (-1.36z)| lr 5.73e-03 | 1999.97 ms | 68.6% bf16 MFU | 260665 tok/s +step 3541/18794 | loss 3.338640 (+0.89z)| norm 0.1716 (-0.87z)| lr 5.73e-03 | 2016.20 ms | 68.1% bf16 MFU | 260634 tok/s +step 3542/18794 | loss 3.328092 (+0.60z)| norm 0.1923 (-0.27z)| lr 5.73e-03 | 1999.36 ms | 68.6% bf16 MFU | 260713 tok/s +step 3543/18794 | loss 3.313974 (+0.22z)| norm 0.1998 (-0.05z)| lr 5.73e-03 | 2007.69 ms | 68.4% bf16 MFU | 260735 tok/s +step 3544/18794 | loss 3.273529 (-0.88z)| norm 0.2065 (+0.14z)| lr 5.73e-03 | 2012.73 ms | 68.2% bf16 MFU | 260722 tok/s +step 3545/18794 | loss 3.386694 (+2.16z)| norm 0.1985 (-0.11z)| lr 5.73e-03 | 1998.45 ms | 68.7% bf16 MFU | 260803 tok/s +step 3546/18794 | loss 3.297192 (-0.23z)| norm 0.1836 (-0.54z)| lr 5.73e-03 | 2015.49 ms | 68.1% bf16 MFU | 260770 tok/s +step 3547/18794 | loss 3.281069 (-0.65z)| norm 0.1576 (-1.27z)| lr 5.73e-03 | 2001.88 ms | 68.6% bf16 MFU | 260826 tok/s +step 3548/18794 | loss 3.289207 (-0.43z)| norm 0.1646 (-1.05z)| lr 5.73e-03 | 2018.88 ms | 68.0% bf16 MFU | 260769 tok/s +step 3549/18794 | loss 3.295316 (-0.26z)| norm 0.1533 (-1.37z)| lr 5.73e-03 | 1988.34 ms | 69.0% bf16 MFU | 260915 tok/s +step 3550/18794 | loss 3.306471 (+0.04z)| norm 0.1691 (-0.91z)| lr 5.73e-03 | 2008.47 ms | 68.3% bf16 MFU | 260921 tok/s +step 3551/18794 | loss 3.329324 (+0.69z)| norm 0.1800 (-0.59z)| lr 5.73e-03 | 1986.88 ms | 69.1% bf16 MFU | 261069 tok/s +step 3552/18794 | loss 3.273500 (-0.90z)| norm 0.1964 (-0.11z)| lr 5.73e-03 | 2006.67 ms | 68.4% bf16 MFU | 261079 tok/s +step 3553/18794 | loss 3.268606 (-1.03z)| norm 0.1624 (-1.09z)| lr 5.73e-03 | 1999.94 ms | 68.6% bf16 MFU | 261133 tok/s +step 3554/18794 | loss 3.311882 (+0.20z)| norm 0.1684 (-0.90z)| lr 5.73e-03 | 2005.56 ms | 68.4% bf16 MFU | 261147 tok/s +step 3555/18794 | loss 3.311709 (+0.20z)| norm 0.1818 (-0.51z)| lr 5.73e-03 | 1980.88 ms | 69.3% bf16 MFU | 261323 tok/s +step 3556/18794 | loss 3.336689 (+0.89z)| norm 0.1634 (-1.02z)| lr 5.73e-03 | 1991.67 ms | 68.9% bf16 MFU | 261419 tok/s +step 3557/18794 | loss 3.271749 (-0.96z)| norm 0.1873 (-0.33z)| lr 5.73e-03 | 2004.70 ms | 68.5% bf16 MFU | 261425 tok/s +step 3558/18794 | loss 3.315133 (+0.28z)| norm 0.2089 (+0.28z)| lr 5.73e-03 | 2013.20 ms | 68.2% bf16 MFU | 261375 tok/s +step 3559/18794 | loss 3.309263 (+0.12z)| norm 0.2160 (+0.47z)| lr 5.73e-03 | 1998.36 ms | 68.7% bf16 MFU | 261424 tok/s +step 3560/18794 | loss 3.292293 (-0.36z)| norm 0.2443 (+1.26z)| lr 5.73e-03 | 2002.34 ms | 68.5% bf16 MFU | 261445 tok/s +step 3561/18794 | loss 3.340996 (+1.03z)| norm 0.2534 (+1.49z)| lr 5.73e-03 | 1992.06 ms | 68.9% bf16 MFU | 261532 tok/s +step 3562/18794 | loss 3.319186 (+0.40z)| norm 0.2600 (+1.64z)| lr 5.73e-03 | 2005.93 ms | 68.4% bf16 MFU | 261524 tok/s +step 3563/18794 | loss 3.344202 (+1.09z)| norm 0.2598 (+1.59z)| lr 5.73e-03 | 1995.95 ms | 68.8% bf16 MFU | 261581 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.039039 +step 3564/18794 | loss 3.232042 (-2.06z)| norm 0.3184 (+3.04z)| lr 5.73e-03 | 1983.20 ms | 69.2% bf16 MFU | 261721 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.451921 +step 3565/18794 | loss 3.304177 (-0.02z)| norm 0.3002 (+2.45z)| lr 5.73e-03 | 1988.64 ms | 69.0% bf16 MFU | 261817 tok/s +step 3566/18794 | loss 3.312697 (+0.23z)| norm 0.2220 (+0.43z)| lr 5.73e-03 | 2016.03 ms | 68.1% bf16 MFU | 261729 tok/s +step 3567/18794 | loss 3.294621 (-0.27z)| norm 0.1895 (-0.41z)| lr 5.73e-03 | 1992.86 ms | 68.9% bf16 MFU | 261796 tok/s +step 3568/18794 | loss 3.322915 (+0.57z)| norm 0.1874 (-0.45z)| lr 5.73e-03 | 2002.45 ms | 68.5% bf16 MFU | 261798 tok/s +step 3569/18794 | loss 3.361479 (+1.67z)| norm 0.1965 (-0.20z)| lr 5.73e-03 | 1998.43 ms | 68.7% bf16 MFU | 261825 tok/s +step 3570/18794 | loss 3.318936 (+0.43z)| norm 0.2592 (+1.42z)| lr 5.73e-03 | 1993.20 ms | 68.9% bf16 MFU | 261886 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.343475 +step 3571/18794 | loss 3.251853 (-1.48z)| norm 0.2984 (+2.34z)| lr 5.73e-03 | 1997.55 ms | 68.7% bf16 MFU | 261915 tok/s +step 3572/18794 | loss 3.291234 (-0.33z)| norm 0.1814 (-0.60z)| lr 5.73e-03 | 1995.45 ms | 68.8% bf16 MFU | 261956 tok/s +step 3573/18794 | loss 3.330497 (+0.81z)| norm 0.1991 (-0.12z)| lr 5.73e-03 | 1998.81 ms | 68.7% bf16 MFU | 261974 tok/s +step 3574/18794 | loss 3.325436 (+0.67z)| norm 0.2049 (+0.05z)| lr 5.73e-03 | 1984.25 ms | 69.2% bf16 MFU | 262086 tok/s +step 3575/18794 | loss 3.358041 (+1.62z)| norm 0.1856 (-0.47z)| lr 5.73e-03 | 2004.03 ms | 68.5% bf16 MFU | 262063 tok/s +step 3576/18794 | loss 3.346283 (+1.25z)| norm 0.2546 (+1.39z)| lr 5.73e-03 | 2002.61 ms | 68.5% bf16 MFU | 262050 tok/s +step 3577/18794 | loss 3.285706 (-0.50z)| norm 0.2263 (+0.62z)| lr 5.73e-03 | 1996.60 ms | 68.7% bf16 MFU | 262077 tok/s +step 3578/18794 | loss 3.349134 (+1.31z)| norm 0.1596 (-1.16z)| lr 5.73e-03 | 1998.40 ms | 68.7% bf16 MFU | 262090 tok/s +step 3579/18794 | loss 3.236582 (-1.88z)| norm 0.1753 (-0.74z)| lr 5.73e-03 | 1989.57 ms | 69.0% bf16 MFU | 262162 tok/s +step 3580/18794 | loss 3.272708 (-0.83z)| norm 0.2201 (+0.44z)| lr 5.72e-03 | 1981.09 ms | 69.3% bf16 MFU | 262286 tok/s +step 3581/18794 | loss 3.280875 (-0.59z)| norm 0.2646 (+1.61z)| lr 5.72e-03 | 1991.56 ms | 68.9% bf16 MFU | 262334 tok/s +step 3582/18794 | loss 3.219450 (-2.25z)| norm 0.2201 (+0.42z)| lr 5.72e-03 | 1995.74 ms | 68.8% bf16 MFU | 262353 tok/s +step 3583/18794 | loss 3.285105 (-0.43z)| norm 0.1434 (-1.61z)| lr 5.72e-03 | 1987.14 ms | 69.1% bf16 MFU | 262427 tok/s +step 3584/18794 | loss 3.292450 (-0.23z)| norm 0.1831 (-0.54z)| lr 5.72e-03 | 1979.91 ms | 69.3% bf16 MFU | 262546 tok/s +step 3585/18794 | loss 3.290740 (-0.27z)| norm 0.1912 (-0.32z)| lr 5.72e-03 | 1995.36 ms | 68.8% bf16 MFU | 262556 tok/s +step 3586/18794 | loss 3.292282 (-0.24z)| norm 0.2200 (+0.45z)| lr 5.72e-03 | 2001.74 ms | 68.6% bf16 MFU | 262524 tok/s +step 3587/18794 | loss 3.318081 (+0.51z)| norm 0.1932 (-0.27z)| lr 5.72e-03 | 1998.62 ms | 68.7% bf16 MFU | 262514 tok/s +step 3588/18794 | loss 3.310551 (+0.31z)| norm 0.1604 (-1.13z)| lr 5.72e-03 | 1985.14 ms | 69.1% bf16 MFU | 262594 tok/s +step 3589/18794 | loss 3.290819 (-0.24z)| norm 0.1550 (-1.25z)| lr 5.72e-03 | 1990.98 ms | 68.9% bf16 MFU | 262631 tok/s +step 3590/18794 | loss 3.325080 (+0.75z)| norm 0.2148 (+0.35z)| lr 5.72e-03 | 2002.71 ms | 68.5% bf16 MFU | 262589 tok/s +step 3591/18794 | loss 3.248825 (-1.47z)| norm 0.2051 (+0.09z)| lr 5.72e-03 | 1978.75 ms | 69.4% bf16 MFU | 262707 tok/s +step 3592/18794 | loss 3.247762 (-1.48z)| norm 0.2002 (-0.03z)| lr 5.72e-03 | 1986.53 ms | 69.1% bf16 MFU | 262768 tok/s +step 3593/18794 | loss 3.275370 (-0.67z)| norm 0.2187 (+0.46z)| lr 5.72e-03 | 1989.99 ms | 69.0% bf16 MFU | 262803 tok/s +step 3594/18794 | loss 3.275134 (-0.67z)| norm 0.2118 (+0.27z)| lr 5.72e-03 | 1995.20 ms | 68.8% bf16 MFU | 262801 tok/s +step 3595/18794 | loss 3.284126 (-0.42z)| norm 0.2072 (+0.15z)| lr 5.72e-03 | 1982.73 ms | 69.2% bf16 MFU | 262883 tok/s +step 3596/18794 | loss 3.364736 (+1.89z)| norm 0.2132 (+0.31z)| lr 5.72e-03 | 1984.68 ms | 69.1% bf16 MFU | 262947 tok/s +step 3597/18794 | loss 3.200678 (-2.72z)| norm 0.2044 (+0.09z)| lr 5.72e-03 | 1990.14 ms | 69.0% bf16 MFU | 262972 tok/s +step 3598/18794 | loss 3.268491 (-0.82z)| norm 0.1999 (-0.03z)| lr 5.72e-03 | 1981.61 ms | 69.3% bf16 MFU | 263052 tok/s +step 3599/18794 | loss 3.383977 (+2.34z)| norm 0.2157 (+0.40z)| lr 5.72e-03 | 1994.42 ms | 68.8% bf16 MFU | 263043 tok/s +step 3600/18794 | loss 3.338644 (+1.10z)| norm 0.2303 (+0.79z)| lr 5.72e-03 | 1995.00 ms | 68.8% bf16 MFU | 263031 tok/s +step 3601/18794 | loss 3.265743 (-0.90z)| norm 0.2576 (+1.49z)| lr 5.72e-03 | 1987.86 ms | 69.0% bf16 MFU | 263067 tok/s +step 3602/18794 | loss 3.340011 (+1.18z)| norm 0.1792 (-0.61z)| lr 5.72e-03 | 1987.80 ms | 69.0% bf16 MFU | 263101 tok/s +step 3603/18794 | loss 3.242150 (-1.56z)| norm 0.1640 (-1.01z)| lr 5.72e-03 | 1992.44 ms | 68.9% bf16 MFU | 263103 tok/s +step 3604/18794 | loss 3.288478 (-0.26z)| norm 0.1959 (-0.16z)| lr 5.72e-03 | 1980.38 ms | 69.3% bf16 MFU | 263185 tok/s +step 3605/18794 | loss 3.303141 (+0.15z)| norm 0.1832 (-0.51z)| lr 5.72e-03 | 1994.08 ms | 68.8% bf16 MFU | 263172 tok/s +step 3606/18794 | loss 3.224061 (-2.01z)| norm 0.2059 (+0.08z)| lr 5.72e-03 | 1983.29 ms | 69.2% bf16 MFU | 263231 tok/s +step 3607/18794 | loss 3.373396 (+2.04z)| norm 0.2301 (+0.73z)| lr 5.72e-03 | 2001.96 ms | 68.5% bf16 MFU | 263164 tok/s +step 3608/18794 | loss 3.298610 (+0.02z)| norm 0.1515 (-1.38z)| lr 5.72e-03 | 1993.12 ms | 68.9% bf16 MFU | 263158 tok/s +step 3609/18794 | loss 3.287921 (-0.28z)| norm 0.1988 (-0.09z)| lr 5.72e-03 | 1984.49 ms | 69.2% bf16 MFU | 263210 tok/s +step 3610/18794 | loss 3.273588 (-0.65z)| norm 0.2752 (+1.94z)| lr 5.72e-03 | 1979.28 ms | 69.3% bf16 MFU | 263294 tok/s +step 3611/18794 | loss 3.343951 (+1.23z)| norm 0.2031 (+0.00z)| lr 5.72e-03 | 1982.02 ms | 69.2% bf16 MFU | 263355 tok/s +step 3612/18794 | loss 3.269754 (-0.75z)| norm 0.1736 (-0.80z)| lr 5.72e-03 | 1980.69 ms | 69.3% bf16 MFU | 263422 tok/s +step 3613/18794 | loss 3.285043 (-0.35z)| norm 0.1537 (-1.30z)| lr 5.72e-03 | 1979.69 ms | 69.3% bf16 MFU | 263493 tok/s +step 3614/18794 | loss 3.269179 (-0.76z)| norm 0.1708 (-0.83z)| lr 5.72e-03 | 1989.46 ms | 69.0% bf16 MFU | 263495 tok/s +step 3615/18794 | loss 3.257518 (-1.06z)| norm 0.2179 (+0.45z)| lr 5.72e-03 | 1982.71 ms | 69.2% bf16 MFU | 263542 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.244565 +step 3616/18794 | loss 3.239558 (-1.52z)| norm 0.2873 (+2.24z)| lr 5.72e-03 | 1979.95 ms | 69.3% bf16 MFU | 263604 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.938208 +step 3617/18794 | loss 3.398130 (+2.56z)| norm 0.3194 (+2.94z)| lr 5.72e-03 | 1990.11 ms | 69.0% bf16 MFU | 263597 tok/s +reducing beta2 to 0.9 and lr/wd by 0.920 due to grad z-score of 3.805261 +step 3618/18794 | loss 3.310508 (+0.30z)| norm 0.3672 (+3.81z)| lr 5.26e-03 | 1985.48 ms | 69.1% bf16 MFU | 263620 tok/s +step 3619/18794 | loss 3.313203 (+0.37z)| norm 0.1994 (-0.12z)| lr 5.72e-03 | 1985.35 ms | 69.1% bf16 MFU | 263643 tok/s +step 3620/18794 | loss 3.316011 (+0.43z)| norm 0.2075 (+0.08z)| lr 5.72e-03 | 1979.71 ms | 69.3% bf16 MFU | 263702 tok/s +step 3621/18794 | loss 3.290858 (-0.24z)| norm 0.2403 (+0.84z)| lr 5.72e-03 | 1981.89 ms | 69.2% bf16 MFU | 263744 tok/s +step 3622/18794 | loss 3.335775 (+0.93z)| norm 0.2558 (+1.19z)| lr 5.72e-03 | 1987.81 ms | 69.0% bf16 MFU | 263744 tok/s +step 3623/18794 | loss 3.357403 (+1.47z)| norm 0.2411 (+0.82z)| lr 5.72e-03 | 2040.32 ms | 67.3% bf16 MFU | 263405 tok/s +step 3624/18794 | loss 3.318653 (+0.45z)| norm 0.1763 (-0.72z)| lr 5.72e-03 | 2035.14 ms | 67.4% bf16 MFU | 263116 tok/s +step 3625/18794 | loss 3.263195 (-0.99z)| norm 0.2035 (-0.08z)| lr 5.72e-03 | 2041.63 ms | 67.2% bf16 MFU | 262800 tok/s +step 3626/18794 | loss 3.293610 (-0.18z)| norm 0.2863 (+1.85z)| lr 5.72e-03 | 2028.79 ms | 67.6% bf16 MFU | 262581 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.077898 +step 3627/18794 | loss 3.317883 (+0.45z)| norm 0.2989 (+2.08z)| lr 5.71e-03 | 2037.45 ms | 67.4% bf16 MFU | 262318 tok/s +step 3628/18794 | loss 3.265041 (-0.91z)| norm 0.2102 (+0.01z)| lr 5.71e-03 | 2033.22 ms | 67.5% bf16 MFU | 262095 tok/s +step 3629/18794 | loss 3.323436 (+0.60z)| norm 0.1593 (-1.18z)| lr 5.71e-03 | 2030.76 ms | 67.6% bf16 MFU | 261899 tok/s +step 3630/18794 | loss 3.330214 (+0.76z)| norm 0.1463 (-1.46z)| lr 5.71e-03 | 2023.16 ms | 67.8% bf16 MFU | 261762 tok/s +step 3631/18794 | loss 3.289450 (-0.32z)| norm 0.1599 (-1.12z)| lr 5.71e-03 | 2026.88 ms | 67.7% bf16 MFU | 261607 tok/s +step 3632/18794 | loss 3.250330 (-1.32z)| norm 0.1656 (-0.98z)| lr 5.71e-03 | 2028.28 ms | 67.7% bf16 MFU | 261451 tok/s +step 3633/18794 | loss 3.260530 (-1.05z)| norm 0.1725 (-0.80z)| lr 5.71e-03 | 2020.07 ms | 67.9% bf16 MFU | 261355 tok/s +step 3634/18794 | loss 3.269594 (-0.80z)| norm 0.1765 (-0.69z)| lr 5.71e-03 | 2019.09 ms | 68.0% bf16 MFU | 261271 tok/s +step 3635/18794 | loss 3.286353 (-0.37z)| norm 0.1786 (-0.64z)| lr 5.71e-03 | 2036.71 ms | 67.4% bf16 MFU | 261078 tok/s +step 3636/18794 | loss 3.310983 (+0.30z)| norm 0.2263 (+0.46z)| lr 5.71e-03 | 2035.13 ms | 67.4% bf16 MFU | 260905 tok/s +step 3637/18794 | loss 3.364254 (+1.68z)| norm 0.2565 (+1.14z)| lr 5.71e-03 | 2025.80 ms | 67.7% bf16 MFU | 260800 tok/s +step 3638/18794 | loss 3.262479 (-0.97z)| norm 0.2408 (+0.76z)| lr 5.71e-03 | 2033.99 ms | 67.5% bf16 MFU | 260648 tok/s +step 3639/18794 | loss 3.328113 (+0.74z)| norm 0.1405 (-1.53z)| lr 5.71e-03 | 2018.76 ms | 68.0% bf16 MFU | 260601 tok/s +step 3640/18794 | loss 3.280414 (-0.52z)| norm 0.1982 (-0.22z)| lr 5.71e-03 | 2010.76 ms | 68.2% bf16 MFU | 260608 tok/s +step 3641/18794 | loss 3.302759 (+0.08z)| norm 0.2613 (+1.21z)| lr 5.71e-03 | 2026.59 ms | 67.7% bf16 MFU | 260513 tok/s +step 3642/18794 | loss 3.279713 (-0.52z)| norm 0.2795 (+1.59z)| lr 5.71e-03 | 2019.08 ms | 68.0% bf16 MFU | 260471 tok/s +step 3643/18794 | loss 3.295015 (-0.11z)| norm 0.2749 (+1.45z)| lr 5.71e-03 | 2022.14 ms | 67.9% bf16 MFU | 260411 tok/s +step 3644/18794 | loss 3.284471 (-0.39z)| norm 0.2023 (-0.18z)| lr 5.71e-03 | 2025.86 ms | 67.7% bf16 MFU | 260330 tok/s +step 3645/18794 | loss 3.280159 (-0.49z)| norm 0.2151 (+0.10z)| lr 5.71e-03 | 2003.63 ms | 68.5% bf16 MFU | 260397 tok/s +step 3646/18794 | loss 3.356539 (+1.55z)| norm 0.1966 (-0.32z)| lr 5.71e-03 | 2012.22 ms | 68.2% bf16 MFU | 260405 tok/s +step 3647/18794 | loss 3.279046 (-0.53z)| norm 0.1997 (-0.26z)| lr 5.71e-03 | 2009.74 ms | 68.3% bf16 MFU | 260428 tok/s +step 3648/18794 | loss 3.325607 (+0.71z)| norm 0.2147 (+0.07z)| lr 5.71e-03 | 2010.40 ms | 68.3% bf16 MFU | 260446 tok/s +step 3649/18794 | loss 3.339644 (+1.07z)| norm 0.2221 (+0.23z)| lr 5.71e-03 | 2012.31 ms | 68.2% bf16 MFU | 260451 tok/s +step 3650/18794 | loss 3.316114 (+0.44z)| norm 0.1915 (-0.48z)| lr 5.71e-03 | 2020.92 ms | 67.9% bf16 MFU | 260400 tok/s +step 3651/18794 | loss 3.324999 (+0.68z)| norm 0.2024 (-0.24z)| lr 5.71e-03 | 2015.51 ms | 68.1% bf16 MFU | 260386 tok/s +step 3652/18794 | loss 3.317678 (+0.47z)| norm 0.1584 (-1.24z)| lr 5.71e-03 | 2018.93 ms | 68.0% bf16 MFU | 260351 tok/s +step 3653/18794 | loss 3.314215 (+0.37z)| norm 0.2299 (+0.39z)| lr 5.71e-03 | 2011.17 ms | 68.2% bf16 MFU | 260368 tok/s +step 3654/18794 | loss 3.308289 (+0.21z)| norm 0.2378 (+0.56z)| lr 5.71e-03 | 2003.80 ms | 68.5% bf16 MFU | 260432 tok/s +step 3655/18794 | loss 3.323301 (+0.61z)| norm 0.2384 (+0.56z)| lr 5.71e-03 | 2013.18 ms | 68.2% bf16 MFU | 260432 tok/s +step 3656/18794 | loss 3.319312 (+0.51z)| norm 0.2041 (-0.24z)| lr 5.71e-03 | 2024.38 ms | 67.8% bf16 MFU | 260360 tok/s +step 3657/18794 | loss 3.275061 (-0.68z)| norm 0.2111 (-0.09z)| lr 5.71e-03 | 2019.79 ms | 67.9% bf16 MFU | 260321 tok/s +step 3658/18794 | loss 3.318328 (+0.48z)| norm 0.1767 (-0.88z)| lr 5.71e-03 | 2008.23 ms | 68.3% bf16 MFU | 260358 tok/s +step 3659/18794 | loss 3.222094 (-2.04z)| norm 0.1461 (-1.56z)| lr 5.71e-03 | 2004.51 ms | 68.5% bf16 MFU | 260418 tok/s +step 3660/18794 | loss 3.277333 (-0.58z)| norm 0.2194 (+0.13z)| lr 5.71e-03 | 2026.75 ms | 67.7% bf16 MFU | 260331 tok/s +step 3661/18794 | loss 3.306185 (+0.19z)| norm 0.2778 (+1.47z)| lr 5.71e-03 | 2020.26 ms | 67.9% bf16 MFU | 260290 tok/s +step 3662/18794 | loss 3.321041 (+0.58z)| norm 0.2101 (-0.08z)| lr 5.71e-03 | 2017.88 ms | 68.0% bf16 MFU | 260267 tok/s +step 3663/18794 | loss 3.305275 (+0.17z)| norm 0.2137 (+0.02z)| lr 5.71e-03 | 2013.25 ms | 68.2% bf16 MFU | 260274 tok/s +step 3664/18794 | loss 3.284562 (-0.40z)| norm 0.2410 (+0.69z)| lr 5.71e-03 | 2004.44 ms | 68.5% bf16 MFU | 260339 tok/s +step 3665/18794 | loss 3.320872 (+0.58z)| norm 0.2008 (-0.25z)| lr 5.71e-03 | 2002.80 ms | 68.5% bf16 MFU | 260411 tok/s +step 3666/18794 | loss 3.274814 (-0.65z)| norm 0.2520 (+0.99z)| lr 5.71e-03 | 2009.61 ms | 68.3% bf16 MFU | 260435 tok/s +step 3667/18794 | loss 3.292523 (-0.18z)| norm 0.2576 (+1.10z)| lr 5.71e-03 | 2006.35 ms | 68.4% bf16 MFU | 260479 tok/s +step 3668/18794 | loss 3.271039 (-0.74z)| norm 0.2084 (-0.09z)| lr 5.71e-03 | 2005.82 ms | 68.4% bf16 MFU | 260524 tok/s +step 3669/18794 | loss 3.307057 (+0.25z)| norm 0.2013 (-0.27z)| lr 5.71e-03 | 2011.22 ms | 68.2% bf16 MFU | 260532 tok/s +step 3670/18794 | loss 3.279639 (-0.49z)| norm 0.2003 (-0.28z)| lr 5.71e-03 | 2015.70 ms | 68.1% bf16 MFU | 260510 tok/s +step 3671/18794 | loss 3.312005 (+0.38z)| norm 0.1857 (-0.62z)| lr 5.71e-03 | 2019.11 ms | 68.0% bf16 MFU | 260468 tok/s +step 3672/18794 | loss 3.310711 (+0.34z)| norm 0.1644 (-1.14z)| lr 5.71e-03 | 1998.39 ms | 68.7% bf16 MFU | 260562 tok/s +step 3673/18794 | loss 3.330704 (+0.89z)| norm 0.1477 (-1.53z)| lr 5.71e-03 | 2005.72 ms | 68.4% bf16 MFU | 260604 tok/s +step 3674/18794 | loss 3.291516 (-0.18z)| norm 0.1792 (-0.75z)| lr 5.70e-03 | 2022.57 ms | 67.9% bf16 MFU | 260535 tok/s +step 3675/18794 | loss 3.288828 (-0.24z)| norm 0.1916 (-0.45z)| lr 5.70e-03 | 2003.68 ms | 68.5% bf16 MFU | 260591 tok/s +step 3676/18794 | loss 3.282409 (-0.40z)| norm 0.2517 (+1.03z)| lr 5.70e-03 | 2018.49 ms | 68.0% bf16 MFU | 260549 tok/s +step 3677/18794 | loss 3.388930 (+2.48z)| norm 0.1934 (-0.39z)| lr 5.70e-03 | 2011.06 ms | 68.2% bf16 MFU | 260556 tok/s +step 3678/18794 | loss 3.304398 (+0.19z)| norm 0.1609 (-1.19z)| lr 5.70e-03 | 1994.50 ms | 68.8% bf16 MFU | 260672 tok/s +step 3679/18794 | loss 3.327480 (+0.81z)| norm 0.1490 (-1.46z)| lr 5.70e-03 | 2011.89 ms | 68.2% bf16 MFU | 260668 tok/s +step 3680/18794 | loss 3.304658 (+0.17z)| norm 0.1465 (-1.49z)| lr 5.70e-03 | 2019.72 ms | 67.9% bf16 MFU | 260614 tok/s +step 3681/18794 | loss 3.345197 (+1.27z)| norm 0.2024 (-0.13z)| lr 5.70e-03 | 2003.70 ms | 68.5% bf16 MFU | 260666 tok/s +step 3682/18794 | loss 3.278016 (-0.62z)| norm 0.1961 (-0.28z)| lr 5.70e-03 | 2004.50 ms | 68.5% bf16 MFU | 260711 tok/s +step 3683/18794 | loss 3.277758 (-0.62z)| norm 0.2076 (-0.01z)| lr 5.70e-03 | 1996.72 ms | 68.7% bf16 MFU | 260804 tok/s +step 3684/18794 | loss 3.321265 (+0.60z)| norm 0.2541 (+1.11z)| lr 5.70e-03 | 2017.49 ms | 68.0% bf16 MFU | 260757 tok/s +step 3685/18794 | loss 3.303857 (+0.11z)| norm 0.2571 (+1.16z)| lr 5.70e-03 | 1996.58 ms | 68.7% bf16 MFU | 260849 tok/s +step 3686/18794 | loss 3.341455 (+1.15z)| norm 0.2474 (+0.91z)| lr 5.70e-03 | 2011.38 ms | 68.2% bf16 MFU | 260840 tok/s +step 3687/18794 | loss 3.276187 (-0.67z)| norm 0.2466 (+0.88z)| lr 5.70e-03 | 1996.60 ms | 68.7% bf16 MFU | 260927 tok/s +step 3688/18794 | loss 3.345358 (+1.25z)| norm 0.2773 (+1.58z)| lr 5.70e-03 | 2012.03 ms | 68.2% bf16 MFU | 260910 tok/s +step 3689/18794 | loss 3.354167 (+1.46z)| norm 0.2108 (-0.03z)| lr 5.70e-03 | 2011.11 ms | 68.2% bf16 MFU | 260899 tok/s +step 3690/18794 | loss 3.336259 (+0.96z)| norm 0.2038 (-0.20z)| lr 5.70e-03 | 2000.57 ms | 68.6% bf16 MFU | 260957 tok/s +step 3691/18794 | loss 3.248744 (-1.45z)| norm 0.2074 (-0.11z)| lr 5.70e-03 | 2013.09 ms | 68.2% bf16 MFU | 260932 tok/s +step 3692/18794 | loss 3.317063 (+0.42z)| norm 0.1977 (-0.35z)| lr 5.70e-03 | 2000.80 ms | 68.6% bf16 MFU | 260987 tok/s +step 3693/18794 | loss 3.287709 (-0.40z)| norm 0.1625 (-1.18z)| lr 5.70e-03 | 2003.75 ms | 68.5% bf16 MFU | 261020 tok/s +step 3694/18794 | loss 3.326021 (+0.65z)| norm 0.1595 (-1.23z)| lr 5.70e-03 | 1989.55 ms | 69.0% bf16 MFU | 261145 tok/s +step 3695/18794 | loss 3.416476 (+2.99z)| norm 0.2518 (+0.96z)| lr 5.70e-03 | 2010.43 ms | 68.3% bf16 MFU | 261127 tok/s +step 3696/18794 | loss 3.337047 (+0.90z)| norm 0.2870 (+1.75z)| lr 5.70e-03 | 2012.37 ms | 68.2% bf16 MFU | 261098 tok/s +step 3697/18794 | loss 3.333249 (+0.79z)| norm 0.2811 (+1.58z)| lr 5.70e-03 | 1998.30 ms | 68.7% bf16 MFU | 261161 tok/s +step 3698/18794 | loss 3.296507 (-0.25z)| norm 0.2008 (-0.28z)| lr 5.70e-03 | 2010.91 ms | 68.2% bf16 MFU | 261139 tok/s +step 3699/18794 | loss 3.314558 (+0.29z)| norm 0.1573 (-1.26z)| lr 5.70e-03 | 1987.30 ms | 69.1% bf16 MFU | 261273 tok/s +step 3700/18794 | loss 3.250421 (-1.52z)| norm 0.1603 (-1.17z)| lr 5.70e-03 | 1998.39 ms | 68.7% bf16 MFU | 261327 tok/s +step 3701/18794 | loss 3.281651 (-0.64z)| norm 0.1896 (-0.49z)| lr 5.70e-03 | 1996.67 ms | 68.7% bf16 MFU | 261390 tok/s +step 3702/18794 | loss 3.263562 (-1.13z)| norm 0.2466 (+0.80z)| lr 5.70e-03 | 2002.29 ms | 68.5% bf16 MFU | 261413 tok/s +step 3703/18794 | loss 3.354532 (+1.45z)| norm 0.2590 (+1.07z)| lr 5.70e-03 | 1995.62 ms | 68.8% bf16 MFU | 261478 tok/s +step 3704/18794 | loss 3.254956 (-1.39z)| norm 0.2276 (+0.34z)| lr 5.70e-03 | 1992.77 ms | 68.9% bf16 MFU | 261559 tok/s +step 3705/18794 | loss 3.295476 (-0.24z)| norm 0.2277 (+0.33z)| lr 5.70e-03 | 1997.22 ms | 68.7% bf16 MFU | 261606 tok/s +step 3706/18794 | loss 3.354030 (+1.42z)| norm 0.2100 (-0.08z)| lr 5.70e-03 | 2011.35 ms | 68.2% bf16 MFU | 261559 tok/s +step 3707/18794 | loss 3.278353 (-0.76z)| norm 0.2370 (+0.54z)| lr 5.70e-03 | 2004.12 ms | 68.5% bf16 MFU | 261561 tok/s +step 3708/18794 | loss 3.281708 (-0.65z)| norm 0.2742 (+1.37z)| lr 5.70e-03 | 2012.47 ms | 68.2% bf16 MFU | 261509 tok/s +step 3709/18794 | loss 3.299797 (-0.13z)| norm 0.2303 (+0.35z)| lr 5.70e-03 | 1999.63 ms | 68.6% bf16 MFU | 261544 tok/s +step 3710/18794 | loss 3.343157 (+1.13z)| norm 0.2268 (+0.29z)| lr 5.70e-03 | 2000.16 ms | 68.6% bf16 MFU | 261573 tok/s +step 3711/18794 | loss 3.321726 (+0.51z)| norm 0.1853 (-0.67z)| lr 5.70e-03 | 1996.42 ms | 68.7% bf16 MFU | 261625 tok/s +step 3712/18794 | loss 3.283720 (-0.62z)| norm 0.1711 (-1.00z)| lr 5.70e-03 | 1990.79 ms | 68.9% bf16 MFU | 261711 tok/s +step 3713/18794 | loss 3.319131 (+0.42z)| norm 0.1901 (-0.57z)| lr 5.70e-03 | 1996.87 ms | 68.7% bf16 MFU | 261753 tok/s +step 3714/18794 | loss 3.357741 (+1.53z)| norm 0.1956 (-0.45z)| lr 5.70e-03 | 2005.12 ms | 68.4% bf16 MFU | 261740 tok/s +step 3715/18794 | loss 3.345301 (+1.14z)| norm 0.1741 (-0.94z)| lr 5.70e-03 | 1997.62 ms | 68.7% bf16 MFU | 261775 tok/s +step 3716/18794 | loss 3.252510 (-1.62z)| norm 0.1854 (-0.66z)| lr 5.70e-03 | 1995.91 ms | 68.8% bf16 MFU | 261821 tok/s +step 3717/18794 | loss 3.269680 (-1.11z)| norm 0.1947 (-0.43z)| lr 5.70e-03 | 1995.31 ms | 68.8% bf16 MFU | 261868 tok/s +step 3718/18794 | loss 3.310892 (+0.16z)| norm 0.2126 (+0.05z)| lr 5.70e-03 | 2016.45 ms | 68.1% bf16 MFU | 261775 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.011218 +step 3719/18794 | loss 3.263858 (-1.26z)| norm 0.2895 (+2.01z)| lr 5.70e-03 | 1990.61 ms | 68.9% bf16 MFU | 261855 tok/s +step 3720/18794 | loss 3.334552 (+0.89z)| norm 0.2152 (+0.09z)| lr 5.69e-03 | 1987.77 ms | 69.0% bf16 MFU | 261950 tok/s +step 3721/18794 | loss 3.319344 (+0.42z)| norm 0.1759 (-0.90z)| lr 5.69e-03 | 1999.63 ms | 68.6% bf16 MFU | 261962 tok/s +step 3722/18794 | loss 3.263603 (-1.25z)| norm 0.2379 (+0.70z)| lr 5.69e-03 | 2002.55 ms | 68.5% bf16 MFU | 261955 tok/s +step 3723/18794 | loss 3.266523 (-1.14z)| norm 0.2038 (-0.17z)| lr 5.69e-03 | 2011.03 ms | 68.2% bf16 MFU | 261892 tok/s +step 3724/18794 | loss 3.324932 (+0.63z)| norm 0.1993 (-0.30z)| lr 5.69e-03 | 2001.72 ms | 68.6% bf16 MFU | 261893 tok/s +step 3725/18794 | loss 3.284798 (-0.60z)| norm 0.1936 (-0.44z)| lr 5.69e-03 | 1996.92 ms | 68.7% bf16 MFU | 261926 tok/s +step 3726/18794 | loss 3.291701 (-0.39z)| norm 0.1668 (-1.12z)| lr 5.69e-03 | 1988.28 ms | 69.0% bf16 MFU | 262014 tok/s +step 3727/18794 | loss 3.302047 (-0.06z)| norm 0.1866 (-0.58z)| lr 5.69e-03 | 1995.28 ms | 68.8% bf16 MFU | 262052 tok/s +step 3728/18794 | loss 3.337893 (+1.02z)| norm 0.1650 (-1.15z)| lr 5.69e-03 | 2016.48 ms | 68.1% bf16 MFU | 261949 tok/s +step 3729/18794 | loss 3.336920 (+0.98z)| norm 0.2144 (+0.17z)| lr 5.69e-03 | 1984.09 ms | 69.2% bf16 MFU | 262064 tok/s +step 3730/18794 | loss 3.313115 (+0.25z)| norm 0.2538 (+1.21z)| lr 5.69e-03 | 1980.03 ms | 69.3% bf16 MFU | 262200 tok/s +step 3731/18794 | loss 3.282614 (-0.68z)| norm 0.2779 (+1.83z)| lr 5.69e-03 | 2008.94 ms | 68.3% bf16 MFU | 262139 tok/s +step 3732/18794 | loss 3.300563 (-0.15z)| norm 0.2471 (+0.97z)| lr 5.69e-03 | 2007.86 ms | 68.3% bf16 MFU | 262088 tok/s +step 3733/18794 | loss 3.321619 (+0.49z)| norm 0.2405 (+0.78z)| lr 5.69e-03 | 1989.37 ms | 69.0% bf16 MFU | 262161 tok/s +reducing beta2 to 0.9 and lr/wd by 0.903 due to grad z-score of 3.877316 +step 3734/18794 | loss 3.294279 (-0.37z)| norm 0.3676 (+3.88z)| lr 5.14e-03 | 2000.19 ms | 68.6% bf16 MFU | 262159 tok/s +step 3735/18794 | loss 3.327984 (+0.68z)| norm 0.2093 (-0.13z)| lr 5.69e-03 | 1980.96 ms | 69.3% bf16 MFU | 262284 tok/s +step 3736/18794 | loss 3.298398 (-0.25z)| norm 0.2797 (+1.62z)| lr 5.69e-03 | 1996.74 ms | 68.7% bf16 MFU | 262298 tok/s +step 3737/18794 | loss 3.269500 (-1.15z)| norm 0.2583 (+1.09z)| lr 5.69e-03 | 1986.36 ms | 69.1% bf16 MFU | 262381 tok/s +step 3738/18794 | loss 3.287208 (-0.59z)| norm 0.1828 (-0.78z)| lr 5.69e-03 | 1993.77 ms | 68.8% bf16 MFU | 262410 tok/s +step 3739/18794 | loss 3.274906 (-0.97z)| norm 0.2616 (+1.16z)| lr 5.69e-03 | 1988.00 ms | 69.0% bf16 MFU | 262476 tok/s +step 3740/18794 | loss 3.266150 (-1.24z)| norm 0.2225 (+0.17z)| lr 5.69e-03 | 1998.18 ms | 68.7% bf16 MFU | 262471 tok/s +step 3741/18794 | loss 3.369933 (+2.01z)| norm 0.1871 (-0.71z)| lr 5.69e-03 | 2000.06 ms | 68.6% bf16 MFU | 262454 tok/s +step 3742/18794 | loss 3.261766 (-1.36z)| norm 0.2029 (-0.29z)| lr 5.69e-03 | 1983.16 ms | 69.2% bf16 MFU | 262550 tok/s +step 3743/18794 | loss 3.230325 (-2.26z)| norm 0.2012 (-0.32z)| lr 5.69e-03 | 1988.54 ms | 69.0% bf16 MFU | 262605 tok/s +step 3744/18794 | loss 3.341718 (+1.09z)| norm 0.1783 (-0.91z)| lr 5.69e-03 | 1996.72 ms | 68.7% bf16 MFU | 262604 tok/s +step 3745/18794 | loss 3.240885 (-1.90z)| norm 0.1832 (-0.77z)| lr 5.69e-03 | 1995.57 ms | 68.8% bf16 MFU | 262610 tok/s +step 3746/18794 | loss 3.320259 (+0.47z)| norm 0.1843 (-0.74z)| lr 5.69e-03 | 1991.23 ms | 68.9% bf16 MFU | 262644 tok/s +step 3747/18794 | loss 3.252625 (-1.54z)| norm 0.1733 (-1.01z)| lr 5.69e-03 | 2002.93 ms | 68.5% bf16 MFU | 262600 tok/s +step 3748/18794 | loss 3.316713 (+0.37z)| norm 0.2698 (+1.44z)| lr 5.69e-03 | 1991.97 ms | 68.9% bf16 MFU | 262630 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.120754 +step 3749/18794 | loss 3.280454 (-0.70z)| norm 0.2993 (+2.12z)| lr 5.69e-03 | 1982.52 ms | 69.2% bf16 MFU | 262721 tok/s +step 3750/18794 | loss 3.325439 (+0.64z)| norm 0.1736 (-0.99z)| lr 5.69e-03 | 1995.20 ms | 68.8% bf16 MFU | 262724 tok/s +val loss 3.309433 +HellaSwag: 2788/10042 = 0.277634: 0/1256 +step 3751/18794 | loss 3.311132 (+0.22z)| norm 0.2016 (-0.30z)| lr 5.69e-03 | 1998.03 ms | 68.7% bf16 MFU | 262708 tok/s +step 3752/18794 | loss 3.319467 (+0.47z)| norm 0.2352 (+0.52z)| lr 5.69e-03 | 1996.10 ms | 68.8% bf16 MFU | 262705 tok/s +step 3753/18794 | loss 3.313395 (+0.29z)| norm 0.2494 (+0.87z)| lr 5.69e-03 | 1995.97 ms | 68.8% bf16 MFU | 262704 tok/s +step 3754/18794 | loss 3.284297 (-0.57z)| norm 0.1820 (-0.79z)| lr 5.69e-03 | 1995.53 ms | 68.8% bf16 MFU | 262705 tok/s +step 3755/18794 | loss 3.293642 (-0.28z)| norm 0.2049 (-0.22z)| lr 5.69e-03 | 1990.60 ms | 68.9% bf16 MFU | 262739 tok/s +step 3756/18794 | loss 3.298242 (-0.14z)| norm 0.2068 (-0.17z)| lr 5.69e-03 | 1987.71 ms | 69.0% bf16 MFU | 262790 tok/s +step 3757/18794 | loss 3.336902 (+0.99z)| norm 0.2020 (-0.29z)| lr 5.69e-03 | 1984.16 ms | 69.2% bf16 MFU | 262862 tok/s +step 3758/18794 | loss 3.303185 (-0.01z)| norm 0.2229 (+0.22z)| lr 5.69e-03 | 1997.75 ms | 68.7% bf16 MFU | 262841 tok/s +step 3759/18794 | loss 3.254420 (-1.50z)| norm 0.2615 (+1.16z)| lr 5.69e-03 | 1988.63 ms | 69.0% bf16 MFU | 262881 tok/s +step 3760/18794 | loss 3.303433 (-0.02z)| norm 0.2366 (+0.53z)| lr 5.69e-03 | 1986.60 ms | 69.1% bf16 MFU | 262933 tok/s +step 3761/18794 | loss 3.232757 (-2.11z)| norm 0.1986 (-0.41z)| lr 5.69e-03 | 1996.55 ms | 68.7% bf16 MFU | 262916 tok/s +step 3762/18794 | loss 3.320037 (+0.50z)| norm 0.2433 (+0.72z)| lr 5.69e-03 | 1994.72 ms | 68.8% bf16 MFU | 262912 tok/s +step 3763/18794 | loss 3.352658 (+1.45z)| norm 0.2142 (-0.02z)| lr 5.69e-03 | 2004.93 ms | 68.4% bf16 MFU | 262842 tok/s +step 3764/18794 | loss 3.319173 (+0.45z)| norm 0.1657 (-1.22z)| lr 5.69e-03 | 1994.95 ms | 68.8% bf16 MFU | 262840 tok/s +step 3765/18794 | loss 3.300921 (-0.09z)| norm 0.1724 (-1.04z)| lr 5.68e-03 | 1981.40 ms | 69.3% bf16 MFU | 262928 tok/s +step 3766/18794 | loss 3.303865 (-0.01z)| norm 0.1397 (-1.81z)| lr 5.68e-03 | 1983.13 ms | 69.2% bf16 MFU | 263000 tok/s +step 3767/18794 | loss 3.324804 (+0.60z)| norm 0.1987 (-0.34z)| lr 5.68e-03 | 1988.40 ms | 69.0% bf16 MFU | 263034 tok/s +step 3768/18794 | loss 3.303970 (-0.03z)| norm 0.2190 (+0.17z)| lr 5.68e-03 | 1987.73 ms | 69.0% bf16 MFU | 263070 tok/s +step 3769/18794 | loss 3.295885 (-0.26z)| norm 0.2070 (-0.14z)| lr 5.68e-03 | 1989.02 ms | 69.0% bf16 MFU | 263096 tok/s +step 3770/18794 | loss 3.351840 (+1.37z)| norm 0.1572 (-1.35z)| lr 5.68e-03 | 1988.02 ms | 69.0% bf16 MFU | 263128 tok/s +step 3771/18794 | loss 3.289873 (-0.45z)| norm 0.1615 (-1.24z)| lr 5.68e-03 | 1989.62 ms | 69.0% bf16 MFU | 263147 tok/s +step 3772/18794 | loss 3.395816 (+2.57z)| norm 0.1976 (-0.36z)| lr 5.68e-03 | 1982.51 ms | 69.2% bf16 MFU | 263213 tok/s +step 3773/18794 | loss 3.398428 (+2.54z)| norm 0.2351 (+0.55z)| lr 5.68e-03 | 1984.62 ms | 69.1% bf16 MFU | 263261 tok/s +step 3774/18794 | loss 3.277059 (-0.82z)| norm 0.2732 (+1.47z)| lr 5.68e-03 | 1991.66 ms | 68.9% bf16 MFU | 263260 tok/s +step 3775/18794 | loss 3.324808 (+0.49z)| norm 0.2538 (+0.97z)| lr 5.68e-03 | 1988.59 ms | 69.0% bf16 MFU | 263279 tok/s +step 3776/18794 | loss 3.266920 (-1.10z)| norm 0.2043 (-0.24z)| lr 5.68e-03 | 1997.16 ms | 68.7% bf16 MFU | 263241 tok/s +step 3777/18794 | loss 3.287657 (-0.51z)| norm 0.2575 (+1.06z)| lr 5.68e-03 | 1980.53 ms | 69.3% bf16 MFU | 263315 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.010476 +step 3778/18794 | loss 3.299853 (-0.17z)| norm 0.2983 (+2.01z)| lr 5.68e-03 | 1982.40 ms | 69.2% bf16 MFU | 263373 tok/s +step 3779/18794 | loss 3.329911 (+0.68z)| norm 0.2125 (-0.10z)| lr 5.68e-03 | 1992.80 ms | 68.9% bf16 MFU | 263359 tok/s +step 3780/18794 | loss 3.247045 (-1.62z)| norm 0.1826 (-0.86z)| lr 5.68e-03 | 1985.90 ms | 69.1% bf16 MFU | 263391 tok/s +step 3781/18794 | loss 3.302735 (-0.06z)| norm 0.1825 (-0.86z)| lr 5.68e-03 | 1983.63 ms | 69.2% bf16 MFU | 263437 tok/s +step 3782/18794 | loss 3.282230 (-0.63z)| norm 0.1932 (-0.59z)| lr 5.68e-03 | 1982.81 ms | 69.2% bf16 MFU | 263486 tok/s +step 3783/18794 | loss 3.350358 (+1.25z)| norm 0.2037 (-0.33z)| lr 5.68e-03 | 1980.00 ms | 69.3% bf16 MFU | 263551 tok/s +step 3784/18794 | loss 3.298473 (-0.19z)| norm 0.1745 (-1.03z)| lr 5.68e-03 | 1980.70 ms | 69.3% bf16 MFU | 263609 tok/s +step 3785/18794 | loss 3.343654 (+1.05z)| norm 0.2006 (-0.37z)| lr 5.68e-03 | 1987.90 ms | 69.0% bf16 MFU | 263615 tok/s +step 3786/18794 | loss 3.316124 (+0.30z)| norm 0.1956 (-0.48z)| lr 5.68e-03 | 1987.95 ms | 69.0% bf16 MFU | 263621 tok/s +step 3787/18794 | loss 3.354650 (+1.34z)| norm 0.2039 (-0.26z)| lr 5.68e-03 | 1987.57 ms | 69.0% bf16 MFU | 263629 tok/s +step 3788/18794 | loss 3.302892 (-0.08z)| norm 0.2075 (-0.16z)| lr 5.68e-03 | 1984.70 ms | 69.1% bf16 MFU | 263656 tok/s +step 3789/18794 | loss 3.315180 (+0.27z)| norm 0.2024 (-0.29z)| lr 5.68e-03 | 1987.79 ms | 69.0% bf16 MFU | 263661 tok/s +step 3790/18794 | loss 3.285309 (-0.55z)| norm 0.1372 (-1.90z)| lr 5.68e-03 | 1981.24 ms | 69.3% bf16 MFU | 263709 tok/s +step 3791/18794 | loss 3.304019 (-0.04z)| norm 0.1446 (-1.68z)| lr 5.68e-03 | 1983.53 ms | 69.2% bf16 MFU | 263740 tok/s +step 3792/18794 | loss 3.287942 (-0.49z)| norm 0.1701 (-1.03z)| lr 5.68e-03 | 1981.24 ms | 69.3% bf16 MFU | 263784 tok/s +step 3793/18794 | loss 3.362617 (+1.60z)| norm 0.1784 (-0.84z)| lr 5.68e-03 | 1988.63 ms | 69.0% bf16 MFU | 263777 tok/s +step 3794/18794 | loss 3.302757 (-0.08z)| norm 0.1982 (-0.36z)| lr 5.68e-03 | 1988.16 ms | 69.0% bf16 MFU | 263773 tok/s +step 3795/18794 | loss 3.294922 (-0.28z)| norm 0.1526 (-1.46z)| lr 5.68e-03 | 1991.70 ms | 68.9% bf16 MFU | 263746 tok/s +step 3796/18794 | loss 3.268709 (-1.04z)| norm 0.1718 (-0.97z)| lr 5.68e-03 | 1987.21 ms | 69.1% bf16 MFU | 263751 tok/s +step 3797/18794 | loss 3.251514 (-1.52z)| norm 0.2217 (+0.30z)| lr 5.68e-03 | 1980.10 ms | 69.3% bf16 MFU | 263802 tok/s +step 3798/18794 | loss 3.274382 (-0.83z)| norm 0.1623 (-1.19z)| lr 5.68e-03 | 1980.90 ms | 69.3% bf16 MFU | 263846 tok/s +step 3799/18794 | loss 3.279137 (-0.68z)| norm 0.2631 (+1.33z)| lr 5.68e-03 | 1988.17 ms | 69.0% bf16 MFU | 263838 tok/s +reducing beta2 to 0.9 and lr/wd by 0.814 due to grad z-score of 4.299570 +step 3800/18794 | loss 3.312548 (+0.28z)| norm 0.4001 (+4.30z)| lr 4.62e-03 | 1984.42 ms | 69.2% bf16 MFU | 263857 tok/s +step 3801/18794 | loss 3.324091 (+0.61z)| norm 0.1951 (-0.41z)| lr 5.68e-03 | 1984.58 ms | 69.1% bf16 MFU | 263873 tok/s +step 3802/18794 | loss 3.310486 (+0.20z)| norm 0.2634 (+1.15z)| lr 5.68e-03 | 1984.54 ms | 69.2% bf16 MFU | 263888 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.476195 +step 3803/18794 | loss 3.356820 (+1.58z)| norm 0.3249 (+2.48z)| lr 5.68e-03 | 1983.21 ms | 69.2% bf16 MFU | 263912 tok/s +step 3804/18794 | loss 3.277202 (-0.81z)| norm 0.2788 (+1.42z)| lr 5.68e-03 | 1981.97 ms | 69.2% bf16 MFU | 263943 tok/s +step 3805/18794 | loss 3.309918 (+0.17z)| norm 0.2071 (-0.16z)| lr 5.68e-03 | 1987.74 ms | 69.0% bf16 MFU | 263934 tok/s +step 3806/18794 | loss 3.331193 (+0.82z)| norm 0.2149 (+0.01z)| lr 5.68e-03 | 1979.70 ms | 69.3% bf16 MFU | 263979 tok/s +step 3807/18794 | loss 3.275010 (-0.88z)| norm 0.2684 (+1.18z)| lr 5.68e-03 | 1979.71 ms | 69.3% bf16 MFU | 264021 tok/s +step 3808/18794 | loss 3.361465 (+1.70z)| norm 0.2323 (+0.40z)| lr 5.68e-03 | 1979.71 ms | 69.3% bf16 MFU | 264062 tok/s +step 3809/18794 | loss 3.283904 (-0.62z)| norm 0.2553 (+0.90z)| lr 5.67e-03 | 1988.35 ms | 69.0% bf16 MFU | 264043 tok/s +step 3810/18794 | loss 3.264257 (-1.19z)| norm 0.3040 (+1.92z)| lr 5.67e-03 | 1983.40 ms | 69.2% bf16 MFU | 264058 tok/s +step 3811/18794 | loss 3.278287 (-0.75z)| norm 0.2894 (+1.57z)| lr 5.67e-03 | 1983.97 ms | 69.2% bf16 MFU | 264068 tok/s +step 3812/18794 | loss 3.231546 (-2.09z)| norm 0.2113 (-0.11z)| lr 5.67e-03 | 1981.88 ms | 69.2% bf16 MFU | 264091 tok/s +step 3813/18794 | loss 3.269269 (-0.97z)| norm 0.1829 (-0.72z)| lr 5.67e-03 | 2010.76 ms | 68.2% bf16 MFU | 263924 tok/s +step 3814/18794 | loss 3.284418 (-0.51z)| norm 0.1527 (-1.35z)| lr 5.67e-03 | 2044.56 ms | 67.1% bf16 MFU | 263549 tok/s +step 3815/18794 | loss 3.267606 (-0.99z)| norm 0.1826 (-0.72z)| lr 5.67e-03 | 2043.51 ms | 67.2% bf16 MFU | 263200 tok/s +step 3816/18794 | loss 3.301456 (+0.00z)| norm 0.2212 (+0.10z)| lr 5.67e-03 | 2036.59 ms | 67.4% bf16 MFU | 262912 tok/s +step 3817/18794 | loss 3.287385 (-0.43z)| norm 0.1763 (-0.86z)| lr 5.67e-03 | 2044.19 ms | 67.1% bf16 MFU | 262590 tok/s +step 3818/18794 | loss 3.284422 (-0.51z)| norm 0.1609 (-1.17z)| lr 5.67e-03 | 2044.23 ms | 67.1% bf16 MFU | 262284 tok/s +step 3819/18794 | loss 3.280916 (-0.62z)| norm 0.1766 (-0.82z)| lr 5.67e-03 | 2042.70 ms | 67.2% bf16 MFU | 262003 tok/s +step 3820/18794 | loss 3.234862 (-1.95z)| norm 0.2341 (+0.41z)| lr 5.67e-03 | 2035.57 ms | 67.4% bf16 MFU | 261781 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.157750 +step 3821/18794 | loss 3.317103 (+0.49z)| norm 0.3193 (+2.16z)| lr 5.67e-03 | 2042.61 ms | 67.2% bf16 MFU | 261526 tok/s +step 3822/18794 | loss 3.294978 (-0.17z)| norm 0.2532 (+0.77z)| lr 5.67e-03 | 2034.50 ms | 67.5% bf16 MFU | 261334 tok/s +step 3823/18794 | loss 3.322954 (+0.65z)| norm 0.1759 (-0.84z)| lr 5.67e-03 | 2042.54 ms | 67.2% bf16 MFU | 261102 tok/s +step 3824/18794 | loss 3.292990 (-0.24z)| norm 0.1829 (-0.69z)| lr 5.67e-03 | 2041.33 ms | 67.2% bf16 MFU | 260889 tok/s +step 3825/18794 | loss 3.264403 (-1.09z)| norm 0.1825 (-0.69z)| lr 5.67e-03 | 2027.44 ms | 67.7% bf16 MFU | 260774 tok/s +step 3826/18794 | loss 3.306761 (+0.17z)| norm 0.1947 (-0.45z)| lr 5.67e-03 | 2026.29 ms | 67.7% bf16 MFU | 260672 tok/s +step 3827/18794 | loss 3.267784 (-0.98z)| norm 0.1799 (-0.75z)| lr 5.67e-03 | 2041.85 ms | 67.2% bf16 MFU | 260477 tok/s +step 3828/18794 | loss 3.301184 (+0.03z)| norm 0.1898 (-0.56z)| lr 5.67e-03 | 2034.27 ms | 67.5% bf16 MFU | 260340 tok/s +step 3829/18794 | loss 3.323259 (+0.69z)| norm 0.1919 (-0.51z)| lr 5.67e-03 | 2034.30 ms | 67.5% bf16 MFU | 260209 tok/s +step 3830/18794 | loss 3.249972 (-1.47z)| norm 0.1940 (-0.45z)| lr 5.67e-03 | 2035.67 ms | 67.4% bf16 MFU | 260076 tok/s +step 3831/18794 | loss 3.315899 (+0.48z)| norm 0.1960 (-0.39z)| lr 5.67e-03 | 2027.71 ms | 67.7% bf16 MFU | 260000 tok/s +step 3832/18794 | loss 3.281173 (-0.55z)| norm 0.2399 (+0.53z)| lr 5.67e-03 | 2034.45 ms | 67.5% bf16 MFU | 259886 tok/s +step 3833/18794 | loss 3.307312 (+0.23z)| norm 0.2477 (+0.70z)| lr 5.67e-03 | 2043.46 ms | 67.2% bf16 MFU | 259720 tok/s +step 3834/18794 | loss 3.288114 (-0.34z)| norm 0.2368 (+0.52z)| lr 5.67e-03 | 2027.23 ms | 67.7% bf16 MFU | 259665 tok/s +step 3835/18794 | loss 3.256776 (-1.24z)| norm 0.1846 (-0.64z)| lr 5.67e-03 | 2018.33 ms | 68.0% bf16 MFU | 259670 tok/s +step 3836/18794 | loss 3.309711 (+0.32z)| norm 0.2302 (+0.39z)| lr 5.67e-03 | 2042.97 ms | 67.2% bf16 MFU | 259518 tok/s +step 3837/18794 | loss 3.226701 (-2.08z)| norm 0.2769 (+1.43z)| lr 5.67e-03 | 2041.80 ms | 67.2% bf16 MFU | 259381 tok/s +step 3838/18794 | loss 3.277632 (-0.60z)| norm 0.2238 (+0.24z)| lr 5.67e-03 | 2034.61 ms | 67.4% bf16 MFU | 259296 tok/s +step 3839/18794 | loss 3.294668 (-0.11z)| norm 0.2448 (+0.71z)| lr 5.67e-03 | 2041.15 ms | 67.2% bf16 MFU | 259174 tok/s +step 3840/18794 | loss 3.286813 (-0.35z)| norm 0.2467 (+0.75z)| lr 5.67e-03 | 2033.48 ms | 67.5% bf16 MFU | 259107 tok/s +step 3841/18794 | loss 3.231893 (-1.92z)| norm 0.1942 (-0.43z)| lr 5.67e-03 | 2042.96 ms | 67.2% bf16 MFU | 258983 tok/s +step 3842/18794 | loss 3.329187 (+0.91z)| norm 0.1943 (-0.43z)| lr 5.67e-03 | 2027.58 ms | 67.7% bf16 MFU | 258963 tok/s +step 3843/18794 | loss 3.315537 (+0.50z)| norm 0.2262 (+0.28z)| lr 5.67e-03 | 2034.47 ms | 67.5% bf16 MFU | 258900 tok/s +step 3844/18794 | loss 3.268887 (-0.88z)| norm 0.2230 (+0.20z)| lr 5.67e-03 | 2027.42 ms | 67.7% bf16 MFU | 258885 tok/s +step 3845/18794 | loss 3.349953 (+1.53z)| norm 0.1744 (-0.89z)| lr 5.67e-03 | 2034.41 ms | 67.5% bf16 MFU | 258826 tok/s +step 3846/18794 | loss 3.266753 (-0.96z)| norm 0.2133 (-0.02z)| lr 5.67e-03 | 2018.24 ms | 68.0% bf16 MFU | 258873 tok/s +step 3847/18794 | loss 3.262752 (-1.09z)| norm 0.2373 (+0.51z)| lr 5.67e-03 | 2034.25 ms | 67.5% bf16 MFU | 258816 tok/s +step 3848/18794 | loss 3.265122 (-1.00z)| norm 0.1866 (-0.62z)| lr 5.67e-03 | 2036.39 ms | 67.4% bf16 MFU | 258749 tok/s +step 3849/18794 | loss 3.366492 (+1.99z)| norm 0.1962 (-0.39z)| lr 5.67e-03 | 2033.72 ms | 67.5% bf16 MFU | 258701 tok/s +step 3850/18794 | loss 3.334992 (+1.05z)| norm 0.2455 (+0.74z)| lr 5.67e-03 | 2012.85 ms | 68.2% bf16 MFU | 258789 tok/s +step 3851/18794 | loss 3.329609 (+0.89z)| norm 0.2166 (+0.06z)| lr 5.67e-03 | 2034.21 ms | 67.5% bf16 MFU | 258737 tok/s +step 3852/18794 | loss 3.305434 (+0.18z)| norm 0.2049 (-0.20z)| lr 5.67e-03 | 2027.41 ms | 67.7% bf16 MFU | 258730 tok/s +step 3853/18794 | loss 3.267534 (-0.92z)| norm 0.1721 (-0.94z)| lr 5.66e-03 | 2035.02 ms | 67.4% bf16 MFU | 258675 tok/s +step 3854/18794 | loss 3.325811 (+0.78z)| norm 0.2077 (-0.13z)| lr 5.66e-03 | 2027.10 ms | 67.7% bf16 MFU | 258673 tok/s +step 3855/18794 | loss 3.276775 (-0.65z)| norm 0.1766 (-0.84z)| lr 5.66e-03 | 2019.24 ms | 68.0% bf16 MFU | 258722 tok/s +step 3856/18794 | loss 3.309553 (+0.30z)| norm 0.1739 (-0.89z)| lr 5.66e-03 | 2018.49 ms | 68.0% bf16 MFU | 258773 tok/s +step 3857/18794 | loss 3.354063 (+1.58z)| norm 0.2139 (+0.03z)| lr 5.66e-03 | 2013.15 ms | 68.2% bf16 MFU | 258856 tok/s +step 3858/18794 | loss 3.258121 (-1.17z)| norm 0.2507 (+0.87z)| lr 5.66e-03 | 2019.33 ms | 68.0% bf16 MFU | 258895 tok/s +step 3859/18794 | loss 3.262182 (-1.06z)| norm 0.2482 (+0.82z)| lr 5.66e-03 | 2042.54 ms | 67.2% bf16 MFU | 258784 tok/s +step 3860/18794 | loss 3.217046 (-2.27z)| norm 0.1715 (-0.93z)| lr 5.66e-03 | 2019.62 ms | 67.9% bf16 MFU | 258825 tok/s +step 3861/18794 | loss 3.266866 (-0.90z)| norm 0.2330 (+0.47z)| lr 5.66e-03 | 2034.58 ms | 67.4% bf16 MFU | 258768 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.299188 +step 3862/18794 | loss 3.270308 (-0.79z)| norm 0.3157 (+2.30z)| lr 5.66e-03 | 2027.16 ms | 67.7% bf16 MFU | 258761 tok/s +step 3863/18794 | loss 3.259525 (-1.07z)| norm 0.2533 (+0.89z)| lr 5.66e-03 | 2042.75 ms | 67.2% bf16 MFU | 258656 tok/s +step 3864/18794 | loss 3.250164 (-1.31z)| norm 0.1876 (-0.59z)| lr 5.66e-03 | 2020.10 ms | 67.9% bf16 MFU | 258700 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.374491 +step 3865/18794 | loss 3.279793 (-0.46z)| norm 0.3238 (+2.37z)| lr 5.66e-03 | 2034.44 ms | 67.5% bf16 MFU | 258650 tok/s +step 3866/18794 | loss 3.301383 (+0.15z)| norm 0.2540 (+0.83z)| lr 5.66e-03 | 2017.63 ms | 68.0% bf16 MFU | 258711 tok/s +step 3867/18794 | loss 3.263946 (-0.89z)| norm 0.1881 (-0.62z)| lr 5.66e-03 | 2027.43 ms | 67.7% bf16 MFU | 258705 tok/s +step 3868/18794 | loss 3.321599 (+0.73z)| norm 0.2660 (+1.08z)| lr 5.66e-03 | 2027.36 ms | 67.7% bf16 MFU | 258700 tok/s +step 3869/18794 | loss 3.274578 (-0.59z)| norm 0.2489 (+0.69z)| lr 5.66e-03 | 2034.10 ms | 67.5% bf16 MFU | 258653 tok/s +step 3870/18794 | loss 3.312164 (+0.49z)| norm 0.1684 (-1.07z)| lr 5.66e-03 | 2026.19 ms | 67.7% bf16 MFU | 258658 tok/s +step 3871/18794 | loss 3.297321 (+0.06z)| norm 0.2187 (+0.02z)| lr 5.66e-03 | 2026.56 ms | 67.7% bf16 MFU | 258660 tok/s +step 3872/18794 | loss 3.308486 (+0.42z)| norm 0.2269 (+0.19z)| lr 5.66e-03 | 2035.19 ms | 67.4% bf16 MFU | 258608 tok/s +step 3873/18794 | loss 3.228531 (-1.97z)| norm 0.2076 (-0.23z)| lr 5.66e-03 | 2035.65 ms | 67.4% bf16 MFU | 258555 tok/s +step 3874/18794 | loss 3.253570 (-1.18z)| norm 0.2179 (+0.01z)| lr 5.66e-03 | 2020.11 ms | 67.9% bf16 MFU | 258604 tok/s +step 3875/18794 | loss 3.329546 (+1.13z)| norm 0.1834 (-0.74z)| lr 5.66e-03 | 2019.74 ms | 67.9% bf16 MFU | 258653 tok/s +step 3876/18794 | loss 3.323323 (+0.93z)| norm 0.1994 (-0.38z)| lr 5.66e-03 | 2028.18 ms | 67.7% bf16 MFU | 258645 tok/s +step 3877/18794 | loss 3.296438 (+0.10z)| norm 0.2400 (+0.53z)| lr 5.66e-03 | 2020.91 ms | 67.9% bf16 MFU | 258685 tok/s +step 3878/18794 | loss 3.281895 (-0.33z)| norm 0.2047 (-0.25z)| lr 5.66e-03 | 2011.06 ms | 68.2% bf16 MFU | 258786 tok/s +step 3879/18794 | loss 3.317754 (+0.77z)| norm 0.2113 (-0.10z)| lr 5.66e-03 | 2034.79 ms | 67.4% bf16 MFU | 258729 tok/s +step 3880/18794 | loss 3.280346 (-0.39z)| norm 0.1952 (-0.46z)| lr 5.66e-03 | 2019.72 ms | 67.9% bf16 MFU | 258772 tok/s +step 3881/18794 | loss 3.215201 (-2.31z)| norm 0.1643 (-1.16z)| lr 5.66e-03 | 2012.14 ms | 68.2% bf16 MFU | 258862 tok/s +step 3882/18794 | loss 3.276183 (-0.48z)| norm 0.1745 (-0.92z)| lr 5.66e-03 | 2018.84 ms | 68.0% bf16 MFU | 258904 tok/s +step 3883/18794 | loss 3.262709 (-0.87z)| norm 0.1862 (-0.65z)| lr 5.66e-03 | 2011.12 ms | 68.2% bf16 MFU | 258993 tok/s +step 3884/18794 | loss 3.306273 (+0.45z)| norm 0.2480 (+0.73z)| lr 5.66e-03 | 2027.31 ms | 67.7% bf16 MFU | 258974 tok/s +step 3885/18794 | loss 3.352316 (+1.84z)| norm 0.2115 (-0.10z)| lr 5.66e-03 | 2018.90 ms | 68.0% bf16 MFU | 259010 tok/s +step 3886/18794 | loss 3.363087 (+2.11z)| norm 0.1748 (-0.92z)| lr 5.66e-03 | 1994.60 ms | 68.8% bf16 MFU | 259202 tok/s +step 3887/18794 | loss 3.341792 (+1.49z)| norm 0.1817 (-0.76z)| lr 5.66e-03 | 2018.64 ms | 68.0% bf16 MFU | 259228 tok/s +step 3888/18794 | loss 3.262689 (-0.85z)| norm 0.1745 (-0.91z)| lr 5.66e-03 | 2020.55 ms | 67.9% bf16 MFU | 259241 tok/s +step 3889/18794 | loss 3.280283 (-0.32z)| norm 0.1960 (-0.43z)| lr 5.66e-03 | 2026.83 ms | 67.7% bf16 MFU | 259212 tok/s +step 3890/18794 | loss 3.300845 (+0.29z)| norm 0.2770 (+1.37z)| lr 5.66e-03 | 2027.27 ms | 67.7% bf16 MFU | 259183 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.346498 +step 3891/18794 | loss 3.308681 (+0.52z)| norm 0.3238 (+2.35z)| lr 5.66e-03 | 2011.96 ms | 68.2% bf16 MFU | 259253 tok/s +step 3892/18794 | loss 3.334076 (+1.25z)| norm 0.2616 (+0.94z)| lr 5.66e-03 | 1995.68 ms | 68.8% bf16 MFU | 259426 tok/s +step 3893/18794 | loss 3.314804 (+0.71z)| norm 0.1749 (-0.98z)| lr 5.66e-03 | 2019.39 ms | 68.0% bf16 MFU | 259436 tok/s +step 3894/18794 | loss 3.278587 (-0.37z)| norm 0.2660 (+1.02z)| lr 5.66e-03 | 2027.40 ms | 67.7% bf16 MFU | 259394 tok/s +step 3895/18794 | loss 3.307994 (+0.51z)| norm 0.2252 (+0.10z)| lr 5.66e-03 | 2026.79 ms | 67.7% bf16 MFU | 259358 tok/s +step 3896/18794 | loss 3.262842 (-0.85z)| norm 0.1864 (-0.77z)| lr 5.65e-03 | 2018.71 ms | 68.0% bf16 MFU | 259376 tok/s +step 3897/18794 | loss 3.327487 (+1.07z)| norm 0.1672 (-1.18z)| lr 5.65e-03 | 2018.83 ms | 68.0% bf16 MFU | 259392 tok/s +step 3898/18794 | loss 3.305907 (+0.42z)| norm 0.2139 (-0.15z)| lr 5.65e-03 | 2005.06 ms | 68.4% bf16 MFU | 259497 tok/s +step 3899/18794 | loss 3.311374 (+0.57z)| norm 0.2581 (+0.84z)| lr 5.65e-03 | 2019.32 ms | 68.0% bf16 MFU | 259504 tok/s +step 3900/18794 | loss 3.226516 (-1.92z)| norm 0.2816 (+1.51z)| lr 5.65e-03 | 2027.74 ms | 67.7% bf16 MFU | 259456 tok/s +step 3901/18794 | loss 3.292454 (+0.04z)| norm 0.2506 (+0.75z)| lr 5.65e-03 | 2003.49 ms | 68.5% bf16 MFU | 259568 tok/s +step 3902/18794 | loss 3.289604 (-0.04z)| norm 0.2442 (+0.60z)| lr 5.65e-03 | 2010.76 ms | 68.2% bf16 MFU | 259627 tok/s +step 3903/18794 | loss 3.252544 (-1.13z)| norm 0.2587 (+0.99z)| lr 5.65e-03 | 1996.07 ms | 68.8% bf16 MFU | 259778 tok/s +step 3904/18794 | loss 3.255319 (-1.03z)| norm 0.2175 (-0.03z)| lr 5.65e-03 | 1996.17 ms | 68.7% bf16 MFU | 259922 tok/s +step 3905/18794 | loss 3.278871 (-0.32z)| norm 0.2033 (-0.39z)| lr 5.65e-03 | 2019.66 ms | 67.9% bf16 MFU | 259905 tok/s +step 3906/18794 | loss 3.269298 (-0.59z)| norm 0.1808 (-0.94z)| lr 5.65e-03 | 2018.98 ms | 68.0% bf16 MFU | 259894 tok/s +step 3907/18794 | loss 3.310910 (+0.65z)| norm 0.1942 (-0.59z)| lr 5.65e-03 | 2004.19 ms | 68.5% bf16 MFU | 259979 tok/s +step 3908/18794 | loss 3.300538 (+0.37z)| norm 0.1931 (-0.61z)| lr 5.65e-03 | 2004.95 ms | 68.4% bf16 MFU | 260055 tok/s +step 3909/18794 | loss 3.244069 (-1.35z)| norm 0.1609 (-1.40z)| lr 5.65e-03 | 2011.10 ms | 68.2% bf16 MFU | 260087 tok/s +step 3910/18794 | loss 3.284206 (-0.13z)| norm 0.1635 (-1.32z)| lr 5.65e-03 | 2001.82 ms | 68.6% bf16 MFU | 260178 tok/s +step 3911/18794 | loss 3.283554 (-0.15z)| norm 0.1908 (-0.60z)| lr 5.65e-03 | 2011.89 ms | 68.2% bf16 MFU | 260199 tok/s +step 3912/18794 | loss 3.303023 (+0.43z)| norm 0.1502 (-1.62z)| lr 5.65e-03 | 2020.04 ms | 67.9% bf16 MFU | 260166 tok/s +step 3913/18794 | loss 3.393919 (+3.07z)| norm 0.1969 (-0.42z)| lr 5.65e-03 | 2034.55 ms | 67.5% bf16 MFU | 260042 tok/s +step 3914/18794 | loss 3.343669 (+1.55z)| norm 0.2381 (+0.63z)| lr 5.65e-03 | 1996.09 ms | 68.8% bf16 MFU | 260173 tok/s +step 3915/18794 | loss 3.341648 (+1.45z)| norm 0.1996 (-0.38z)| lr 5.65e-03 | 2012.00 ms | 68.2% bf16 MFU | 260193 tok/s +step 3916/18794 | loss 3.350116 (+1.66z)| norm 0.1813 (-0.85z)| lr 5.65e-03 | 2011.57 ms | 68.2% bf16 MFU | 260216 tok/s +step 3917/18794 | loss 3.298791 (+0.19z)| norm 0.2065 (-0.20z)| lr 5.65e-03 | 2003.57 ms | 68.5% bf16 MFU | 260289 tok/s +step 3918/18794 | loss 3.324601 (+0.91z)| norm 0.1884 (-0.69z)| lr 5.65e-03 | 2012.18 ms | 68.2% bf16 MFU | 260302 tok/s +step 3919/18794 | loss 3.293011 (+0.00z)| norm 0.1959 (-0.50z)| lr 5.65e-03 | 2011.92 ms | 68.2% bf16 MFU | 260316 tok/s +step 3920/18794 | loss 3.234735 (-1.66z)| norm 0.2489 (+0.91z)| lr 5.65e-03 | 2005.26 ms | 68.4% bf16 MFU | 260373 tok/s +step 3921/18794 | loss 3.229500 (-1.76z)| norm 0.2470 (+0.91z)| lr 5.65e-03 | 1993.52 ms | 68.8% bf16 MFU | 260505 tok/s +step 3922/18794 | loss 3.378611 (+2.35z)| norm 0.2186 (+0.14z)| lr 5.65e-03 | 2004.31 ms | 68.5% bf16 MFU | 260558 tok/s +step 3923/18794 | loss 3.268885 (-0.64z)| norm 0.2140 (-0.00z)| lr 5.65e-03 | 2002.90 ms | 68.5% bf16 MFU | 260619 tok/s +step 3924/18794 | loss 3.220388 (-1.92z)| norm 0.1936 (-0.57z)| lr 5.65e-03 | 2026.90 ms | 67.7% bf16 MFU | 260521 tok/s +step 3925/18794 | loss 3.284779 (-0.19z)| norm 0.1785 (-0.99z)| lr 5.65e-03 | 1996.16 ms | 68.7% bf16 MFU | 260627 tok/s +step 3926/18794 | loss 3.304555 (+0.35z)| norm 0.2231 (+0.24z)| lr 5.65e-03 | 2018.58 ms | 68.0% bf16 MFU | 260583 tok/s +step 3927/18794 | loss 3.274291 (-0.47z)| norm 0.1967 (-0.50z)| lr 5.65e-03 | 2027.47 ms | 67.7% bf16 MFU | 260483 tok/s +step 3928/18794 | loss 3.306435 (+0.39z)| norm 0.1901 (-0.68z)| lr 5.65e-03 | 1985.16 ms | 69.1% bf16 MFU | 260664 tok/s +step 3929/18794 | loss 3.256657 (-0.93z)| norm 0.2028 (-0.33z)| lr 5.65e-03 | 2003.40 ms | 68.5% bf16 MFU | 260716 tok/s +step 3930/18794 | loss 3.309469 (+0.48z)| norm 0.2185 (+0.10z)| lr 5.65e-03 | 2003.29 ms | 68.5% bf16 MFU | 260766 tok/s +step 3931/18794 | loss 3.260117 (-0.84z)| norm 0.2538 (+1.07z)| lr 5.65e-03 | 2003.87 ms | 68.5% bf16 MFU | 260809 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.669381 +step 3932/18794 | loss 3.268110 (-0.62z)| norm 0.3152 (+2.67z)| lr 5.65e-03 | 1988.87 ms | 69.0% bf16 MFU | 260949 tok/s +step 3933/18794 | loss 3.244486 (-1.24z)| norm 0.2470 (+0.83z)| lr 5.65e-03 | 1989.72 ms | 69.0% bf16 MFU | 261077 tok/s +step 3934/18794 | loss 3.291691 (+0.03z)| norm 0.2145 (-0.04z)| lr 5.65e-03 | 1988.70 ms | 69.0% bf16 MFU | 261205 tok/s +step 3935/18794 | loss 3.255739 (-0.93z)| norm 0.2156 (-0.02z)| lr 5.65e-03 | 2019.11 ms | 68.0% bf16 MFU | 261128 tok/s +step 3936/18794 | loss 3.270782 (-0.52z)| norm 0.1982 (-0.48z)| lr 5.65e-03 | 1997.46 ms | 68.7% bf16 MFU | 261195 tok/s +step 3937/18794 | loss 3.282027 (-0.24z)| norm 0.1780 (-1.01z)| lr 5.65e-03 | 1987.01 ms | 69.1% bf16 MFU | 261328 tok/s +step 3938/18794 | loss 3.294746 (+0.11z)| norm 0.2330 (+0.49z)| lr 5.65e-03 | 2019.15 ms | 68.0% bf16 MFU | 261245 tok/s +step 3939/18794 | loss 3.292874 (+0.06z)| norm 0.1920 (-0.62z)| lr 5.64e-03 | 1995.80 ms | 68.8% bf16 MFU | 261317 tok/s +step 3940/18794 | loss 3.315798 (+0.67z)| norm 0.1672 (-1.27z)| lr 5.64e-03 | 2003.68 ms | 68.5% bf16 MFU | 261334 tok/s +step 3941/18794 | loss 3.277876 (-0.38z)| norm 0.1308 (-2.19z)| lr 5.64e-03 | 2002.99 ms | 68.5% bf16 MFU | 261355 tok/s +step 3942/18794 | loss 3.283843 (-0.20z)| norm 0.1667 (-1.22z)| lr 5.64e-03 | 1997.08 ms | 68.7% bf16 MFU | 261414 tok/s +step 3943/18794 | loss 3.302934 (+0.33z)| norm 0.1584 (-1.41z)| lr 5.64e-03 | 2012.14 ms | 68.2% bf16 MFU | 261371 tok/s +step 3944/18794 | loss 3.245555 (-1.25z)| norm 0.2461 (+0.88z)| lr 5.64e-03 | 1996.50 ms | 68.7% bf16 MFU | 261433 tok/s +step 3945/18794 | loss 3.302100 (+0.33z)| norm 0.2779 (+1.67z)| lr 5.64e-03 | 1996.31 ms | 68.7% bf16 MFU | 261493 tok/s +step 3946/18794 | loss 3.290851 (+0.01z)| norm 0.2105 (-0.08z)| lr 5.64e-03 | 1996.64 ms | 68.7% bf16 MFU | 261547 tok/s +step 3947/18794 | loss 3.257251 (-0.93z)| norm 0.1540 (-1.50z)| lr 5.64e-03 | 2002.99 ms | 68.5% bf16 MFU | 261558 tok/s +step 3948/18794 | loss 3.353657 (+1.72z)| norm 0.1984 (-0.37z)| lr 5.64e-03 | 1996.04 ms | 68.8% bf16 MFU | 261613 tok/s +step 3949/18794 | loss 3.285995 (-0.13z)| norm 0.2290 (+0.41z)| lr 5.64e-03 | 1997.56 ms | 68.7% bf16 MFU | 261656 tok/s +step 3950/18794 | loss 3.250073 (-1.12z)| norm 0.2047 (-0.21z)| lr 5.64e-03 | 2011.05 ms | 68.2% bf16 MFU | 261608 tok/s +step 3951/18794 | loss 3.299273 (+0.28z)| norm 0.1690 (-1.10z)| lr 5.64e-03 | 1987.97 ms | 69.0% bf16 MFU | 261714 tok/s +step 3952/18794 | loss 3.256862 (-0.91z)| norm 0.1925 (-0.50z)| lr 5.64e-03 | 2005.70 ms | 68.4% bf16 MFU | 261698 tok/s +step 3953/18794 | loss 3.282712 (-0.18z)| norm 0.2109 (-0.04z)| lr 5.64e-03 | 2012.97 ms | 68.2% bf16 MFU | 261636 tok/s +step 3954/18794 | loss 3.205947 (-2.28z)| norm 0.1920 (-0.52z)| lr 5.64e-03 | 1988.30 ms | 69.0% bf16 MFU | 261739 tok/s +step 3955/18794 | loss 3.311342 (+0.64z)| norm 0.1499 (-1.58z)| lr 5.64e-03 | 1988.73 ms | 69.0% bf16 MFU | 261833 tok/s +step 3956/18794 | loss 3.264883 (-0.64z)| norm 0.1767 (-0.90z)| lr 5.64e-03 | 1996.03 ms | 68.8% bf16 MFU | 261875 tok/s +step 3957/18794 | loss 3.286054 (-0.03z)| norm 0.1917 (-0.51z)| lr 5.64e-03 | 2003.99 ms | 68.5% bf16 MFU | 261862 tok/s +step 3958/18794 | loss 3.294530 (+0.20z)| norm 0.1915 (-0.50z)| lr 5.64e-03 | 2010.88 ms | 68.2% bf16 MFU | 261805 tok/s +step 3959/18794 | loss 3.271001 (-0.47z)| norm 0.2135 (+0.07z)| lr 5.64e-03 | 1988.29 ms | 69.0% bf16 MFU | 261899 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.079440 +step 3960/18794 | loss 3.288854 (+0.02z)| norm 0.2947 (+2.08z)| lr 5.64e-03 | 1995.33 ms | 68.8% bf16 MFU | 261942 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.657079 +step 3961/18794 | loss 3.254553 (-0.97z)| norm 0.3223 (+2.66z)| lr 5.64e-03 | 1994.81 ms | 68.8% bf16 MFU | 261986 tok/s +step 3962/18794 | loss 3.278325 (-0.29z)| norm 0.1955 (-0.41z)| lr 5.64e-03 | 1995.71 ms | 68.8% bf16 MFU | 262023 tok/s +step 3963/18794 | loss 3.273361 (-0.43z)| norm 0.1898 (-0.54z)| lr 5.64e-03 | 1986.80 ms | 69.1% bf16 MFU | 262116 tok/s +step 3964/18794 | loss 3.287967 (-0.02z)| norm 0.1638 (-1.18z)| lr 5.64e-03 | 2018.88 ms | 68.0% bf16 MFU | 261995 tok/s +step 3965/18794 | loss 3.252693 (-1.04z)| norm 0.1775 (-0.83z)| lr 5.64e-03 | 1995.60 ms | 68.8% bf16 MFU | 262031 tok/s +step 3966/18794 | loss 3.283736 (-0.13z)| norm 0.1461 (-1.61z)| lr 5.64e-03 | 1995.80 ms | 68.8% bf16 MFU | 262064 tok/s +step 3967/18794 | loss 3.249077 (-1.13z)| norm 0.1904 (-0.47z)| lr 5.64e-03 | 1986.90 ms | 69.1% bf16 MFU | 262154 tok/s +step 3968/18794 | loss 3.252096 (-1.02z)| norm 0.1726 (-0.91z)| lr 5.64e-03 | 1983.40 ms | 69.2% bf16 MFU | 262264 tok/s +step 3969/18794 | loss 3.309073 (+0.62z)| norm 0.1602 (-1.21z)| lr 5.64e-03 | 1996.68 ms | 68.7% bf16 MFU | 262279 tok/s +step 3970/18794 | loss 3.271042 (-0.47z)| norm 0.2096 (+0.07z)| lr 5.64e-03 | 2004.24 ms | 68.5% bf16 MFU | 262245 tok/s +step 3971/18794 | loss 3.303397 (+0.46z)| norm 0.1826 (-0.63z)| lr 5.64e-03 | 1988.41 ms | 69.0% bf16 MFU | 262316 tok/s +step 3972/18794 | loss 3.251606 (-1.02z)| norm 0.1896 (-0.43z)| lr 5.64e-03 | 1989.20 ms | 69.0% bf16 MFU | 262379 tok/s +step 3973/18794 | loss 3.304555 (+0.49z)| norm 0.2148 (+0.22z)| lr 5.64e-03 | 2011.42 ms | 68.2% bf16 MFU | 262293 tok/s +step 3974/18794 | loss 3.271836 (-0.47z)| norm 0.2525 (+1.19z)| lr 5.64e-03 | 1987.61 ms | 69.0% bf16 MFU | 262367 tok/s +step 3975/18794 | loss 3.275560 (-0.35z)| norm 0.2588 (+1.32z)| lr 5.64e-03 | 1996.04 ms | 68.8% bf16 MFU | 262382 tok/s +step 3976/18794 | loss 3.285923 (-0.03z)| norm 0.2342 (+0.68z)| lr 5.64e-03 | 1986.07 ms | 69.1% bf16 MFU | 262462 tok/s +step 3977/18794 | loss 3.248173 (-1.13z)| norm 0.1965 (-0.28z)| lr 5.64e-03 | 1987.82 ms | 69.0% bf16 MFU | 262526 tok/s +step 3978/18794 | loss 3.254192 (-0.94z)| norm 0.1992 (-0.21z)| lr 5.64e-03 | 1979.56 ms | 69.3% bf16 MFU | 262642 tok/s +step 3979/18794 | loss 3.264760 (-0.61z)| norm 0.1920 (-0.39z)| lr 5.64e-03 | 1987.90 ms | 69.0% bf16 MFU | 262697 tok/s +step 3980/18794 | loss 3.313604 (+0.81z)| norm 0.2032 (-0.10z)| lr 5.64e-03 | 1995.22 ms | 68.8% bf16 MFU | 262701 tok/s +step 3981/18794 | loss 3.271310 (-0.45z)| norm 0.1836 (-0.62z)| lr 5.63e-03 | 1984.83 ms | 69.1% bf16 MFU | 262773 tok/s +step 3982/18794 | loss 3.305381 (+0.56z)| norm 0.2700 (+1.58z)| lr 5.63e-03 | 1983.46 ms | 69.2% bf16 MFU | 262851 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.208511 +step 3983/18794 | loss 3.251159 (-1.06z)| norm 0.3418 (+3.21z)| lr 5.63e-03 | 1986.47 ms | 69.1% bf16 MFU | 262905 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.670013 +step 3984/18794 | loss 3.220119 (-1.93z)| norm 0.3241 (+2.67z)| lr 5.63e-03 | 1990.21 ms | 69.0% bf16 MFU | 262932 tok/s +step 3985/18794 | loss 3.301134 (+0.47z)| norm 0.2351 (+0.57z)| lr 5.63e-03 | 1985.09 ms | 69.1% bf16 MFU | 262991 tok/s +step 3986/18794 | loss 3.285412 (+0.03z)| norm 0.2114 (+0.00z)| lr 5.63e-03 | 1979.16 ms | 69.3% bf16 MFU | 263086 tok/s +step 3987/18794 | loss 3.245183 (-1.19z)| norm 0.2033 (-0.19z)| lr 5.63e-03 | 1995.59 ms | 68.8% bf16 MFU | 263068 tok/s +step 3988/18794 | loss 3.258903 (-0.76z)| norm 0.1778 (-0.80z)| lr 5.63e-03 | 1982.93 ms | 69.2% bf16 MFU | 263135 tok/s +step 3989/18794 | loss 3.221076 (-1.88z)| norm 0.2121 (+0.01z)| lr 5.63e-03 | 1983.83 ms | 69.2% bf16 MFU | 263192 tok/s +step 3990/18794 | loss 3.290311 (+0.23z)| norm 0.2125 (+0.04z)| lr 5.63e-03 | 1982.82 ms | 69.2% bf16 MFU | 263253 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.180088 +step 3991/18794 | loss 3.264780 (-0.54z)| norm 0.3005 (+2.18z)| lr 5.63e-03 | 1986.55 ms | 69.1% bf16 MFU | 263286 tok/s +step 3992/18794 | loss 3.261366 (-0.63z)| norm 0.2354 (+0.61z)| lr 5.63e-03 | 1987.81 ms | 69.0% bf16 MFU | 263310 tok/s +step 3993/18794 | loss 3.238521 (-1.30z)| norm 0.1556 (-1.33z)| lr 5.63e-03 | 1989.46 ms | 69.0% bf16 MFU | 263321 tok/s +step 3994/18794 | loss 3.340235 (+1.78z)| norm 0.2147 (+0.12z)| lr 5.63e-03 | 1987.38 ms | 69.1% bf16 MFU | 263345 tok/s +step 3995/18794 | loss 3.279995 (-0.04z)| norm 0.1786 (-0.75z)| lr 5.63e-03 | 1987.22 ms | 69.1% bf16 MFU | 263370 tok/s +step 3996/18794 | loss 3.326922 (+1.36z)| norm 0.2319 (+0.54z)| lr 5.63e-03 | 1995.21 ms | 68.8% bf16 MFU | 263340 tok/s +step 3997/18794 | loss 3.253809 (-0.83z)| norm 0.2856 (+1.81z)| lr 5.63e-03 | 1993.80 ms | 68.8% bf16 MFU | 263321 tok/s +step 3998/18794 | loss 3.219349 (-1.82z)| norm 0.2291 (+0.43z)| lr 5.63e-03 | 1979.04 ms | 69.3% bf16 MFU | 263401 tok/s +step 3999/18794 | loss 3.230508 (-1.45z)| norm 0.1740 (-0.88z)| lr 5.63e-03 | 1979.05 ms | 69.3% bf16 MFU | 263477 tok/s +step 4000/18794 | loss 3.251266 (-0.85z)| norm 0.2068 (-0.07z)| lr 5.63e-03 | 1979.04 ms | 69.3% bf16 MFU | 263549 tok/s +val loss 3.294871 +HellaSwag: 2819/10042 = 0.280721Swag: 990/1256: 0/1256 +generating: +--- +Writing state to log_gpt3_125M_edu_v4/state_00004000_00001.bin +The name "Columbus" is two different palaeontologists all over the world — we learn that. While they are colloquially called "quulz" and "qui" (Geologicrock Artifacts, Computers and Physical Geology) +, the prehistoric peoples known as the new civilizations called Atlantis were primarily Homo sapiens, pushed to extinction, and were replaced by God. But thanks to humanoid technologies with upgraded + intelligence and advanced technology introduced in the last few hundred years — and time — advanced painting has taken innovation off the ground. +The hypothesis that the Earth is the center of the Milky Way, also called the world gorilla, as in "our" 2003 movie "Hollywood Tonight." ( +--- +Writing checkpoint at step 4000 +Writing model to log_gpt3_125M_edu_v4/model_00004000.bin +Writing state to log_gpt3_125M_edu_v4/state_00004000_00000.bin +Deleting checkpoint at step 1500 +step 4001/18794 | loss 3.271231 (-0.25z)| norm 0.2059 (-0.08z)| lr 5.63e-03 | 1980.56 ms | 69.3% bf16 MFU | 263607 tok/s +step 4002/18794 | loss 3.246632 (-0.97z)| norm 0.1909 (-0.44z)| lr 5.63e-03 | 1989.01 ms | 69.0% bf16 MFU | 263607 tok/s +step 4003/18794 | loss 3.284498 (+0.15z)| norm 0.2057 (-0.06z)| lr 5.63e-03 | 1984.37 ms | 69.2% bf16 MFU | 263637 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.427761 +step 4004/18794 | loss 3.301184 (+0.64z)| norm 0.3090 (+2.43z)| lr 5.63e-03 | 2034.50 ms | 67.5% bf16 MFU | 263340 tok/s +step 4005/18794 | loss 3.242667 (-1.10z)| norm 0.2910 (+1.93z)| lr 5.63e-03 | 2034.11 ms | 67.5% bf16 MFU | 263060 tok/s +step 4006/18794 | loss 3.274721 (-0.14z)| norm 0.2067 (-0.08z)| lr 5.63e-03 | 2043.54 ms | 67.2% bf16 MFU | 262735 tok/s +step 4007/18794 | loss 3.315251 (+1.06z)| norm 0.1950 (-0.36z)| lr 5.63e-03 | 2041.74 ms | 67.2% bf16 MFU | 262438 tok/s +step 4008/18794 | loss 3.266440 (-0.38z)| norm 0.2150 (+0.11z)| lr 5.63e-03 | 2032.48 ms | 67.5% bf16 MFU | 262213 tok/s +step 4009/18794 | loss 3.256328 (-0.69z)| norm 0.2204 (+0.23z)| lr 5.63e-03 | 2038.16 ms | 67.3% bf16 MFU | 261965 tok/s +step 4010/18794 | loss 3.244098 (-1.04z)| norm 0.2135 (+0.05z)| lr 5.63e-03 | 2031.85 ms | 67.5% bf16 MFU | 261768 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.321863 +step 4011/18794 | loss 3.273547 (-0.16z)| norm 0.3111 (+2.32z)| lr 5.63e-03 | 2038.90 ms | 67.3% bf16 MFU | 261537 tok/s +step 4012/18794 | loss 3.253342 (-0.74z)| norm 0.2217 (+0.20z)| lr 5.63e-03 | 2041.62 ms | 67.2% bf16 MFU | 261300 tok/s +step 4013/18794 | loss 3.285504 (+0.26z)| norm 0.1647 (-1.14z)| lr 5.63e-03 | 2037.70 ms | 67.3% bf16 MFU | 261100 tok/s +step 4014/18794 | loss 3.331378 (+1.73z)| norm 0.2454 (+0.76z)| lr 5.63e-03 | 2037.51 ms | 67.4% bf16 MFU | 260911 tok/s +step 4015/18794 | loss 3.266444 (-0.32z)| norm 0.2431 (+0.70z)| lr 5.63e-03 | 2036.65 ms | 67.4% bf16 MFU | 260736 tok/s +step 4016/18794 | loss 3.247933 (-0.92z)| norm 0.1827 (-0.73z)| lr 5.63e-03 | 1986.22 ms | 69.1% bf16 MFU | 260898 tok/s +step 4017/18794 | loss 3.301960 (+0.89z)| norm 0.1804 (-0.77z)| lr 5.63e-03 | 2021.91 ms | 67.9% bf16 MFU | 260818 tok/s +step 4018/18794 | loss 3.293197 (+0.61z)| norm 0.2029 (-0.25z)| lr 5.63e-03 | 2025.61 ms | 67.7% bf16 MFU | 260718 tok/s +step 4019/18794 | loss 3.301717 (+0.90z)| norm 0.2370 (+0.55z)| lr 5.63e-03 | 2039.29 ms | 67.3% bf16 MFU | 260537 tok/s +step 4020/18794 | loss 3.316722 (+1.37z)| norm 0.1662 (-1.10z)| lr 5.63e-03 | 2028.21 ms | 67.7% bf16 MFU | 260435 tok/s +step 4021/18794 | loss 3.266352 (-0.34z)| norm 0.2539 (+0.96z)| lr 5.63e-03 | 2031.57 ms | 67.5% bf16 MFU | 260317 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.336594 +step 4022/18794 | loss 3.300995 (+0.92z)| norm 0.3164 (+2.34z)| lr 5.62e-03 | 2028.83 ms | 67.6% bf16 MFU | 260222 tok/s +step 4023/18794 | loss 3.337747 (+2.18z)| norm 0.2230 (+0.20z)| lr 5.62e-03 | 2033.65 ms | 67.5% bf16 MFU | 260101 tok/s +step 4024/18794 | loss 3.270357 (-0.23z)| norm 0.1888 (-0.58z)| lr 5.62e-03 | 2020.67 ms | 67.9% bf16 MFU | 260069 tok/s +step 4025/18794 | loss 3.279035 (+0.08z)| norm 0.2762 (+1.39z)| lr 5.62e-03 | 2025.39 ms | 67.8% bf16 MFU | 260009 tok/s +step 4026/18794 | loss 3.301851 (+0.91z)| norm 0.2146 (-0.01z)| lr 5.62e-03 | 2014.63 ms | 68.1% bf16 MFU | 260020 tok/s +step 4027/18794 | loss 3.276292 (-0.01z)| norm 0.1810 (-0.77z)| lr 5.62e-03 | 2025.66 ms | 67.7% bf16 MFU | 259961 tok/s +step 4028/18794 | loss 3.245192 (-1.12z)| norm 0.2127 (-0.05z)| lr 5.62e-03 | 2019.11 ms | 68.0% bf16 MFU | 259946 tok/s +step 4029/18794 | loss 3.271141 (-0.18z)| norm 0.1897 (-0.57z)| lr 5.62e-03 | 2025.19 ms | 67.8% bf16 MFU | 259893 tok/s +step 4030/18794 | loss 3.237728 (-1.37z)| norm 0.1489 (-1.46z)| lr 5.62e-03 | 2025.65 ms | 67.7% bf16 MFU | 259839 tok/s +step 4031/18794 | loss 3.273211 (-0.09z)| norm 0.2080 (-0.13z)| lr 5.62e-03 | 2031.54 ms | 67.6% bf16 MFU | 259751 tok/s +step 4032/18794 | loss 3.313244 (+1.33z)| norm 0.2226 (+0.23z)| lr 5.62e-03 | 2027.04 ms | 67.7% bf16 MFU | 259696 tok/s +step 4033/18794 | loss 3.271253 (-0.18z)| norm 0.2388 (+0.60z)| lr 5.62e-03 | 2027.12 ms | 67.7% bf16 MFU | 259643 tok/s +step 4034/18794 | loss 3.290543 (+0.51z)| norm 0.2126 (-0.00z)| lr 5.62e-03 | 2018.54 ms | 68.0% bf16 MFU | 259647 tok/s +step 4035/18794 | loss 3.285900 (+0.33z)| norm 0.2121 (-0.01z)| lr 5.62e-03 | 2031.00 ms | 67.6% bf16 MFU | 259572 tok/s +step 4036/18794 | loss 3.331871 (+1.94z)| norm 0.2290 (+0.37z)| lr 5.62e-03 | 2020.42 ms | 67.9% bf16 MFU | 259568 tok/s +step 4037/18794 | loss 3.238507 (-1.35z)| norm 0.2501 (+0.84z)| lr 5.62e-03 | 2018.70 ms | 68.0% bf16 MFU | 259576 tok/s +step 4038/18794 | loss 3.260682 (-0.56z)| norm 0.2288 (+0.35z)| lr 5.62e-03 | 2034.93 ms | 67.4% bf16 MFU | 259479 tok/s +step 4039/18794 | loss 3.284175 (+0.27z)| norm 0.1807 (-0.76z)| lr 5.62e-03 | 2026.21 ms | 67.7% bf16 MFU | 259443 tok/s +step 4040/18794 | loss 3.327822 (+1.80z)| norm 0.1676 (-1.06z)| lr 5.62e-03 | 2033.03 ms | 67.5% bf16 MFU | 259365 tok/s +step 4041/18794 | loss 3.311305 (+1.20z)| norm 0.1938 (-0.48z)| lr 5.62e-03 | 2016.53 ms | 68.1% bf16 MFU | 259396 tok/s +step 4042/18794 | loss 3.293329 (+0.57z)| norm 0.1847 (-0.70z)| lr 5.62e-03 | 2031.32 ms | 67.6% bf16 MFU | 259332 tok/s +step 4043/18794 | loss 3.239299 (-1.28z)| norm 0.2137 (-0.03z)| lr 5.62e-03 | 2017.25 ms | 68.0% bf16 MFU | 259360 tok/s +step 4044/18794 | loss 3.249386 (-0.93z)| norm 0.1753 (-0.92z)| lr 5.62e-03 | 2007.18 ms | 68.4% bf16 MFU | 259453 tok/s +step 4045/18794 | loss 3.279905 (+0.13z)| norm 0.1961 (-0.42z)| lr 5.62e-03 | 2020.10 ms | 67.9% bf16 MFU | 259457 tok/s +step 4046/18794 | loss 3.275687 (-0.01z)| norm 0.1687 (-1.06z)| lr 5.62e-03 | 2023.16 ms | 67.8% bf16 MFU | 259441 tok/s +step 4047/18794 | loss 3.248158 (-0.96z)| norm 0.1564 (-1.35z)| lr 5.62e-03 | 2016.92 ms | 68.0% bf16 MFU | 259466 tok/s +step 4048/18794 | loss 3.265616 (-0.34z)| norm 0.1667 (-1.09z)| lr 5.62e-03 | 2023.44 ms | 67.8% bf16 MFU | 259448 tok/s +step 4049/18794 | loss 3.356409 (+2.79z)| norm 0.1724 (-0.94z)| lr 5.62e-03 | 2021.76 ms | 67.9% bf16 MFU | 259442 tok/s +step 4050/18794 | loss 3.306129 (+1.03z)| norm 0.1620 (-1.17z)| lr 5.62e-03 | 2019.45 ms | 68.0% bf16 MFU | 259451 tok/s +step 4051/18794 | loss 3.261184 (-0.51z)| norm 0.1871 (-0.58z)| lr 5.62e-03 | 2016.60 ms | 68.1% bf16 MFU | 259478 tok/s +step 4052/18794 | loss 3.233915 (-1.43z)| norm 0.1886 (-0.55z)| lr 5.62e-03 | 2020.26 ms | 67.9% bf16 MFU | 259479 tok/s +step 4053/18794 | loss 3.267286 (-0.28z)| norm 0.1762 (-0.83z)| lr 5.62e-03 | 2025.83 ms | 67.7% bf16 MFU | 259446 tok/s +step 4054/18794 | loss 3.274854 (-0.05z)| norm 0.1643 (-1.10z)| lr 5.62e-03 | 2015.07 ms | 68.1% bf16 MFU | 259482 tok/s +step 4055/18794 | loss 3.302316 (+0.93z)| norm 0.1582 (-1.25z)| lr 5.62e-03 | 2026.20 ms | 67.7% bf16 MFU | 259446 tok/s +step 4056/18794 | loss 3.287322 (+0.39z)| norm 0.1761 (-0.82z)| lr 5.62e-03 | 2012.16 ms | 68.2% bf16 MFU | 259502 tok/s +step 4057/18794 | loss 3.232874 (-1.51z)| norm 0.1703 (-0.95z)| lr 5.62e-03 | 1999.84 ms | 68.6% bf16 MFU | 259635 tok/s +step 4058/18794 | loss 3.335273 (+2.04z)| norm 0.2036 (-0.18z)| lr 5.62e-03 | 2014.47 ms | 68.1% bf16 MFU | 259666 tok/s +step 4059/18794 | loss 3.287772 (+0.39z)| norm 0.2316 (+0.47z)| lr 5.62e-03 | 2015.79 ms | 68.1% bf16 MFU | 259687 tok/s +step 4060/18794 | loss 3.358599 (+2.70z)| norm 0.2454 (+0.82z)| lr 5.62e-03 | 2012.25 ms | 68.2% bf16 MFU | 259731 tok/s +step 4061/18794 | loss 3.274523 (-0.09z)| norm 0.2796 (+1.68z)| lr 5.62e-03 | 2018.93 ms | 68.0% bf16 MFU | 259728 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.944437 +step 4062/18794 | loss 3.267864 (-0.31z)| norm 0.3389 (+2.94z)| lr 5.62e-03 | 2025.85 ms | 67.7% bf16 MFU | 259682 tok/s +step 4063/18794 | loss 3.294597 (+0.57z)| norm 0.2139 (+0.04z)| lr 5.61e-03 | 2017.86 ms | 68.0% bf16 MFU | 259689 tok/s +step 4064/18794 | loss 3.257109 (-0.66z)| norm 0.2233 (+0.25z)| lr 5.61e-03 | 2018.35 ms | 68.0% bf16 MFU | 259693 tok/s +step 4065/18794 | loss 3.295628 (+0.60z)| norm 0.2511 (+0.88z)| lr 5.61e-03 | 2017.71 ms | 68.0% bf16 MFU | 259700 tok/s +step 4066/18794 | loss 3.244490 (-1.08z)| norm 0.2479 (+0.79z)| lr 5.61e-03 | 2034.90 ms | 67.4% bf16 MFU | 259597 tok/s +step 4067/18794 | loss 3.290363 (+0.43z)| norm 0.2380 (+0.54z)| lr 5.61e-03 | 2003.17 ms | 68.5% bf16 MFU | 259704 tok/s +step 4068/18794 | loss 3.312928 (+1.15z)| norm 0.2423 (+0.63z)| lr 5.61e-03 | 2016.49 ms | 68.1% bf16 MFU | 259719 tok/s +step 4069/18794 | loss 3.312485 (+1.13z)| norm 0.2573 (+0.96z)| lr 5.61e-03 | 2017.88 ms | 68.0% bf16 MFU | 259724 tok/s +step 4070/18794 | loss 3.278671 (+0.01z)| norm 0.1988 (-0.42z)| lr 5.61e-03 | 2017.42 ms | 68.0% bf16 MFU | 259732 tok/s +step 4071/18794 | loss 3.248880 (-0.95z)| norm 0.1835 (-0.78z)| lr 5.61e-03 | 2006.26 ms | 68.4% bf16 MFU | 259812 tok/s +step 4072/18794 | loss 3.264816 (-0.43z)| norm 0.1637 (-1.24z)| lr 5.61e-03 | 2019.34 ms | 68.0% bf16 MFU | 259803 tok/s +step 4073/18794 | loss 3.320805 (+1.41z)| norm 0.1888 (-0.64z)| lr 5.61e-03 | 2012.79 ms | 68.2% bf16 MFU | 259836 tok/s +step 4074/18794 | loss 3.286050 (+0.26z)| norm 0.1985 (-0.40z)| lr 5.61e-03 | 2014.43 ms | 68.1% bf16 MFU | 259858 tok/s +step 4075/18794 | loss 3.272621 (-0.18z)| norm 0.1794 (-0.83z)| lr 5.61e-03 | 2016.01 ms | 68.1% bf16 MFU | 259868 tok/s +step 4076/18794 | loss 3.245248 (-1.06z)| norm 0.1712 (-1.01z)| lr 5.61e-03 | 2025.99 ms | 67.7% bf16 MFU | 259814 tok/s +step 4077/18794 | loss 3.302153 (+0.78z)| norm 0.2158 (+0.04z)| lr 5.61e-03 | 2005.45 ms | 68.4% bf16 MFU | 259895 tok/s +step 4078/18794 | loss 3.296206 (+0.57z)| norm 0.2367 (+0.52z)| lr 5.61e-03 | 2009.39 ms | 68.3% bf16 MFU | 259946 tok/s +step 4079/18794 | loss 3.203317 (-2.38z)| norm 0.2137 (-0.03z)| lr 5.61e-03 | 1992.52 ms | 68.9% bf16 MFU | 260105 tok/s +step 4080/18794 | loss 3.254789 (-0.72z)| norm 0.1732 (-0.97z)| lr 5.61e-03 | 2004.01 ms | 68.5% bf16 MFU | 260181 tok/s +step 4081/18794 | loss 3.252466 (-0.79z)| norm 0.2098 (-0.12z)| lr 5.61e-03 | 2020.44 ms | 67.9% bf16 MFU | 260146 tok/s +step 4082/18794 | loss 3.277615 (+0.02z)| norm 0.2309 (+0.39z)| lr 5.61e-03 | 2010.02 ms | 68.3% bf16 MFU | 260181 tok/s +step 4083/18794 | loss 3.251700 (-0.81z)| norm 0.2169 (+0.09z)| lr 5.61e-03 | 2017.34 ms | 68.0% bf16 MFU | 260166 tok/s +step 4084/18794 | loss 3.315462 (+1.21z)| norm 0.1850 (-0.69z)| lr 5.61e-03 | 2020.18 ms | 67.9% bf16 MFU | 260134 tok/s +step 4085/18794 | loss 3.266359 (-0.36z)| norm 0.2042 (-0.19z)| lr 5.61e-03 | 2026.55 ms | 67.7% bf16 MFU | 260063 tok/s +step 4086/18794 | loss 3.302382 (+0.79z)| norm 0.1633 (-1.23z)| lr 5.61e-03 | 2021.57 ms | 67.9% bf16 MFU | 260027 tok/s +step 4087/18794 | loss 3.225075 (-1.68z)| norm 0.1432 (-1.70z)| lr 5.61e-03 | 2008.19 ms | 68.3% bf16 MFU | 260080 tok/s +step 4088/18794 | loss 3.325674 (+1.50z)| norm 0.1712 (-0.99z)| lr 5.61e-03 | 2017.69 ms | 68.0% bf16 MFU | 260068 tok/s +step 4089/18794 | loss 3.323510 (+1.41z)| norm 0.1831 (-0.68z)| lr 5.61e-03 | 2003.43 ms | 68.5% bf16 MFU | 260149 tok/s +step 4090/18794 | loss 3.280177 (+0.03z)| norm 0.1764 (-0.84z)| lr 5.61e-03 | 2002.79 ms | 68.5% bf16 MFU | 260231 tok/s +step 4091/18794 | loss 3.248333 (-0.98z)| norm 0.1708 (-0.97z)| lr 5.61e-03 | 2009.50 ms | 68.3% bf16 MFU | 260264 tok/s +step 4092/18794 | loss 3.280759 (+0.05z)| norm 0.1780 (-0.77z)| lr 5.61e-03 | 2003.72 ms | 68.5% bf16 MFU | 260334 tok/s +step 4093/18794 | loss 3.284390 (+0.15z)| norm 0.1815 (-0.69z)| lr 5.61e-03 | 2010.14 ms | 68.3% bf16 MFU | 260358 tok/s +step 4094/18794 | loss 3.278605 (-0.01z)| norm 0.1547 (-1.35z)| lr 5.61e-03 | 2012.15 ms | 68.2% bf16 MFU | 260369 tok/s +step 4095/18794 | loss 3.300699 (+0.70z)| norm 0.2078 (+0.00z)| lr 5.61e-03 | 2010.69 ms | 68.3% bf16 MFU | 260388 tok/s +step 4096/18794 | loss 3.266290 (-0.41z)| norm 0.1552 (-1.32z)| lr 5.61e-03 | 2017.64 ms | 68.0% bf16 MFU | 260361 tok/s +step 4097/18794 | loss 3.307046 (+0.92z)| norm 0.2508 (+1.14z)| lr 5.61e-03 | 2005.13 ms | 68.4% bf16 MFU | 260416 tok/s +step 4098/18794 | loss 3.275445 (-0.14z)| norm 0.2177 (+0.29z)| lr 5.61e-03 | 2017.35 ms | 68.0% bf16 MFU | 260390 tok/s +step 4099/18794 | loss 3.206692 (-2.41z)| norm 0.2212 (+0.37z)| lr 5.61e-03 | 2003.21 ms | 68.5% bf16 MFU | 260457 tok/s +step 4100/18794 | loss 3.288848 (+0.30z)| norm 0.2513 (+1.13z)| lr 5.61e-03 | 2010.62 ms | 68.3% bf16 MFU | 260472 tok/s +step 4101/18794 | loss 3.309191 (+0.96z)| norm 0.2474 (+1.02z)| lr 5.61e-03 | 2025.49 ms | 67.8% bf16 MFU | 260391 tok/s +step 4102/18794 | loss 3.254076 (-0.87z)| norm 0.2481 (+1.01z)| lr 5.61e-03 | 2004.02 ms | 68.5% bf16 MFU | 260452 tok/s +step 4103/18794 | loss 3.294192 (+0.46z)| norm 0.2473 (+0.98z)| lr 5.61e-03 | 2017.98 ms | 68.0% bf16 MFU | 260420 tok/s +step 4104/18794 | loss 3.297586 (+0.57z)| norm 0.2256 (+0.46z)| lr 5.60e-03 | 2002.59 ms | 68.5% bf16 MFU | 260489 tok/s +step 4105/18794 | loss 3.238388 (-1.39z)| norm 0.1832 (-0.64z)| lr 5.60e-03 | 2014.74 ms | 68.1% bf16 MFU | 260476 tok/s +step 4106/18794 | loss 3.251325 (-0.95z)| norm 0.2833 (+2.00z)| lr 5.60e-03 | 2009.93 ms | 68.3% bf16 MFU | 260495 tok/s +reducing beta2 to 0.9 and lr/wd by 0.889 due to grad z-score of 3.936120 +step 4107/18794 | loss 3.287191 (+0.24z)| norm 0.3718 (+3.94z)| lr 4.98e-03 | 2000.62 ms | 68.6% bf16 MFU | 260573 tok/s +step 4108/18794 | loss 3.328656 (+1.58z)| norm 0.2314 (+0.53z)| lr 5.60e-03 | 2011.51 ms | 68.2% bf16 MFU | 260576 tok/s +step 4109/18794 | loss 3.289387 (+0.28z)| norm 0.1870 (-0.54z)| lr 5.60e-03 | 2003.53 ms | 68.5% bf16 MFU | 260632 tok/s +step 4110/18794 | loss 3.265738 (-0.51z)| norm 0.1784 (-0.74z)| lr 5.60e-03 | 2000.64 ms | 68.6% bf16 MFU | 260703 tok/s +step 4111/18794 | loss 3.301183 (+0.66z)| norm 0.1857 (-0.55z)| lr 5.60e-03 | 1997.37 ms | 68.7% bf16 MFU | 260792 tok/s +step 4112/18794 | loss 3.260946 (-0.68z)| norm 0.2312 (+0.58z)| lr 5.60e-03 | 2018.05 ms | 68.0% bf16 MFU | 260743 tok/s +step 4113/18794 | loss 3.298518 (+0.56z)| norm 0.2313 (+0.57z)| lr 5.60e-03 | 2013.52 ms | 68.2% bf16 MFU | 260725 tok/s +step 4114/18794 | loss 3.305094 (+0.80z)| norm 0.2545 (+1.14z)| lr 5.60e-03 | 2002.29 ms | 68.5% bf16 MFU | 260781 tok/s +step 4115/18794 | loss 3.228140 (-1.74z)| norm 0.2222 (+0.35z)| lr 5.60e-03 | 2010.23 ms | 68.3% bf16 MFU | 260782 tok/s +step 4116/18794 | loss 3.241314 (-1.30z)| norm 0.2357 (+0.67z)| lr 5.60e-03 | 2013.46 ms | 68.2% bf16 MFU | 260763 tok/s +step 4117/18794 | loss 3.271219 (-0.30z)| norm 0.2125 (+0.08z)| lr 5.60e-03 | 2003.48 ms | 68.5% bf16 MFU | 260809 tok/s +step 4118/18794 | loss 3.283489 (+0.10z)| norm 0.1896 (-0.49z)| lr 5.60e-03 | 2001.94 ms | 68.5% bf16 MFU | 260863 tok/s +step 4119/18794 | loss 3.262243 (-0.58z)| norm 0.1884 (-0.51z)| lr 5.60e-03 | 2009.86 ms | 68.3% bf16 MFU | 260863 tok/s +step 4120/18794 | loss 3.276909 (-0.09z)| norm 0.1724 (-0.91z)| lr 5.60e-03 | 1995.70 ms | 68.8% bf16 MFU | 260955 tok/s +step 4121/18794 | loss 3.295999 (+0.54z)| norm 0.1649 (-1.07z)| lr 5.60e-03 | 2002.89 ms | 68.5% bf16 MFU | 260996 tok/s +step 4122/18794 | loss 3.300774 (+0.70z)| norm 0.2420 (+0.91z)| lr 5.60e-03 | 2000.43 ms | 68.6% bf16 MFU | 261050 tok/s +step 4123/18794 | loss 3.222141 (-1.88z)| norm 0.2119 (+0.13z)| lr 5.60e-03 | 2005.53 ms | 68.4% bf16 MFU | 261069 tok/s +step 4124/18794 | loss 3.257734 (-0.69z)| norm 0.1740 (-0.85z)| lr 5.60e-03 | 2007.14 ms | 68.4% bf16 MFU | 261076 tok/s +step 4125/18794 | loss 3.271519 (-0.23z)| norm 0.1898 (-0.42z)| lr 5.60e-03 | 2014.59 ms | 68.1% bf16 MFU | 261034 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.352632 +step 4126/18794 | loss 3.219070 (-1.91z)| norm 0.2987 (+2.35z)| lr 5.60e-03 | 1995.91 ms | 68.8% bf16 MFU | 261117 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.369315 +step 4127/18794 | loss 3.243423 (-1.10z)| norm 0.3031 (+2.37z)| lr 5.60e-03 | 2008.98 ms | 68.3% bf16 MFU | 261109 tok/s +step 4128/18794 | loss 3.313744 (+1.16z)| norm 0.2068 (-0.03z)| lr 5.60e-03 | 2001.96 ms | 68.5% bf16 MFU | 261148 tok/s +step 4129/18794 | loss 3.195599 (-2.56z)| norm 0.1805 (-0.68z)| lr 5.60e-03 | 1995.14 ms | 68.8% bf16 MFU | 261230 tok/s +step 4130/18794 | loss 3.319670 (+1.30z)| norm 0.2527 (+1.10z)| lr 5.60e-03 | 2008.53 ms | 68.3% bf16 MFU | 261220 tok/s +step 4131/18794 | loss 3.260816 (-0.54z)| norm 0.2625 (+1.32z)| lr 5.60e-03 | 2010.00 ms | 68.3% bf16 MFU | 261201 tok/s +step 4132/18794 | loss 3.239724 (-1.17z)| norm 0.2058 (-0.08z)| lr 5.60e-03 | 2002.89 ms | 68.5% bf16 MFU | 261229 tok/s +step 4133/18794 | loss 3.213848 (-1.93z)| norm 0.2362 (+0.67z)| lr 5.60e-03 | 1997.96 ms | 68.7% bf16 MFU | 261288 tok/s +step 4134/18794 | loss 3.281799 (+0.16z)| norm 0.2471 (+0.93z)| lr 5.60e-03 | 1992.01 ms | 68.9% bf16 MFU | 261384 tok/s +step 4135/18794 | loss 3.258651 (-0.54z)| norm 0.1996 (-0.24z)| lr 5.60e-03 | 2002.86 ms | 68.5% bf16 MFU | 261403 tok/s +step 4136/18794 | loss 3.289441 (+0.42z)| norm 0.2080 (-0.03z)| lr 5.60e-03 | 1997.36 ms | 68.7% bf16 MFU | 261457 tok/s +step 4137/18794 | loss 3.300345 (+0.74z)| norm 0.2507 (+1.03z)| lr 5.60e-03 | 2003.14 ms | 68.5% bf16 MFU | 261471 tok/s +step 4138/18794 | loss 3.245228 (-0.97z)| norm 0.2429 (+0.83z)| lr 5.60e-03 | 2005.60 ms | 68.4% bf16 MFU | 261468 tok/s +step 4139/18794 | loss 3.263172 (-0.40z)| norm 0.2099 (+0.01z)| lr 5.60e-03 | 2009.64 ms | 68.3% bf16 MFU | 261439 tok/s +step 4140/18794 | loss 3.285603 (+0.31z)| norm 0.2039 (-0.15z)| lr 5.60e-03 | 1988.49 ms | 69.0% bf16 MFU | 261550 tok/s +step 4141/18794 | loss 3.290877 (+0.49z)| norm 0.1977 (-0.31z)| lr 5.60e-03 | 2000.61 ms | 68.6% bf16 MFU | 261576 tok/s +step 4142/18794 | loss 3.250951 (-0.76z)| norm 0.1904 (-0.49z)| lr 5.60e-03 | 1991.83 ms | 68.9% bf16 MFU | 261658 tok/s +step 4143/18794 | loss 3.269881 (-0.18z)| norm 0.1799 (-0.74z)| lr 5.60e-03 | 1990.17 ms | 69.0% bf16 MFU | 261747 tok/s +step 4144/18794 | loss 3.300788 (+0.79z)| norm 0.1785 (-0.78z)| lr 5.59e-03 | 2004.89 ms | 68.4% bf16 MFU | 261735 tok/s +step 4145/18794 | loss 3.312455 (+1.15z)| norm 0.2036 (-0.16z)| lr 5.59e-03 | 2004.47 ms | 68.5% bf16 MFU | 261726 tok/s +step 4146/18794 | loss 3.230585 (-1.41z)| norm 0.2384 (+0.69z)| lr 5.59e-03 | 2006.28 ms | 68.4% bf16 MFU | 261706 tok/s +step 4147/18794 | loss 3.314266 (+1.18z)| norm 0.2681 (+1.40z)| lr 5.59e-03 | 2002.74 ms | 68.5% bf16 MFU | 261710 tok/s +step 4148/18794 | loss 3.151794 (-3.59z)| norm 0.2513 (+0.97z)| lr 5.59e-03 | 1993.12 ms | 68.9% bf16 MFU | 261777 tok/s +step 4149/18794 | loss 3.310813 (+1.07z)| norm 0.2204 (+0.18z)| lr 5.59e-03 | 2003.30 ms | 68.5% bf16 MFU | 261774 tok/s +step 4150/18794 | loss 3.280494 (+0.18z)| norm 0.2547 (+1.03z)| lr 5.59e-03 | 1987.47 ms | 69.0% bf16 MFU | 261875 tok/s +step 4151/18794 | loss 3.317404 (+1.25z)| norm 0.2251 (+0.27z)| lr 5.59e-03 | 1987.96 ms | 69.0% bf16 MFU | 261968 tok/s +step 4152/18794 | loss 3.223701 (-1.52z)| norm 0.1761 (-0.96z)| lr 5.59e-03 | 2010.72 ms | 68.3% bf16 MFU | 261907 tok/s +step 4153/18794 | loss 3.286628 (+0.34z)| norm 0.2348 (+0.51z)| lr 5.59e-03 | 1996.86 ms | 68.7% bf16 MFU | 261939 tok/s +step 4154/18794 | loss 3.292742 (+0.51z)| norm 0.2290 (+0.35z)| lr 5.59e-03 | 1994.95 ms | 68.8% bf16 MFU | 261983 tok/s +step 4155/18794 | loss 3.258910 (-0.48z)| norm 0.1669 (-1.24z)| lr 5.59e-03 | 1991.88 ms | 68.9% bf16 MFU | 262044 tok/s +step 4156/18794 | loss 3.316332 (+1.21z)| norm 0.2151 (-0.02z)| lr 5.59e-03 | 1994.33 ms | 68.8% bf16 MFU | 262086 tok/s +step 4157/18794 | loss 3.304453 (+0.84z)| norm 0.2077 (-0.22z)| lr 5.59e-03 | 1994.04 ms | 68.8% bf16 MFU | 262128 tok/s +step 4158/18794 | loss 3.274045 (-0.04z)| norm 0.1705 (-1.16z)| lr 5.59e-03 | 1990.67 ms | 68.9% bf16 MFU | 262191 tok/s +step 4159/18794 | loss 3.279283 (+0.12z)| norm 0.1976 (-0.46z)| lr 5.59e-03 | 1998.39 ms | 68.7% bf16 MFU | 262199 tok/s +step 4160/18794 | loss 3.308386 (+1.04z)| norm 0.2013 (-0.35z)| lr 5.59e-03 | 2010.81 ms | 68.2% bf16 MFU | 262126 tok/s +step 4161/18794 | loss 3.223260 (-1.55z)| norm 0.2002 (-0.37z)| lr 5.59e-03 | 1995.60 ms | 68.8% bf16 MFU | 262155 tok/s +step 4162/18794 | loss 3.261044 (-0.40z)| norm 0.1588 (-1.46z)| lr 5.59e-03 | 2000.18 ms | 68.6% bf16 MFU | 262154 tok/s +step 4163/18794 | loss 3.273286 (-0.02z)| norm 0.1567 (-1.49z)| lr 5.59e-03 | 2002.72 ms | 68.5% bf16 MFU | 262135 tok/s +step 4164/18794 | loss 3.258836 (-0.46z)| norm 0.1524 (-1.56z)| lr 5.59e-03 | 2000.25 ms | 68.6% bf16 MFU | 262134 tok/s +step 4165/18794 | loss 3.289847 (+0.49z)| norm 0.1637 (-1.24z)| lr 5.59e-03 | 1995.65 ms | 68.8% bf16 MFU | 262163 tok/s +step 4166/18794 | loss 3.268497 (-0.17z)| norm 0.1771 (-0.87z)| lr 5.59e-03 | 1993.05 ms | 68.9% bf16 MFU | 262208 tok/s +step 4167/18794 | loss 3.271823 (-0.07z)| norm 0.2064 (-0.08z)| lr 5.59e-03 | 1998.27 ms | 68.7% bf16 MFU | 262216 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.268527 +step 4168/18794 | loss 3.254560 (-0.58z)| norm 0.2971 (+2.27z)| lr 5.59e-03 | 1994.93 ms | 68.8% bf16 MFU | 262246 tok/s +reducing beta2 to 0.9 and lr/wd by 0.746 due to grad z-score of 4.690890 +step 4169/18794 | loss 3.293221 (+0.62z)| norm 0.4144 (+4.69z)| lr 4.17e-03 | 2011.68 ms | 68.2% bf16 MFU | 262165 tok/s +step 4170/18794 | loss 3.236752 (-1.11z)| norm 0.2090 (-0.06z)| lr 5.59e-03 | 2000.67 ms | 68.6% bf16 MFU | 262159 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.384481 +step 4171/18794 | loss 3.297602 (+0.75z)| norm 0.3687 (+3.38z)| lr 5.59e-03 | 1986.95 ms | 69.1% bf16 MFU | 262244 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.076768 +step 4172/18794 | loss 3.293963 (+0.63z)| norm 0.3630 (+3.08z)| lr 5.59e-03 | 1996.67 ms | 68.7% bf16 MFU | 262261 tok/s +step 4173/18794 | loss 3.277455 (+0.13z)| norm 0.2019 (-0.29z)| lr 5.59e-03 | 1995.50 ms | 68.8% bf16 MFU | 262285 tok/s +step 4174/18794 | loss 3.300548 (+0.84z)| norm 0.2535 (+0.78z)| lr 5.59e-03 | 1980.06 ms | 69.3% bf16 MFU | 262410 tok/s +step 4175/18794 | loss 3.291200 (+0.55z)| norm 0.1981 (-0.38z)| lr 5.59e-03 | 1995.70 ms | 68.8% bf16 MFU | 262425 tok/s +step 4176/18794 | loss 3.339664 (+1.99z)| norm 0.2188 (+0.04z)| lr 5.59e-03 | 1987.59 ms | 69.0% bf16 MFU | 262493 tok/s +step 4177/18794 | loss 3.263996 (-0.31z)| norm 0.2318 (+0.31z)| lr 5.59e-03 | 1995.63 ms | 68.8% bf16 MFU | 262504 tok/s +step 4178/18794 | loss 3.220090 (-1.61z)| norm 0.1730 (-0.90z)| lr 5.59e-03 | 1979.11 ms | 69.3% bf16 MFU | 262624 tok/s +step 4179/18794 | loss 3.236507 (-1.14z)| norm 0.2001 (-0.34z)| lr 5.59e-03 | 1987.02 ms | 69.1% bf16 MFU | 262686 tok/s +step 4180/18794 | loss 3.326007 (+1.57z)| norm 0.1783 (-0.79z)| lr 5.59e-03 | 1989.93 ms | 69.0% bf16 MFU | 262725 tok/s +step 4181/18794 | loss 3.286599 (+0.36z)| norm 0.2264 (+0.21z)| lr 5.59e-03 | 1986.77 ms | 69.1% bf16 MFU | 262783 tok/s +step 4182/18794 | loss 3.276757 (+0.06z)| norm 0.2355 (+0.40z)| lr 5.59e-03 | 1990.56 ms | 68.9% bf16 MFU | 262813 tok/s +step 4183/18794 | loss 3.293372 (+0.56z)| norm 0.2350 (+0.38z)| lr 5.59e-03 | 1984.82 ms | 69.1% bf16 MFU | 262880 tok/s +step 4184/18794 | loss 3.249160 (-0.77z)| norm 0.2509 (+0.70z)| lr 5.58e-03 | 1985.88 ms | 69.1% bf16 MFU | 262937 tok/s +step 4185/18794 | loss 3.284624 (+0.31z)| norm 0.3067 (+1.81z)| lr 5.58e-03 | 1984.56 ms | 69.1% bf16 MFU | 262999 tok/s +step 4186/18794 | loss 3.326146 (+1.56z)| norm 0.2594 (+0.83z)| lr 5.58e-03 | 1987.11 ms | 69.1% bf16 MFU | 263041 tok/s +step 4187/18794 | loss 3.277694 (+0.07z)| norm 0.2315 (+0.24z)| lr 5.58e-03 | 1981.62 ms | 69.3% bf16 MFU | 263118 tok/s +step 4188/18794 | loss 3.293687 (+0.58z)| norm 0.2261 (+0.11z)| lr 5.58e-03 | 1988.14 ms | 69.0% bf16 MFU | 263147 tok/s +step 4189/18794 | loss 3.296683 (+0.69z)| norm 0.2102 (-0.23z)| lr 5.58e-03 | 1998.20 ms | 68.7% bf16 MFU | 263109 tok/s +step 4190/18794 | loss 3.269403 (-0.16z)| norm 0.1986 (-0.48z)| lr 5.58e-03 | 1980.90 ms | 69.3% bf16 MFU | 263187 tok/s +step 4191/18794 | loss 3.317669 (+1.32z)| norm 0.1770 (-0.93z)| lr 5.58e-03 | 1978.85 ms | 69.3% bf16 MFU | 263275 tok/s +step 4192/18794 | loss 3.319099 (+1.34z)| norm 0.2257 (+0.09z)| lr 5.58e-03 | 1979.20 ms | 69.3% bf16 MFU | 263356 tok/s +step 4193/18794 | loss 3.286583 (+0.33z)| norm 0.2662 (+0.92z)| lr 5.58e-03 | 1986.09 ms | 69.1% bf16 MFU | 263387 tok/s +step 4194/18794 | loss 3.254157 (-0.66z)| norm 0.2162 (-0.15z)| lr 5.58e-03 | 1994.84 ms | 68.8% bf16 MFU | 263359 tok/s +step 4195/18794 | loss 3.272226 (-0.09z)| norm 0.2053 (-0.38z)| lr 5.58e-03 | 2040.25 ms | 67.3% bf16 MFU | 263040 tok/s +step 4196/18794 | loss 3.306021 (+0.93z)| norm 0.1812 (-0.91z)| lr 5.58e-03 | 2037.85 ms | 67.3% bf16 MFU | 262752 tok/s +step 4197/18794 | loss 3.300350 (+0.76z)| norm 0.2611 (+0.81z)| lr 5.58e-03 | 2033.79 ms | 67.5% bf16 MFU | 262503 tok/s +step 4198/18794 | loss 3.266778 (-0.27z)| norm 0.2590 (+0.75z)| lr 5.58e-03 | 2041.21 ms | 67.2% bf16 MFU | 262221 tok/s +step 4199/18794 | loss 3.259750 (-0.51z)| norm 0.1791 (-0.95z)| lr 5.58e-03 | 2038.66 ms | 67.3% bf16 MFU | 261968 tok/s +step 4200/18794 | loss 3.199312 (-2.32z)| norm 0.1740 (-1.04z)| lr 5.58e-03 | 2029.74 ms | 67.6% bf16 MFU | 261785 tok/s +step 4201/18794 | loss 3.237341 (-1.13z)| norm 0.2670 (+0.93z)| lr 5.58e-03 | 2034.10 ms | 67.5% bf16 MFU | 261583 tok/s +step 4202/18794 | loss 3.301934 (+0.83z)| norm 0.2543 (+0.66z)| lr 5.58e-03 | 2031.44 ms | 67.6% bf16 MFU | 261409 tok/s +step 4203/18794 | loss 3.303874 (+0.88z)| norm 0.1684 (-1.13z)| lr 5.58e-03 | 2026.68 ms | 67.7% bf16 MFU | 261273 tok/s +step 4204/18794 | loss 3.265365 (-0.28z)| norm 0.2270 (+0.10z)| lr 5.58e-03 | 2043.14 ms | 67.2% bf16 MFU | 261040 tok/s +step 4205/18794 | loss 3.279393 (+0.13z)| norm 0.2167 (-0.12z)| lr 5.58e-03 | 2041.31 ms | 67.2% bf16 MFU | 260830 tok/s +step 4206/18794 | loss 3.272484 (-0.09z)| norm 0.1581 (-1.34z)| lr 5.58e-03 | 2026.68 ms | 67.7% bf16 MFU | 260723 tok/s +step 4207/18794 | loss 3.180590 (-2.77z)| norm 0.2180 (-0.04z)| lr 5.58e-03 | 2041.16 ms | 67.2% bf16 MFU | 260530 tok/s +step 4208/18794 | loss 3.344621 (+2.06z)| norm 0.2752 (+1.22z)| lr 5.58e-03 | 2038.19 ms | 67.3% bf16 MFU | 260365 tok/s +step 4209/18794 | loss 3.276228 (+0.06z)| norm 0.2214 (+0.02z)| lr 5.58e-03 | 2032.96 ms | 67.5% bf16 MFU | 260241 tok/s +step 4210/18794 | loss 3.228290 (-1.33z)| norm 0.2643 (+0.95z)| lr 5.58e-03 | 2034.76 ms | 67.4% bf16 MFU | 260112 tok/s +step 4211/18794 | loss 3.227034 (-1.34z)| norm 0.2145 (-0.16z)| lr 5.58e-03 | 2036.89 ms | 67.4% bf16 MFU | 259977 tok/s +step 4212/18794 | loss 3.224880 (-1.37z)| norm 0.1869 (-0.76z)| lr 5.58e-03 | 2040.01 ms | 67.3% bf16 MFU | 259828 tok/s +step 4213/18794 | loss 3.233783 (-1.10z)| norm 0.1987 (-0.49z)| lr 5.58e-03 | 2024.75 ms | 67.8% bf16 MFU | 259783 tok/s +step 4214/18794 | loss 3.217055 (-1.54z)| norm 0.2014 (-0.42z)| lr 5.58e-03 | 2032.17 ms | 67.5% bf16 MFU | 259694 tok/s +step 4215/18794 | loss 3.237275 (-0.97z)| norm 0.2395 (+0.42z)| lr 5.58e-03 | 2037.41 ms | 67.4% bf16 MFU | 259576 tok/s +step 4216/18794 | loss 3.281167 (+0.27z)| norm 0.2258 (+0.12z)| lr 5.58e-03 | 2035.00 ms | 67.4% bf16 MFU | 259479 tok/s +step 4217/18794 | loss 3.250604 (-0.60z)| norm 0.1868 (-0.74z)| lr 5.58e-03 | 2033.33 ms | 67.5% bf16 MFU | 259397 tok/s +step 4218/18794 | loss 3.268541 (-0.08z)| norm 0.1804 (-0.88z)| lr 5.58e-03 | 2018.49 ms | 68.0% bf16 MFU | 259414 tok/s +step 4219/18794 | loss 3.251261 (-0.57z)| norm 0.1670 (-1.16z)| lr 5.58e-03 | 2018.17 ms | 68.0% bf16 MFU | 259433 tok/s +step 4220/18794 | loss 3.289176 (+0.51z)| norm 0.2145 (-0.13z)| lr 5.58e-03 | 2032.88 ms | 67.5% bf16 MFU | 259357 tok/s +step 4221/18794 | loss 3.270380 (-0.02z)| norm 0.2066 (-0.31z)| lr 5.58e-03 | 2027.84 ms | 67.7% bf16 MFU | 259316 tok/s +step 4222/18794 | loss 3.227076 (-1.23z)| norm 0.1972 (-0.51z)| lr 5.58e-03 | 2040.78 ms | 67.2% bf16 MFU | 259195 tok/s +step 4223/18794 | loss 3.210361 (-1.69z)| norm 0.2699 (+1.09z)| lr 5.57e-03 | 2034.32 ms | 67.5% bf16 MFU | 259122 tok/s +step 4224/18794 | loss 3.254458 (-0.45z)| norm 0.2475 (+0.58z)| lr 5.57e-03 | 2025.83 ms | 67.7% bf16 MFU | 259106 tok/s +step 4225/18794 | loss 3.239262 (-0.86z)| norm 0.2389 (+0.38z)| lr 5.57e-03 | 2029.09 ms | 67.6% bf16 MFU | 259070 tok/s +step 4226/18794 | loss 3.240273 (-0.85z)| norm 0.2170 (-0.10z)| lr 5.57e-03 | 2025.31 ms | 67.8% bf16 MFU | 259060 tok/s +step 4227/18794 | loss 3.277795 (+0.21z)| norm 0.1765 (-1.00z)| lr 5.57e-03 | 2025.99 ms | 67.7% bf16 MFU | 259046 tok/s +step 4228/18794 | loss 3.223448 (-1.31z)| norm 0.1939 (-0.59z)| lr 5.57e-03 | 2006.29 ms | 68.4% bf16 MFU | 259160 tok/s +step 4229/18794 | loss 3.321197 (+1.45z)| norm 0.2226 (+0.05z)| lr 5.57e-03 | 2025.06 ms | 67.8% bf16 MFU | 259147 tok/s +step 4230/18794 | loss 3.292819 (+0.65z)| norm 0.2491 (+0.66z)| lr 5.57e-03 | 2033.21 ms | 67.5% bf16 MFU | 259082 tok/s +step 4231/18794 | loss 3.264744 (-0.17z)| norm 0.2401 (+0.46z)| lr 5.57e-03 | 2037.49 ms | 67.4% bf16 MFU | 258994 tok/s +step 4232/18794 | loss 3.238275 (-0.94z)| norm 0.2186 (-0.04z)| lr 5.57e-03 | 2020.41 ms | 67.9% bf16 MFU | 259019 tok/s +step 4233/18794 | loss 3.285607 (+0.42z)| norm 0.1972 (-0.52z)| lr 5.57e-03 | 2025.14 ms | 67.8% bf16 MFU | 259013 tok/s +step 4234/18794 | loss 3.276923 (+0.17z)| norm 0.2123 (-0.17z)| lr 5.57e-03 | 2012.01 ms | 68.2% bf16 MFU | 259091 tok/s +step 4235/18794 | loss 3.239525 (-0.92z)| norm 0.2355 (+0.36z)| lr 5.57e-03 | 2021.73 ms | 67.9% bf16 MFU | 259103 tok/s +step 4236/18794 | loss 3.205178 (-1.87z)| norm 0.2604 (+0.92z)| lr 5.57e-03 | 2011.45 ms | 68.2% bf16 MFU | 259180 tok/s +step 4237/18794 | loss 3.290365 (+0.58z)| norm 0.2575 (+0.86z)| lr 5.57e-03 | 2018.22 ms | 68.0% bf16 MFU | 259210 tok/s +step 4238/18794 | loss 3.226555 (-1.24z)| norm 0.2202 (-0.00z)| lr 5.57e-03 | 2032.41 ms | 67.5% bf16 MFU | 259148 tok/s +step 4239/18794 | loss 3.230324 (-1.12z)| norm 0.1704 (-1.13z)| lr 5.57e-03 | 2004.12 ms | 68.5% bf16 MFU | 259271 tok/s +step 4240/18794 | loss 3.208254 (-1.70z)| norm 0.1694 (-1.14z)| lr 5.57e-03 | 2024.31 ms | 67.8% bf16 MFU | 259257 tok/s +step 4241/18794 | loss 3.234118 (-0.96z)| norm 0.2659 (+1.04z)| lr 5.57e-03 | 2023.94 ms | 67.8% bf16 MFU | 259246 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.217159 +step 4242/18794 | loss 3.260274 (-0.23z)| norm 0.3215 (+2.22z)| lr 5.57e-03 | 2020.66 ms | 67.9% bf16 MFU | 259257 tok/s +step 4243/18794 | loss 3.286684 (+0.51z)| norm 0.2884 (+1.45z)| lr 5.57e-03 | 2018.60 ms | 68.0% bf16 MFU | 259281 tok/s +step 4244/18794 | loss 3.279759 (+0.32z)| norm 0.2469 (+0.52z)| lr 5.57e-03 | 2026.36 ms | 67.7% bf16 MFU | 259253 tok/s +step 4245/18794 | loss 3.204115 (-1.76z)| norm 0.1889 (-0.75z)| lr 5.57e-03 | 2030.62 ms | 67.6% bf16 MFU | 259200 tok/s +step 4246/18794 | loss 3.227724 (-1.10z)| norm 0.2342 (+0.25z)| lr 5.57e-03 | 2014.74 ms | 68.1% bf16 MFU | 259252 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.157799 +step 4247/18794 | loss 3.287279 (+0.57z)| norm 0.3233 (+2.16z)| lr 5.57e-03 | 2023.76 ms | 67.8% bf16 MFU | 259242 tok/s +step 4248/18794 | loss 3.218750 (-1.44z)| norm 0.3010 (+1.64z)| lr 5.57e-03 | 2020.41 ms | 67.9% bf16 MFU | 259255 tok/s +step 4249/18794 | loss 3.262620 (-0.13z)| norm 0.1907 (-0.70z)| lr 5.57e-03 | 2030.41 ms | 67.6% bf16 MFU | 259203 tok/s +step 4250/18794 | loss 3.202039 (-1.87z)| norm 0.2276 (+0.09z)| lr 5.57e-03 | 2018.87 ms | 68.0% bf16 MFU | 259228 tok/s +val loss 3.283053 +HellaSwag: 2800/10042 = 0.278829: 0/1256 +step 4251/18794 | loss 3.212576 (-1.53z)| norm 0.2126 (-0.23z)| lr 5.57e-03 | 2030.87 ms | 67.6% bf16 MFU | 259174 tok/s +step 4252/18794 | loss 3.264932 (-0.02z)| norm 0.1772 (-0.99z)| lr 5.57e-03 | 2018.77 ms | 68.0% bf16 MFU | 259201 tok/s +step 4253/18794 | loss 3.278326 (+0.37z)| norm 0.1834 (-0.84z)| lr 5.57e-03 | 2010.75 ms | 68.2% bf16 MFU | 259278 tok/s +step 4254/18794 | loss 3.267379 (+0.06z)| norm 0.1958 (-0.57z)| lr 5.57e-03 | 2026.63 ms | 67.7% bf16 MFU | 259249 tok/s +step 4255/18794 | loss 3.316454 (+1.47z)| norm 0.1729 (-1.06z)| lr 5.57e-03 | 2033.04 ms | 67.5% bf16 MFU | 259181 tok/s +step 4256/18794 | loss 3.259034 (-0.19z)| norm 0.1446 (-1.62z)| lr 5.57e-03 | 2011.38 ms | 68.2% bf16 MFU | 259255 tok/s +step 4257/18794 | loss 3.241529 (-0.69z)| norm 0.1802 (-0.87z)| lr 5.57e-03 | 2026.83 ms | 67.7% bf16 MFU | 259226 tok/s +step 4258/18794 | loss 3.262391 (-0.07z)| norm 0.1608 (-1.27z)| lr 5.57e-03 | 2014.16 ms | 68.1% bf16 MFU | 259279 tok/s +step 4259/18794 | loss 3.267210 (+0.08z)| norm 0.1931 (-0.59z)| lr 5.57e-03 | 2014.53 ms | 68.1% bf16 MFU | 259328 tok/s +step 4260/18794 | loss 3.274834 (+0.32z)| norm 0.2188 (-0.06z)| lr 5.57e-03 | 2004.35 ms | 68.5% bf16 MFU | 259441 tok/s +step 4261/18794 | loss 3.240238 (-0.72z)| norm 0.2623 (+0.83z)| lr 5.56e-03 | 2018.84 ms | 68.0% bf16 MFU | 259453 tok/s +step 4262/18794 | loss 3.202881 (-1.80z)| norm 0.2601 (+0.77z)| lr 5.56e-03 | 2004.27 ms | 68.5% bf16 MFU | 259560 tok/s +step 4263/18794 | loss 3.241854 (-0.64z)| norm 0.2062 (-0.37z)| lr 5.56e-03 | 2023.48 ms | 67.8% bf16 MFU | 259537 tok/s +step 4264/18794 | loss 3.296558 (+0.96z)| norm 0.2074 (-0.36z)| lr 5.56e-03 | 2022.83 ms | 67.8% bf16 MFU | 259519 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.094417 +step 4265/18794 | loss 3.233791 (-0.86z)| norm 0.3252 (+2.09z)| lr 5.56e-03 | 2013.20 ms | 68.2% bf16 MFU | 259565 tok/s +step 4266/18794 | loss 3.202659 (-1.73z)| norm 0.3227 (+1.98z)| lr 5.56e-03 | 2013.95 ms | 68.1% bf16 MFU | 259603 tok/s +step 4267/18794 | loss 3.290192 (+0.79z)| norm 0.1770 (-1.04z)| lr 5.56e-03 | 2010.39 ms | 68.3% bf16 MFU | 259662 tok/s +step 4268/18794 | loss 3.246033 (-0.48z)| norm 0.2482 (+0.45z)| lr 5.56e-03 | 2026.00 ms | 67.7% bf16 MFU | 259618 tok/s +step 4269/18794 | loss 3.194154 (-1.91z)| norm 0.3147 (+1.99z)| lr 5.56e-03 | 2018.19 ms | 68.0% bf16 MFU | 259626 tok/s +step 4270/18794 | loss 3.272145 (+0.29z)| norm 0.3153 (+1.94z)| lr 5.56e-03 | 2008.71 ms | 68.3% bf16 MFU | 259695 tok/s +step 4271/18794 | loss 3.308974 (+1.32z)| norm 0.2257 (+0.01z)| lr 5.56e-03 | 2010.31 ms | 68.3% bf16 MFU | 259751 tok/s +step 4272/18794 | loss 3.251668 (-0.29z)| norm 0.2358 (+0.29z)| lr 5.56e-03 | 2014.87 ms | 68.1% bf16 MFU | 259774 tok/s +step 4273/18794 | loss 3.194189 (-1.86z)| norm 0.2407 (+0.40z)| lr 5.56e-03 | 2019.00 ms | 68.0% bf16 MFU | 259769 tok/s +step 4274/18794 | loss 3.181562 (-2.14z)| norm 0.2173 (-0.16z)| lr 5.56e-03 | 2025.89 ms | 67.7% bf16 MFU | 259720 tok/s +step 4275/18794 | loss 3.242861 (-0.45z)| norm 0.2902 (+1.57z)| lr 5.56e-03 | 2005.72 ms | 68.4% bf16 MFU | 259804 tok/s +step 4276/18794 | loss 3.290913 (+0.90z)| norm 0.3014 (+1.79z)| lr 5.56e-03 | 2010.97 ms | 68.2% bf16 MFU | 259849 tok/s +step 4277/18794 | loss 3.207326 (-1.41z)| norm 0.2119 (-0.32z)| lr 5.56e-03 | 2018.81 ms | 68.0% bf16 MFU | 259842 tok/s +step 4278/18794 | loss 3.264954 (+0.18z)| norm 0.1745 (-1.21z)| lr 5.56e-03 | 2017.67 ms | 68.0% bf16 MFU | 259842 tok/s +step 4279/18794 | loss 3.224129 (-0.96z)| norm 0.2200 (-0.14z)| lr 5.56e-03 | 2013.71 ms | 68.1% bf16 MFU | 259868 tok/s +step 4280/18794 | loss 3.277325 (+0.55z)| norm 0.2341 (+0.19z)| lr 5.56e-03 | 2004.74 ms | 68.5% bf16 MFU | 259951 tok/s +step 4281/18794 | loss 3.269828 (+0.34z)| norm 0.2389 (+0.30z)| lr 5.56e-03 | 2003.07 ms | 68.5% bf16 MFU | 260040 tok/s +step 4282/18794 | loss 3.248398 (-0.26z)| norm 0.1828 (-1.02z)| lr 5.56e-03 | 2002.94 ms | 68.5% bf16 MFU | 260126 tok/s +step 4283/18794 | loss 3.229758 (-0.78z)| norm 0.2351 (+0.22z)| lr 5.56e-03 | 2012.16 ms | 68.2% bf16 MFU | 260148 tok/s +step 4284/18794 | loss 3.244071 (-0.37z)| norm 0.2520 (+0.62z)| lr 5.56e-03 | 2007.46 ms | 68.4% bf16 MFU | 260199 tok/s +step 4285/18794 | loss 3.226511 (-0.85z)| norm 0.1566 (-1.62z)| lr 5.56e-03 | 2007.11 ms | 68.4% bf16 MFU | 260250 tok/s +step 4286/18794 | loss 3.234085 (-0.62z)| norm 0.2720 (+1.13z)| lr 5.56e-03 | 2003.03 ms | 68.5% bf16 MFU | 260325 tok/s +step 4287/18794 | loss 3.249875 (-0.15z)| norm 0.2760 (+1.21z)| lr 5.56e-03 | 2009.45 ms | 68.3% bf16 MFU | 260354 tok/s +step 4288/18794 | loss 3.260993 (+0.18z)| norm 0.1892 (-0.83z)| lr 5.56e-03 | 2016.84 ms | 68.0% bf16 MFU | 260334 tok/s +step 4289/18794 | loss 3.253775 (-0.02z)| norm 0.2382 (+0.31z)| lr 5.56e-03 | 2016.61 ms | 68.1% bf16 MFU | 260317 tok/s +step 4290/18794 | loss 3.258701 (+0.13z)| norm 0.2613 (+0.84z)| lr 5.56e-03 | 2017.48 ms | 68.0% bf16 MFU | 260295 tok/s +step 4291/18794 | loss 3.245896 (-0.23z)| norm 0.2399 (+0.32z)| lr 5.56e-03 | 1996.07 ms | 68.8% bf16 MFU | 260413 tok/s +step 4292/18794 | loss 3.285788 (+0.99z)| norm 0.1958 (-0.71z)| lr 5.56e-03 | 2009.14 ms | 68.3% bf16 MFU | 260440 tok/s +step 4293/18794 | loss 3.223565 (-0.89z)| norm 0.2067 (-0.44z)| lr 5.56e-03 | 2004.83 ms | 68.5% bf16 MFU | 260493 tok/s +step 4294/18794 | loss 3.237619 (-0.45z)| norm 0.2403 (+0.35z)| lr 5.56e-03 | 2010.68 ms | 68.3% bf16 MFU | 260506 tok/s +step 4295/18794 | loss 3.252177 (-0.00z)| norm 0.1967 (-0.68z)| lr 5.56e-03 | 1999.21 ms | 68.6% bf16 MFU | 260593 tok/s +step 4296/18794 | loss 3.219093 (-0.99z)| norm 0.1953 (-0.72z)| lr 5.56e-03 | 2010.64 ms | 68.3% bf16 MFU | 260601 tok/s +step 4297/18794 | loss 3.240502 (-0.32z)| norm 0.2750 (+1.17z)| lr 5.56e-03 | 2011.11 ms | 68.2% bf16 MFU | 260606 tok/s +step 4298/18794 | loss 3.250813 (+0.01z)| norm 0.2925 (+1.56z)| lr 5.56e-03 | 1992.99 ms | 68.9% bf16 MFU | 260729 tok/s +step 4299/18794 | loss 3.228076 (-0.69z)| norm 0.2634 (+0.86z)| lr 5.56e-03 | 2013.14 ms | 68.2% bf16 MFU | 260714 tok/s +step 4300/18794 | loss 3.277745 (+0.84z)| norm 0.2181 (-0.22z)| lr 5.55e-03 | 2006.32 ms | 68.4% bf16 MFU | 260745 tok/s +step 4301/18794 | loss 3.215637 (-1.10z)| norm 0.2339 (+0.16z)| lr 5.55e-03 | 2011.64 ms | 68.2% bf16 MFU | 260739 tok/s +step 4302/18794 | loss 3.285471 (+1.10z)| norm 0.2595 (+0.77z)| lr 5.55e-03 | 2002.29 ms | 68.5% bf16 MFU | 260794 tok/s +step 4303/18794 | loss 3.225828 (-0.77z)| norm 0.2340 (+0.15z)| lr 5.55e-03 | 1992.41 ms | 68.9% bf16 MFU | 260911 tok/s +step 4304/18794 | loss 3.276809 (+0.85z)| norm 0.2331 (+0.13z)| lr 5.55e-03 | 2014.38 ms | 68.1% bf16 MFU | 260879 tok/s +step 4305/18794 | loss 3.213017 (-1.15z)| norm 0.1947 (-0.79z)| lr 5.55e-03 | 2007.06 ms | 68.4% bf16 MFU | 260897 tok/s +step 4306/18794 | loss 3.278201 (+0.91z)| norm 0.2263 (-0.05z)| lr 5.55e-03 | 2006.58 ms | 68.4% bf16 MFU | 260916 tok/s +step 4307/18794 | loss 3.256143 (+0.19z)| norm 0.1851 (-1.03z)| lr 5.55e-03 | 2005.30 ms | 68.4% bf16 MFU | 260943 tok/s +step 4308/18794 | loss 3.261778 (+0.43z)| norm 0.1779 (-1.18z)| lr 5.55e-03 | 2001.01 ms | 68.6% bf16 MFU | 260996 tok/s +step 4309/18794 | loss 3.203033 (-1.54z)| norm 0.1729 (-1.28z)| lr 5.55e-03 | 1997.24 ms | 68.7% bf16 MFU | 261072 tok/s +step 4310/18794 | loss 3.254724 (+0.20z)| norm 0.1804 (-1.08z)| lr 5.55e-03 | 2000.21 ms | 68.6% bf16 MFU | 261124 tok/s +step 4311/18794 | loss 3.176016 (-2.39z)| norm 0.1783 (-1.11z)| lr 5.55e-03 | 1994.28 ms | 68.8% bf16 MFU | 261212 tok/s +step 4312/18794 | loss 3.249310 (+0.02z)| norm 0.2718 (+1.08z)| lr 5.55e-03 | 2002.50 ms | 68.5% bf16 MFU | 261243 tok/s +reducing beta2 to 0.9 and lr/wd by 0.750 due to grad z-score of 4.665746 +step 4313/18794 | loss 3.278575 (+0.97z)| norm 0.4511 (+4.67z)| lr 4.16e-03 | 1992.73 ms | 68.9% bf16 MFU | 261336 tok/s +step 4314/18794 | loss 3.259620 (+0.34z)| norm 0.1547 (-1.53z)| lr 5.55e-03 | 2019.37 ms | 68.0% bf16 MFU | 261250 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.242538 +step 4315/18794 | loss 3.204919 (-1.45z)| norm 0.3949 (+3.24z)| lr 5.55e-03 | 2013.25 ms | 68.2% bf16 MFU | 261209 tok/s +step 4316/18794 | loss 3.327363 (+2.49z)| norm 0.2660 (+0.70z)| lr 5.55e-03 | 1993.99 ms | 68.8% bf16 MFU | 261295 tok/s +step 4317/18794 | loss 3.257115 (+0.24z)| norm 0.1967 (-0.66z)| lr 5.55e-03 | 2003.76 ms | 68.5% bf16 MFU | 261313 tok/s +step 4318/18794 | loss 3.259023 (+0.30z)| norm 0.2691 (+0.75z)| lr 5.55e-03 | 2004.02 ms | 68.5% bf16 MFU | 261328 tok/s +step 4319/18794 | loss 3.242591 (-0.22z)| norm 0.1911 (-0.79z)| lr 5.55e-03 | 2010.76 ms | 68.2% bf16 MFU | 261299 tok/s +step 4320/18794 | loss 3.226312 (-0.72z)| norm 0.2184 (-0.26z)| lr 5.55e-03 | 1990.26 ms | 69.0% bf16 MFU | 261405 tok/s +step 4321/18794 | loss 3.238132 (-0.33z)| norm 0.2328 (+0.02z)| lr 5.55e-03 | 2003.21 ms | 68.5% bf16 MFU | 261421 tok/s +step 4322/18794 | loss 3.231956 (-0.54z)| norm 0.2399 (+0.16z)| lr 5.55e-03 | 1995.80 ms | 68.8% bf16 MFU | 261485 tok/s +step 4323/18794 | loss 3.305477 (+1.79z)| norm 0.2635 (+0.63z)| lr 5.55e-03 | 1992.77 ms | 68.9% bf16 MFU | 261565 tok/s +step 4324/18794 | loss 3.173972 (-2.32z)| norm 0.2793 (+0.93z)| lr 5.55e-03 | 1992.18 ms | 68.9% bf16 MFU | 261646 tok/s +step 4325/18794 | loss 3.254025 (+0.16z)| norm 0.2357 (+0.07z)| lr 5.55e-03 | 1993.80 ms | 68.8% bf16 MFU | 261711 tok/s +step 4326/18794 | loss 3.169816 (-2.37z)| norm 0.2143 (-0.35z)| lr 5.55e-03 | 1994.03 ms | 68.8% bf16 MFU | 261772 tok/s +step 4327/18794 | loss 3.174930 (-2.14z)| norm 0.2156 (-0.34z)| lr 5.55e-03 | 2001.09 ms | 68.6% bf16 MFU | 261784 tok/s +step 4328/18794 | loss 3.285174 (+1.11z)| norm 0.2031 (-0.59z)| lr 5.55e-03 | 2002.80 ms | 68.5% bf16 MFU | 261783 tok/s +step 4329/18794 | loss 3.244439 (-0.08z)| norm 0.2045 (-0.56z)| lr 5.55e-03 | 1994.89 ms | 68.8% bf16 MFU | 261835 tok/s +step 4330/18794 | loss 3.245448 (-0.03z)| norm 0.1859 (-0.91z)| lr 5.55e-03 | 2011.83 ms | 68.2% bf16 MFU | 261773 tok/s +step 4331/18794 | loss 3.212187 (-1.03z)| norm 0.1732 (-1.14z)| lr 5.55e-03 | 2009.46 ms | 68.3% bf16 MFU | 261730 tok/s +step 4332/18794 | loss 3.239035 (-0.21z)| norm 0.2383 (+0.14z)| lr 5.55e-03 | 1990.20 ms | 69.0% bf16 MFU | 261815 tok/s +step 4333/18794 | loss 3.233888 (-0.35z)| norm 0.2051 (-0.52z)| lr 5.55e-03 | 1991.19 ms | 68.9% bf16 MFU | 261890 tok/s +step 4334/18794 | loss 3.227311 (-0.54z)| norm 0.1789 (-1.02z)| lr 5.55e-03 | 1997.66 ms | 68.7% bf16 MFU | 261918 tok/s +step 4335/18794 | loss 3.283113 (+1.15z)| norm 0.2213 (-0.19z)| lr 5.55e-03 | 1997.84 ms | 68.7% bf16 MFU | 261943 tok/s +step 4336/18794 | loss 3.258884 (+0.40z)| norm 0.1939 (-0.71z)| lr 5.55e-03 | 2000.80 ms | 68.6% bf16 MFU | 261948 tok/s +step 4337/18794 | loss 3.212275 (-1.02z)| norm 0.2453 (+0.30z)| lr 5.55e-03 | 1995.73 ms | 68.8% bf16 MFU | 261986 tok/s +step 4338/18794 | loss 3.188886 (-1.70z)| norm 0.2692 (+0.75z)| lr 5.54e-03 | 1979.85 ms | 69.3% bf16 MFU | 262127 tok/s +step 4339/18794 | loss 3.241774 (-0.10z)| norm 0.2432 (+0.23z)| lr 5.54e-03 | 1996.09 ms | 68.8% bf16 MFU | 262154 tok/s +step 4340/18794 | loss 3.273757 (+0.86z)| norm 0.1890 (-0.84z)| lr 5.54e-03 | 1995.18 ms | 68.8% bf16 MFU | 262185 tok/s +step 4341/18794 | loss 3.261914 (+0.49z)| norm 0.2365 (+0.10z)| lr 5.54e-03 | 1992.44 ms | 68.9% bf16 MFU | 262233 tok/s +step 4342/18794 | loss 3.258310 (+0.38z)| norm 0.2350 (+0.09z)| lr 5.54e-03 | 2000.21 ms | 68.6% bf16 MFU | 262227 tok/s +step 4343/18794 | loss 3.212408 (-1.00z)| norm 0.2101 (-0.40z)| lr 5.54e-03 | 1996.25 ms | 68.7% bf16 MFU | 262247 tok/s +step 4344/18794 | loss 3.196464 (-1.46z)| norm 0.1803 (-0.98z)| lr 5.54e-03 | 1989.78 ms | 69.0% bf16 MFU | 262309 tok/s +step 4345/18794 | loss 3.261925 (+0.53z)| norm 0.1857 (-0.87z)| lr 5.54e-03 | 1999.94 ms | 68.6% bf16 MFU | 262302 tok/s +step 4346/18794 | loss 3.250669 (+0.17z)| norm 0.1749 (-1.07z)| lr 5.54e-03 | 1994.74 ms | 68.8% bf16 MFU | 262328 tok/s +step 4347/18794 | loss 3.274282 (+0.91z)| norm 0.1910 (-0.74z)| lr 5.54e-03 | 1988.75 ms | 69.0% bf16 MFU | 262393 tok/s +step 4348/18794 | loss 3.213732 (-0.96z)| norm 0.2165 (-0.20z)| lr 5.54e-03 | 1999.64 ms | 68.6% bf16 MFU | 262383 tok/s +step 4349/18794 | loss 3.250448 (+0.18z)| norm 0.1961 (-0.62z)| lr 5.54e-03 | 1998.07 ms | 68.7% bf16 MFU | 262384 tok/s +step 4350/18794 | loss 3.202824 (-1.29z)| norm 0.2247 (-0.03z)| lr 5.54e-03 | 2001.50 ms | 68.6% bf16 MFU | 262362 tok/s +step 4351/18794 | loss 3.226699 (-0.56z)| norm 0.2148 (-0.24z)| lr 5.54e-03 | 1989.39 ms | 69.0% bf16 MFU | 262421 tok/s +step 4352/18794 | loss 3.301602 (+1.72z)| norm 0.2064 (-0.42z)| lr 5.54e-03 | 1990.38 ms | 68.9% bf16 MFU | 262470 tok/s +step 4353/18794 | loss 3.223890 (-0.64z)| norm 0.1767 (-1.03z)| lr 5.54e-03 | 2010.74 ms | 68.2% bf16 MFU | 262384 tok/s +step 4354/18794 | loss 3.251673 (+0.22z)| norm 0.1814 (-0.93z)| lr 5.54e-03 | 1991.25 ms | 68.9% bf16 MFU | 262430 tok/s +step 4355/18794 | loss 3.187148 (-1.74z)| norm 0.2304 (+0.07z)| lr 5.54e-03 | 2001.39 ms | 68.6% bf16 MFU | 262406 tok/s +step 4356/18794 | loss 3.212117 (-0.95z)| norm 0.2025 (-0.53z)| lr 5.54e-03 | 1986.95 ms | 69.1% bf16 MFU | 262479 tok/s +step 4357/18794 | loss 3.232961 (-0.30z)| norm 0.1828 (-0.94z)| lr 5.54e-03 | 1993.76 ms | 68.8% bf16 MFU | 262504 tok/s +step 4358/18794 | loss 3.194263 (-1.46z)| norm 0.2304 (+0.05z)| lr 5.54e-03 | 1988.18 ms | 69.0% bf16 MFU | 262563 tok/s +step 4359/18794 | loss 3.185393 (-1.69z)| norm 0.1914 (-0.78z)| lr 5.54e-03 | 1994.74 ms | 68.8% bf16 MFU | 262577 tok/s +step 4360/18794 | loss 3.281255 (+1.21z)| norm 0.1738 (-1.14z)| lr 5.54e-03 | 1986.81 ms | 69.1% bf16 MFU | 262642 tok/s +step 4361/18794 | loss 3.230031 (-0.34z)| norm 0.1982 (-0.61z)| lr 5.54e-03 | 1988.80 ms | 69.0% bf16 MFU | 262691 tok/s +step 4362/18794 | loss 3.201429 (-1.20z)| norm 0.2262 (-0.01z)| lr 5.54e-03 | 1988.94 ms | 69.0% bf16 MFU | 262737 tok/s +step 4363/18794 | loss 3.253995 (+0.38z)| norm 0.2070 (-0.42z)| lr 5.54e-03 | 1993.99 ms | 68.8% bf16 MFU | 262747 tok/s +step 4364/18794 | loss 3.230488 (-0.31z)| norm 0.2708 (+0.92z)| lr 5.54e-03 | 1986.26 ms | 69.1% bf16 MFU | 262807 tok/s +step 4365/18794 | loss 3.240257 (-0.01z)| norm 0.2971 (+1.49z)| lr 5.54e-03 | 1981.33 ms | 69.3% bf16 MFU | 262897 tok/s +step 4366/18794 | loss 3.276289 (+1.07z)| norm 0.2701 (+0.94z)| lr 5.54e-03 | 1990.49 ms | 68.9% bf16 MFU | 262922 tok/s +step 4367/18794 | loss 3.185141 (-1.69z)| norm 0.2710 (+0.94z)| lr 5.54e-03 | 1986.94 ms | 69.1% bf16 MFU | 262970 tok/s +step 4368/18794 | loss 3.208939 (-0.94z)| norm 0.2783 (+1.09z)| lr 5.54e-03 | 1984.25 ms | 69.2% bf16 MFU | 263032 tok/s +step 4369/18794 | loss 3.292628 (+1.57z)| norm 0.2868 (+1.29z)| lr 5.54e-03 | 1987.06 ms | 69.1% bf16 MFU | 263073 tok/s +step 4370/18794 | loss 3.204883 (-1.07z)| norm 0.2350 (+0.18z)| lr 5.54e-03 | 1986.60 ms | 69.1% bf16 MFU | 263115 tok/s +step 4371/18794 | loss 3.200639 (-1.19z)| norm 0.1706 (-1.23z)| lr 5.54e-03 | 1989.42 ms | 69.0% bf16 MFU | 263136 tok/s +step 4372/18794 | loss 3.180022 (-1.77z)| norm 0.2344 (+0.18z)| lr 5.54e-03 | 1983.33 ms | 69.2% bf16 MFU | 263197 tok/s +step 4373/18794 | loss 3.255199 (+0.49z)| norm 0.2895 (+1.37z)| lr 5.54e-03 | 1987.37 ms | 69.1% bf16 MFU | 263228 tok/s +step 4374/18794 | loss 3.276916 (+1.13z)| norm 0.2407 (+0.30z)| lr 5.54e-03 | 1983.72 ms | 69.2% bf16 MFU | 263281 tok/s +step 4375/18794 | loss 3.239524 (-0.02z)| norm 0.2057 (-0.45z)| lr 5.53e-03 | 1984.08 ms | 69.2% bf16 MFU | 263329 tok/s +step 4376/18794 | loss 3.256110 (+0.51z)| norm 0.2431 (+0.39z)| lr 5.53e-03 | 1998.13 ms | 68.7% bf16 MFU | 263282 tok/s +step 4377/18794 | loss 3.268047 (+0.87z)| norm 0.1981 (-0.61z)| lr 5.53e-03 | 1981.41 ms | 69.3% bf16 MFU | 263348 tok/s +step 4378/18794 | loss 3.266516 (+0.82z)| norm 0.2036 (-0.50z)| lr 5.53e-03 | 1991.89 ms | 68.9% bf16 MFU | 263341 tok/s +step 4379/18794 | loss 3.230929 (-0.30z)| norm 0.1875 (-0.85z)| lr 5.53e-03 | 1979.60 ms | 69.3% bf16 MFU | 263417 tok/s +step 4380/18794 | loss 3.233494 (-0.20z)| norm 0.2662 (+0.90z)| lr 5.53e-03 | 1980.27 ms | 69.3% bf16 MFU | 263484 tok/s +step 4381/18794 | loss 3.227954 (-0.36z)| norm 0.2954 (+1.52z)| lr 5.53e-03 | 1984.27 ms | 69.2% bf16 MFU | 263521 tok/s +step 4382/18794 | loss 3.236188 (-0.10z)| norm 0.2507 (+0.53z)| lr 5.53e-03 | 1980.90 ms | 69.3% bf16 MFU | 263578 tok/s +step 4383/18794 | loss 3.255744 (+0.51z)| norm 0.2105 (-0.36z)| lr 5.53e-03 | 1980.14 ms | 69.3% bf16 MFU | 263638 tok/s +step 4384/18794 | loss 3.295980 (+1.73z)| norm 0.1708 (-1.21z)| lr 5.53e-03 | 1979.97 ms | 69.3% bf16 MFU | 263696 tok/s +step 4385/18794 | loss 3.174933 (-1.97z)| norm 0.2084 (-0.40z)| lr 5.53e-03 | 2011.57 ms | 68.2% bf16 MFU | 263543 tok/s +step 4386/18794 | loss 3.233242 (-0.20z)| norm 0.2631 (+0.82z)| lr 5.53e-03 | 2038.22 ms | 67.3% bf16 MFU | 263227 tok/s +step 4387/18794 | loss 3.239991 (+0.01z)| norm 0.2897 (+1.40z)| lr 5.53e-03 | 2037.26 ms | 67.4% bf16 MFU | 262933 tok/s +step 4388/18794 | loss 3.281030 (+1.25z)| norm 0.2500 (+0.51z)| lr 5.53e-03 | 2033.79 ms | 67.5% bf16 MFU | 262676 tok/s +step 4389/18794 | loss 3.310969 (+2.09z)| norm 0.2188 (-0.18z)| lr 5.53e-03 | 2045.83 ms | 67.1% bf16 MFU | 262356 tok/s +step 4390/18794 | loss 3.252746 (+0.37z)| norm 0.2208 (-0.13z)| lr 5.53e-03 | 2037.83 ms | 67.3% bf16 MFU | 262102 tok/s +step 4391/18794 | loss 3.303102 (+1.82z)| norm 0.2346 (+0.18z)| lr 5.53e-03 | 2033.82 ms | 67.5% bf16 MFU | 261886 tok/s +step 4392/18794 | loss 3.211139 (-0.85z)| norm 0.2689 (+0.93z)| lr 5.53e-03 | 2041.73 ms | 67.2% bf16 MFU | 261631 tok/s +step 4393/18794 | loss 3.276285 (+1.04z)| norm 0.3075 (+1.74z)| lr 5.53e-03 | 2039.22 ms | 67.3% bf16 MFU | 261405 tok/s +step 4394/18794 | loss 3.266327 (+0.74z)| norm 0.2689 (+0.88z)| lr 5.53e-03 | 2036.93 ms | 67.4% bf16 MFU | 261204 tok/s +step 4395/18794 | loss 3.282026 (+1.18z)| norm 0.2629 (+0.74z)| lr 5.53e-03 | 2024.02 ms | 67.8% bf16 MFU | 261095 tok/s +step 4396/18794 | loss 3.222491 (-0.54z)| norm 0.2371 (+0.17z)| lr 5.53e-03 | 2043.82 ms | 67.1% bf16 MFU | 260867 tok/s +step 4397/18794 | loss 3.232165 (-0.26z)| norm 0.2176 (-0.25z)| lr 5.53e-03 | 2033.41 ms | 67.5% bf16 MFU | 260715 tok/s +step 4398/18794 | loss 3.287369 (+1.31z)| norm 0.2462 (+0.39z)| lr 5.53e-03 | 2027.60 ms | 67.7% bf16 MFU | 260608 tok/s +step 4399/18794 | loss 3.230122 (-0.33z)| norm 0.2843 (+1.23z)| lr 5.53e-03 | 2043.50 ms | 67.2% bf16 MFU | 260406 tok/s +step 4400/18794 | loss 3.242291 (+0.03z)| norm 0.2649 (+0.79z)| lr 5.53e-03 | 2047.98 ms | 67.0% bf16 MFU | 260186 tok/s +step 4401/18794 | loss 3.266447 (+0.71z)| norm 0.1934 (-0.78z)| lr 5.53e-03 | 2031.45 ms | 67.6% bf16 MFU | 260081 tok/s +step 4402/18794 | loss 3.203568 (-1.08z)| norm 0.2455 (+0.37z)| lr 5.53e-03 | 2028.39 ms | 67.7% bf16 MFU | 260001 tok/s +step 4403/18794 | loss 3.244408 (+0.10z)| norm 0.2446 (+0.35z)| lr 5.53e-03 | 2043.93 ms | 67.1% bf16 MFU | 259826 tok/s +step 4404/18794 | loss 3.190400 (-1.43z)| norm 0.1763 (-1.13z)| lr 5.53e-03 | 2023.63 ms | 67.8% bf16 MFU | 259789 tok/s +step 4405/18794 | loss 3.270906 (+0.87z)| norm 0.2486 (+0.44z)| lr 5.53e-03 | 2037.72 ms | 67.3% bf16 MFU | 259664 tok/s +step 4406/18794 | loss 3.272640 (+0.92z)| norm 0.2737 (+0.97z)| lr 5.53e-03 | 2026.92 ms | 67.7% bf16 MFU | 259614 tok/s +step 4407/18794 | loss 3.267139 (+0.76z)| norm 0.2373 (+0.17z)| lr 5.53e-03 | 2029.09 ms | 67.6% bf16 MFU | 259553 tok/s +step 4408/18794 | loss 3.235113 (-0.16z)| norm 0.1711 (-1.28z)| lr 5.53e-03 | 2031.42 ms | 67.6% bf16 MFU | 259479 tok/s +step 4409/18794 | loss 3.212909 (-0.80z)| norm 0.2427 (+0.27z)| lr 5.53e-03 | 2026.09 ms | 67.7% bf16 MFU | 259444 tok/s +step 4410/18794 | loss 3.242755 (+0.06z)| norm 0.2553 (+0.54z)| lr 5.53e-03 | 2020.38 ms | 67.9% bf16 MFU | 259447 tok/s +step 4411/18794 | loss 3.304722 (+1.82z)| norm 0.1952 (-0.80z)| lr 5.53e-03 | 2039.11 ms | 67.3% bf16 MFU | 259330 tok/s +step 4412/18794 | loss 3.268790 (+0.77z)| norm 0.2167 (-0.31z)| lr 5.53e-03 | 2019.28 ms | 68.0% bf16 MFU | 259346 tok/s +step 4413/18794 | loss 3.291050 (+1.40z)| norm 0.2847 (+1.41z)| lr 5.52e-03 | 2022.02 ms | 67.9% bf16 MFU | 259343 tok/s +step 4414/18794 | loss 3.256906 (+0.42z)| norm 0.2530 (+0.59z)| lr 5.52e-03 | 2043.95 ms | 67.1% bf16 MFU | 259201 tok/s +step 4415/18794 | loss 3.245342 (+0.08z)| norm 0.1897 (-1.08z)| lr 5.52e-03 | 2006.77 ms | 68.4% bf16 MFU | 259304 tok/s +step 4416/18794 | loss 3.292584 (+1.49z)| norm 0.1893 (-1.07z)| lr 5.52e-03 | 2034.93 ms | 67.4% bf16 MFU | 259221 tok/s +step 4417/18794 | loss 3.224820 (-0.50z)| norm 0.2700 (+1.19z)| lr 5.52e-03 | 2021.07 ms | 67.9% bf16 MFU | 259230 tok/s +step 4418/18794 | loss 3.210007 (-0.92z)| norm 0.2011 (-0.74z)| lr 5.52e-03 | 2042.79 ms | 67.2% bf16 MFU | 259102 tok/s +step 4419/18794 | loss 3.233414 (-0.23z)| norm 0.1619 (-1.82z)| lr 5.52e-03 | 2020.72 ms | 67.9% bf16 MFU | 259119 tok/s +step 4420/18794 | loss 3.313006 (+2.04z)| norm 0.2208 (-0.17z)| lr 5.52e-03 | 2029.78 ms | 67.6% bf16 MFU | 259078 tok/s +step 4421/18794 | loss 3.275080 (+0.93z)| norm 0.1820 (-1.23z)| lr 5.52e-03 | 2032.25 ms | 67.5% bf16 MFU | 259024 tok/s +step 4422/18794 | loss 3.291878 (+1.38z)| norm 0.1946 (-0.87z)| lr 5.52e-03 | 2033.15 ms | 67.5% bf16 MFU | 258966 tok/s +step 4423/18794 | loss 3.239271 (-0.09z)| norm 0.2329 (+0.20z)| lr 5.52e-03 | 1989.26 ms | 69.0% bf16 MFU | 259196 tok/s +step 4424/18794 | loss 3.268461 (+0.74z)| norm 0.2785 (+1.47z)| lr 5.52e-03 | 2019.30 ms | 68.0% bf16 MFU | 259218 tok/s +step 4425/18794 | loss 3.240838 (-0.07z)| norm 0.2161 (-0.26z)| lr 5.52e-03 | 2028.75 ms | 67.6% bf16 MFU | 259178 tok/s +step 4426/18794 | loss 3.285734 (+1.23z)| norm 0.2348 (+0.25z)| lr 5.52e-03 | 2033.15 ms | 67.5% bf16 MFU | 259113 tok/s +reducing beta2 to 0.9 and lr/wd by 0.994 due to grad z-score of 3.520957 +step 4427/18794 | loss 3.321633 (+2.25z)| norm 0.3622 (+3.52z)| lr 5.49e-03 | 2017.12 ms | 68.0% bf16 MFU | 259153 tok/s +step 4428/18794 | loss 3.326595 (+2.34z)| norm 0.2835 (+1.44z)| lr 5.52e-03 | 2032.47 ms | 67.5% bf16 MFU | 259093 tok/s +step 4429/18794 | loss 3.235704 (-0.31z)| norm 0.2537 (+0.65z)| lr 5.52e-03 | 2036.12 ms | 67.4% bf16 MFU | 259013 tok/s +step 4430/18794 | loss 3.294175 (+1.37z)| norm 0.2521 (+0.60z)| lr 5.52e-03 | 2018.90 ms | 68.0% bf16 MFU | 259047 tok/s +step 4431/18794 | loss 3.253281 (+0.18z)| norm 0.2575 (+0.72z)| lr 5.52e-03 | 2026.81 ms | 67.7% bf16 MFU | 259029 tok/s +step 4432/18794 | loss 3.232519 (-0.42z)| norm 0.2201 (-0.25z)| lr 5.52e-03 | 2031.95 ms | 67.5% bf16 MFU | 258978 tok/s +step 4433/18794 | loss 3.280224 (+0.94z)| norm 0.1873 (-1.10z)| lr 5.52e-03 | 2005.11 ms | 68.4% bf16 MFU | 259103 tok/s +step 4434/18794 | loss 3.265392 (+0.50z)| norm 0.2617 (+0.82z)| lr 5.52e-03 | 2005.18 ms | 68.4% bf16 MFU | 259221 tok/s +step 4435/18794 | loss 3.259550 (+0.34z)| norm 0.1887 (-1.08z)| lr 5.52e-03 | 2035.32 ms | 67.4% bf16 MFU | 259140 tok/s +step 4436/18794 | loss 3.249092 (+0.04z)| norm 0.2118 (-0.48z)| lr 5.52e-03 | 2014.33 ms | 68.1% bf16 MFU | 259197 tok/s +step 4437/18794 | loss 3.285225 (+1.07z)| norm 0.2276 (-0.07z)| lr 5.52e-03 | 2030.58 ms | 67.6% bf16 MFU | 259147 tok/s +step 4438/18794 | loss 3.264398 (+0.45z)| norm 0.2146 (-0.39z)| lr 5.52e-03 | 2027.24 ms | 67.7% bf16 MFU | 259121 tok/s +step 4439/18794 | loss 3.261173 (+0.35z)| norm 0.2523 (+0.60z)| lr 5.52e-03 | 2017.40 ms | 68.0% bf16 MFU | 259159 tok/s +step 4440/18794 | loss 3.243844 (-0.15z)| norm 0.2865 (+1.46z)| lr 5.52e-03 | 2011.12 ms | 68.2% bf16 MFU | 259235 tok/s +step 4441/18794 | loss 3.205686 (-1.25z)| norm 0.2307 (+0.00z)| lr 5.52e-03 | 2032.79 ms | 67.5% bf16 MFU | 259169 tok/s +step 4442/18794 | loss 3.323618 (+2.13z)| norm 0.1874 (-1.11z)| lr 5.52e-03 | 2011.83 ms | 68.2% bf16 MFU | 259241 tok/s +step 4443/18794 | loss 3.238531 (-0.31z)| norm 0.2866 (+1.43z)| lr 5.52e-03 | 2030.35 ms | 67.6% bf16 MFU | 259190 tok/s +step 4444/18794 | loss 3.306814 (+1.62z)| norm 0.2471 (+0.40z)| lr 5.52e-03 | 2017.37 ms | 68.0% bf16 MFU | 259225 tok/s +step 4445/18794 | loss 3.260815 (+0.30z)| norm 0.2028 (-0.75z)| lr 5.52e-03 | 2011.15 ms | 68.2% bf16 MFU | 259298 tok/s +step 4446/18794 | loss 3.284043 (+0.95z)| norm 0.2257 (-0.17z)| lr 5.52e-03 | 2016.73 ms | 68.0% bf16 MFU | 259332 tok/s +step 4447/18794 | loss 3.258900 (+0.24z)| norm 0.2449 (+0.32z)| lr 5.52e-03 | 1990.04 ms | 69.0% bf16 MFU | 259538 tok/s +step 4448/18794 | loss 3.234065 (-0.48z)| norm 0.2218 (-0.29z)| lr 5.52e-03 | 2013.53 ms | 68.2% bf16 MFU | 259580 tok/s +step 4449/18794 | loss 3.254409 (+0.10z)| norm 0.2646 (+0.82z)| lr 5.51e-03 | 2010.57 ms | 68.3% bf16 MFU | 259640 tok/s +step 4450/18794 | loss 3.240888 (-0.30z)| norm 0.2495 (+0.42z)| lr 5.51e-03 | 2007.68 ms | 68.4% bf16 MFU | 259715 tok/s +step 4451/18794 | loss 3.240792 (-0.31z)| norm 0.2687 (+0.91z)| lr 5.51e-03 | 2034.59 ms | 67.4% bf16 MFU | 259613 tok/s +step 4452/18794 | loss 3.294986 (+1.27z)| norm 0.2287 (-0.15z)| lr 5.51e-03 | 2023.68 ms | 67.8% bf16 MFU | 259587 tok/s +step 4453/18794 | loss 3.281155 (+0.85z)| norm 0.2718 (+0.97z)| lr 5.51e-03 | 2029.63 ms | 67.6% bf16 MFU | 259523 tok/s +step 4454/18794 | loss 3.224505 (-0.79z)| norm 0.2301 (-0.16z)| lr 5.51e-03 | 2004.21 ms | 68.5% bf16 MFU | 259627 tok/s +step 4455/18794 | loss 3.246902 (-0.16z)| norm 0.2285 (-0.20z)| lr 5.51e-03 | 2020.59 ms | 67.9% bf16 MFU | 259619 tok/s +step 4456/18794 | loss 3.258800 (+0.18z)| norm 0.2315 (-0.13z)| lr 5.51e-03 | 2026.23 ms | 67.7% bf16 MFU | 259575 tok/s +step 4457/18794 | loss 3.228617 (-0.71z)| norm 0.2691 (+0.87z)| lr 5.51e-03 | 2017.95 ms | 68.0% bf16 MFU | 259587 tok/s +step 4458/18794 | loss 3.265477 (+0.37z)| norm 0.2273 (-0.27z)| lr 5.51e-03 | 2011.83 ms | 68.2% bf16 MFU | 259638 tok/s +step 4459/18794 | loss 3.257230 (+0.10z)| norm 0.1885 (-1.32z)| lr 5.51e-03 | 2005.49 ms | 68.4% bf16 MFU | 259727 tok/s +step 4460/18794 | loss 3.251322 (-0.07z)| norm 0.2616 (+0.66z)| lr 5.51e-03 | 2005.51 ms | 68.4% bf16 MFU | 259812 tok/s +step 4461/18794 | loss 3.285245 (+0.96z)| norm 0.2623 (+0.66z)| lr 5.51e-03 | 2010.06 ms | 68.3% bf16 MFU | 259863 tok/s +step 4462/18794 | loss 3.249258 (-0.17z)| norm 0.2368 (-0.05z)| lr 5.51e-03 | 2005.18 ms | 68.4% bf16 MFU | 259943 tok/s +step 4463/18794 | loss 3.350843 (+2.85z)| norm 0.1802 (-1.60z)| lr 5.51e-03 | 2002.68 ms | 68.5% bf16 MFU | 260036 tok/s +step 4464/18794 | loss 3.282066 (+0.77z)| norm 0.1992 (-1.05z)| lr 5.51e-03 | 2017.01 ms | 68.0% bf16 MFU | 260031 tok/s +step 4465/18794 | loss 3.257318 (+0.03z)| norm 0.2221 (-0.41z)| lr 5.51e-03 | 2014.11 ms | 68.1% bf16 MFU | 260045 tok/s +step 4466/18794 | loss 3.273946 (+0.53z)| norm 0.1734 (-1.72z)| lr 5.51e-03 | 2041.30 ms | 67.2% bf16 MFU | 259884 tok/s +step 4467/18794 | loss 3.229997 (-0.82z)| norm 0.2112 (-0.66z)| lr 5.51e-03 | 1999.30 ms | 68.6% bf16 MFU | 260002 tok/s +step 4468/18794 | loss 3.284607 (+0.83z)| norm 0.1907 (-1.20z)| lr 5.51e-03 | 2001.44 ms | 68.6% bf16 MFU | 260100 tok/s +step 4469/18794 | loss 3.252716 (-0.14z)| norm 0.2116 (-0.61z)| lr 5.51e-03 | 2018.22 ms | 68.0% bf16 MFU | 260083 tok/s +step 4470/18794 | loss 3.256252 (-0.05z)| norm 0.2271 (-0.18z)| lr 5.51e-03 | 2024.52 ms | 67.8% bf16 MFU | 260028 tok/s +step 4471/18794 | loss 3.305001 (+1.46z)| norm 0.2259 (-0.23z)| lr 5.51e-03 | 2022.21 ms | 67.9% bf16 MFU | 259990 tok/s +step 4472/18794 | loss 3.236614 (-0.74z)| norm 0.1820 (-1.44z)| lr 5.51e-03 | 2002.08 ms | 68.5% bf16 MFU | 260084 tok/s +step 4473/18794 | loss 3.200344 (-1.87z)| norm 0.1870 (-1.28z)| lr 5.51e-03 | 2005.09 ms | 68.4% bf16 MFU | 260153 tok/s +step 4474/18794 | loss 3.266335 (+0.25z)| norm 0.2685 (+0.99z)| lr 5.51e-03 | 2016.42 ms | 68.1% bf16 MFU | 260146 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.413889 +step 4475/18794 | loss 3.256735 (-0.07z)| norm 0.3645 (+3.41z)| lr 5.51e-03 | 2017.49 ms | 68.0% bf16 MFU | 260132 tok/s +step 4476/18794 | loss 3.247209 (-0.37z)| norm 0.2330 (-0.04z)| lr 5.51e-03 | 2013.20 ms | 68.2% bf16 MFU | 260147 tok/s +step 4477/18794 | loss 3.224216 (-1.09z)| norm 0.1984 (-0.95z)| lr 5.51e-03 | 2016.05 ms | 68.1% bf16 MFU | 260143 tok/s +step 4478/18794 | loss 3.258173 (-0.00z)| norm 0.2734 (+1.00z)| lr 5.51e-03 | 2018.66 ms | 68.0% bf16 MFU | 260122 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.253225 +step 4479/18794 | loss 3.278797 (+0.64z)| norm 0.3672 (+3.25z)| lr 5.51e-03 | 2019.05 ms | 68.0% bf16 MFU | 260099 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.823358 +step 4480/18794 | loss 3.231657 (-0.86z)| norm 0.3554 (+2.82z)| lr 5.51e-03 | 2005.39 ms | 68.4% bf16 MFU | 260166 tok/s +step 4481/18794 | loss 3.197466 (-1.92z)| norm 0.2647 (+0.66z)| lr 5.51e-03 | 2012.59 ms | 68.2% bf16 MFU | 260183 tok/s +step 4482/18794 | loss 3.232991 (-0.80z)| norm 0.2138 (-0.56z)| lr 5.51e-03 | 1998.94 ms | 68.7% bf16 MFU | 260288 tok/s +step 4483/18794 | loss 3.220092 (-1.19z)| norm 0.1979 (-0.94z)| lr 5.51e-03 | 2011.13 ms | 68.2% bf16 MFU | 260308 tok/s +step 4484/18794 | loss 3.313773 (+1.73z)| norm 0.1668 (-1.69z)| lr 5.51e-03 | 2018.77 ms | 68.0% bf16 MFU | 260278 tok/s +step 4485/18794 | loss 3.245984 (-0.41z)| norm 0.1843 (-1.25z)| lr 5.51e-03 | 1998.76 ms | 68.7% bf16 MFU | 260379 tok/s +step 4486/18794 | loss 3.232432 (-0.85z)| norm 0.2217 (-0.35z)| lr 5.50e-03 | 1999.51 ms | 68.6% bf16 MFU | 260471 tok/s +step 4487/18794 | loss 3.230965 (-0.89z)| norm 0.2626 (+0.64z)| lr 5.50e-03 | 1994.60 ms | 68.8% bf16 MFU | 260590 tok/s +step 4488/18794 | loss 3.289156 (+0.97z)| norm 0.2381 (+0.05z)| lr 5.50e-03 | 2016.42 ms | 68.1% bf16 MFU | 260561 tok/s +step 4489/18794 | loss 3.290058 (+1.01z)| norm 0.2960 (+1.42z)| lr 5.50e-03 | 2004.70 ms | 68.5% bf16 MFU | 260609 tok/s +step 4490/18794 | loss 3.257865 (-0.03z)| norm 0.2455 (+0.20z)| lr 5.50e-03 | 2014.48 ms | 68.1% bf16 MFU | 260592 tok/s +step 4491/18794 | loss 3.259986 (+0.06z)| norm 0.2088 (-0.67z)| lr 5.50e-03 | 1998.41 ms | 68.7% bf16 MFU | 260680 tok/s +step 4492/18794 | loss 3.259800 (+0.03z)| norm 0.2128 (-0.56z)| lr 5.50e-03 | 2003.27 ms | 68.5% bf16 MFU | 260732 tok/s +step 4493/18794 | loss 3.257130 (-0.05z)| norm 0.2390 (+0.09z)| lr 5.50e-03 | 2020.78 ms | 67.9% bf16 MFU | 260668 tok/s +step 4494/18794 | loss 3.216094 (-1.37z)| norm 0.2374 (+0.06z)| lr 5.50e-03 | 2003.66 ms | 68.5% bf16 MFU | 260718 tok/s +step 4495/18794 | loss 3.304356 (+1.50z)| norm 0.1920 (-1.03z)| lr 5.50e-03 | 2004.89 ms | 68.4% bf16 MFU | 260757 tok/s +step 4496/18794 | loss 3.264148 (+0.18z)| norm 0.1731 (-1.46z)| lr 5.50e-03 | 2011.78 ms | 68.2% bf16 MFU | 260749 tok/s +step 4497/18794 | loss 3.241902 (-0.55z)| norm 0.1793 (-1.29z)| lr 5.50e-03 | 2011.47 ms | 68.2% bf16 MFU | 260744 tok/s +step 4498/18794 | loss 3.325187 (+2.12z)| norm 0.1914 (-0.98z)| lr 5.50e-03 | 2020.93 ms | 67.9% bf16 MFU | 260679 tok/s +step 4499/18794 | loss 3.215075 (-1.41z)| norm 0.2129 (-0.46z)| lr 5.50e-03 | 2020.86 ms | 67.9% bf16 MFU | 260617 tok/s +step 4500/18794 | loss 3.238189 (-0.66z)| norm 0.2703 (+0.91z)| lr 5.50e-03 | 2022.65 ms | 67.8% bf16 MFU | 260546 tok/s +val loss 3.277654 +HellaSwag: 2815/10042 = 0.280323Swag: 990/1256: 0/1256 +Writing checkpoint at step 4500 +Writing model to log_gpt3_125M_edu_v4/model_00004500.bin +Writing state to log_gpt3_125M_edu_v4/state_00004500_00001.bin +Writing state to log_gpt3_125M_edu_v4/state_00004500_00000.bin +step 4501/18794 | loss 3.257780 (-0.04z)| norm 0.3170 (+1.96z)| lr 5.50e-03 | 1994.57 ms | 68.8% bf16 MFU | 260662 tok/s +step 4502/18794 | loss 3.271945 (+0.40z)| norm 0.2893 (+1.29z)| lr 5.50e-03 | 2006.64 ms | 68.4% bf16 MFU | 260693 tok/s +step 4503/18794 | loss 3.246552 (-0.42z)| norm 0.2338 (+0.00z)| lr 5.50e-03 | 1999.51 ms | 68.6% bf16 MFU | 260768 tok/s +step 4504/18794 | loss 3.269610 (+0.31z)| norm 0.2288 (-0.13z)| lr 5.50e-03 | 1990.80 ms | 68.9% bf16 MFU | 260898 tok/s +step 4505/18794 | loss 3.253297 (-0.23z)| norm 0.2926 (+1.35z)| lr 5.50e-03 | 1999.68 ms | 68.6% bf16 MFU | 260962 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.776754 +step 4506/18794 | loss 3.277760 (+0.58z)| norm 0.3591 (+2.78z)| lr 5.50e-03 | 2006.13 ms | 68.4% bf16 MFU | 260981 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.509751 +step 4507/18794 | loss 3.271279 (+0.37z)| norm 0.3521 (+2.51z)| lr 5.50e-03 | 1990.69 ms | 68.9% bf16 MFU | 261101 tok/s +step 4508/18794 | loss 3.263650 (+0.10z)| norm 0.2135 (-0.52z)| lr 5.50e-03 | 2015.36 ms | 68.1% bf16 MFU | 261053 tok/s +step 4509/18794 | loss 3.242629 (-0.62z)| norm 0.2009 (-0.78z)| lr 5.50e-03 | 1984.49 ms | 69.2% bf16 MFU | 261210 tok/s +step 4510/18794 | loss 3.307429 (+1.53z)| norm 0.1674 (-1.48z)| lr 5.50e-03 | 1993.36 ms | 68.8% bf16 MFU | 261300 tok/s +step 4511/18794 | loss 3.218770 (-1.40z)| norm 0.1769 (-1.27z)| lr 5.50e-03 | 2023.14 ms | 67.8% bf16 MFU | 261192 tok/s +step 4512/18794 | loss 3.236013 (-0.81z)| norm 0.1794 (-1.20z)| lr 5.50e-03 | 1995.83 ms | 68.8% bf16 MFU | 261267 tok/s +step 4513/18794 | loss 3.262833 (+0.09z)| norm 0.2287 (-0.13z)| lr 5.50e-03 | 2009.33 ms | 68.3% bf16 MFU | 261250 tok/s +step 4514/18794 | loss 3.268283 (+0.27z)| norm 0.1927 (-0.89z)| lr 5.50e-03 | 2000.97 ms | 68.6% bf16 MFU | 261289 tok/s +step 4515/18794 | loss 3.291624 (+1.03z)| norm 0.2251 (-0.20z)| lr 5.50e-03 | 2000.88 ms | 68.6% bf16 MFU | 261326 tok/s +step 4516/18794 | loss 3.241759 (-0.62z)| norm 0.2462 (+0.24z)| lr 5.50e-03 | 1986.08 ms | 69.1% bf16 MFU | 261458 tok/s +step 4517/18794 | loss 3.247310 (-0.44z)| norm 0.2360 (+0.03z)| lr 5.50e-03 | 2000.15 ms | 68.6% bf16 MFU | 261492 tok/s +step 4518/18794 | loss 3.244292 (-0.56z)| norm 0.2239 (-0.24z)| lr 5.50e-03 | 1983.75 ms | 69.2% bf16 MFU | 261632 tok/s +step 4519/18794 | loss 3.350788 (+2.92z)| norm 0.2030 (-0.71z)| lr 5.50e-03 | 2016.15 ms | 68.1% bf16 MFU | 261552 tok/s +step 4520/18794 | loss 3.255273 (-0.20z)| norm 0.2306 (-0.11z)| lr 5.50e-03 | 1995.04 ms | 68.8% bf16 MFU | 261614 tok/s +step 4521/18794 | loss 3.278091 (+0.56z)| norm 0.2903 (+1.18z)| lr 5.50e-03 | 1984.25 ms | 69.2% bf16 MFU | 261745 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.022939 +step 4522/18794 | loss 3.231753 (-0.96z)| norm 0.3313 (+2.02z)| lr 5.49e-03 | 1983.53 ms | 69.2% bf16 MFU | 261874 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.301637 +step 4523/18794 | loss 3.301632 (+1.33z)| norm 0.3483 (+2.30z)| lr 5.49e-03 | 2007.53 ms | 68.4% bf16 MFU | 261838 tok/s +step 4524/18794 | loss 3.280778 (+0.64z)| norm 0.2203 (-0.38z)| lr 5.49e-03 | 2010.82 ms | 68.2% bf16 MFU | 261783 tok/s +step 4525/18794 | loss 3.240184 (-0.70z)| norm 0.2275 (-0.23z)| lr 5.49e-03 | 2002.38 ms | 68.5% bf16 MFU | 261785 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.559391 +step 4526/18794 | loss 3.274053 (+0.42z)| norm 0.3651 (+2.56z)| lr 5.49e-03 | 2014.78 ms | 68.1% bf16 MFU | 261707 tok/s +step 4527/18794 | loss 3.254036 (-0.22z)| norm 0.3000 (+1.27z)| lr 5.49e-03 | 1994.95 ms | 68.8% bf16 MFU | 261762 tok/s +step 4528/18794 | loss 3.248923 (-0.38z)| norm 0.2277 (-0.23z)| lr 5.49e-03 | 2000.40 ms | 68.6% bf16 MFU | 261779 tok/s +step 4529/18794 | loss 3.274017 (+0.48z)| norm 0.2639 (+0.53z)| lr 5.49e-03 | 1993.24 ms | 68.8% bf16 MFU | 261841 tok/s +step 4530/18794 | loss 3.252064 (-0.27z)| norm 0.2356 (-0.06z)| lr 5.49e-03 | 1992.80 ms | 68.9% bf16 MFU | 261904 tok/s +step 4531/18794 | loss 3.316237 (+1.91z)| norm 0.2109 (-0.57z)| lr 5.49e-03 | 2011.25 ms | 68.2% bf16 MFU | 261843 tok/s +step 4532/18794 | loss 3.249262 (-0.39z)| norm 0.2419 (+0.08z)| lr 5.49e-03 | 2000.88 ms | 68.6% bf16 MFU | 261852 tok/s +step 4533/18794 | loss 3.319938 (+1.99z)| norm 0.2984 (+1.24z)| lr 5.49e-03 | 1981.35 ms | 69.3% bf16 MFU | 261990 tok/s +step 4534/18794 | loss 3.262365 (+0.05z)| norm 0.3170 (+1.59z)| lr 5.49e-03 | 2010.32 ms | 68.3% bf16 MFU | 261930 tok/s +step 4535/18794 | loss 3.227134 (-1.13z)| norm 0.1939 (-0.96z)| lr 5.49e-03 | 1980.15 ms | 69.3% bf16 MFU | 262072 tok/s +step 4536/18794 | loss 3.291625 (+1.02z)| norm 0.2658 (+0.52z)| lr 5.49e-03 | 1997.15 ms | 68.7% bf16 MFU | 262095 tok/s +step 4537/18794 | loss 3.252898 (-0.27z)| norm 0.3318 (+1.84z)| lr 5.49e-03 | 1981.90 ms | 69.2% bf16 MFU | 262217 tok/s +step 4538/18794 | loss 3.238535 (-0.74z)| norm 0.2438 (+0.04z)| lr 5.49e-03 | 1996.10 ms | 68.8% bf16 MFU | 262239 tok/s +step 4539/18794 | loss 3.235652 (-0.82z)| norm 0.1740 (-1.36z)| lr 5.49e-03 | 1984.90 ms | 69.1% bf16 MFU | 262334 tok/s +step 4540/18794 | loss 3.223109 (-1.22z)| norm 0.1773 (-1.27z)| lr 5.49e-03 | 1992.26 ms | 68.9% bf16 MFU | 262375 tok/s +step 4541/18794 | loss 3.224926 (-1.18z)| norm 0.1893 (-1.01z)| lr 5.49e-03 | 1982.86 ms | 69.2% bf16 MFU | 262477 tok/s +step 4542/18794 | loss 3.243423 (-0.55z)| norm 0.1729 (-1.33z)| lr 5.49e-03 | 2000.35 ms | 68.6% bf16 MFU | 262458 tok/s +step 4543/18794 | loss 3.221322 (-1.29z)| norm 0.1779 (-1.21z)| lr 5.49e-03 | 1993.40 ms | 68.8% bf16 MFU | 262486 tok/s +step 4544/18794 | loss 3.264761 (+0.20z)| norm 0.1999 (-0.76z)| lr 5.49e-03 | 1983.98 ms | 69.2% bf16 MFU | 262574 tok/s +step 4545/18794 | loss 3.258033 (-0.03z)| norm 0.1852 (-1.04z)| lr 5.49e-03 | 1998.37 ms | 68.7% bf16 MFU | 262564 tok/s +step 4546/18794 | loss 3.222984 (-1.21z)| norm 0.1933 (-0.87z)| lr 5.49e-03 | 1991.70 ms | 68.9% bf16 MFU | 262597 tok/s +step 4547/18794 | loss 3.296723 (+1.30z)| norm 0.2970 (+1.16z)| lr 5.49e-03 | 1984.75 ms | 69.1% bf16 MFU | 262675 tok/s +step 4548/18794 | loss 3.275642 (+0.57z)| norm 0.3321 (+1.80z)| lr 5.49e-03 | 1979.85 ms | 69.3% bf16 MFU | 262782 tok/s +step 4549/18794 | loss 3.291223 (+1.08z)| norm 0.3216 (+1.56z)| lr 5.49e-03 | 1979.08 ms | 69.3% bf16 MFU | 262889 tok/s +step 4550/18794 | loss 3.281448 (+0.73z)| norm 0.2643 (+0.47z)| lr 5.49e-03 | 1983.71 ms | 69.2% bf16 MFU | 262959 tok/s +step 4551/18794 | loss 3.263021 (+0.10z)| norm 0.2678 (+0.53z)| lr 5.49e-03 | 1986.47 ms | 69.1% bf16 MFU | 263008 tok/s +step 4552/18794 | loss 3.244012 (-0.53z)| norm 0.2733 (+0.63z)| lr 5.49e-03 | 1988.08 ms | 69.0% bf16 MFU | 263043 tok/s +step 4553/18794 | loss 3.246112 (-0.45z)| norm 0.2313 (-0.16z)| lr 5.49e-03 | 1984.20 ms | 69.2% bf16 MFU | 263102 tok/s +step 4554/18794 | loss 3.240557 (-0.65z)| norm 0.2169 (-0.43z)| lr 5.49e-03 | 1988.03 ms | 69.0% bf16 MFU | 263134 tok/s +step 4555/18794 | loss 3.253299 (-0.21z)| norm 0.3093 (+1.30z)| lr 5.49e-03 | 1988.73 ms | 69.0% bf16 MFU | 263158 tok/s +step 4556/18794 | loss 3.255877 (-0.12z)| norm 0.2655 (+0.47z)| lr 5.49e-03 | 1979.01 ms | 69.3% bf16 MFU | 263247 tok/s +step 4557/18794 | loss 3.233149 (-0.90z)| norm 0.2808 (+0.75z)| lr 5.49e-03 | 1986.08 ms | 69.1% bf16 MFU | 263283 tok/s +step 4558/18794 | loss 3.259890 (+0.02z)| norm 0.2148 (-0.49z)| lr 5.48e-03 | 1982.01 ms | 69.2% bf16 MFU | 263345 tok/s +step 4559/18794 | loss 3.253646 (-0.20z)| norm 0.2178 (-0.44z)| lr 5.48e-03 | 1980.01 ms | 69.3% bf16 MFU | 263418 tok/s +step 4560/18794 | loss 3.274261 (+0.51z)| norm 0.2718 (+0.58z)| lr 5.48e-03 | 1984.38 ms | 69.2% bf16 MFU | 263457 tok/s +step 4561/18794 | loss 3.165175 (-3.06z)| norm 0.2783 (+0.70z)| lr 5.48e-03 | 1988.28 ms | 69.0% bf16 MFU | 263469 tok/s +step 4562/18794 | loss 3.192654 (-2.09z)| norm 0.1926 (-0.91z)| lr 5.48e-03 | 1996.40 ms | 68.7% bf16 MFU | 263426 tok/s +step 4563/18794 | loss 3.242781 (-0.47z)| norm 0.1935 (-0.90z)| lr 5.48e-03 | 1978.80 ms | 69.4% bf16 MFU | 263502 tok/s +step 4564/18794 | loss 3.206043 (-1.66z)| norm 0.2201 (-0.40z)| lr 5.48e-03 | 1979.91 ms | 69.3% bf16 MFU | 263567 tok/s +step 4565/18794 | loss 3.311426 (+1.80z)| norm 0.2554 (+0.26z)| lr 5.48e-03 | 1991.92 ms | 68.9% bf16 MFU | 263549 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.099074 +step 4566/18794 | loss 3.312814 (+1.80z)| norm 0.3562 (+2.10z)| lr 5.48e-03 | 1979.16 ms | 69.3% bf16 MFU | 263617 tok/s +step 4567/18794 | loss 3.235103 (-0.71z)| norm 0.3363 (+1.68z)| lr 5.48e-03 | 1982.66 ms | 69.2% bf16 MFU | 263658 tok/s +step 4568/18794 | loss 3.281698 (+0.80z)| norm 0.2501 (+0.09z)| lr 5.48e-03 | 1985.34 ms | 69.1% bf16 MFU | 263679 tok/s +step 4569/18794 | loss 3.256001 (-0.03z)| norm 0.1978 (-0.87z)| lr 5.48e-03 | 1984.56 ms | 69.1% bf16 MFU | 263704 tok/s +step 4570/18794 | loss 3.202879 (-1.70z)| norm 0.2423 (-0.06z)| lr 5.48e-03 | 1980.89 ms | 69.3% bf16 MFU | 263753 tok/s +step 4571/18794 | loss 3.288522 (+1.04z)| norm 0.2358 (-0.18z)| lr 5.48e-03 | 1995.42 ms | 68.8% bf16 MFU | 263703 tok/s +step 4572/18794 | loss 3.216100 (-1.27z)| norm 0.2256 (-0.38z)| lr 5.48e-03 | 1979.61 ms | 69.3% bf16 MFU | 263760 tok/s +step 4573/18794 | loss 3.270938 (+0.46z)| norm 0.2156 (-0.57z)| lr 5.48e-03 | 1980.87 ms | 69.3% bf16 MFU | 263805 tok/s +step 4574/18794 | loss 3.229132 (-0.88z)| norm 0.2140 (-0.59z)| lr 5.48e-03 | 1985.14 ms | 69.1% bf16 MFU | 263820 tok/s +step 4575/18794 | loss 3.186400 (-2.18z)| norm 0.1852 (-1.11z)| lr 5.48e-03 | 1979.42 ms | 69.3% bf16 MFU | 263873 tok/s +step 4576/18794 | loss 3.180711 (-2.28z)| norm 0.2052 (-0.73z)| lr 5.48e-03 | 2042.38 ms | 67.2% bf16 MFU | 263514 tok/s +step 4577/18794 | loss 3.228867 (-0.80z)| norm 0.2573 (+0.25z)| lr 5.48e-03 | 2041.93 ms | 67.2% bf16 MFU | 263177 tok/s +step 4578/18794 | loss 3.175917 (-2.34z)| norm 0.2338 (-0.19z)| lr 5.48e-03 | 2042.00 ms | 67.2% bf16 MFU | 262856 tok/s +step 4579/18794 | loss 3.279750 (+0.76z)| norm 0.3143 (+1.38z)| lr 5.48e-03 | 2041.46 ms | 67.2% bf16 MFU | 262554 tok/s +step 4580/18794 | loss 3.214294 (-1.18z)| norm 0.3247 (+1.61z)| lr 5.48e-03 | 2041.52 ms | 67.2% bf16 MFU | 262267 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.350622 +step 4581/18794 | loss 3.190532 (-1.88z)| norm 0.3671 (+2.35z)| lr 5.48e-03 | 2041.90 ms | 67.2% bf16 MFU | 261992 tok/s +step 4582/18794 | loss 3.163325 (-2.57z)| norm 0.3246 (+1.50z)| lr 5.48e-03 | 2040.91 ms | 67.2% bf16 MFU | 261736 tok/s +step 4583/18794 | loss 3.267231 (+0.39z)| norm 0.2061 (-0.74z)| lr 5.48e-03 | 2041.80 ms | 67.2% bf16 MFU | 261488 tok/s +step 4584/18794 | loss 3.217533 (-1.02z)| norm 0.2489 (+0.06z)| lr 5.48e-03 | 2025.25 ms | 67.8% bf16 MFU | 261358 tok/s +step 4585/18794 | loss 3.213416 (-1.13z)| norm 0.2528 (+0.12z)| lr 5.48e-03 | 2033.22 ms | 67.5% bf16 MFU | 261183 tok/s +step 4586/18794 | loss 3.197360 (-1.56z)| norm 0.2264 (-0.39z)| lr 5.48e-03 | 2041.21 ms | 67.2% bf16 MFU | 260966 tok/s +step 4587/18794 | loss 3.211698 (-1.14z)| norm 0.2226 (-0.46z)| lr 5.48e-03 | 2040.38 ms | 67.3% bf16 MFU | 260766 tok/s +step 4588/18794 | loss 3.192403 (-1.65z)| norm 0.2549 (+0.16z)| lr 5.48e-03 | 2025.00 ms | 67.8% bf16 MFU | 260673 tok/s +step 4589/18794 | loss 3.272814 (+0.62z)| norm 0.2278 (-0.35z)| lr 5.48e-03 | 2041.81 ms | 67.2% bf16 MFU | 260478 tok/s +step 4590/18794 | loss 3.131254 (-3.17z)| norm 0.2425 (-0.06z)| lr 5.48e-03 | 2041.50 ms | 67.2% bf16 MFU | 260295 tok/s +step 4591/18794 | loss 3.183637 (-1.72z)| norm 0.2314 (-0.28z)| lr 5.48e-03 | 2018.18 ms | 68.0% bf16 MFU | 260269 tok/s +step 4592/18794 | loss 3.236425 (-0.32z)| norm 0.2536 (+0.14z)| lr 5.48e-03 | 2040.47 ms | 67.3% bf16 MFU | 260103 tok/s +step 4593/18794 | loss 3.226845 (-0.56z)| norm 0.1929 (-1.02z)| lr 5.48e-03 | 2033.30 ms | 67.5% bf16 MFU | 259991 tok/s +step 4594/18794 | loss 3.223476 (-0.66z)| norm 0.2407 (-0.10z)| lr 5.47e-03 | 2041.52 ms | 67.2% bf16 MFU | 259832 tok/s +step 4595/18794 | loss 3.147517 (-2.56z)| norm 0.2052 (-0.79z)| lr 5.47e-03 | 2041.66 ms | 67.2% bf16 MFU | 259680 tok/s +step 4596/18794 | loss 3.261530 (+0.38z)| norm 0.2131 (-0.65z)| lr 5.47e-03 | 2041.33 ms | 67.2% bf16 MFU | 259538 tok/s +step 4597/18794 | loss 3.219295 (-0.70z)| norm 0.1979 (-0.95z)| lr 5.47e-03 | 2041.69 ms | 67.2% bf16 MFU | 259400 tok/s +step 4598/18794 | loss 3.196875 (-1.26z)| norm 0.2090 (-0.74z)| lr 5.47e-03 | 2032.79 ms | 67.5% bf16 MFU | 259326 tok/s +step 4599/18794 | loss 3.209168 (-0.94z)| norm 0.2218 (-0.49z)| lr 5.47e-03 | 2034.23 ms | 67.5% bf16 MFU | 259246 tok/s +step 4600/18794 | loss 3.206755 (-0.99z)| norm 0.2185 (-0.55z)| lr 5.47e-03 | 2041.59 ms | 67.2% bf16 MFU | 259124 tok/s +step 4601/18794 | loss 3.287157 (+1.08z)| norm 0.2162 (-0.58z)| lr 5.47e-03 | 2041.49 ms | 67.2% bf16 MFU | 259009 tok/s +step 4602/18794 | loss 3.200816 (-1.12z)| norm 0.2522 (+0.14z)| lr 5.47e-03 | 2034.94 ms | 67.4% bf16 MFU | 258941 tok/s +step 4603/18794 | loss 3.172435 (-1.80z)| norm 0.2850 (+0.78z)| lr 5.47e-03 | 2018.17 ms | 68.0% bf16 MFU | 258983 tok/s +step 4604/18794 | loss 3.170127 (-1.81z)| norm 0.2037 (-0.82z)| lr 5.47e-03 | 2025.76 ms | 67.7% bf16 MFU | 258974 tok/s +step 4605/18794 | loss 3.226402 (-0.40z)| norm 0.1734 (-1.39z)| lr 5.47e-03 | 2034.76 ms | 67.4% bf16 MFU | 258909 tok/s +step 4606/18794 | loss 3.132194 (-2.62z)| norm 0.2013 (-0.83z)| lr 5.47e-03 | 2017.61 ms | 68.0% bf16 MFU | 258956 tok/s +step 4607/18794 | loss 3.217834 (-0.54z)| norm 0.2597 (+0.38z)| lr 5.47e-03 | 2042.81 ms | 67.2% bf16 MFU | 258841 tok/s +step 4608/18794 | loss 3.251395 (+0.27z)| norm 0.2823 (+0.83z)| lr 5.47e-03 | 2033.94 ms | 67.5% bf16 MFU | 258787 tok/s +step 4609/18794 | loss 3.202711 (-0.89z)| norm 0.2411 (-0.03z)| lr 5.47e-03 | 2025.93 ms | 67.7% bf16 MFU | 258787 tok/s +step 4610/18794 | loss 3.263438 (+0.58z)| norm 0.2505 (+0.15z)| lr 5.47e-03 | 2034.79 ms | 67.4% bf16 MFU | 258731 tok/s +step 4611/18794 | loss 3.262899 (+0.56z)| norm 0.3011 (+1.19z)| lr 5.47e-03 | 2025.96 ms | 67.7% bf16 MFU | 258734 tok/s +step 4612/18794 | loss 3.223154 (-0.40z)| norm 0.2648 (+0.41z)| lr 5.47e-03 | 2034.75 ms | 67.4% bf16 MFU | 258680 tok/s +step 4613/18794 | loss 3.244585 (+0.12z)| norm 0.2166 (-0.61z)| lr 5.47e-03 | 2032.73 ms | 67.5% bf16 MFU | 258642 tok/s +step 4614/18794 | loss 3.232456 (-0.17z)| norm 0.2216 (-0.51z)| lr 5.47e-03 | 2034.12 ms | 67.5% bf16 MFU | 258598 tok/s +step 4615/18794 | loss 3.136224 (-2.42z)| norm 0.2612 (+0.32z)| lr 5.47e-03 | 2041.07 ms | 67.2% bf16 MFU | 258511 tok/s +step 4616/18794 | loss 3.238221 (+0.01z)| norm 0.2790 (+0.69z)| lr 5.47e-03 | 2026.75 ms | 67.7% bf16 MFU | 258520 tok/s +step 4617/18794 | loss 3.249595 (+0.29z)| norm 0.1989 (-1.00z)| lr 5.47e-03 | 2011.28 ms | 68.2% bf16 MFU | 258628 tok/s +step 4618/18794 | loss 3.152324 (-1.98z)| norm 0.1880 (-1.21z)| lr 5.47e-03 | 2032.66 ms | 67.5% bf16 MFU | 258593 tok/s +step 4619/18794 | loss 3.227423 (-0.20z)| norm 0.1938 (-1.09z)| lr 5.47e-03 | 2018.86 ms | 68.0% bf16 MFU | 258648 tok/s +step 4620/18794 | loss 3.198514 (-0.88z)| norm 0.2734 (+0.57z)| lr 5.47e-03 | 2025.73 ms | 67.7% bf16 MFU | 258656 tok/s +step 4621/18794 | loss 3.218894 (-0.38z)| norm 0.3293 (+1.72z)| lr 5.47e-03 | 2032.93 ms | 67.5% bf16 MFU | 258618 tok/s +step 4622/18794 | loss 3.170750 (-1.52z)| norm 0.3436 (+2.00z)| lr 5.47e-03 | 2017.79 ms | 68.0% bf16 MFU | 258679 tok/s +step 4623/18794 | loss 3.195394 (-0.91z)| norm 0.2862 (+0.85z)| lr 5.47e-03 | 2011.04 ms | 68.2% bf16 MFU | 258780 tok/s +step 4624/18794 | loss 3.215307 (-0.41z)| norm 0.2202 (-0.54z)| lr 5.47e-03 | 2011.67 ms | 68.2% bf16 MFU | 258872 tok/s +step 4625/18794 | loss 3.225791 (-0.15z)| norm 0.2077 (-0.80z)| lr 5.47e-03 | 2041.92 ms | 67.2% bf16 MFU | 258767 tok/s +step 4626/18794 | loss 3.293106 (+1.48z)| norm 0.2580 (+0.29z)| lr 5.47e-03 | 2019.08 ms | 68.0% bf16 MFU | 258812 tok/s +step 4627/18794 | loss 3.199964 (-0.77z)| norm 0.2057 (-0.83z)| lr 5.47e-03 | 2033.23 ms | 67.5% bf16 MFU | 258764 tok/s +step 4628/18794 | loss 3.238588 (+0.17z)| norm 0.2125 (-0.67z)| lr 5.47e-03 | 2026.09 ms | 67.7% bf16 MFU | 258764 tok/s +step 4629/18794 | loss 3.159263 (-1.71z)| norm 0.2374 (-0.13z)| lr 5.46e-03 | 2009.95 ms | 68.3% bf16 MFU | 258869 tok/s +step 4630/18794 | loss 3.156008 (-1.74z)| norm 0.2229 (-0.44z)| lr 5.46e-03 | 2026.35 ms | 67.7% bf16 MFU | 258862 tok/s +step 4631/18794 | loss 3.224600 (-0.09z)| norm 0.2300 (-0.29z)| lr 5.46e-03 | 2025.49 ms | 67.8% bf16 MFU | 258861 tok/s +step 4632/18794 | loss 3.226663 (-0.04z)| norm 0.2686 (+0.54z)| lr 5.46e-03 | 2033.28 ms | 67.5% bf16 MFU | 258811 tok/s +step 4633/18794 | loss 3.171073 (-1.37z)| norm 0.2241 (-0.41z)| lr 5.46e-03 | 2025.97 ms | 67.7% bf16 MFU | 258809 tok/s +step 4634/18794 | loss 3.188963 (-0.91z)| norm 0.1681 (-1.60z)| lr 5.46e-03 | 2011.35 ms | 68.2% bf16 MFU | 258902 tok/s +step 4635/18794 | loss 3.225921 (-0.00z)| norm 0.2089 (-0.71z)| lr 5.46e-03 | 2025.95 ms | 67.7% bf16 MFU | 258896 tok/s +step 4636/18794 | loss 3.190491 (-0.86z)| norm 0.2008 (-0.88z)| lr 5.46e-03 | 2025.57 ms | 67.7% bf16 MFU | 258893 tok/s +step 4637/18794 | loss 3.199623 (-0.62z)| norm 0.1835 (-1.24z)| lr 5.46e-03 | 2024.76 ms | 67.8% bf16 MFU | 258895 tok/s +step 4638/18794 | loss 3.100076 (-2.93z)| norm 0.1777 (-1.34z)| lr 5.46e-03 | 2002.74 ms | 68.5% bf16 MFU | 259040 tok/s +step 4639/18794 | loss 3.221261 (-0.04z)| norm 0.1530 (-1.86z)| lr 5.46e-03 | 2009.91 ms | 68.3% bf16 MFU | 259130 tok/s +step 4640/18794 | loss 3.195302 (-0.65z)| norm 0.1645 (-1.60z)| lr 5.46e-03 | 2018.01 ms | 68.0% bf16 MFU | 259164 tok/s +step 4641/18794 | loss 3.106980 (-2.63z)| norm 0.1979 (-0.88z)| lr 5.46e-03 | 2041.14 ms | 67.2% bf16 MFU | 259049 tok/s +step 4642/18794 | loss 3.201241 (-0.45z)| norm 0.2342 (-0.11z)| lr 5.46e-03 | 2002.03 ms | 68.5% bf16 MFU | 259190 tok/s +step 4643/18794 | loss 3.231776 (+0.25z)| norm 0.2425 (+0.06z)| lr 5.46e-03 | 2025.19 ms | 67.8% bf16 MFU | 259175 tok/s +step 4644/18794 | loss 3.202099 (-0.42z)| norm 0.2695 (+0.64z)| lr 5.46e-03 | 2010.89 ms | 68.2% bf16 MFU | 259253 tok/s +step 4645/18794 | loss 3.116768 (-2.30z)| norm 0.2686 (+0.61z)| lr 5.46e-03 | 2008.90 ms | 68.3% bf16 MFU | 259339 tok/s +step 4646/18794 | loss 3.208066 (-0.24z)| norm 0.1967 (-1.00z)| lr 5.46e-03 | 2010.25 ms | 68.3% bf16 MFU | 259413 tok/s +step 4647/18794 | loss 3.216121 (-0.04z)| norm 0.2287 (-0.27z)| lr 5.46e-03 | 2017.37 ms | 68.0% bf16 MFU | 259436 tok/s +step 4648/18794 | loss 3.178476 (-0.89z)| norm 0.3068 (+1.51z)| lr 5.46e-03 | 2008.84 ms | 68.3% bf16 MFU | 259514 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.253343 +step 4649/18794 | loss 3.203866 (-0.29z)| norm 0.3406 (+2.25z)| lr 5.46e-03 | 2008.53 ms | 68.3% bf16 MFU | 259590 tok/s +step 4650/18794 | loss 3.152455 (-1.46z)| norm 0.2559 (+0.35z)| lr 5.46e-03 | 2010.36 ms | 68.3% bf16 MFU | 259650 tok/s +step 4651/18794 | loss 3.240311 (+0.60z)| norm 0.1849 (-1.23z)| lr 5.46e-03 | 2025.85 ms | 67.7% bf16 MFU | 259607 tok/s +step 4652/18794 | loss 3.193888 (-0.48z)| norm 0.2563 (+0.38z)| lr 5.46e-03 | 2016.66 ms | 68.0% bf16 MFU | 259626 tok/s +step 4653/18794 | loss 3.199368 (-0.34z)| norm 0.2706 (+0.69z)| lr 5.46e-03 | 2033.13 ms | 67.5% bf16 MFU | 259538 tok/s +step 4654/18794 | loss 3.201348 (-0.28z)| norm 0.2306 (-0.21z)| lr 5.46e-03 | 1987.74 ms | 69.0% bf16 MFU | 259749 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.977270 +step 4655/18794 | loss 3.249593 (+0.85z)| norm 0.3780 (+2.98z)| lr 5.46e-03 | 2026.61 ms | 67.7% bf16 MFU | 259697 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.697842 +step 4656/18794 | loss 3.243344 (+0.71z)| norm 0.3708 (+2.70z)| lr 5.46e-03 | 2034.61 ms | 67.4% bf16 MFU | 259596 tok/s +step 4657/18794 | loss 3.148712 (-1.49z)| norm 0.2048 (-0.76z)| lr 5.46e-03 | 1996.26 ms | 68.7% bf16 MFU | 259748 tok/s +step 4658/18794 | loss 3.242397 (+0.71z)| norm 0.2901 (+1.01z)| lr 5.46e-03 | 2010.21 ms | 68.3% bf16 MFU | 259802 tok/s +step 4659/18794 | loss 3.250713 (+0.91z)| norm 0.3104 (+1.40z)| lr 5.46e-03 | 2026.12 ms | 67.7% bf16 MFU | 259750 tok/s +step 4660/18794 | loss 3.266010 (+1.27z)| norm 0.1883 (-1.10z)| lr 5.46e-03 | 1994.14 ms | 68.8% bf16 MFU | 259908 tok/s +step 4661/18794 | loss 3.216374 (+0.09z)| norm 0.2067 (-0.71z)| lr 5.46e-03 | 2010.15 ms | 68.3% bf16 MFU | 259953 tok/s +step 4662/18794 | loss 3.154747 (-1.35z)| norm 0.2142 (-0.56z)| lr 5.46e-03 | 1986.76 ms | 69.1% bf16 MFU | 260150 tok/s +step 4663/18794 | loss 3.245697 (+0.79z)| norm 0.2247 (-0.35z)| lr 5.46e-03 | 2017.78 ms | 68.0% bf16 MFU | 260134 tok/s +step 4664/18794 | loss 3.202907 (-0.22z)| norm 0.2265 (-0.31z)| lr 5.45e-03 | 2017.32 ms | 68.0% bf16 MFU | 260122 tok/s +step 4665/18794 | loss 3.155555 (-1.32z)| norm 0.1995 (-0.86z)| lr 5.45e-03 | 2000.19 ms | 68.6% bf16 MFU | 260222 tok/s +step 4666/18794 | loss 3.231181 (+0.53z)| norm 0.2242 (-0.33z)| lr 5.45e-03 | 2033.90 ms | 67.5% bf16 MFU | 260100 tok/s +step 4667/18794 | loss 3.139088 (-1.70z)| norm 0.2086 (-0.65z)| lr 5.45e-03 | 2017.91 ms | 68.0% bf16 MFU | 260086 tok/s +step 4668/18794 | loss 3.202332 (-0.14z)| norm 0.1975 (-0.88z)| lr 5.45e-03 | 2017.78 ms | 68.0% bf16 MFU | 260073 tok/s +step 4669/18794 | loss 3.288142 (+1.95z)| norm 0.2224 (-0.35z)| lr 5.45e-03 | 2002.96 ms | 68.5% bf16 MFU | 260157 tok/s +step 4670/18794 | loss 3.196540 (-0.29z)| norm 0.2353 (-0.06z)| lr 5.45e-03 | 2009.64 ms | 68.3% bf16 MFU | 260194 tok/s +step 4671/18794 | loss 3.126853 (-1.95z)| norm 0.1820 (-1.20z)| lr 5.45e-03 | 2002.93 ms | 68.5% bf16 MFU | 260272 tok/s +step 4672/18794 | loss 3.187016 (-0.47z)| norm 0.2687 (+0.66z)| lr 5.45e-03 | 2026.31 ms | 67.7% bf16 MFU | 260196 tok/s +step 4673/18794 | loss 3.193791 (-0.29z)| norm 0.2579 (+0.42z)| lr 5.45e-03 | 2025.98 ms | 67.7% bf16 MFU | 260125 tok/s +step 4674/18794 | loss 3.192929 (-0.31z)| norm 0.2234 (-0.33z)| lr 5.45e-03 | 2018.88 ms | 68.0% bf16 MFU | 260103 tok/s +step 4675/18794 | loss 3.238673 (+0.81z)| norm 0.1881 (-1.09z)| lr 5.45e-03 | 2002.18 ms | 68.5% bf16 MFU | 260191 tok/s +step 4676/18794 | loss 3.249798 (+1.07z)| norm 0.2028 (-0.77z)| lr 5.45e-03 | 2001.80 ms | 68.6% bf16 MFU | 260277 tok/s +step 4677/18794 | loss 3.214486 (+0.20z)| norm 0.2194 (-0.41z)| lr 5.45e-03 | 2011.27 ms | 68.2% bf16 MFU | 260297 tok/s +step 4678/18794 | loss 3.143536 (-1.53z)| norm 0.1950 (-0.92z)| lr 5.45e-03 | 2002.11 ms | 68.5% bf16 MFU | 260375 tok/s +step 4679/18794 | loss 3.254564 (+1.20z)| norm 0.2020 (-0.75z)| lr 5.45e-03 | 2003.68 ms | 68.5% bf16 MFU | 260440 tok/s +step 4680/18794 | loss 3.162308 (-1.05z)| norm 0.2168 (-0.42z)| lr 5.45e-03 | 2019.07 ms | 68.0% bf16 MFU | 260401 tok/s +step 4681/18794 | loss 3.181627 (-0.58z)| norm 0.2659 (+0.72z)| lr 5.45e-03 | 1986.76 ms | 69.1% bf16 MFU | 260576 tok/s +step 4682/18794 | loss 3.229070 (+0.57z)| norm 0.2459 (+0.28z)| lr 5.45e-03 | 2011.93 ms | 68.2% bf16 MFU | 260576 tok/s +step 4683/18794 | loss 3.152643 (-1.28z)| norm 0.1819 (-1.21z)| lr 5.45e-03 | 2019.03 ms | 68.0% bf16 MFU | 260531 tok/s +step 4684/18794 | loss 3.226682 (+0.54z)| norm 0.2346 (+0.03z)| lr 5.45e-03 | 2001.81 ms | 68.6% bf16 MFU | 260600 tok/s +step 4685/18794 | loss 3.171318 (-0.81z)| norm 0.2880 (+1.26z)| lr 5.45e-03 | 2010.76 ms | 68.2% bf16 MFU | 260607 tok/s +step 4686/18794 | loss 3.117542 (-2.07z)| norm 0.2166 (-0.40z)| lr 5.45e-03 | 2010.94 ms | 68.2% bf16 MFU | 260613 tok/s +step 4687/18794 | loss 3.198896 (-0.11z)| norm 0.2281 (-0.13z)| lr 5.45e-03 | 1993.68 ms | 68.8% bf16 MFU | 260731 tok/s +step 4688/18794 | loss 3.178565 (-0.59z)| norm 0.2338 (+0.00z)| lr 5.45e-03 | 2002.16 ms | 68.5% bf16 MFU | 260787 tok/s +step 4689/18794 | loss 3.221976 (+0.47z)| norm 0.2533 (+0.45z)| lr 5.45e-03 | 1985.95 ms | 69.1% bf16 MFU | 260948 tok/s +step 4690/18794 | loss 3.191071 (-0.30z)| norm 0.2536 (+0.46z)| lr 5.45e-03 | 2003.48 ms | 68.5% bf16 MFU | 260985 tok/s +step 4691/18794 | loss 3.229045 (+0.62z)| norm 0.2351 (+0.03z)| lr 5.45e-03 | 1986.10 ms | 69.1% bf16 MFU | 261134 tok/s +step 4692/18794 | loss 3.155232 (-1.17z)| norm 0.2286 (-0.12z)| lr 5.45e-03 | 1994.86 ms | 68.8% bf16 MFU | 261219 tok/s +step 4693/18794 | loss 3.197646 (-0.13z)| norm 0.2220 (-0.28z)| lr 5.45e-03 | 2018.71 ms | 68.0% bf16 MFU | 261144 tok/s +step 4694/18794 | loss 3.182243 (-0.49z)| norm 0.2443 (+0.24z)| lr 5.45e-03 | 2004.69 ms | 68.5% bf16 MFU | 261163 tok/s +step 4695/18794 | loss 3.208947 (+0.15z)| norm 0.2000 (-0.79z)| lr 5.45e-03 | 1994.49 ms | 68.8% bf16 MFU | 261248 tok/s +step 4696/18794 | loss 3.211392 (+0.22z)| norm 0.1935 (-0.94z)| lr 5.45e-03 | 1994.94 ms | 68.8% bf16 MFU | 261326 tok/s +step 4697/18794 | loss 3.186373 (-0.39z)| norm 0.2131 (-0.49z)| lr 5.45e-03 | 2009.89 ms | 68.3% bf16 MFU | 261303 tok/s +step 4698/18794 | loss 3.185528 (-0.41z)| norm 0.2210 (-0.30z)| lr 5.44e-03 | 2004.02 ms | 68.5% bf16 MFU | 261318 tok/s +step 4699/18794 | loss 3.185139 (-0.42z)| norm 0.2489 (+0.34z)| lr 5.44e-03 | 1994.60 ms | 68.8% bf16 MFU | 261395 tok/s +step 4700/18794 | loss 3.254726 (+1.30z)| norm 0.3105 (+1.73z)| lr 5.44e-03 | 1985.94 ms | 69.1% bf16 MFU | 261525 tok/s +step 4701/18794 | loss 3.213407 (+0.30z)| norm 0.2909 (+1.25z)| lr 5.44e-03 | 2002.47 ms | 68.5% bf16 MFU | 261540 tok/s +step 4702/18794 | loss 3.157692 (-1.10z)| norm 0.2283 (-0.17z)| lr 5.44e-03 | 2002.52 ms | 68.5% bf16 MFU | 261554 tok/s +step 4703/18794 | loss 3.198729 (-0.07z)| norm 0.2010 (-0.78z)| lr 5.44e-03 | 2019.21 ms | 68.0% bf16 MFU | 261459 tok/s +step 4704/18794 | loss 3.173315 (-0.71z)| norm 0.1764 (-1.33z)| lr 5.44e-03 | 1985.91 ms | 69.1% bf16 MFU | 261586 tok/s +step 4705/18794 | loss 3.264798 (+1.58z)| norm 0.2505 (+0.35z)| lr 5.44e-03 | 2001.40 ms | 68.6% bf16 MFU | 261605 tok/s +step 4706/18794 | loss 3.149624 (-1.32z)| norm 0.2873 (+1.17z)| lr 5.44e-03 | 2003.96 ms | 68.5% bf16 MFU | 261606 tok/s +step 4707/18794 | loss 3.165354 (-0.91z)| norm 0.2226 (-0.31z)| lr 5.44e-03 | 1978.87 ms | 69.3% bf16 MFU | 261773 tok/s +step 4708/18794 | loss 3.178517 (-0.56z)| norm 0.2649 (+0.67z)| lr 5.44e-03 | 1986.80 ms | 69.1% bf16 MFU | 261878 tok/s +reducing beta2 to 0.9 and lr/wd by 0.953 due to grad z-score of 3.671510 +step 4709/18794 | loss 3.126772 (-1.82z)| norm 0.4093 (+3.67z)| lr 5.19e-03 | 2002.52 ms | 68.5% bf16 MFU | 261875 tok/s +step 4710/18794 | loss 3.140568 (-1.45z)| norm 0.3330 (+1.98z)| lr 5.44e-03 | 2011.10 ms | 68.2% bf16 MFU | 261816 tok/s +step 4711/18794 | loss 3.241810 (+1.09z)| norm 0.2042 (-0.70z)| lr 5.44e-03 | 1996.13 ms | 68.7% bf16 MFU | 261858 tok/s +step 4712/18794 | loss 3.235014 (+0.91z)| norm 0.2946 (+1.20z)| lr 5.44e-03 | 2004.27 ms | 68.5% bf16 MFU | 261844 tok/s +step 4713/18794 | loss 3.205143 (+0.18z)| norm 0.2390 (+0.03z)| lr 5.44e-03 | 1985.70 ms | 69.1% bf16 MFU | 261954 tok/s +step 4714/18794 | loss 3.235148 (+0.93z)| norm 0.1839 (-1.12z)| lr 5.44e-03 | 1987.70 ms | 69.0% bf16 MFU | 262044 tok/s +step 4715/18794 | loss 3.182462 (-0.41z)| norm 0.2080 (-0.60z)| lr 5.44e-03 | 1996.85 ms | 68.7% bf16 MFU | 262070 tok/s +step 4716/18794 | loss 3.233534 (+0.89z)| norm 0.1831 (-1.10z)| lr 5.44e-03 | 2003.75 ms | 68.5% bf16 MFU | 262049 tok/s +step 4717/18794 | loss 3.207154 (+0.23z)| norm 0.1781 (-1.19z)| lr 5.44e-03 | 1994.91 ms | 68.8% bf16 MFU | 262087 tok/s +step 4718/18794 | loss 3.150229 (-1.23z)| norm 0.2015 (-0.71z)| lr 5.44e-03 | 2004.04 ms | 68.5% bf16 MFU | 262064 tok/s +step 4719/18794 | loss 3.160729 (-0.95z)| norm 0.2256 (-0.22z)| lr 5.44e-03 | 1980.57 ms | 69.3% bf16 MFU | 262196 tok/s +step 4720/18794 | loss 3.227908 (+0.77z)| norm 0.2287 (-0.15z)| lr 5.44e-03 | 1987.43 ms | 69.1% bf16 MFU | 262276 tok/s +step 4721/18794 | loss 3.157865 (-1.00z)| norm 0.2050 (-0.63z)| lr 5.44e-03 | 1995.91 ms | 68.8% bf16 MFU | 262297 tok/s +step 4722/18794 | loss 3.191329 (-0.16z)| norm 0.2261 (-0.16z)| lr 5.44e-03 | 2011.58 ms | 68.2% bf16 MFU | 262214 tok/s +step 4723/18794 | loss 3.169153 (-0.71z)| norm 0.2525 (+0.43z)| lr 5.44e-03 | 2003.02 ms | 68.5% bf16 MFU | 262190 tok/s +step 4724/18794 | loss 3.257362 (+1.51z)| norm 0.2056 (-0.60z)| lr 5.44e-03 | 1987.05 ms | 69.1% bf16 MFU | 262274 tok/s +step 4725/18794 | loss 3.249766 (+1.30z)| norm 0.2494 (+0.36z)| lr 5.44e-03 | 2011.20 ms | 68.2% bf16 MFU | 262194 tok/s +step 4726/18794 | loss 3.236217 (+1.00z)| norm 0.2117 (-0.46z)| lr 5.44e-03 | 1980.54 ms | 69.3% bf16 MFU | 262320 tok/s +step 4727/18794 | loss 3.166997 (-0.77z)| norm 0.1969 (-0.79z)| lr 5.44e-03 | 1986.35 ms | 69.1% bf16 MFU | 262402 tok/s +step 4728/18794 | loss 3.182902 (-0.35z)| norm 0.1853 (-1.03z)| lr 5.44e-03 | 1979.03 ms | 69.3% bf16 MFU | 262528 tok/s +step 4729/18794 | loss 3.186074 (-0.27z)| norm 0.1933 (-0.84z)| lr 5.44e-03 | 2003.20 ms | 68.5% bf16 MFU | 262488 tok/s +step 4730/18794 | loss 3.225806 (+0.74z)| norm 0.1949 (-0.80z)| lr 5.44e-03 | 2004.15 ms | 68.5% bf16 MFU | 262443 tok/s +step 4731/18794 | loss 3.247963 (+1.30z)| norm 0.1996 (-0.69z)| lr 5.44e-03 | 1987.19 ms | 69.1% bf16 MFU | 262513 tok/s +step 4732/18794 | loss 3.211737 (+0.37z)| norm 0.2418 (+0.23z)| lr 5.44e-03 | 1987.36 ms | 69.1% bf16 MFU | 262578 tok/s +step 4733/18794 | loss 3.193443 (-0.11z)| norm 0.2723 (+0.88z)| lr 5.43e-03 | 1982.27 ms | 69.2% bf16 MFU | 262673 tok/s +step 4734/18794 | loss 3.202797 (+0.13z)| norm 0.2722 (+0.86z)| lr 5.43e-03 | 1995.30 ms | 68.8% bf16 MFU | 262678 tok/s +step 4735/18794 | loss 3.217008 (+0.50z)| norm 0.2515 (+0.40z)| lr 5.43e-03 | 1987.08 ms | 69.1% bf16 MFU | 262736 tok/s +step 4736/18794 | loss 3.213851 (+0.41z)| norm 0.2596 (+0.56z)| lr 5.43e-03 | 1995.61 ms | 68.8% bf16 MFU | 262735 tok/s +step 4737/18794 | loss 3.247409 (+1.26z)| norm 0.2646 (+0.66z)| lr 5.43e-03 | 1987.87 ms | 69.0% bf16 MFU | 262786 tok/s +step 4738/18794 | loss 3.228617 (+0.77z)| norm 0.2787 (+0.95z)| lr 5.43e-03 | 1987.10 ms | 69.1% bf16 MFU | 262839 tok/s +step 4739/18794 | loss 3.219956 (+0.54z)| norm 0.2591 (+0.50z)| lr 5.43e-03 | 1987.61 ms | 69.0% bf16 MFU | 262886 tok/s +step 4740/18794 | loss 3.222656 (+0.60z)| norm 0.2342 (-0.07z)| lr 5.43e-03 | 1994.95 ms | 68.8% bf16 MFU | 262882 tok/s +step 4741/18794 | loss 3.397271 (+4.69z)| norm 0.2710 (+0.75z)| lr 5.43e-03 | 1982.28 ms | 69.2% bf16 MFU | 262962 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.347109 +step 4742/18794 | loss 3.120249 (-1.93z)| norm 0.3458 (+2.35z)| lr 5.43e-03 | 1981.02 ms | 69.3% bf16 MFU | 263047 tok/s +step 4743/18794 | loss 3.212584 (+0.25z)| norm 0.3017 (+1.35z)| lr 5.43e-03 | 1995.09 ms | 68.8% bf16 MFU | 263034 tok/s +step 4744/18794 | loss 3.203658 (+0.04z)| norm 0.2551 (+0.34z)| lr 5.43e-03 | 1981.45 ms | 69.3% bf16 MFU | 263112 tok/s +step 4745/18794 | loss 3.123373 (-1.87z)| norm 0.2801 (+0.88z)| lr 5.43e-03 | 1981.67 ms | 69.3% bf16 MFU | 263185 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.508417 +step 4746/18794 | loss 3.194138 (-0.18z)| norm 0.3597 (+2.51z)| lr 5.43e-03 | 1986.61 ms | 69.1% bf16 MFU | 263221 tok/s +step 4747/18794 | loss 3.176629 (-0.59z)| norm 0.3090 (+1.40z)| lr 5.43e-03 | 1980.17 ms | 69.3% bf16 MFU | 263299 tok/s +step 4748/18794 | loss 3.147699 (-1.26z)| norm 0.1757 (-1.37z)| lr 5.43e-03 | 1979.06 ms | 69.3% bf16 MFU | 263380 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.135047 +step 4749/18794 | loss 3.217010 (+0.37z)| norm 0.3427 (+2.14z)| lr 5.43e-03 | 1989.86 ms | 69.0% bf16 MFU | 263385 tok/s +step 4750/18794 | loss 3.219622 (+0.42z)| norm 0.2418 (+0.02z)| lr 5.43e-03 | 1983.24 ms | 69.2% bf16 MFU | 263433 tok/s +val loss 3.258681 +HellaSwag: 2831/10042 = 0.281916: 0/1256 +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.506829 +step 4751/18794 | loss 3.215382 (+0.33z)| norm 0.3652 (+2.51z)| lr 5.43e-03 | 1988.79 ms | 69.0% bf16 MFU | 263443 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.377912 +step 4752/18794 | loss 3.168858 (-0.77z)| norm 0.3633 (+2.38z)| lr 5.43e-03 | 1980.66 ms | 69.3% bf16 MFU | 263506 tok/s +step 4753/18794 | loss 3.264122 (+1.46z)| norm 0.1911 (-1.02z)| lr 5.43e-03 | 1980.32 ms | 69.3% bf16 MFU | 263568 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.003219 +step 4754/18794 | loss 3.116436 (-1.95z)| norm 0.3473 (+2.00z)| lr 5.43e-03 | 1980.93 ms | 69.3% bf16 MFU | 263623 tok/s +step 4755/18794 | loss 3.251129 (+1.15z)| norm 0.2752 (+0.65z)| lr 5.43e-03 | 1979.69 ms | 69.3% bf16 MFU | 263683 tok/s +step 4756/18794 | loss 3.190953 (-0.22z)| norm 0.2076 (-0.70z)| lr 5.43e-03 | 1988.01 ms | 69.0% bf16 MFU | 263686 tok/s +step 4757/18794 | loss 3.228743 (+0.63z)| norm 0.2614 (+0.41z)| lr 5.43e-03 | 1980.99 ms | 69.3% bf16 MFU | 263734 tok/s +step 4758/18794 | loss 3.273472 (+1.65z)| norm 0.2518 (+0.22z)| lr 5.43e-03 | 1988.52 ms | 69.0% bf16 MFU | 263730 tok/s +step 4759/18794 | loss 3.218572 (+0.40z)| norm 0.1627 (-1.60z)| lr 5.43e-03 | 1989.30 ms | 69.0% bf16 MFU | 263722 tok/s +step 4760/18794 | loss 3.237796 (+0.85z)| norm 0.2227 (-0.37z)| lr 5.43e-03 | 1987.67 ms | 69.0% bf16 MFU | 263724 tok/s +step 4761/18794 | loss 3.184922 (-0.37z)| norm 0.1982 (-0.88z)| lr 5.43e-03 | 1979.84 ms | 69.3% bf16 MFU | 263778 tok/s +step 4762/18794 | loss 3.204681 (+0.08z)| norm 0.3069 (+1.36z)| lr 5.43e-03 | 1980.50 ms | 69.3% bf16 MFU | 263826 tok/s +step 4763/18794 | loss 3.292041 (+2.08z)| norm 0.3071 (+1.34z)| lr 5.43e-03 | 1980.57 ms | 69.3% bf16 MFU | 263870 tok/s +step 4764/18794 | loss 3.231863 (+0.68z)| norm 0.2453 (+0.06z)| lr 5.43e-03 | 1980.52 ms | 69.3% bf16 MFU | 263913 tok/s +step 4765/18794 | loss 3.181976 (-0.47z)| norm 0.2445 (+0.04z)| lr 5.43e-03 | 1982.82 ms | 69.2% bf16 MFU | 263938 tok/s +step 4766/18794 | loss 3.227801 (+0.59z)| norm 0.2400 (-0.06z)| lr 5.43e-03 | 1999.33 ms | 68.6% bf16 MFU | 263853 tok/s +step 4767/18794 | loss 3.255372 (+1.20z)| norm 0.2015 (-0.85z)| lr 5.42e-03 | 2040.77 ms | 67.2% bf16 MFU | 263505 tok/s +step 4768/18794 | loss 3.215779 (+0.28z)| norm 0.1909 (-1.06z)| lr 5.42e-03 | 2030.88 ms | 67.6% bf16 MFU | 263238 tok/s +step 4769/18794 | loss 3.220295 (+0.41z)| norm 0.2117 (-0.63z)| lr 5.42e-03 | 2030.97 ms | 67.6% bf16 MFU | 262983 tok/s +step 4770/18794 | loss 3.257505 (+1.27z)| norm 0.2258 (-0.34z)| lr 5.42e-03 | 2047.40 ms | 67.0% bf16 MFU | 262638 tok/s +step 4771/18794 | loss 3.319917 (+2.63z)| norm 0.3088 (+1.33z)| lr 5.42e-03 | 2033.08 ms | 67.5% bf16 MFU | 262400 tok/s +step 4772/18794 | loss 3.236965 (+0.71z)| norm 0.2643 (+0.42z)| lr 5.42e-03 | 2039.07 ms | 67.3% bf16 MFU | 262136 tok/s +step 4773/18794 | loss 3.224087 (+0.41z)| norm 0.2059 (-0.76z)| lr 5.42e-03 | 2033.80 ms | 67.5% bf16 MFU | 261919 tok/s +step 4774/18794 | loss 3.266884 (+1.36z)| norm 0.2673 (+0.49z)| lr 5.42e-03 | 2036.40 ms | 67.4% bf16 MFU | 261696 tok/s +step 4775/18794 | loss 3.297778 (+2.01z)| norm 0.2108 (-0.68z)| lr 5.42e-03 | 2027.94 ms | 67.7% bf16 MFU | 261537 tok/s +step 4776/18794 | loss 3.220804 (+0.30z)| norm 0.2508 (+0.13z)| lr 5.42e-03 | 2037.70 ms | 67.3% bf16 MFU | 261325 tok/s +step 4777/18794 | loss 3.248223 (+0.91z)| norm 0.2664 (+0.45z)| lr 5.42e-03 | 2045.20 ms | 67.1% bf16 MFU | 261077 tok/s +step 4778/18794 | loss 3.264483 (+1.25z)| norm 0.1930 (-1.07z)| lr 5.42e-03 | 2036.82 ms | 67.4% bf16 MFU | 260893 tok/s +step 4779/18794 | loss 3.242044 (+0.75z)| norm 0.1932 (-1.06z)| lr 5.42e-03 | 2036.32 ms | 67.4% bf16 MFU | 260722 tok/s +step 4780/18794 | loss 3.281577 (+1.60z)| norm 0.2407 (-0.09z)| lr 5.42e-03 | 2029.68 ms | 67.6% bf16 MFU | 260601 tok/s +step 4781/18794 | loss 3.223635 (+0.30z)| norm 0.1980 (-0.95z)| lr 5.42e-03 | 2039.01 ms | 67.3% bf16 MFU | 260428 tok/s +step 4782/18794 | loss 3.278425 (+1.49z)| norm 0.2636 (+0.39z)| lr 5.42e-03 | 2042.97 ms | 67.2% bf16 MFU | 260238 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.221931 +step 4783/18794 | loss 3.220819 (+0.21z)| norm 0.4113 (+3.22z)| lr 5.42e-03 | 2028.80 ms | 67.6% bf16 MFU | 260147 tok/s +step 4784/18794 | loss 3.210291 (-0.02z)| norm 0.2761 (+0.57z)| lr 5.42e-03 | 2037.36 ms | 67.4% bf16 MFU | 260006 tok/s +step 4785/18794 | loss 3.237613 (+0.57z)| norm 0.2180 (-0.56z)| lr 5.42e-03 | 2048.11 ms | 67.0% bf16 MFU | 259805 tok/s +step 4786/18794 | loss 3.256033 (+0.97z)| norm 0.2645 (+0.35z)| lr 5.42e-03 | 2030.83 ms | 67.6% bf16 MFU | 259723 tok/s +step 4787/18794 | loss 3.240539 (+0.61z)| norm 0.2279 (-0.37z)| lr 5.42e-03 | 2039.32 ms | 67.3% bf16 MFU | 259592 tok/s +step 4788/18794 | loss 3.243339 (+0.66z)| norm 0.1993 (-0.92z)| lr 5.42e-03 | 2040.02 ms | 67.3% bf16 MFU | 259462 tok/s +step 4789/18794 | loss 3.217036 (+0.06z)| norm 0.2130 (-0.65z)| lr 5.42e-03 | 2033.46 ms | 67.5% bf16 MFU | 259380 tok/s +step 4790/18794 | loss 3.237259 (+0.51z)| norm 0.1900 (-1.07z)| lr 5.42e-03 | 2023.25 ms | 67.8% bf16 MFU | 259368 tok/s +step 4791/18794 | loss 3.258893 (+0.99z)| norm 0.2118 (-0.65z)| lr 5.42e-03 | 2024.94 ms | 67.8% bf16 MFU | 259345 tok/s +step 4792/18794 | loss 3.212897 (-0.07z)| norm 0.1747 (-1.34z)| lr 5.42e-03 | 2020.42 ms | 67.9% bf16 MFU | 259353 tok/s +step 4793/18794 | loss 3.237571 (+0.49z)| norm 0.1880 (-1.08z)| lr 5.42e-03 | 2027.73 ms | 67.7% bf16 MFU | 259313 tok/s +step 4794/18794 | loss 3.255071 (+0.87z)| norm 0.2039 (-0.76z)| lr 5.42e-03 | 2033.55 ms | 67.5% bf16 MFU | 259238 tok/s +step 4795/18794 | loss 3.219731 (+0.06z)| norm 0.2380 (-0.12z)| lr 5.42e-03 | 2030.65 ms | 67.6% bf16 MFU | 259186 tok/s +step 4796/18794 | loss 3.223012 (+0.13z)| norm 0.2140 (-0.58z)| lr 5.42e-03 | 2021.01 ms | 67.9% bf16 MFU | 259197 tok/s +step 4797/18794 | loss 3.285002 (+1.51z)| norm 0.1603 (-1.58z)| lr 5.42e-03 | 2028.13 ms | 67.7% bf16 MFU | 259163 tok/s +step 4798/18794 | loss 3.227953 (+0.21z)| norm 0.1895 (-1.02z)| lr 5.42e-03 | 2028.66 ms | 67.6% bf16 MFU | 259127 tok/s +step 4799/18794 | loss 3.232268 (+0.30z)| norm 0.2263 (-0.32z)| lr 5.42e-03 | 2029.11 ms | 67.6% bf16 MFU | 259090 tok/s +step 4800/18794 | loss 3.246359 (+0.62z)| norm 0.2192 (-0.44z)| lr 5.42e-03 | 2020.69 ms | 67.9% bf16 MFU | 259108 tok/s +step 4801/18794 | loss 3.263866 (+1.01z)| norm 0.2229 (-0.36z)| lr 5.41e-03 | 2010.91 ms | 68.2% bf16 MFU | 259189 tok/s +step 4802/18794 | loss 3.242942 (+0.52z)| norm 0.2476 (+0.10z)| lr 5.41e-03 | 2037.73 ms | 67.3% bf16 MFU | 259094 tok/s +step 4803/18794 | loss 3.219859 (-0.01z)| norm 0.2303 (-0.23z)| lr 5.41e-03 | 2005.97 ms | 68.4% bf16 MFU | 259207 tok/s +step 4804/18794 | loss 3.225449 (+0.10z)| norm 0.2292 (-0.26z)| lr 5.41e-03 | 2031.66 ms | 67.5% bf16 MFU | 259150 tok/s +step 4805/18794 | loss 3.199148 (-0.49z)| norm 0.2785 (+0.68z)| lr 5.41e-03 | 2025.41 ms | 67.8% bf16 MFU | 259135 tok/s +step 4806/18794 | loss 3.195372 (-0.60z)| norm 0.3021 (+1.12z)| lr 5.41e-03 | 2031.94 ms | 67.5% bf16 MFU | 259080 tok/s +step 4807/18794 | loss 3.295190 (+1.70z)| norm 0.2496 (+0.11z)| lr 5.41e-03 | 2030.87 ms | 67.6% bf16 MFU | 259034 tok/s +step 4808/18794 | loss 3.263294 (+0.94z)| norm 0.1836 (-1.13z)| lr 5.41e-03 | 2047.51 ms | 67.0% bf16 MFU | 258885 tok/s +step 4809/18794 | loss 3.272344 (+1.14z)| norm 0.2150 (-0.52z)| lr 5.41e-03 | 2014.08 ms | 68.1% bf16 MFU | 258956 tok/s +step 4810/18794 | loss 3.207203 (-0.43z)| norm 0.2061 (-0.68z)| lr 5.41e-03 | 2013.04 ms | 68.2% bf16 MFU | 259031 tok/s +step 4811/18794 | loss 3.230907 (+0.14z)| norm 0.2523 (+0.25z)| lr 5.41e-03 | 2029.46 ms | 67.6% bf16 MFU | 258996 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.169200 +step 4812/18794 | loss 3.254407 (+0.71z)| norm 0.3487 (+2.17z)| lr 5.41e-03 | 2034.30 ms | 67.5% bf16 MFU | 258933 tok/s +step 4813/18794 | loss 3.201523 (-0.57z)| norm 0.3267 (+1.69z)| lr 5.41e-03 | 2020.35 ms | 67.9% bf16 MFU | 258961 tok/s +step 4814/18794 | loss 3.257669 (+0.78z)| norm 0.2475 (+0.11z)| lr 5.41e-03 | 2035.13 ms | 67.4% bf16 MFU | 258894 tok/s +step 4815/18794 | loss 3.216081 (-0.23z)| norm 0.2282 (-0.28z)| lr 5.41e-03 | 2024.88 ms | 67.8% bf16 MFU | 258895 tok/s +step 4816/18794 | loss 3.189598 (-0.86z)| norm 0.2538 (+0.22z)| lr 5.41e-03 | 2025.60 ms | 67.7% bf16 MFU | 258892 tok/s +step 4817/18794 | loss 3.242815 (+0.41z)| norm 0.1775 (-1.32z)| lr 5.41e-03 | 2021.08 ms | 67.9% bf16 MFU | 258918 tok/s +step 4818/18794 | loss 3.277245 (+1.23z)| norm 0.2962 (+1.05z)| lr 5.41e-03 | 2017.16 ms | 68.0% bf16 MFU | 258968 tok/s +step 4819/18794 | loss 3.291199 (+1.53z)| norm 0.3147 (+1.39z)| lr 5.41e-03 | 2039.84 ms | 67.3% bf16 MFU | 258871 tok/s +step 4820/18794 | loss 3.219392 (-0.21z)| norm 0.2042 (-0.80z)| lr 5.41e-03 | 2027.45 ms | 67.7% bf16 MFU | 258857 tok/s +step 4821/18794 | loss 3.188631 (-0.98z)| norm 0.2503 (+0.10z)| lr 5.41e-03 | 2025.72 ms | 67.7% bf16 MFU | 258855 tok/s +step 4822/18794 | loss 3.236623 (+0.19z)| norm 0.2836 (+0.75z)| lr 5.41e-03 | 2005.70 ms | 68.4% bf16 MFU | 258982 tok/s +step 4823/18794 | loss 3.274521 (+1.11z)| norm 0.3123 (+1.30z)| lr 5.41e-03 | 2023.19 ms | 67.8% bf16 MFU | 258990 tok/s +step 4824/18794 | loss 3.222731 (-0.17z)| norm 0.2495 (+0.06z)| lr 5.41e-03 | 2026.09 ms | 67.7% bf16 MFU | 258979 tok/s +step 4825/18794 | loss 3.278240 (+1.20z)| norm 0.2122 (-0.67z)| lr 5.41e-03 | 2016.20 ms | 68.1% bf16 MFU | 259032 tok/s +step 4826/18794 | loss 3.218472 (-0.28z)| norm 0.2528 (+0.12z)| lr 5.41e-03 | 2010.49 ms | 68.3% bf16 MFU | 259119 tok/s +step 4827/18794 | loss 3.250574 (+0.50z)| norm 0.2265 (-0.41z)| lr 5.41e-03 | 2011.08 ms | 68.2% bf16 MFU | 259198 tok/s +step 4828/18794 | loss 3.337167 (+2.56z)| norm 0.2130 (-0.68z)| lr 5.41e-03 | 2039.60 ms | 67.3% bf16 MFU | 259091 tok/s +step 4829/18794 | loss 3.303372 (+1.69z)| norm 0.2180 (-0.59z)| lr 5.41e-03 | 2015.20 ms | 68.1% bf16 MFU | 259145 tok/s +step 4830/18794 | loss 3.279262 (+1.09z)| norm 0.2315 (-0.33z)| lr 5.41e-03 | 2037.56 ms | 67.4% bf16 MFU | 259053 tok/s +step 4831/18794 | loss 3.267348 (+0.80z)| norm 0.1924 (-1.11z)| lr 5.41e-03 | 2020.33 ms | 67.9% bf16 MFU | 259076 tok/s +step 4832/18794 | loss 3.258649 (+0.58z)| norm 0.1805 (-1.33z)| lr 5.41e-03 | 2005.61 ms | 68.4% bf16 MFU | 259192 tok/s +step 4833/18794 | loss 3.235544 (+0.02z)| norm 0.1992 (-0.94z)| lr 5.41e-03 | 2013.54 ms | 68.2% bf16 MFU | 259252 tok/s +step 4834/18794 | loss 3.228049 (-0.17z)| norm 0.1967 (-0.97z)| lr 5.40e-03 | 2018.57 ms | 68.0% bf16 MFU | 259276 tok/s +step 4835/18794 | loss 3.239115 (+0.09z)| norm 0.2055 (-0.78z)| lr 5.40e-03 | 2023.29 ms | 67.8% bf16 MFU | 259268 tok/s +step 4836/18794 | loss 3.262712 (+0.65z)| norm 0.3057 (+1.17z)| lr 5.40e-03 | 2010.84 ms | 68.2% bf16 MFU | 259341 tok/s +step 4837/18794 | loss 3.273632 (+0.90z)| norm 0.2278 (-0.34z)| lr 5.40e-03 | 2013.65 ms | 68.2% bf16 MFU | 259393 tok/s +step 4838/18794 | loss 3.286938 (+1.20z)| norm 0.1756 (-1.33z)| lr 5.40e-03 | 2020.93 ms | 67.9% bf16 MFU | 259395 tok/s +step 4839/18794 | loss 3.271943 (+0.83z)| norm 0.1882 (-1.07z)| lr 5.40e-03 | 2031.47 ms | 67.6% bf16 MFU | 259329 tok/s +step 4840/18794 | loss 3.225749 (-0.27z)| norm 0.1967 (-0.90z)| lr 5.40e-03 | 2010.10 ms | 68.3% bf16 MFU | 259404 tok/s +step 4841/18794 | loss 3.249892 (+0.36z)| norm 0.2128 (-0.57z)| lr 5.40e-03 | 2024.37 ms | 67.8% bf16 MFU | 259383 tok/s +step 4842/18794 | loss 3.220796 (-0.43z)| norm 0.2327 (-0.17z)| lr 5.40e-03 | 2005.40 ms | 68.4% bf16 MFU | 259486 tok/s +step 4843/18794 | loss 3.268535 (+0.84z)| norm 0.2653 (+0.48z)| lr 5.40e-03 | 1989.63 ms | 69.0% bf16 MFU | 259687 tok/s +step 4844/18794 | loss 3.236213 (-0.04z)| norm 0.1947 (-0.90z)| lr 5.40e-03 | 2026.35 ms | 67.7% bf16 MFU | 259639 tok/s +step 4845/18794 | loss 3.251031 (+0.34z)| norm 0.2421 (+0.04z)| lr 5.40e-03 | 2013.43 ms | 68.2% bf16 MFU | 259677 tok/s +step 4846/18794 | loss 3.261168 (+0.62z)| norm 0.2894 (+1.01z)| lr 5.40e-03 | 2013.93 ms | 68.1% bf16 MFU | 259710 tok/s +step 4847/18794 | loss 3.201515 (-1.11z)| norm 0.2423 (+0.07z)| lr 5.40e-03 | 2008.78 ms | 68.3% bf16 MFU | 259774 tok/s +step 4848/18794 | loss 3.231235 (-0.28z)| norm 0.2212 (-0.37z)| lr 5.40e-03 | 2001.29 ms | 68.6% bf16 MFU | 259884 tok/s +step 4849/18794 | loss 3.222286 (-0.55z)| norm 0.2424 (+0.09z)| lr 5.40e-03 | 1996.41 ms | 68.7% bf16 MFU | 260021 tok/s +step 4850/18794 | loss 3.222795 (-0.54z)| norm 0.2110 (-0.57z)| lr 5.40e-03 | 2013.24 ms | 68.2% bf16 MFU | 260041 tok/s +step 4851/18794 | loss 3.232078 (-0.27z)| norm 0.1735 (-1.35z)| lr 5.40e-03 | 2012.05 ms | 68.2% bf16 MFU | 260068 tok/s +step 4852/18794 | loss 3.245070 (+0.10z)| norm 0.1864 (-1.07z)| lr 5.40e-03 | 2011.83 ms | 68.2% bf16 MFU | 260094 tok/s +step 4853/18794 | loss 3.227272 (-0.43z)| norm 0.2046 (-0.67z)| lr 5.40e-03 | 2029.90 ms | 67.6% bf16 MFU | 260004 tok/s +step 4854/18794 | loss 3.235395 (-0.24z)| norm 0.2201 (-0.30z)| lr 5.40e-03 | 2028.05 ms | 67.7% bf16 MFU | 259929 tok/s +step 4855/18794 | loss 3.291321 (+1.60z)| norm 0.2159 (-0.39z)| lr 5.40e-03 | 2000.54 ms | 68.6% bf16 MFU | 260037 tok/s +step 4856/18794 | loss 3.258200 (+0.49z)| norm 0.1860 (-1.07z)| lr 5.40e-03 | 2000.17 ms | 68.6% bf16 MFU | 260141 tok/s +step 4857/18794 | loss 3.241177 (-0.08z)| norm 0.2272 (-0.11z)| lr 5.40e-03 | 2016.05 ms | 68.1% bf16 MFU | 260137 tok/s +reducing beta2 to 0.9 and lr/wd by 0.946 due to grad z-score of 3.698463 +step 4858/18794 | loss 3.284535 (+1.36z)| norm 0.4048 (+3.70z)| lr 5.11e-03 | 2001.11 ms | 68.6% bf16 MFU | 260230 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.970734 +step 4859/18794 | loss 3.289102 (+1.48z)| norm 0.3781 (+2.97z)| lr 5.40e-03 | 2012.89 ms | 68.2% bf16 MFU | 260242 tok/s +step 4860/18794 | loss 3.249099 (+0.15z)| norm 0.3166 (+1.65z)| lr 5.40e-03 | 2008.57 ms | 68.3% bf16 MFU | 260281 tok/s +step 4861/18794 | loss 3.225608 (-0.66z)| norm 0.2564 (+0.40z)| lr 5.40e-03 | 2010.38 ms | 68.3% bf16 MFU | 260306 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.624544 +step 4862/18794 | loss 3.265005 (+0.66z)| norm 0.3683 (+2.62z)| lr 5.40e-03 | 2007.59 ms | 68.4% bf16 MFU | 260348 tok/s +step 4863/18794 | loss 3.249612 (+0.15z)| norm 0.2147 (-0.45z)| lr 5.40e-03 | 2006.72 ms | 68.4% bf16 MFU | 260394 tok/s +step 4864/18794 | loss 3.286879 (+1.40z)| norm 0.2162 (-0.41z)| lr 5.40e-03 | 2015.21 ms | 68.1% bf16 MFU | 260383 tok/s +step 4865/18794 | loss 3.249977 (+0.12z)| norm 0.2957 (+1.18z)| lr 5.40e-03 | 2009.56 ms | 68.3% bf16 MFU | 260409 tok/s +step 4866/18794 | loss 3.247522 (+0.03z)| norm 0.2638 (+0.53z)| lr 5.40e-03 | 2028.23 ms | 67.7% bf16 MFU | 260313 tok/s +step 4867/18794 | loss 3.186104 (-2.05z)| norm 0.1708 (-1.32z)| lr 5.40e-03 | 2015.78 ms | 68.1% bf16 MFU | 260302 tok/s +step 4868/18794 | loss 3.210573 (-1.21z)| norm 0.1764 (-1.20z)| lr 5.39e-03 | 1989.04 ms | 69.0% bf16 MFU | 260466 tok/s +step 4869/18794 | loss 3.262553 (+0.56z)| norm 0.2252 (-0.24z)| lr 5.39e-03 | 2013.09 ms | 68.2% bf16 MFU | 260465 tok/s +step 4870/18794 | loss 3.233989 (-0.41z)| norm 0.2307 (-0.13z)| lr 5.39e-03 | 1997.91 ms | 68.7% bf16 MFU | 260563 tok/s +step 4871/18794 | loss 3.215803 (-1.03z)| norm 0.1942 (-0.84z)| lr 5.39e-03 | 1992.98 ms | 68.9% bf16 MFU | 260688 tok/s +step 4872/18794 | loss 3.199513 (-1.57z)| norm 0.2898 (+1.07z)| lr 5.39e-03 | 2018.31 ms | 68.0% bf16 MFU | 260642 tok/s +step 4873/18794 | loss 3.252833 (+0.28z)| norm 0.2399 (+0.07z)| lr 5.39e-03 | 2022.19 ms | 67.9% bf16 MFU | 260573 tok/s +step 4874/18794 | loss 3.235744 (-0.31z)| norm 0.2049 (-0.62z)| lr 5.39e-03 | 2006.19 ms | 68.4% bf16 MFU | 260611 tok/s +step 4875/18794 | loss 3.204314 (-1.39z)| norm 0.1837 (-1.04z)| lr 5.39e-03 | 1999.83 ms | 68.6% bf16 MFU | 260689 tok/s +step 4876/18794 | loss 3.213358 (-1.07z)| norm 0.1997 (-0.71z)| lr 5.39e-03 | 1999.45 ms | 68.6% bf16 MFU | 260765 tok/s +step 4877/18794 | loss 3.242303 (-0.04z)| norm 0.2462 (+0.23z)| lr 5.39e-03 | 2003.56 ms | 68.5% bf16 MFU | 260811 tok/s +step 4878/18794 | loss 3.275554 (+1.12z)| norm 0.2287 (-0.13z)| lr 5.39e-03 | 2005.18 ms | 68.4% bf16 MFU | 260844 tok/s +step 4879/18794 | loss 3.202571 (-1.41z)| norm 0.2615 (+0.51z)| lr 5.39e-03 | 2004.82 ms | 68.5% bf16 MFU | 260877 tok/s +step 4880/18794 | loss 3.189826 (-1.81z)| norm 0.2064 (-0.58z)| lr 5.39e-03 | 1987.50 ms | 69.0% bf16 MFU | 261023 tok/s +step 4881/18794 | loss 3.263402 (+0.71z)| norm 0.2396 (+0.07z)| lr 5.39e-03 | 2012.57 ms | 68.2% bf16 MFU | 260997 tok/s +step 4882/18794 | loss 3.253152 (+0.37z)| norm 0.2822 (+0.92z)| lr 5.39e-03 | 1999.23 ms | 68.6% bf16 MFU | 261059 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.029886 +step 4883/18794 | loss 3.268843 (+0.90z)| norm 0.3323 (+2.03z)| lr 5.39e-03 | 2003.22 ms | 68.5% bf16 MFU | 261093 tok/s +step 4884/18794 | loss 3.246854 (+0.12z)| norm 0.2566 (+0.45z)| lr 5.39e-03 | 2017.12 ms | 68.0% bf16 MFU | 261034 tok/s +step 4885/18794 | loss 3.239194 (-0.15z)| norm 0.1943 (-0.85z)| lr 5.39e-03 | 1997.89 ms | 68.7% bf16 MFU | 261103 tok/s +step 4886/18794 | loss 3.201502 (-1.43z)| norm 0.1943 (-0.84z)| lr 5.39e-03 | 2002.75 ms | 68.5% bf16 MFU | 261137 tok/s +step 4887/18794 | loss 3.244306 (+0.05z)| norm 0.1898 (-0.92z)| lr 5.39e-03 | 2000.66 ms | 68.6% bf16 MFU | 261183 tok/s +step 4888/18794 | loss 3.303049 (+2.02z)| norm 0.2437 (+0.20z)| lr 5.39e-03 | 2014.46 ms | 68.1% bf16 MFU | 261137 tok/s +step 4889/18794 | loss 3.224683 (-0.64z)| norm 0.2253 (-0.19z)| lr 5.39e-03 | 1987.97 ms | 69.0% bf16 MFU | 261267 tok/s +step 4890/18794 | loss 3.227699 (-0.53z)| norm 0.1661 (-1.42z)| lr 5.39e-03 | 1994.56 ms | 68.8% bf16 MFU | 261346 tok/s +step 4891/18794 | loss 3.277630 (+1.15z)| norm 0.2478 (+0.27z)| lr 5.39e-03 | 2025.36 ms | 67.8% bf16 MFU | 261222 tok/s +step 4892/18794 | loss 3.249698 (+0.19z)| norm 0.2856 (+1.04z)| lr 5.39e-03 | 1997.00 ms | 68.7% bf16 MFU | 261288 tok/s +step 4893/18794 | loss 3.233394 (-0.36z)| norm 0.2188 (-0.36z)| lr 5.39e-03 | 2005.62 ms | 68.4% bf16 MFU | 261294 tok/s +step 4894/18794 | loss 3.275728 (+1.06z)| norm 0.2458 (+0.20z)| lr 5.39e-03 | 2003.13 ms | 68.5% bf16 MFU | 261316 tok/s +step 4895/18794 | loss 3.228344 (-0.54z)| norm 0.2034 (-0.68z)| lr 5.39e-03 | 1986.48 ms | 69.1% bf16 MFU | 261447 tok/s +step 4896/18794 | loss 3.249669 (+0.17z)| norm 0.1875 (-1.01z)| lr 5.39e-03 | 2011.72 ms | 68.2% bf16 MFU | 261405 tok/s +step 4897/18794 | loss 3.225073 (-0.64z)| norm 0.2003 (-0.76z)| lr 5.39e-03 | 1999.40 ms | 68.6% bf16 MFU | 261446 tok/s +step 4898/18794 | loss 3.270970 (+0.90z)| norm 0.2102 (-0.55z)| lr 5.39e-03 | 2003.19 ms | 68.5% bf16 MFU | 261460 tok/s +step 4899/18794 | loss 3.221206 (-0.78z)| norm 0.2113 (-0.53z)| lr 5.39e-03 | 1980.79 ms | 69.3% bf16 MFU | 261621 tok/s +step 4900/18794 | loss 3.288436 (+1.47z)| norm 0.1711 (-1.35z)| lr 5.39e-03 | 2006.58 ms | 68.4% bf16 MFU | 261604 tok/s +step 4901/18794 | loss 3.214560 (-0.99z)| norm 0.1871 (-1.00z)| lr 5.38e-03 | 2009.52 ms | 68.3% bf16 MFU | 261569 tok/s +step 4902/18794 | loss 3.233963 (-0.34z)| norm 0.2042 (-0.64z)| lr 5.38e-03 | 2008.29 ms | 68.3% bf16 MFU | 261544 tok/s +step 4903/18794 | loss 3.273118 (+0.95z)| norm 0.2574 (+0.46z)| lr 5.38e-03 | 2004.62 ms | 68.5% bf16 MFU | 261544 tok/s +step 4904/18794 | loss 3.252001 (+0.24z)| norm 0.2207 (-0.30z)| lr 5.38e-03 | 1980.94 ms | 69.3% bf16 MFU | 261700 tok/s +step 4905/18794 | loss 3.251395 (+0.20z)| norm 0.1886 (-0.95z)| lr 5.38e-03 | 1995.69 ms | 68.8% bf16 MFU | 261750 tok/s +step 4906/18794 | loss 3.251178 (+0.18z)| norm 0.2140 (-0.41z)| lr 5.38e-03 | 2006.70 ms | 68.4% bf16 MFU | 261726 tok/s +step 4907/18794 | loss 3.219379 (-0.90z)| norm 0.2059 (-0.57z)| lr 5.38e-03 | 1986.12 ms | 69.1% bf16 MFU | 261839 tok/s +step 4908/18794 | loss 3.237024 (-0.27z)| norm 0.2320 (-0.03z)| lr 5.38e-03 | 2004.66 ms | 68.5% bf16 MFU | 261824 tok/s +step 4909/18794 | loss 3.232573 (-0.42z)| norm 0.1987 (-0.72z)| lr 5.38e-03 | 1991.14 ms | 68.9% bf16 MFU | 261898 tok/s +step 4910/18794 | loss 3.194542 (-1.73z)| norm 0.1809 (-1.09z)| lr 5.38e-03 | 1996.25 ms | 68.7% bf16 MFU | 261935 tok/s +step 4911/18794 | loss 3.235823 (-0.30z)| norm 0.1821 (-1.04z)| lr 5.38e-03 | 2001.77 ms | 68.6% bf16 MFU | 261934 tok/s +step 4912/18794 | loss 3.280784 (+1.24z)| norm 0.2799 (+1.03z)| lr 5.38e-03 | 1986.69 ms | 69.1% bf16 MFU | 262032 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.170053 +step 4913/18794 | loss 3.251736 (+0.23z)| norm 0.3860 (+3.17z)| lr 5.38e-03 | 1993.17 ms | 68.9% bf16 MFU | 262083 tok/s +step 4914/18794 | loss 3.220533 (-0.84z)| norm 0.2997 (+1.37z)| lr 5.38e-03 | 1998.09 ms | 68.7% bf16 MFU | 262098 tok/s +step 4915/18794 | loss 3.218411 (-0.92z)| norm 0.2085 (-0.49z)| lr 5.38e-03 | 1981.48 ms | 69.3% bf16 MFU | 262223 tok/s +step 4916/18794 | loss 3.196044 (-1.71z)| norm 0.2753 (+0.86z)| lr 5.38e-03 | 1995.95 ms | 68.8% bf16 MFU | 262246 tok/s +step 4917/18794 | loss 3.252440 (+0.26z)| norm 0.2401 (+0.14z)| lr 5.38e-03 | 1990.79 ms | 68.9% bf16 MFU | 262301 tok/s +step 4918/18794 | loss 3.235548 (-0.32z)| norm 0.2243 (-0.17z)| lr 5.38e-03 | 1985.36 ms | 69.1% bf16 MFU | 262390 tok/s +step 4919/18794 | loss 3.252767 (+0.30z)| norm 0.2181 (-0.28z)| lr 5.38e-03 | 1990.87 ms | 68.9% bf16 MFU | 262438 tok/s +step 4920/18794 | loss 3.189787 (-1.90z)| norm 0.2116 (-0.42z)| lr 5.38e-03 | 1989.89 ms | 69.0% bf16 MFU | 262490 tok/s +step 4921/18794 | loss 3.247687 (+0.11z)| norm 0.2513 (+0.41z)| lr 5.38e-03 | 1987.93 ms | 69.0% bf16 MFU | 262552 tok/s +step 4922/18794 | loss 3.239249 (-0.19z)| norm 0.3212 (+1.85z)| lr 5.38e-03 | 1988.39 ms | 69.0% bf16 MFU | 262608 tok/s +step 4923/18794 | loss 3.228104 (-0.57z)| norm 0.2613 (+0.62z)| lr 5.38e-03 | 2007.05 ms | 68.4% bf16 MFU | 262539 tok/s +step 4924/18794 | loss 3.185583 (-2.05z)| norm 0.2651 (+0.70z)| lr 5.38e-03 | 1992.89 ms | 68.9% bf16 MFU | 262566 tok/s +step 4925/18794 | loss 3.244888 (+0.05z)| norm 0.3236 (+1.87z)| lr 5.38e-03 | 1982.62 ms | 69.2% bf16 MFU | 262660 tok/s +step 4926/18794 | loss 3.167219 (-2.60z)| norm 0.2514 (+0.38z)| lr 5.38e-03 | 1991.57 ms | 68.9% bf16 MFU | 262689 tok/s +step 4927/18794 | loss 3.237714 (-0.17z)| norm 0.2132 (-0.40z)| lr 5.38e-03 | 1992.80 ms | 68.9% bf16 MFU | 262709 tok/s +step 4928/18794 | loss 3.292076 (+1.78z)| norm 0.1923 (-0.83z)| lr 5.38e-03 | 1992.22 ms | 68.9% bf16 MFU | 262732 tok/s +step 4929/18794 | loss 3.217627 (-0.87z)| norm 0.2429 (+0.21z)| lr 5.38e-03 | 1999.81 ms | 68.6% bf16 MFU | 262704 tok/s +step 4930/18794 | loss 3.255032 (+0.51z)| norm 0.2361 (+0.07z)| lr 5.38e-03 | 1997.94 ms | 68.7% bf16 MFU | 262690 tok/s +step 4931/18794 | loss 3.248986 (+0.30z)| norm 0.2603 (+0.55z)| lr 5.38e-03 | 1988.39 ms | 69.0% bf16 MFU | 262739 tok/s +step 4932/18794 | loss 3.218343 (-0.82z)| norm 0.2296 (-0.09z)| lr 5.38e-03 | 1985.20 ms | 69.1% bf16 MFU | 262807 tok/s +step 4933/18794 | loss 3.218958 (-0.79z)| norm 0.1853 (-1.00z)| lr 5.38e-03 | 1986.36 ms | 69.1% bf16 MFU | 262864 tok/s +step 4934/18794 | loss 3.289039 (+1.74z)| norm 0.2001 (-0.70z)| lr 5.37e-03 | 2001.46 ms | 68.6% bf16 MFU | 262818 tok/s +step 4935/18794 | loss 3.274471 (+1.19z)| norm 0.2447 (+0.21z)| lr 5.37e-03 | 2002.72 ms | 68.5% bf16 MFU | 262767 tok/s +step 4936/18794 | loss 3.186120 (-1.93z)| norm 0.2539 (+0.42z)| lr 5.37e-03 | 1981.30 ms | 69.3% bf16 MFU | 262859 tok/s +step 4937/18794 | loss 3.250885 (+0.37z)| norm 0.2422 (+0.17z)| lr 5.37e-03 | 1989.56 ms | 69.0% bf16 MFU | 262892 tok/s +step 4938/18794 | loss 3.257046 (+0.61z)| norm 0.2229 (-0.24z)| lr 5.37e-03 | 1982.22 ms | 69.2% bf16 MFU | 262972 tok/s +step 4939/18794 | loss 3.268899 (+1.04z)| norm 0.2247 (-0.21z)| lr 5.37e-03 | 1993.16 ms | 68.9% bf16 MFU | 262976 tok/s +step 4940/18794 | loss 3.271190 (+1.10z)| norm 0.1956 (-0.83z)| lr 5.37e-03 | 1993.40 ms | 68.8% bf16 MFU | 262978 tok/s +step 4941/18794 | loss 3.236210 (-0.15z)| norm 0.1861 (-1.02z)| lr 5.37e-03 | 1979.64 ms | 69.3% bf16 MFU | 263071 tok/s +step 4942/18794 | loss 3.192738 (-1.69z)| norm 0.2208 (-0.29z)| lr 5.37e-03 | 1985.29 ms | 69.1% bf16 MFU | 263122 tok/s +step 4943/18794 | loss 3.196677 (-1.51z)| norm 0.2323 (-0.04z)| lr 5.37e-03 | 1980.71 ms | 69.3% bf16 MFU | 263200 tok/s +step 4944/18794 | loss 3.325743 (+2.89z)| norm 0.2625 (+0.59z)| lr 5.37e-03 | 1983.03 ms | 69.2% bf16 MFU | 263260 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.608803 +step 4945/18794 | loss 3.284297 (+1.46z)| norm 0.3640 (+2.61z)| lr 5.37e-03 | 1985.47 ms | 69.1% bf16 MFU | 263300 tok/s +reducing beta2 to 0.9 and lr/wd by 0.943 due to grad z-score of 3.709874 +step 4946/18794 | loss 3.220064 (-0.68z)| norm 0.4324 (+3.71z)| lr 5.07e-03 | 1981.37 ms | 69.3% bf16 MFU | 263365 tok/s +step 4947/18794 | loss 3.307853 (+2.19z)| norm 0.2521 (+0.28z)| lr 5.37e-03 | 1997.16 ms | 68.7% bf16 MFU | 263323 tok/s +step 4948/18794 | loss 3.177364 (-2.05z)| norm 0.2254 (-0.23z)| lr 5.37e-03 | 1986.52 ms | 69.1% bf16 MFU | 263353 tok/s +step 4949/18794 | loss 3.267457 (+0.85z)| norm 0.3003 (+1.17z)| lr 5.37e-03 | 1977.92 ms | 69.4% bf16 MFU | 263439 tok/s +step 4950/18794 | loss 3.230930 (-0.33z)| norm 0.2378 (-0.01z)| lr 5.37e-03 | 1980.38 ms | 69.3% bf16 MFU | 263504 tok/s +step 4951/18794 | loss 3.234200 (-0.23z)| norm 0.2214 (-0.33z)| lr 5.37e-03 | 1979.56 ms | 69.3% bf16 MFU | 263571 tok/s +step 4952/18794 | loss 3.274620 (+1.06z)| norm 0.2448 (+0.10z)| lr 5.37e-03 | 1980.63 ms | 69.3% bf16 MFU | 263628 tok/s +step 4953/18794 | loss 3.255579 (+0.44z)| norm 0.2190 (-0.39z)| lr 5.37e-03 | 1980.45 ms | 69.3% bf16 MFU | 263683 tok/s +step 4954/18794 | loss 3.220837 (-0.67z)| norm 0.2306 (-0.17z)| lr 5.37e-03 | 1982.50 ms | 69.2% bf16 MFU | 263722 tok/s +step 4955/18794 | loss 3.234685 (-0.21z)| norm 0.2430 (+0.06z)| lr 5.37e-03 | 1978.64 ms | 69.4% bf16 MFU | 263784 tok/s +step 4956/18794 | loss 3.253942 (+0.42z)| norm 0.2039 (-0.70z)| lr 5.37e-03 | 1980.63 ms | 69.3% bf16 MFU | 263831 tok/s +step 4957/18794 | loss 3.226817 (-0.46z)| norm 0.2107 (-0.56z)| lr 5.37e-03 | 1979.06 ms | 69.3% bf16 MFU | 263885 tok/s +step 4958/18794 | loss 3.259952 (+0.63z)| norm 0.2271 (-0.23z)| lr 5.37e-03 | 1983.36 ms | 69.2% bf16 MFU | 263908 tok/s +step 4959/18794 | loss 3.169681 (-2.25z)| norm 0.1934 (-0.91z)| lr 5.37e-03 | 1985.50 ms | 69.1% bf16 MFU | 263915 tok/s +step 4960/18794 | loss 3.259884 (+0.65z)| norm 0.1760 (-1.25z)| lr 5.37e-03 | 1987.11 ms | 69.1% bf16 MFU | 263912 tok/s +step 4961/18794 | loss 3.252950 (+0.42z)| norm 0.1744 (-1.26z)| lr 5.37e-03 | 1986.66 ms | 69.1% bf16 MFU | 263912 tok/s +step 4962/18794 | loss 3.273168 (+1.06z)| norm 0.1756 (-1.24z)| lr 5.37e-03 | 1982.14 ms | 69.2% bf16 MFU | 263941 tok/s +step 4963/18794 | loss 3.196981 (-1.35z)| norm 0.1564 (-1.62z)| lr 5.37e-03 | 1981.45 ms | 69.3% bf16 MFU | 263974 tok/s +step 4964/18794 | loss 3.266625 (+0.88z)| norm 0.1984 (-0.71z)| lr 5.37e-03 | 1980.00 ms | 69.3% bf16 MFU | 264015 tok/s +step 4965/18794 | loss 3.190192 (-1.54z)| norm 0.2120 (-0.40z)| lr 5.37e-03 | 1979.81 ms | 69.3% bf16 MFU | 264055 tok/s +step 4966/18794 | loss 3.228061 (-0.33z)| norm 0.2176 (-0.27z)| lr 5.37e-03 | 1980.72 ms | 69.3% bf16 MFU | 264087 tok/s +step 4967/18794 | loss 3.248627 (+0.31z)| norm 0.2151 (-0.34z)| lr 5.36e-03 | 1980.59 ms | 69.3% bf16 MFU | 264118 tok/s +step 4968/18794 | loss 3.199076 (-1.28z)| norm 0.1959 (-0.77z)| lr 5.36e-03 | 1978.23 ms | 69.4% bf16 MFU | 264164 tok/s +step 4969/18794 | loss 3.282709 (+1.38z)| norm 0.1885 (-0.92z)| lr 5.36e-03 | 1979.94 ms | 69.3% bf16 MFU | 264196 tok/s +step 4970/18794 | loss 3.237607 (-0.05z)| norm 0.1489 (-1.74z)| lr 5.36e-03 | 1997.03 ms | 68.7% bf16 MFU | 264113 tok/s +step 4971/18794 | loss 3.282978 (+1.36z)| norm 0.2060 (-0.51z)| lr 5.36e-03 | 1988.29 ms | 69.0% bf16 MFU | 264091 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.236799 +step 4972/18794 | loss 3.219553 (-0.65z)| norm 0.3355 (+2.24z)| lr 5.36e-03 | 1985.18 ms | 69.1% bf16 MFU | 264092 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.317663 +step 4973/18794 | loss 3.259578 (+0.62z)| norm 0.3974 (+3.32z)| lr 5.36e-03 | 1982.18 ms | 69.2% bf16 MFU | 264112 tok/s +step 4974/18794 | loss 3.257882 (+0.55z)| norm 0.2336 (+0.03z)| lr 5.36e-03 | 1980.29 ms | 69.3% bf16 MFU | 264144 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.391501 +step 4975/18794 | loss 3.236632 (-0.13z)| norm 0.3560 (+2.39z)| lr 5.36e-03 | 1980.83 ms | 69.3% bf16 MFU | 264171 tok/s +step 4976/18794 | loss 3.235303 (-0.18z)| norm 0.2863 (+1.01z)| lr 5.36e-03 | 1980.49 ms | 69.3% bf16 MFU | 264199 tok/s +step 4977/18794 | loss 3.233905 (-0.22z)| norm 0.2312 (-0.06z)| lr 5.36e-03 | 1979.53 ms | 69.3% bf16 MFU | 264232 tok/s +step 4978/18794 | loss 3.209278 (-0.99z)| norm 0.2784 (+0.84z)| lr 5.36e-03 | 1980.20 ms | 69.3% bf16 MFU | 264258 tok/s +step 4979/18794 | loss 3.236961 (-0.12z)| norm 0.2245 (-0.20z)| lr 5.36e-03 | 1979.87 ms | 69.3% bf16 MFU | 264286 tok/s +step 4980/18794 | loss 3.288885 (+1.53z)| norm 0.1706 (-1.23z)| lr 5.36e-03 | 1978.24 ms | 69.4% bf16 MFU | 264323 tok/s +step 4981/18794 | loss 3.237805 (-0.11z)| norm 0.2251 (-0.17z)| lr 5.36e-03 | 1979.73 ms | 69.3% bf16 MFU | 264348 tok/s +step 4982/18794 | loss 3.228481 (-0.41z)| norm 0.2077 (-0.50z)| lr 5.36e-03 | 1980.23 ms | 69.3% bf16 MFU | 264369 tok/s +step 4983/18794 | loss 3.237730 (-0.10z)| norm 0.2093 (-0.45z)| lr 5.36e-03 | 1979.85 ms | 69.3% bf16 MFU | 264391 tok/s +step 4984/18794 | loss 3.208131 (-1.04z)| norm 0.2028 (-0.57z)| lr 5.36e-03 | 1980.70 ms | 69.3% bf16 MFU | 264406 tok/s +step 4985/18794 | loss 3.199826 (-1.29z)| norm 0.1915 (-0.79z)| lr 5.36e-03 | 1979.92 ms | 69.3% bf16 MFU | 264426 tok/s +step 4986/18794 | loss 3.184903 (-1.75z)| norm 0.1863 (-0.89z)| lr 5.36e-03 | 1979.79 ms | 69.3% bf16 MFU | 264446 tok/s +step 4987/18794 | loss 3.142790 (-2.92z)| norm 0.1920 (-0.78z)| lr 5.36e-03 | 1979.75 ms | 69.3% bf16 MFU | 264465 tok/s +step 4988/18794 | loss 3.317542 (+2.37z)| norm 0.2027 (-0.56z)| lr 5.36e-03 | 1978.88 ms | 69.3% bf16 MFU | 264489 tok/s +step 4989/18794 | loss 3.166782 (-2.11z)| norm 0.2419 (+0.21z)| lr 5.36e-03 | 1982.05 ms | 69.2% bf16 MFU | 264490 tok/s +step 4990/18794 | loss 3.281296 (+1.24z)| norm 0.2113 (-0.40z)| lr 5.36e-03 | 1980.39 ms | 69.3% bf16 MFU | 264503 tok/s +step 4991/18794 | loss 3.208438 (-0.87z)| norm 0.2097 (-0.43z)| lr 5.36e-03 | 1979.64 ms | 69.3% bf16 MFU | 264520 tok/s +step 4992/18794 | loss 3.334978 (+2.71z)| norm 0.2788 (+0.94z)| lr 5.36e-03 | 1982.39 ms | 69.2% bf16 MFU | 264517 tok/s +step 4993/18794 | loss 3.196100 (-1.19z)| norm 0.2887 (+1.12z)| lr 5.36e-03 | 1983.25 ms | 69.2% bf16 MFU | 264509 tok/s +step 4994/18794 | loss 3.178700 (-1.64z)| norm 0.2302 (-0.03z)| lr 5.36e-03 | 1986.78 ms | 69.1% bf16 MFU | 264478 tok/s +step 4995/18794 | loss 3.250806 (+0.36z)| norm 0.2223 (-0.19z)| lr 5.36e-03 | 1985.08 ms | 69.1% bf16 MFU | 264460 tok/s +step 4996/18794 | loss 3.164251 (-1.99z)| norm 0.2287 (-0.07z)| lr 5.36e-03 | 1984.90 ms | 69.1% bf16 MFU | 264444 tok/s +step 4997/18794 | loss 3.214076 (-0.62z)| norm 0.2067 (-0.51z)| lr 5.36e-03 | 1981.46 ms | 69.3% bf16 MFU | 264452 tok/s +step 4998/18794 | loss 3.222824 (-0.37z)| norm 0.2086 (-0.47z)| lr 5.36e-03 | 1980.59 ms | 69.3% bf16 MFU | 264465 tok/s +step 4999/18794 | loss 3.204742 (-0.86z)| norm 0.1763 (-1.10z)| lr 5.35e-03 | 1978.97 ms | 69.3% bf16 MFU | 264488 tok/s +step 5000/18794 | loss 3.275227 (+1.07z)| norm 0.1988 (-0.67z)| lr 5.35e-03 | 1981.06 ms | 69.3% bf16 MFU | 264496 tok/s +val loss 3.242186 +HellaSwag: 2818/10042 = 0.280621Swag: 990/1256: 0/1256 +generating: +--- +Writing state to log_gpt3_125M_edu_v4/state_00005000_00001.bin +a) Is isolated taxa of more than 50 types of organisms in the area. They are characterized by a narrow band, these common intrabromon organisms. +b) A number of species share common environments but are thereby conserved and not considered radio-restricted for native species, but species occurhomerozooidslive in the sedges lichenie. | +c) Distinguishing differences in polyphyletic species are found stratigastrata. It bridges the gap between primitive sedges and extant sedges, or the main axis of the plant-cladar system impacts (Add +in & damochet 2013; Fazal, 2003). +4). Beside the +--- +Writing checkpoint at step 5000 +Writing model to log_gpt3_125M_edu_v4/model_00005000.bin +Writing state to log_gpt3_125M_edu_v4/state_00005000_00000.bin +Deleting checkpoint at step 2500 +step 5001/18794 | loss 3.254761 (+0.50z)| norm 0.2433 (+0.21z)| lr 5.35e-03 | 1978.17 ms | 69.4% bf16 MFU | 264523 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.163512 +step 5002/18794 | loss 3.245795 (+0.25z)| norm 0.4020 (+3.16z)| lr 5.35e-03 | 1982.34 ms | 69.2% bf16 MFU | 264521 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.106078 +step 5003/18794 | loss 3.245253 (+0.24z)| norm 0.3496 (+2.11z)| lr 5.35e-03 | 1980.79 ms | 69.3% bf16 MFU | 264529 tok/s +step 5004/18794 | loss 3.263069 (+0.73z)| norm 0.1853 (-0.93z)| lr 5.35e-03 | 1981.68 ms | 69.3% bf16 MFU | 264531 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.648654 +step 5005/18794 | loss 3.232710 (-0.10z)| norm 0.3858 (+2.65z)| lr 5.35e-03 | 1980.55 ms | 69.3% bf16 MFU | 264540 tok/s +step 5006/18794 | loss 3.226680 (-0.26z)| norm 0.3255 (+1.53z)| lr 5.35e-03 | 1983.00 ms | 69.2% bf16 MFU | 264533 tok/s +step 5007/18794 | loss 3.278969 (+1.16z)| norm 0.2211 (-0.31z)| lr 5.35e-03 | 1978.34 ms | 69.4% bf16 MFU | 264557 tok/s +step 5008/18794 | loss 3.208239 (-0.77z)| norm 0.3367 (+1.69z)| lr 5.35e-03 | 1988.09 ms | 69.0% bf16 MFU | 264515 tok/s +step 5009/18794 | loss 3.230443 (-0.16z)| norm 0.2010 (-0.68z)| lr 5.35e-03 | 1979.23 ms | 69.3% bf16 MFU | 264534 tok/s +step 5010/18794 | loss 3.215519 (-0.58z)| norm 0.2459 (+0.09z)| lr 5.35e-03 | 1980.01 ms | 69.3% bf16 MFU | 264547 tok/s +step 5011/18794 | loss 3.231545 (-0.14z)| norm 0.2505 (+0.17z)| lr 5.35e-03 | 1982.69 ms | 69.2% bf16 MFU | 264541 tok/s +step 5012/18794 | loss 3.249708 (+0.37z)| norm 0.2699 (+0.51z)| lr 5.35e-03 | 1979.43 ms | 69.3% bf16 MFU | 264557 tok/s +step 5013/18794 | loss 3.254538 (+0.50z)| norm 0.2991 (+1.07z)| lr 5.35e-03 | 1979.27 ms | 69.3% bf16 MFU | 264574 tok/s +step 5014/18794 | loss 3.240298 (+0.11z)| norm 0.2436 (+0.07z)| lr 5.35e-03 | 1983.97 ms | 69.2% bf16 MFU | 264558 tok/s +step 5015/18794 | loss 3.212873 (-0.65z)| norm 0.2203 (-0.36z)| lr 5.35e-03 | 1977.70 ms | 69.4% bf16 MFU | 264585 tok/s +step 5016/18794 | loss 3.171203 (-1.77z)| norm 0.2386 (-0.01z)| lr 5.35e-03 | 1980.41 ms | 69.3% bf16 MFU | 264593 tok/s +step 5017/18794 | loss 3.255919 (+0.54z)| norm 0.1937 (-0.82z)| lr 5.35e-03 | 1980.05 ms | 69.3% bf16 MFU | 264603 tok/s +step 5018/18794 | loss 3.241408 (+0.14z)| norm 0.1677 (-1.28z)| lr 5.35e-03 | 1979.14 ms | 69.3% bf16 MFU | 264618 tok/s +step 5019/18794 | loss 3.253671 (+0.48z)| norm 0.2246 (-0.25z)| lr 5.35e-03 | 1980.15 ms | 69.3% bf16 MFU | 264625 tok/s +step 5020/18794 | loss 3.261548 (+0.68z)| norm 0.2086 (-0.54z)| lr 5.35e-03 | 1979.59 ms | 69.3% bf16 MFU | 264637 tok/s +step 5021/18794 | loss 3.185865 (-1.37z)| norm 0.1878 (-0.90z)| lr 5.35e-03 | 1978.14 ms | 69.4% bf16 MFU | 264657 tok/s +step 5022/18794 | loss 3.203209 (-0.89z)| norm 0.2304 (-0.12z)| lr 5.35e-03 | 1979.78 ms | 69.3% bf16 MFU | 264665 tok/s +step 5023/18794 | loss 3.232878 (-0.09z)| norm 0.1756 (-1.09z)| lr 5.35e-03 | 1978.92 ms | 69.3% bf16 MFU | 264679 tok/s +step 5024/18794 | loss 3.164171 (-1.92z)| norm 0.2027 (-0.59z)| lr 5.35e-03 | 1979.98 ms | 69.3% bf16 MFU | 264684 tok/s +step 5025/18794 | loss 3.191812 (-1.16z)| norm 0.2657 (+0.57z)| lr 5.35e-03 | 1978.92 ms | 69.3% bf16 MFU | 264697 tok/s +step 5026/18794 | loss 3.237536 (+0.04z)| norm 0.2713 (+0.67z)| lr 5.35e-03 | 1980.08 ms | 69.3% bf16 MFU | 264701 tok/s +step 5027/18794 | loss 3.219257 (-0.45z)| norm 0.2375 (+0.04z)| lr 5.35e-03 | 1979.26 ms | 69.3% bf16 MFU | 264711 tok/s +step 5028/18794 | loss 3.201349 (-0.92z)| norm 0.2066 (-0.53z)| lr 5.35e-03 | 1980.11 ms | 69.3% bf16 MFU | 264714 tok/s +step 5029/18794 | loss 3.220023 (-0.41z)| norm 0.1866 (-0.88z)| lr 5.35e-03 | 1978.83 ms | 69.4% bf16 MFU | 264726 tok/s +step 5030/18794 | loss 3.249929 (+0.41z)| norm 0.2200 (-0.27z)| lr 5.35e-03 | 1979.17 ms | 69.3% bf16 MFU | 264735 tok/s +step 5031/18794 | loss 3.182910 (-1.39z)| norm 0.2086 (-0.47z)| lr 5.34e-03 | 1978.86 ms | 69.3% bf16 MFU | 264745 tok/s +step 5032/18794 | loss 3.209340 (-0.67z)| norm 0.1991 (-0.63z)| lr 5.34e-03 | 1990.52 ms | 68.9% bf16 MFU | 264677 tok/s +step 5033/18794 | loss 3.269917 (+0.95z)| norm 0.3068 (+1.30z)| lr 5.34e-03 | 1985.19 ms | 69.1% bf16 MFU | 264649 tok/s +step 5034/18794 | loss 3.214957 (-0.51z)| norm 0.3205 (+1.51z)| lr 5.34e-03 | 1985.47 ms | 69.1% bf16 MFU | 264619 tok/s +step 5035/18794 | loss 3.275900 (+1.14z)| norm 0.2453 (+0.16z)| lr 5.34e-03 | 1982.36 ms | 69.2% bf16 MFU | 264612 tok/s +step 5036/18794 | loss 3.228274 (-0.17z)| norm 0.2602 (+0.43z)| lr 5.34e-03 | 1982.52 ms | 69.2% bf16 MFU | 264604 tok/s +step 5037/18794 | loss 3.229149 (-0.14z)| norm 0.2422 (+0.11z)| lr 5.34e-03 | 1980.62 ms | 69.3% bf16 MFU | 264609 tok/s +step 5038/18794 | loss 3.208803 (-0.68z)| norm 0.2391 (+0.05z)| lr 5.34e-03 | 1979.86 ms | 69.3% bf16 MFU | 264620 tok/s +step 5039/18794 | loss 3.185466 (-1.29z)| norm 0.2134 (-0.41z)| lr 5.34e-03 | 1979.90 ms | 69.3% bf16 MFU | 264629 tok/s +step 5040/18794 | loss 3.201639 (-0.83z)| norm 0.2006 (-0.64z)| lr 5.34e-03 | 1980.03 ms | 69.3% bf16 MFU | 264637 tok/s +step 5041/18794 | loss 3.200435 (-0.85z)| norm 0.1770 (-1.06z)| lr 5.34e-03 | 1979.37 ms | 69.3% bf16 MFU | 264649 tok/s +step 5042/18794 | loss 3.223140 (-0.24z)| norm 0.2121 (-0.43z)| lr 5.34e-03 | 1980.54 ms | 69.3% bf16 MFU | 264652 tok/s +step 5043/18794 | loss 3.307481 (+2.01z)| norm 0.2840 (+0.84z)| lr 5.34e-03 | 1979.61 ms | 69.3% bf16 MFU | 264662 tok/s +step 5044/18794 | loss 3.256669 (+0.67z)| norm 0.2027 (-0.60z)| lr 5.34e-03 | 1981.02 ms | 69.3% bf16 MFU | 264662 tok/s +step 5045/18794 | loss 3.213213 (-0.52z)| norm 0.2411 (+0.11z)| lr 5.34e-03 | 1978.70 ms | 69.4% bf16 MFU | 264677 tok/s +step 5046/18794 | loss 3.213988 (-0.50z)| norm 0.2057 (-0.53z)| lr 5.34e-03 | 1980.28 ms | 69.3% bf16 MFU | 264681 tok/s +step 5047/18794 | loss 3.194876 (-1.02z)| norm 0.2466 (+0.28z)| lr 5.34e-03 | 1979.88 ms | 69.3% bf16 MFU | 264687 tok/s +step 5048/18794 | loss 3.234171 (+0.09z)| norm 0.2464 (+0.27z)| lr 5.34e-03 | 1978.34 ms | 69.4% bf16 MFU | 264703 tok/s +step 5049/18794 | loss 3.193449 (-1.07z)| norm 0.2326 (+0.01z)| lr 5.34e-03 | 1980.25 ms | 69.3% bf16 MFU | 264706 tok/s +step 5050/18794 | loss 3.272513 (+1.20z)| norm 0.2076 (-0.48z)| lr 5.34e-03 | 1980.49 ms | 69.3% bf16 MFU | 264707 tok/s +step 5051/18794 | loss 3.289313 (+1.64z)| norm 0.2454 (+0.26z)| lr 5.34e-03 | 1979.50 ms | 69.3% bf16 MFU | 264715 tok/s +step 5052/18794 | loss 3.225889 (-0.14z)| norm 0.2508 (+0.37z)| lr 5.34e-03 | 1979.89 ms | 69.3% bf16 MFU | 264719 tok/s +step 5053/18794 | loss 3.243701 (+0.37z)| norm 0.2199 (-0.24z)| lr 5.34e-03 | 1981.92 ms | 69.2% bf16 MFU | 264710 tok/s +step 5054/18794 | loss 3.182713 (-1.35z)| norm 0.2079 (-0.47z)| lr 5.34e-03 | 1981.43 ms | 69.3% bf16 MFU | 264705 tok/s +step 5055/18794 | loss 3.266880 (+1.02z)| norm 0.2053 (-0.52z)| lr 5.34e-03 | 1979.69 ms | 69.3% bf16 MFU | 264711 tok/s +step 5056/18794 | loss 3.239393 (+0.25z)| norm 0.2013 (-0.60z)| lr 5.34e-03 | 1981.23 ms | 69.3% bf16 MFU | 264707 tok/s +step 5057/18794 | loss 3.252137 (+0.60z)| norm 0.2662 (+0.67z)| lr 5.34e-03 | 1981.64 ms | 69.3% bf16 MFU | 264700 tok/s +step 5058/18794 | loss 3.228526 (-0.06z)| norm 0.2389 (+0.13z)| lr 5.34e-03 | 1984.57 ms | 69.1% bf16 MFU | 264674 tok/s +step 5059/18794 | loss 3.205561 (-0.72z)| norm 0.1832 (-0.96z)| lr 5.34e-03 | 1986.77 ms | 69.1% bf16 MFU | 264635 tok/s +step 5060/18794 | loss 3.258302 (+0.79z)| norm 0.1899 (-0.83z)| lr 5.34e-03 | 1981.62 ms | 69.3% bf16 MFU | 264632 tok/s +step 5061/18794 | loss 3.232534 (+0.06z)| norm 0.2302 (-0.05z)| lr 5.34e-03 | 1981.37 ms | 69.3% bf16 MFU | 264631 tok/s +step 5062/18794 | loss 3.192214 (-1.08z)| norm 0.2459 (+0.25z)| lr 5.34e-03 | 1981.36 ms | 69.3% bf16 MFU | 264630 tok/s +step 5063/18794 | loss 3.214461 (-0.45z)| norm 0.2543 (+0.40z)| lr 5.33e-03 | 1982.86 ms | 69.2% bf16 MFU | 264619 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.378094 +step 5064/18794 | loss 3.260021 (+0.87z)| norm 0.3567 (+2.38z)| lr 5.33e-03 | 1981.25 ms | 69.3% bf16 MFU | 264619 tok/s +step 5065/18794 | loss 3.268905 (+1.11z)| norm 0.2693 (+0.64z)| lr 5.33e-03 | 1980.59 ms | 69.3% bf16 MFU | 264624 tok/s +step 5066/18794 | loss 3.206800 (-0.68z)| norm 0.2339 (-0.06z)| lr 5.33e-03 | 1980.28 ms | 69.3% bf16 MFU | 264630 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.851803 +step 5067/18794 | loss 3.222897 (-0.21z)| norm 0.3894 (+2.85z)| lr 5.33e-03 | 1978.13 ms | 69.4% bf16 MFU | 264651 tok/s +step 5068/18794 | loss 3.201501 (-0.83z)| norm 0.2666 (+0.52z)| lr 5.33e-03 | 1980.37 ms | 69.3% bf16 MFU | 264656 tok/s +step 5069/18794 | loss 3.237510 (+0.23z)| norm 0.1809 (-1.10z)| lr 5.33e-03 | 1979.56 ms | 69.3% bf16 MFU | 264665 tok/s +step 5070/18794 | loss 3.266816 (+1.07z)| norm 0.2607 (+0.39z)| lr 5.33e-03 | 1979.63 ms | 69.3% bf16 MFU | 264674 tok/s +step 5071/18794 | loss 3.237644 (+0.24z)| norm 0.2623 (+0.41z)| lr 5.33e-03 | 1979.26 ms | 69.3% bf16 MFU | 264685 tok/s +step 5072/18794 | loss 3.231973 (+0.07z)| norm 0.2066 (-0.64z)| lr 5.33e-03 | 1981.28 ms | 69.3% bf16 MFU | 264682 tok/s +step 5073/18794 | loss 3.215289 (-0.41z)| norm 0.1842 (-1.08z)| lr 5.33e-03 | 1979.91 ms | 69.3% bf16 MFU | 264688 tok/s +step 5074/18794 | loss 3.216568 (-0.37z)| norm 0.1802 (-1.15z)| lr 5.33e-03 | 1979.52 ms | 69.3% bf16 MFU | 264696 tok/s +step 5075/18794 | loss 3.312626 (+2.39z)| norm 0.2119 (-0.49z)| lr 5.33e-03 | 1980.50 ms | 69.3% bf16 MFU | 264698 tok/s +step 5076/18794 | loss 3.249816 (+0.57z)| norm 0.2453 (+0.22z)| lr 5.33e-03 | 1981.38 ms | 69.3% bf16 MFU | 264693 tok/s +step 5077/18794 | loss 3.251827 (+0.63z)| norm 0.2890 (+1.11z)| lr 5.33e-03 | 1980.00 ms | 69.3% bf16 MFU | 264698 tok/s +step 5078/18794 | loss 3.184053 (-1.30z)| norm 0.2483 (+0.27z)| lr 5.33e-03 | 1986.01 ms | 69.1% bf16 MFU | 264663 tok/s +step 5079/18794 | loss 3.228173 (-0.04z)| norm 0.2096 (-0.53z)| lr 5.33e-03 | 1982.15 ms | 69.2% bf16 MFU | 264655 tok/s +step 5080/18794 | loss 3.219650 (-0.27z)| norm 0.1784 (-1.19z)| lr 5.33e-03 | 1987.26 ms | 69.1% bf16 MFU | 264613 tok/s +step 5081/18794 | loss 3.196524 (-0.92z)| norm 0.2179 (-0.36z)| lr 5.33e-03 | 1982.62 ms | 69.2% bf16 MFU | 264605 tok/s +step 5082/18794 | loss 3.269131 (+1.15z)| norm 0.2389 (+0.07z)| lr 5.33e-03 | 1985.34 ms | 69.1% bf16 MFU | 264578 tok/s +step 5083/18794 | loss 3.237106 (+0.23z)| norm 0.1644 (-1.46z)| lr 5.33e-03 | 1982.75 ms | 69.2% bf16 MFU | 264571 tok/s +step 5084/18794 | loss 3.257365 (+0.80z)| norm 0.2202 (-0.31z)| lr 5.33e-03 | 1980.66 ms | 69.3% bf16 MFU | 264577 tok/s +step 5085/18794 | loss 3.223926 (-0.16z)| norm 0.2590 (+0.48z)| lr 5.33e-03 | 1979.44 ms | 69.3% bf16 MFU | 264592 tok/s +step 5086/18794 | loss 3.159795 (-1.97z)| norm 0.1922 (-0.91z)| lr 5.33e-03 | 1978.85 ms | 69.3% bf16 MFU | 264610 tok/s +step 5087/18794 | loss 3.217652 (-0.37z)| norm 0.2474 (+0.23z)| lr 5.33e-03 | 1979.27 ms | 69.3% bf16 MFU | 264624 tok/s +step 5088/18794 | loss 3.244654 (+0.46z)| norm 0.2321 (-0.10z)| lr 5.33e-03 | 1980.01 ms | 69.3% bf16 MFU | 264632 tok/s +step 5089/18794 | loss 3.208687 (-0.65z)| norm 0.2386 (+0.04z)| lr 5.33e-03 | 1978.40 ms | 69.4% bf16 MFU | 264651 tok/s +step 5090/18794 | loss 3.222666 (-0.21z)| norm 0.2392 (+0.05z)| lr 5.33e-03 | 1979.97 ms | 69.3% bf16 MFU | 264658 tok/s +step 5091/18794 | loss 3.231246 (+0.05z)| norm 0.2378 (+0.01z)| lr 5.33e-03 | 1982.09 ms | 69.2% bf16 MFU | 264651 tok/s +step 5092/18794 | loss 3.213488 (-0.49z)| norm 0.2366 (-0.01z)| lr 5.33e-03 | 1980.68 ms | 69.3% bf16 MFU | 264653 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.539187 +step 5093/18794 | loss 3.228009 (-0.02z)| norm 0.3610 (+2.54z)| lr 5.33e-03 | 1981.27 ms | 69.3% bf16 MFU | 264652 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.076506 +step 5094/18794 | loss 3.197074 (-1.06z)| norm 0.3419 (+2.08z)| lr 5.33e-03 | 1981.05 ms | 69.3% bf16 MFU | 264652 tok/s +step 5095/18794 | loss 3.232636 (+0.13z)| norm 0.2296 (-0.18z)| lr 5.32e-03 | 1980.00 ms | 69.3% bf16 MFU | 264659 tok/s +step 5096/18794 | loss 3.291821 (+2.08z)| norm 0.2278 (-0.22z)| lr 5.32e-03 | 1982.50 ms | 69.2% bf16 MFU | 264649 tok/s +step 5097/18794 | loss 3.220800 (-0.31z)| norm 0.3354 (+1.89z)| lr 5.32e-03 | 1981.04 ms | 69.3% bf16 MFU | 264649 tok/s +step 5098/18794 | loss 3.244923 (+0.49z)| norm 0.3137 (+1.43z)| lr 5.32e-03 | 1979.53 ms | 69.3% bf16 MFU | 264659 tok/s +step 5099/18794 | loss 3.243003 (+0.42z)| norm 0.1755 (-1.29z)| lr 5.32e-03 | 1984.42 ms | 69.2% bf16 MFU | 264636 tok/s +step 5100/18794 | loss 3.191377 (-1.29z)| norm 0.3425 (+1.93z)| lr 5.32e-03 | 1986.94 ms | 69.1% bf16 MFU | 264598 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.060607 +step 5101/18794 | loss 3.250727 (+0.71z)| norm 0.3526 (+2.06z)| lr 5.32e-03 | 1984.74 ms | 69.1% bf16 MFU | 264576 tok/s +step 5102/18794 | loss 3.270193 (+1.35z)| norm 0.2082 (-0.66z)| lr 5.32e-03 | 1978.78 ms | 69.4% bf16 MFU | 264595 tok/s +step 5103/18794 | loss 3.218807 (-0.36z)| norm 0.1938 (-0.93z)| lr 5.32e-03 | 1980.67 ms | 69.3% bf16 MFU | 264600 tok/s +step 5104/18794 | loss 3.225924 (-0.11z)| norm 0.2273 (-0.27z)| lr 5.32e-03 | 1978.25 ms | 69.4% bf16 MFU | 264621 tok/s +step 5105/18794 | loss 3.207403 (-0.73z)| norm 0.1995 (-0.83z)| lr 5.32e-03 | 1979.49 ms | 69.3% bf16 MFU | 264633 tok/s +step 5106/18794 | loss 3.239794 (+0.36z)| norm 0.2006 (-0.79z)| lr 5.32e-03 | 1978.99 ms | 69.3% bf16 MFU | 264648 tok/s +step 5107/18794 | loss 3.167957 (-2.01z)| norm 0.2144 (-0.49z)| lr 5.32e-03 | 1979.40 ms | 69.3% bf16 MFU | 264659 tok/s +step 5108/18794 | loss 3.217144 (-0.37z)| norm 0.2025 (-0.73z)| lr 5.32e-03 | 1978.78 ms | 69.4% bf16 MFU | 264674 tok/s +step 5109/18794 | loss 3.221843 (-0.21z)| norm 0.1927 (-0.95z)| lr 5.32e-03 | 1977.98 ms | 69.4% bf16 MFU | 264693 tok/s +step 5110/18794 | loss 3.285512 (+1.87z)| norm 0.2490 (+0.28z)| lr 5.32e-03 | 1978.22 ms | 69.4% bf16 MFU | 264710 tok/s +step 5111/18794 | loss 3.216121 (-0.41z)| norm 0.2376 (+0.04z)| lr 5.32e-03 | 1979.55 ms | 69.3% bf16 MFU | 264717 tok/s +step 5112/18794 | loss 3.269239 (+1.32z)| norm 0.2265 (-0.20z)| lr 5.32e-03 | 1978.98 ms | 69.3% bf16 MFU | 264728 tok/s +step 5113/18794 | loss 3.285197 (+1.81z)| norm 0.2650 (+0.66z)| lr 5.32e-03 | 1979.14 ms | 69.3% bf16 MFU | 264737 tok/s +step 5114/18794 | loss 3.269872 (+1.29z)| norm 0.2714 (+0.79z)| lr 5.32e-03 | 1981.42 ms | 69.3% bf16 MFU | 264730 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.459295 +step 5115/18794 | loss 3.276208 (+1.46z)| norm 0.3518 (+2.46z)| lr 5.32e-03 | 1978.10 ms | 69.4% bf16 MFU | 264746 tok/s +step 5116/18794 | loss 3.223182 (-0.24z)| norm 0.2138 (-0.48z)| lr 5.32e-03 | 1979.51 ms | 69.3% bf16 MFU | 264752 tok/s +step 5117/18794 | loss 3.250889 (+0.66z)| norm 0.2660 (+0.62z)| lr 5.32e-03 | 1979.03 ms | 69.3% bf16 MFU | 264760 tok/s +step 5118/18794 | loss 3.254036 (+0.75z)| norm 0.2541 (+0.35z)| lr 5.32e-03 | 1978.91 ms | 69.3% bf16 MFU | 264769 tok/s +step 5119/18794 | loss 3.276291 (+1.45z)| norm 0.2110 (-0.58z)| lr 5.32e-03 | 1978.55 ms | 69.4% bf16 MFU | 264780 tok/s +step 5120/18794 | loss 3.193651 (-1.16z)| norm 0.2463 (+0.17z)| lr 5.32e-03 | 1979.29 ms | 69.3% bf16 MFU | 264785 tok/s +step 5121/18794 | loss 3.184928 (-1.44z)| norm 0.3018 (+1.35z)| lr 5.32e-03 | 1980.51 ms | 69.3% bf16 MFU | 264782 tok/s +step 5122/18794 | loss 3.169579 (-1.89z)| norm 0.1985 (-0.87z)| lr 5.32e-03 | 1978.92 ms | 69.3% bf16 MFU | 264790 tok/s +step 5123/18794 | loss 3.214344 (-0.48z)| norm 0.1926 (-1.01z)| lr 5.32e-03 | 1978.18 ms | 69.4% bf16 MFU | 264802 tok/s +step 5124/18794 | loss 3.265763 (+1.12z)| norm 0.2202 (-0.42z)| lr 5.32e-03 | 1979.81 ms | 69.3% bf16 MFU | 264803 tok/s +step 5125/18794 | loss 3.242972 (+0.38z)| norm 0.1901 (-1.05z)| lr 5.32e-03 | 1981.40 ms | 69.3% bf16 MFU | 264793 tok/s +step 5126/18794 | loss 3.224244 (-0.22z)| norm 0.2015 (-0.79z)| lr 5.32e-03 | 1981.76 ms | 69.2% bf16 MFU | 264781 tok/s +step 5127/18794 | loss 3.209915 (-0.67z)| norm 0.2547 (+0.36z)| lr 5.31e-03 | 1982.44 ms | 69.2% bf16 MFU | 264765 tok/s +reducing beta2 to 0.9 and lr/wd by 0.849 due to grad z-score of 4.122335 +step 5128/18794 | loss 3.297077 (+2.05z)| norm 0.4506 (+4.12z)| lr 4.51e-03 | 1986.77 ms | 69.1% bf16 MFU | 264722 tok/s +step 5129/18794 | loss 3.224910 (-0.22z)| norm 0.2074 (-0.66z)| lr 5.31e-03 | 1982.97 ms | 69.2% bf16 MFU | 264705 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.323031 +step 5130/18794 | loss 3.186580 (-1.40z)| norm 0.3634 (+2.32z)| lr 5.31e-03 | 1993.56 ms | 68.8% bf16 MFU | 264619 tok/s +step 5131/18794 | loss 3.217624 (-0.44z)| norm 0.3403 (+1.83z)| lr 5.31e-03 | 1980.12 ms | 69.3% bf16 MFU | 264627 tok/s +step 5132/18794 | loss 3.239882 (+0.25z)| norm 0.1749 (-1.29z)| lr 5.31e-03 | 1978.64 ms | 69.4% bf16 MFU | 264645 tok/s +step 5133/18794 | loss 3.195126 (-1.14z)| norm 0.2461 (+0.07z)| lr 5.31e-03 | 1979.95 ms | 69.3% bf16 MFU | 264652 tok/s +step 5134/18794 | loss 3.286190 (+1.70z)| norm 0.2347 (-0.14z)| lr 5.31e-03 | 1980.13 ms | 69.3% bf16 MFU | 264658 tok/s +step 5135/18794 | loss 3.262136 (+0.95z)| norm 0.2569 (+0.29z)| lr 5.31e-03 | 1979.63 ms | 69.3% bf16 MFU | 264668 tok/s +step 5136/18794 | loss 3.178645 (-1.63z)| norm 0.2388 (-0.06z)| lr 5.31e-03 | 1979.46 ms | 69.3% bf16 MFU | 264677 tok/s +step 5137/18794 | loss 3.219359 (-0.37z)| norm 0.2354 (-0.12z)| lr 5.31e-03 | 1978.16 ms | 69.4% bf16 MFU | 264695 tok/s +step 5138/18794 | loss 3.245259 (+0.42z)| norm 0.2923 (+0.95z)| lr 5.31e-03 | 1979.41 ms | 69.3% bf16 MFU | 264704 tok/s +step 5139/18794 | loss 3.171636 (-1.84z)| norm 0.3044 (+1.16z)| lr 5.31e-03 | 1979.37 ms | 69.3% bf16 MFU | 264713 tok/s +step 5140/18794 | loss 3.219968 (-0.36z)| norm 0.1891 (-1.02z)| lr 5.31e-03 | 1980.36 ms | 69.3% bf16 MFU | 264714 tok/s +step 5141/18794 | loss 3.164482 (-2.03z)| norm 0.1644 (-1.48z)| lr 5.31e-03 | 1979.82 ms | 69.3% bf16 MFU | 264719 tok/s +step 5142/18794 | loss 3.248803 (+0.53z)| norm 0.1970 (-0.86z)| lr 5.31e-03 | 1990.79 ms | 68.9% bf16 MFU | 264651 tok/s +step 5143/18794 | loss 3.231202 (+0.01z)| norm 0.2637 (+0.40z)| lr 5.31e-03 | 1984.37 ms | 69.2% bf16 MFU | 264629 tok/s +step 5144/18794 | loss 3.252035 (+0.67z)| norm 0.2399 (-0.06z)| lr 5.31e-03 | 1982.77 ms | 69.2% bf16 MFU | 264619 tok/s +step 5145/18794 | loss 3.286265 (+1.69z)| norm 0.2459 (+0.06z)| lr 5.31e-03 | 1991.37 ms | 68.9% bf16 MFU | 264552 tok/s +step 5146/18794 | loss 3.259534 (+0.85z)| norm 0.2266 (-0.31z)| lr 5.31e-03 | 1981.97 ms | 69.2% bf16 MFU | 264551 tok/s +step 5147/18794 | loss 3.273391 (+1.25z)| norm 0.1976 (-0.85z)| lr 5.31e-03 | 1986.44 ms | 69.1% bf16 MFU | 264520 tok/s +step 5148/18794 | loss 3.183638 (-1.48z)| norm 0.2168 (-0.48z)| lr 5.31e-03 | 2043.15 ms | 67.2% bf16 MFU | 264124 tok/s +step 5149/18794 | loss 3.255566 (+0.70z)| norm 0.2201 (-0.42z)| lr 5.31e-03 | 2041.40 ms | 67.2% bf16 MFU | 263759 tok/s +step 5150/18794 | loss 3.194933 (-1.13z)| norm 0.2205 (-0.41z)| lr 5.31e-03 | 2033.80 ms | 67.5% bf16 MFU | 263461 tok/s +step 5151/18794 | loss 3.200635 (-0.94z)| norm 0.2742 (+0.59z)| lr 5.31e-03 | 2041.79 ms | 67.2% bf16 MFU | 263127 tok/s +step 5152/18794 | loss 3.224896 (-0.19z)| norm 0.2824 (+0.74z)| lr 5.31e-03 | 2041.19 ms | 67.2% bf16 MFU | 262813 tok/s +step 5153/18794 | loss 3.230140 (-0.03z)| norm 0.3497 (+1.94z)| lr 5.31e-03 | 2042.40 ms | 67.2% bf16 MFU | 262507 tok/s +step 5154/18794 | loss 3.116760 (-3.35z)| norm 0.2828 (+0.70z)| lr 5.31e-03 | 2033.89 ms | 67.5% bf16 MFU | 262271 tok/s +step 5155/18794 | loss 3.156541 (-2.11z)| norm 0.2734 (+0.51z)| lr 5.31e-03 | 2043.09 ms | 67.2% bf16 MFU | 261988 tok/s +step 5156/18794 | loss 3.202141 (-0.77z)| norm 0.3021 (+1.02z)| lr 5.31e-03 | 2041.34 ms | 67.2% bf16 MFU | 261731 tok/s +step 5157/18794 | loss 3.170926 (-1.63z)| norm 0.2292 (-0.32z)| lr 5.31e-03 | 2041.67 ms | 67.2% bf16 MFU | 261484 tok/s +step 5158/18794 | loss 3.205154 (-0.65z)| norm 0.2642 (+0.32z)| lr 5.30e-03 | 2042.00 ms | 67.2% bf16 MFU | 261247 tok/s +step 5159/18794 | loss 3.141614 (-2.37z)| norm 0.2742 (+0.50z)| lr 5.30e-03 | 2040.08 ms | 67.3% bf16 MFU | 261034 tok/s +step 5160/18794 | loss 3.201960 (-0.68z)| norm 0.2155 (-0.60z)| lr 5.30e-03 | 2041.74 ms | 67.2% bf16 MFU | 260822 tok/s +step 5161/18794 | loss 3.232007 (+0.15z)| norm 0.2409 (-0.13z)| lr 5.30e-03 | 2033.67 ms | 67.5% bf16 MFU | 260671 tok/s +step 5162/18794 | loss 3.165256 (-1.68z)| norm 0.2038 (-0.81z)| lr 5.30e-03 | 2042.51 ms | 67.2% bf16 MFU | 260472 tok/s +step 5163/18794 | loss 3.180272 (-1.25z)| norm 0.2644 (+0.31z)| lr 5.30e-03 | 2034.11 ms | 67.5% bf16 MFU | 260336 tok/s +step 5164/18794 | loss 3.224234 (-0.04z)| norm 0.2175 (-0.54z)| lr 5.30e-03 | 2026.41 ms | 67.7% bf16 MFU | 260255 tok/s +step 5165/18794 | loss 3.204240 (-0.57z)| norm 0.1863 (-1.11z)| lr 5.30e-03 | 2041.97 ms | 67.2% bf16 MFU | 260080 tok/s +step 5166/18794 | loss 3.149878 (-2.02z)| norm 0.2587 (+0.25z)| lr 5.30e-03 | 2017.51 ms | 68.0% bf16 MFU | 260070 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.539868 +step 5167/18794 | loss 3.244208 (+0.53z)| norm 0.3799 (+2.54z)| lr 5.30e-03 | 2042.11 ms | 67.2% bf16 MFU | 259903 tok/s +step 5168/18794 | loss 3.261903 (+0.99z)| norm 0.2897 (+0.83z)| lr 5.30e-03 | 2041.99 ms | 67.2% bf16 MFU | 259746 tok/s +step 5169/18794 | loss 3.199365 (-0.68z)| norm 0.1872 (-1.11z)| lr 5.30e-03 | 2011.38 ms | 68.2% bf16 MFU | 259791 tok/s +step 5170/18794 | loss 3.258861 (+0.92z)| norm 0.1885 (-1.06z)| lr 5.30e-03 | 2034.44 ms | 67.5% bf16 MFU | 259687 tok/s +step 5171/18794 | loss 3.176599 (-1.27z)| norm 0.2630 (+0.34z)| lr 5.30e-03 | 2042.09 ms | 67.2% bf16 MFU | 259540 tok/s +step 5172/18794 | loss 3.109679 (-2.90z)| norm 0.2691 (+0.44z)| lr 5.30e-03 | 2024.57 ms | 67.8% bf16 MFU | 259511 tok/s +step 5173/18794 | loss 3.168018 (-1.38z)| norm 0.2090 (-0.70z)| lr 5.30e-03 | 2042.02 ms | 67.2% bf16 MFU | 259373 tok/s +step 5174/18794 | loss 3.149219 (-1.81z)| norm 0.1822 (-1.20z)| lr 5.30e-03 | 2025.30 ms | 67.8% bf16 MFU | 259348 tok/s +step 5175/18794 | loss 3.200444 (-0.52z)| norm 0.2712 (+0.47z)| lr 5.30e-03 | 2024.88 ms | 67.8% bf16 MFU | 259326 tok/s +step 5176/18794 | loss 3.179832 (-1.02z)| norm 0.2536 (+0.13z)| lr 5.30e-03 | 2033.60 ms | 67.5% bf16 MFU | 259251 tok/s +step 5177/18794 | loss 3.200599 (-0.48z)| norm 0.2022 (-0.82z)| lr 5.30e-03 | 2041.54 ms | 67.2% bf16 MFU | 259129 tok/s +step 5178/18794 | loss 3.219070 (-0.02z)| norm 0.2498 (+0.08z)| lr 5.30e-03 | 2034.75 ms | 67.4% bf16 MFU | 259056 tok/s +step 5179/18794 | loss 3.155611 (-1.61z)| norm 0.2959 (+0.93z)| lr 5.30e-03 | 2042.32 ms | 67.2% bf16 MFU | 258938 tok/s +step 5180/18794 | loss 3.153326 (-1.63z)| norm 0.2061 (-0.77z)| lr 5.30e-03 | 2041.56 ms | 67.2% bf16 MFU | 258832 tok/s +step 5181/18794 | loss 3.175323 (-1.07z)| norm 0.3048 (+1.08z)| lr 5.30e-03 | 2034.33 ms | 67.5% bf16 MFU | 258776 tok/s +step 5182/18794 | loss 3.158088 (-1.46z)| norm 0.3468 (+1.82z)| lr 5.30e-03 | 2024.81 ms | 67.8% bf16 MFU | 258784 tok/s +step 5183/18794 | loss 3.177861 (-0.95z)| norm 0.2525 (+0.05z)| lr 5.30e-03 | 2026.17 ms | 67.7% bf16 MFU | 258783 tok/s +step 5184/18794 | loss 3.195371 (-0.51z)| norm 0.2131 (-0.69z)| lr 5.30e-03 | 2034.73 ms | 67.4% bf16 MFU | 258727 tok/s +step 5185/18794 | loss 3.176819 (-0.95z)| norm 0.1936 (-1.03z)| lr 5.30e-03 | 2025.18 ms | 67.8% bf16 MFU | 258735 tok/s +step 5186/18794 | loss 3.223098 (+0.18z)| norm 0.2429 (-0.12z)| lr 5.30e-03 | 2025.81 ms | 67.7% bf16 MFU | 258738 tok/s +step 5187/18794 | loss 3.228882 (+0.32z)| norm 0.2014 (-0.89z)| lr 5.30e-03 | 2034.01 ms | 67.5% bf16 MFU | 258690 tok/s +step 5188/18794 | loss 3.153678 (-1.52z)| norm 0.2301 (-0.35z)| lr 5.30e-03 | 2041.97 ms | 67.2% bf16 MFU | 258593 tok/s +step 5189/18794 | loss 3.178686 (-0.89z)| norm 0.2533 (+0.08z)| lr 5.29e-03 | 2034.40 ms | 67.5% bf16 MFU | 258549 tok/s +step 5190/18794 | loss 3.174750 (-0.97z)| norm 0.2108 (-0.71z)| lr 5.29e-03 | 2017.83 ms | 68.0% bf16 MFU | 258613 tok/s +step 5191/18794 | loss 3.196882 (-0.42z)| norm 0.2186 (-0.56z)| lr 5.29e-03 | 2018.44 ms | 68.0% bf16 MFU | 258670 tok/s +step 5192/18794 | loss 3.168464 (-1.10z)| norm 0.1848 (-1.17z)| lr 5.29e-03 | 2034.16 ms | 67.5% bf16 MFU | 258623 tok/s +step 5193/18794 | loss 3.159993 (-1.28z)| norm 0.1869 (-1.12z)| lr 5.29e-03 | 2026.39 ms | 67.7% bf16 MFU | 258628 tok/s +step 5194/18794 | loss 3.175766 (-0.89z)| norm 0.1697 (-1.42z)| lr 5.29e-03 | 2032.36 ms | 67.5% bf16 MFU | 258596 tok/s +step 5195/18794 | loss 3.199935 (-0.30z)| norm 0.1945 (-0.94z)| lr 5.29e-03 | 2025.65 ms | 67.7% bf16 MFU | 258607 tok/s +step 5196/18794 | loss 3.153661 (-1.40z)| norm 0.2788 (+0.64z)| lr 5.29e-03 | 2010.69 ms | 68.3% bf16 MFU | 258714 tok/s +step 5197/18794 | loss 3.162215 (-1.17z)| norm 0.2653 (+0.40z)| lr 5.29e-03 | 2016.25 ms | 68.1% bf16 MFU | 258780 tok/s +step 5198/18794 | loss 3.171903 (-0.91z)| norm 0.2288 (-0.28z)| lr 5.29e-03 | 2024.45 ms | 67.8% bf16 MFU | 258790 tok/s +step 5199/18794 | loss 3.233159 (+0.57z)| norm 0.2518 (+0.15z)| lr 5.29e-03 | 2016.97 ms | 68.0% bf16 MFU | 258847 tok/s +step 5200/18794 | loss 3.208973 (-0.02z)| norm 0.2307 (-0.24z)| lr 5.29e-03 | 2024.75 ms | 67.8% bf16 MFU | 258852 tok/s +step 5201/18794 | loss 3.155880 (-1.28z)| norm 0.1827 (-1.18z)| lr 5.29e-03 | 2009.56 ms | 68.3% bf16 MFU | 258954 tok/s +step 5202/18794 | loss 3.231081 (+0.55z)| norm 0.2150 (-0.53z)| lr 5.29e-03 | 2018.69 ms | 68.0% bf16 MFU | 258992 tok/s +step 5203/18794 | loss 3.184701 (-0.57z)| norm 0.2448 (+0.06z)| lr 5.29e-03 | 2032.94 ms | 67.5% bf16 MFU | 258938 tok/s +step 5204/18794 | loss 3.204536 (-0.08z)| norm 0.2389 (-0.06z)| lr 5.29e-03 | 2025.11 ms | 67.8% bf16 MFU | 258935 tok/s +reducing beta2 to 0.9 and lr/wd by 0.976 due to grad z-score of 3.585263 +step 5205/18794 | loss 3.161197 (-1.12z)| norm 0.4333 (+3.59z)| lr 5.16e-03 | 2024.29 ms | 67.8% bf16 MFU | 258938 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.019581 +step 5206/18794 | loss 3.128067 (-1.86z)| norm 0.4129 (+3.02z)| lr 5.29e-03 | 2016.66 ms | 68.0% bf16 MFU | 258991 tok/s +step 5207/18794 | loss 3.185482 (-0.50z)| norm 0.2307 (-0.29z)| lr 5.29e-03 | 1993.86 ms | 68.8% bf16 MFU | 259189 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.812488 +step 5208/18794 | loss 3.253876 (+1.12z)| norm 0.4096 (+2.81z)| lr 5.29e-03 | 2010.76 ms | 68.2% bf16 MFU | 259266 tok/s +step 5209/18794 | loss 3.157065 (-1.16z)| norm 0.3588 (+1.87z)| lr 5.29e-03 | 2033.74 ms | 67.5% bf16 MFU | 259193 tok/s +step 5210/18794 | loss 3.238140 (+0.78z)| norm 0.2473 (-0.05z)| lr 5.29e-03 | 2025.51 ms | 67.8% bf16 MFU | 259175 tok/s +step 5211/18794 | loss 3.154875 (-1.20z)| norm 0.2395 (-0.19z)| lr 5.29e-03 | 2025.70 ms | 67.7% bf16 MFU | 259157 tok/s +step 5212/18794 | loss 3.167588 (-0.88z)| norm 0.2922 (+0.71z)| lr 5.29e-03 | 2033.21 ms | 67.5% bf16 MFU | 259093 tok/s +step 5213/18794 | loss 3.178257 (-0.61z)| norm 0.2495 (-0.02z)| lr 5.29e-03 | 2010.02 ms | 68.3% bf16 MFU | 259180 tok/s +step 5214/18794 | loss 3.212159 (+0.24z)| norm 0.2679 (+0.30z)| lr 5.29e-03 | 2009.86 ms | 68.3% bf16 MFU | 259264 tok/s +step 5215/18794 | loss 3.207680 (+0.15z)| norm 0.2596 (+0.17z)| lr 5.29e-03 | 2009.51 ms | 68.3% bf16 MFU | 259346 tok/s +step 5216/18794 | loss 3.253893 (+1.29z)| norm 0.2586 (+0.14z)| lr 5.29e-03 | 2025.52 ms | 67.8% bf16 MFU | 259320 tok/s +step 5217/18794 | loss 3.154583 (-1.16z)| norm 0.2185 (-0.55z)| lr 5.29e-03 | 2009.29 ms | 68.3% bf16 MFU | 259401 tok/s +step 5218/18794 | loss 3.199343 (-0.03z)| norm 0.1877 (-1.07z)| lr 5.29e-03 | 2026.78 ms | 67.7% bf16 MFU | 259365 tok/s +step 5219/18794 | loss 3.164982 (-0.88z)| norm 0.2050 (-0.77z)| lr 5.29e-03 | 2026.25 ms | 67.7% bf16 MFU | 259334 tok/s +step 5220/18794 | loss 3.169197 (-0.77z)| norm 0.3238 (+1.27z)| lr 5.28e-03 | 2026.11 ms | 67.7% bf16 MFU | 259306 tok/s +step 5221/18794 | loss 3.152063 (-1.19z)| norm 0.2510 (+0.03z)| lr 5.28e-03 | 2033.43 ms | 67.5% bf16 MFU | 259232 tok/s +step 5222/18794 | loss 3.111791 (-2.15z)| norm 0.1789 (-1.21z)| lr 5.28e-03 | 2008.77 ms | 68.3% bf16 MFU | 259321 tok/s +step 5223/18794 | loss 3.197735 (-0.01z)| norm 0.2074 (-0.72z)| lr 5.28e-03 | 2025.78 ms | 67.7% bf16 MFU | 259295 tok/s +step 5224/18794 | loss 3.224573 (+0.67z)| norm 0.2009 (-0.83z)| lr 5.28e-03 | 2010.51 ms | 68.3% bf16 MFU | 259369 tok/s +step 5225/18794 | loss 3.213381 (+0.40z)| norm 0.1791 (-1.20z)| lr 5.28e-03 | 2018.49 ms | 68.0% bf16 MFU | 259388 tok/s +step 5226/18794 | loss 3.137907 (-1.47z)| norm 0.2358 (-0.24z)| lr 5.28e-03 | 2002.14 ms | 68.5% bf16 MFU | 259511 tok/s +reducing beta2 to 0.9 and lr/wd by 0.948 due to grad z-score of 3.693609 +step 5227/18794 | loss 3.253618 (+1.41z)| norm 0.4818 (+3.69z)| lr 5.01e-03 | 2033.35 ms | 67.5% bf16 MFU | 259428 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.487226 +step 5228/18794 | loss 3.146528 (-1.25z)| norm 0.4028 (+2.49z)| lr 5.28e-03 | 2010.77 ms | 68.2% bf16 MFU | 259494 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.936327 +step 5229/18794 | loss 3.186852 (-0.21z)| norm 0.4404 (+2.94z)| lr 5.28e-03 | 2033.86 ms | 67.5% bf16 MFU | 259408 tok/s +step 5230/18794 | loss 3.261526 (+1.65z)| norm 0.3208 (+1.07z)| lr 5.28e-03 | 2009.74 ms | 68.3% bf16 MFU | 259481 tok/s +step 5231/18794 | loss 3.216166 (+0.51z)| norm 0.2563 (+0.06z)| lr 5.28e-03 | 2016.51 ms | 68.1% bf16 MFU | 259507 tok/s +step 5232/18794 | loss 3.197793 (+0.06z)| norm 0.2380 (-0.24z)| lr 5.28e-03 | 2025.66 ms | 67.7% bf16 MFU | 259473 tok/s +step 5233/18794 | loss 3.154690 (-1.02z)| norm 0.2939 (+0.65z)| lr 5.28e-03 | 2018.72 ms | 68.0% bf16 MFU | 259485 tok/s +step 5234/18794 | loss 3.221277 (+0.69z)| norm 0.3227 (+1.10z)| lr 5.28e-03 | 2018.14 ms | 68.0% bf16 MFU | 259500 tok/s +step 5235/18794 | loss 3.241912 (+1.24z)| norm 0.1826 (-1.13z)| lr 5.28e-03 | 2000.47 ms | 68.6% bf16 MFU | 259629 tok/s +step 5236/18794 | loss 3.159617 (-0.89z)| norm 0.3030 (+0.78z)| lr 5.28e-03 | 2032.78 ms | 67.5% bf16 MFU | 259544 tok/s +step 5237/18794 | loss 3.207500 (+0.35z)| norm 0.3543 (+1.55z)| lr 5.28e-03 | 2001.11 ms | 68.6% bf16 MFU | 259666 tok/s +step 5238/18794 | loss 3.250500 (+1.47z)| norm 0.1888 (-1.02z)| lr 5.28e-03 | 2009.34 ms | 68.3% bf16 MFU | 259729 tok/s +step 5239/18794 | loss 3.225903 (+0.81z)| norm 0.1788 (-1.15z)| lr 5.28e-03 | 2001.76 ms | 68.6% bf16 MFU | 259838 tok/s +step 5240/18794 | loss 3.264016 (+1.76z)| norm 0.2063 (-0.73z)| lr 5.28e-03 | 2008.96 ms | 68.3% bf16 MFU | 259895 tok/s +step 5241/18794 | loss 3.228031 (+0.83z)| norm 0.2653 (+0.17z)| lr 5.28e-03 | 2008.41 ms | 68.3% bf16 MFU | 259953 tok/s +step 5242/18794 | loss 3.177684 (-0.44z)| norm 0.2850 (+0.47z)| lr 5.28e-03 | 2002.20 ms | 68.5% bf16 MFU | 260048 tok/s +step 5243/18794 | loss 3.211789 (+0.44z)| norm 0.2399 (-0.24z)| lr 5.28e-03 | 2009.86 ms | 68.3% bf16 MFU | 260088 tok/s +step 5244/18794 | loss 3.177084 (-0.44z)| norm 0.2754 (+0.32z)| lr 5.28e-03 | 2002.84 ms | 68.5% bf16 MFU | 260173 tok/s +step 5245/18794 | loss 3.231149 (+1.01z)| norm 0.3436 (+1.37z)| lr 5.28e-03 | 2016.94 ms | 68.0% bf16 MFU | 260161 tok/s +step 5246/18794 | loss 3.133352 (-1.58z)| norm 0.2536 (-0.05z)| lr 5.28e-03 | 2010.17 ms | 68.3% bf16 MFU | 260194 tok/s +step 5247/18794 | loss 3.108416 (-2.20z)| norm 0.1980 (-0.91z)| lr 5.28e-03 | 2017.40 ms | 68.0% bf16 MFU | 260178 tok/s +step 5248/18794 | loss 3.231536 (+1.08z)| norm 0.2205 (-0.56z)| lr 5.28e-03 | 2003.08 ms | 68.5% bf16 MFU | 260257 tok/s +step 5249/18794 | loss 3.245687 (+1.47z)| norm 0.2119 (-0.70z)| lr 5.28e-03 | 2008.86 ms | 68.3% bf16 MFU | 260293 tok/s +step 5250/18794 | loss 3.191232 (+0.01z)| norm 0.1917 (-1.00z)| lr 5.28e-03 | 2010.55 ms | 68.3% bf16 MFU | 260317 tok/s +val loss 3.231664 +HellaSwag: 2866/10042 = 0.285401: 0/1256 +step 5251/18794 | loss 3.197918 (+0.19z)| norm 0.1760 (-1.23z)| lr 5.27e-03 | 2010.38 ms | 68.3% bf16 MFU | 260341 tok/s +step 5252/18794 | loss 3.166411 (-0.64z)| norm 0.1902 (-0.99z)| lr 5.27e-03 | 2034.82 ms | 67.4% bf16 MFU | 260206 tok/s +step 5253/18794 | loss 3.129225 (-1.60z)| norm 0.1764 (-1.18z)| lr 5.27e-03 | 1996.76 ms | 68.7% bf16 MFU | 260325 tok/s +step 5254/18794 | loss 3.173842 (-0.43z)| norm 0.2653 (+0.20z)| lr 5.27e-03 | 1994.26 ms | 68.8% bf16 MFU | 260453 tok/s +step 5255/18794 | loss 3.174544 (-0.42z)| norm 0.2081 (-0.67z)| lr 5.27e-03 | 2003.31 ms | 68.5% bf16 MFU | 260516 tok/s +step 5256/18794 | loss 3.262036 (+1.91z)| norm 0.1960 (-0.84z)| lr 5.27e-03 | 2018.07 ms | 68.0% bf16 MFU | 260480 tok/s +step 5257/18794 | loss 3.218686 (+0.74z)| norm 0.2433 (-0.12z)| lr 5.27e-03 | 2017.61 ms | 68.0% bf16 MFU | 260449 tok/s +step 5258/18794 | loss 3.168101 (-0.60z)| norm 0.1646 (-1.31z)| lr 5.27e-03 | 2026.05 ms | 67.7% bf16 MFU | 260365 tok/s +step 5259/18794 | loss 3.149965 (-1.09z)| norm 0.2124 (-0.56z)| lr 5.27e-03 | 2011.49 ms | 68.2% bf16 MFU | 260379 tok/s +step 5260/18794 | loss 3.210035 (+0.51z)| norm 0.2087 (-0.62z)| lr 5.27e-03 | 2011.95 ms | 68.2% bf16 MFU | 260390 tok/s +step 5261/18794 | loss 3.276660 (+2.24z)| norm 0.2054 (-0.66z)| lr 5.27e-03 | 2011.66 ms | 68.2% bf16 MFU | 260401 tok/s +step 5262/18794 | loss 3.178349 (-0.34z)| norm 0.2394 (-0.15z)| lr 5.27e-03 | 1994.81 ms | 68.8% bf16 MFU | 260523 tok/s +step 5263/18794 | loss 3.142153 (-1.27z)| norm 0.3293 (+1.21z)| lr 5.27e-03 | 1997.13 ms | 68.7% bf16 MFU | 260623 tok/s +step 5264/18794 | loss 3.121251 (-1.77z)| norm 0.3732 (+1.83z)| lr 5.27e-03 | 2002.49 ms | 68.5% bf16 MFU | 260682 tok/s +step 5265/18794 | loss 3.161174 (-0.73z)| norm 0.2393 (-0.19z)| lr 5.27e-03 | 2011.68 ms | 68.2% bf16 MFU | 260679 tok/s +step 5266/18794 | loss 3.193501 (+0.09z)| norm 0.2629 (+0.17z)| lr 5.27e-03 | 1994.00 ms | 68.8% bf16 MFU | 260792 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.702332 +step 5267/18794 | loss 3.204844 (+0.40z)| norm 0.4355 (+2.70z)| lr 5.27e-03 | 2011.29 ms | 68.2% bf16 MFU | 260786 tok/s +step 5268/18794 | loss 3.120572 (-1.77z)| norm 0.2414 (-0.15z)| lr 5.27e-03 | 1997.22 ms | 68.7% bf16 MFU | 260872 tok/s +step 5269/18794 | loss 3.203566 (+0.40z)| norm 0.2735 (+0.31z)| lr 5.27e-03 | 2009.50 ms | 68.3% bf16 MFU | 260874 tok/s +step 5270/18794 | loss 3.176450 (-0.29z)| norm 0.3380 (+1.24z)| lr 5.27e-03 | 2011.08 ms | 68.2% bf16 MFU | 260865 tok/s +step 5271/18794 | loss 3.223460 (+0.94z)| norm 0.1942 (-0.87z)| lr 5.27e-03 | 2004.08 ms | 68.5% bf16 MFU | 260902 tok/s +step 5272/18794 | loss 3.210869 (+0.59z)| norm 0.2516 (-0.03z)| lr 5.27e-03 | 1987.71 ms | 69.0% bf16 MFU | 261045 tok/s +step 5273/18794 | loss 3.265306 (+2.00z)| norm 0.2411 (-0.19z)| lr 5.27e-03 | 2003.60 ms | 68.5% bf16 MFU | 261077 tok/s +step 5274/18794 | loss 3.180804 (-0.25z)| norm 0.1936 (-0.89z)| lr 5.27e-03 | 2010.26 ms | 68.3% bf16 MFU | 261063 tok/s +step 5275/18794 | loss 3.176411 (-0.36z)| norm 0.2622 (+0.13z)| lr 5.27e-03 | 2002.62 ms | 68.5% bf16 MFU | 261100 tok/s +step 5276/18794 | loss 3.256674 (+1.73z)| norm 0.2924 (+0.56z)| lr 5.27e-03 | 1986.92 ms | 69.1% bf16 MFU | 261239 tok/s +step 5277/18794 | loss 3.209892 (+0.50z)| norm 0.2896 (+0.51z)| lr 5.27e-03 | 1986.93 ms | 69.1% bf16 MFU | 261370 tok/s +step 5278/18794 | loss 3.160277 (-0.78z)| norm 0.2478 (-0.11z)| lr 5.27e-03 | 1995.47 ms | 68.8% bf16 MFU | 261439 tok/s +step 5279/18794 | loss 3.148778 (-1.08z)| norm 0.1951 (-0.87z)| lr 5.27e-03 | 1994.43 ms | 68.8% bf16 MFU | 261510 tok/s +step 5280/18794 | loss 3.188096 (-0.06z)| norm 0.1977 (-0.83z)| lr 5.27e-03 | 2009.97 ms | 68.3% bf16 MFU | 261477 tok/s +step 5281/18794 | loss 3.198116 (+0.19z)| norm 0.2363 (-0.25z)| lr 5.27e-03 | 2001.27 ms | 68.6% bf16 MFU | 261502 tok/s +step 5282/18794 | loss 3.209198 (+0.47z)| norm 0.3043 (+0.77z)| lr 5.26e-03 | 1986.33 ms | 69.1% bf16 MFU | 261624 tok/s +step 5283/18794 | loss 3.084615 (-2.68z)| norm 0.2873 (+0.51z)| lr 5.26e-03 | 2003.58 ms | 68.5% bf16 MFU | 261627 tok/s +step 5284/18794 | loss 3.157564 (-0.82z)| norm 0.2437 (-0.14z)| lr 5.26e-03 | 2011.35 ms | 68.2% bf16 MFU | 261579 tok/s +step 5285/18794 | loss 3.187449 (-0.06z)| norm 0.1821 (-1.06z)| lr 5.26e-03 | 2009.01 ms | 68.3% bf16 MFU | 261548 tok/s +step 5286/18794 | loss 3.206527 (+0.42z)| norm 0.1815 (-1.05z)| lr 5.26e-03 | 1987.13 ms | 69.1% bf16 MFU | 261663 tok/s +step 5287/18794 | loss 3.251995 (+1.56z)| norm 0.1826 (-1.03z)| lr 5.26e-03 | 1994.75 ms | 68.8% bf16 MFU | 261722 tok/s +step 5288/18794 | loss 3.139887 (-1.26z)| norm 0.2099 (-0.62z)| lr 5.26e-03 | 1987.90 ms | 69.0% bf16 MFU | 261823 tok/s +step 5289/18794 | loss 3.215708 (+0.64z)| norm 0.3292 (+1.11z)| lr 5.26e-03 | 1996.20 ms | 68.7% bf16 MFU | 261864 tok/s +step 5290/18794 | loss 3.126411 (-1.57z)| norm 0.3339 (+1.16z)| lr 5.26e-03 | 1994.69 ms | 68.8% bf16 MFU | 261912 tok/s +step 5291/18794 | loss 3.157188 (-0.80z)| norm 0.2486 (-0.09z)| lr 5.26e-03 | 2002.26 ms | 68.5% bf16 MFU | 261909 tok/s +step 5292/18794 | loss 3.186990 (-0.06z)| norm 0.2061 (-0.71z)| lr 5.26e-03 | 2004.00 ms | 68.5% bf16 MFU | 261895 tok/s +step 5293/18794 | loss 3.129979 (-1.46z)| norm 0.2607 (+0.08z)| lr 5.26e-03 | 1996.91 ms | 68.7% bf16 MFU | 261928 tok/s +step 5294/18794 | loss 3.218623 (+0.71z)| norm 0.2117 (-0.66z)| lr 5.26e-03 | 1979.18 ms | 69.3% bf16 MFU | 262076 tok/s +step 5295/18794 | loss 3.274034 (+2.00z)| norm 0.1917 (-0.95z)| lr 5.26e-03 | 1993.93 ms | 68.8% bf16 MFU | 262120 tok/s +step 5296/18794 | loss 3.180964 (-0.24z)| norm 0.2457 (-0.15z)| lr 5.26e-03 | 1990.66 ms | 68.9% bf16 MFU | 262182 tok/s +step 5297/18794 | loss 3.160154 (-0.74z)| norm 0.2464 (-0.13z)| lr 5.26e-03 | 1987.77 ms | 69.0% bf16 MFU | 262261 tok/s +step 5298/18794 | loss 3.157815 (-0.79z)| norm 0.2039 (-0.76z)| lr 5.26e-03 | 1985.63 ms | 69.1% bf16 MFU | 262350 tok/s +step 5299/18794 | loss 3.106916 (-1.95z)| norm 0.2864 (+0.46z)| lr 5.26e-03 | 1986.63 ms | 69.1% bf16 MFU | 262428 tok/s +step 5300/18794 | loss 3.146579 (-0.99z)| norm 0.2331 (-0.33z)| lr 5.26e-03 | 1987.68 ms | 69.0% bf16 MFU | 262495 tok/s +step 5301/18794 | loss 3.165662 (-0.55z)| norm 0.1758 (-1.17z)| lr 5.26e-03 | 1979.16 ms | 69.3% bf16 MFU | 262615 tok/s +step 5302/18794 | loss 3.211396 (+0.54z)| norm 0.2616 (+0.08z)| lr 5.26e-03 | 1995.57 ms | 68.8% bf16 MFU | 262621 tok/s +step 5303/18794 | loss 3.217004 (+0.66z)| norm 0.3345 (+1.14z)| lr 5.26e-03 | 1985.22 ms | 69.1% bf16 MFU | 262695 tok/s +step 5304/18794 | loss 3.251448 (+1.45z)| norm 0.2403 (-0.24z)| lr 5.26e-03 | 2001.21 ms | 68.6% bf16 MFU | 262659 tok/s +step 5305/18794 | loss 3.203745 (+0.33z)| norm 0.1687 (-1.29z)| lr 5.26e-03 | 1986.28 ms | 69.1% bf16 MFU | 262724 tok/s +step 5306/18794 | loss 3.225064 (+0.81z)| norm 0.2042 (-0.74z)| lr 5.26e-03 | 1996.78 ms | 68.7% bf16 MFU | 262716 tok/s +step 5307/18794 | loss 3.203399 (+0.29z)| norm 0.1869 (-1.00z)| lr 5.26e-03 | 1988.00 ms | 69.0% bf16 MFU | 262767 tok/s +step 5308/18794 | loss 3.280965 (+2.10z)| norm 0.2349 (-0.24z)| lr 5.26e-03 | 1995.77 ms | 68.8% bf16 MFU | 262763 tok/s +step 5309/18794 | loss 3.153347 (-0.88z)| norm 0.2010 (-0.76z)| lr 5.26e-03 | 1994.69 ms | 68.8% bf16 MFU | 262767 tok/s +step 5310/18794 | loss 3.193001 (+0.05z)| norm 0.2157 (-0.52z)| lr 5.26e-03 | 1987.82 ms | 69.0% bf16 MFU | 262816 tok/s +step 5311/18794 | loss 3.163715 (-0.64z)| norm 0.3164 (+1.08z)| lr 5.26e-03 | 2002.67 ms | 68.5% bf16 MFU | 262765 tok/s +step 5312/18794 | loss 3.135122 (-1.29z)| norm 0.3441 (+1.50z)| lr 5.25e-03 | 1987.80 ms | 69.0% bf16 MFU | 262815 tok/s +step 5313/18794 | loss 3.144060 (-1.07z)| norm 0.1940 (-0.86z)| lr 5.25e-03 | 1988.14 ms | 69.0% bf16 MFU | 262859 tok/s +step 5314/18794 | loss 3.138356 (-1.18z)| norm 0.2157 (-0.51z)| lr 5.25e-03 | 1994.72 ms | 68.8% bf16 MFU | 262858 tok/s +step 5315/18794 | loss 3.234781 (+1.04z)| norm 0.2355 (-0.20z)| lr 5.25e-03 | 1987.18 ms | 69.1% bf16 MFU | 262907 tok/s +step 5316/18794 | loss 3.183144 (-0.14z)| norm 0.2127 (-0.55z)| lr 5.25e-03 | 1985.95 ms | 69.1% bf16 MFU | 262962 tok/s +step 5317/18794 | loss 3.251290 (+1.42z)| norm 0.2738 (+0.40z)| lr 5.25e-03 | 1981.64 ms | 69.3% bf16 MFU | 263042 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.764996 +step 5318/18794 | loss 3.178502 (-0.26z)| norm 0.4328 (+2.76z)| lr 5.25e-03 | 1988.34 ms | 69.0% bf16 MFU | 263074 tok/s +step 5319/18794 | loss 3.167490 (-0.51z)| norm 0.2268 (-0.36z)| lr 5.25e-03 | 1989.56 ms | 69.0% bf16 MFU | 263096 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.586766 +step 5320/18794 | loss 3.087100 (-2.29z)| norm 0.4270 (+2.59z)| lr 5.25e-03 | 1994.78 ms | 68.8% bf16 MFU | 263083 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.892339 +step 5321/18794 | loss 3.260398 (+1.57z)| norm 0.4586 (+2.89z)| lr 5.25e-03 | 1994.69 ms | 68.8% bf16 MFU | 263071 tok/s +step 5322/18794 | loss 3.211034 (+0.45z)| norm 0.2425 (-0.17z)| lr 5.25e-03 | 1979.65 ms | 69.3% bf16 MFU | 263159 tok/s +step 5323/18794 | loss 3.147445 (-0.97z)| norm 0.3636 (+1.51z)| lr 5.25e-03 | 1988.54 ms | 69.0% bf16 MFU | 263184 tok/s +step 5324/18794 | loss 3.204272 (+0.31z)| norm 0.3196 (+0.88z)| lr 5.25e-03 | 1984.85 ms | 69.1% bf16 MFU | 263232 tok/s +step 5325/18794 | loss 3.241282 (+1.14z)| norm 0.1872 (-0.99z)| lr 5.25e-03 | 1982.46 ms | 69.2% bf16 MFU | 263294 tok/s +step 5326/18794 | loss 3.239727 (+1.08z)| norm 0.2440 (-0.19z)| lr 5.25e-03 | 1990.58 ms | 68.9% bf16 MFU | 263298 tok/s +step 5327/18794 | loss 3.153590 (-0.84z)| norm 0.2104 (-0.66z)| lr 5.25e-03 | 1981.29 ms | 69.3% bf16 MFU | 263364 tok/s +step 5328/18794 | loss 3.156871 (-0.77z)| norm 0.2219 (-0.47z)| lr 5.25e-03 | 1979.74 ms | 69.3% bf16 MFU | 263438 tok/s +step 5329/18794 | loss 3.198564 (+0.17z)| norm 0.2992 (+0.75z)| lr 5.25e-03 | 1987.41 ms | 69.1% bf16 MFU | 263456 tok/s +step 5330/18794 | loss 3.135818 (-1.23z)| norm 0.2292 (-0.34z)| lr 5.25e-03 | 1978.63 ms | 69.4% bf16 MFU | 263532 tok/s +step 5331/18794 | loss 3.191012 (+0.04z)| norm 0.2466 (-0.06z)| lr 5.25e-03 | 1979.50 ms | 69.3% bf16 MFU | 263598 tok/s +step 5332/18794 | loss 3.143637 (-1.03z)| norm 0.3629 (+1.74z)| lr 5.25e-03 | 1980.44 ms | 69.3% bf16 MFU | 263655 tok/s +step 5333/18794 | loss 3.213964 (+0.56z)| norm 0.2482 (-0.05z)| lr 5.25e-03 | 1979.07 ms | 69.3% bf16 MFU | 263718 tok/s +step 5334/18794 | loss 3.211426 (+0.50z)| norm 0.2756 (+0.39z)| lr 5.25e-03 | 1986.69 ms | 69.1% bf16 MFU | 263727 tok/s +step 5335/18794 | loss 3.190292 (+0.03z)| norm 0.3361 (+1.32z)| lr 5.25e-03 | 1982.58 ms | 69.2% bf16 MFU | 263763 tok/s +step 5336/18794 | loss 3.217586 (+0.65z)| norm 0.2107 (-0.64z)| lr 5.25e-03 | 1984.22 ms | 69.2% bf16 MFU | 263786 tok/s +step 5337/18794 | loss 3.156510 (-0.74z)| norm 0.2125 (-0.60z)| lr 5.25e-03 | 1980.36 ms | 69.3% bf16 MFU | 263834 tok/s +step 5338/18794 | loss 3.195051 (+0.16z)| norm 0.2654 (+0.23z)| lr 5.25e-03 | 2000.84 ms | 68.6% bf16 MFU | 263744 tok/s +step 5339/18794 | loss 3.219148 (+0.72z)| norm 0.3066 (+0.87z)| lr 5.25e-03 | 2040.59 ms | 67.3% bf16 MFU | 263404 tok/s +step 5340/18794 | loss 3.229996 (+0.99z)| norm 0.2437 (-0.14z)| lr 5.25e-03 | 2032.88 ms | 67.5% bf16 MFU | 263129 tok/s +step 5341/18794 | loss 3.257860 (+1.62z)| norm 0.2604 (+0.13z)| lr 5.25e-03 | 2030.63 ms | 67.6% bf16 MFU | 262882 tok/s +step 5342/18794 | loss 3.213065 (+0.57z)| norm 0.3083 (+0.89z)| lr 5.25e-03 | 2041.42 ms | 67.2% bf16 MFU | 262579 tok/s +step 5343/18794 | loss 3.182106 (-0.14z)| norm 0.2649 (+0.19z)| lr 5.24e-03 | 2038.46 ms | 67.3% bf16 MFU | 262310 tok/s +step 5344/18794 | loss 3.181237 (-0.16z)| norm 0.2072 (-0.72z)| lr 5.24e-03 | 2024.84 ms | 67.8% bf16 MFU | 262141 tok/s +step 5345/18794 | loss 3.208154 (+0.47z)| norm 0.2099 (-0.66z)| lr 5.24e-03 | 2039.20 ms | 67.3% bf16 MFU | 261889 tok/s +step 5346/18794 | loss 3.195650 (+0.16z)| norm 0.2111 (-0.63z)| lr 5.24e-03 | 2040.88 ms | 67.2% bf16 MFU | 261639 tok/s +step 5347/18794 | loss 3.207429 (+0.42z)| norm 0.2454 (-0.09z)| lr 5.24e-03 | 2025.29 ms | 67.8% bf16 MFU | 261501 tok/s +step 5348/18794 | loss 3.229058 (+0.94z)| norm 0.1994 (-0.83z)| lr 5.24e-03 | 2037.95 ms | 67.3% bf16 MFU | 261289 tok/s +step 5349/18794 | loss 3.221374 (+0.77z)| norm 0.2163 (-0.55z)| lr 5.24e-03 | 2041.88 ms | 67.2% bf16 MFU | 261063 tok/s +step 5350/18794 | loss 3.198580 (+0.22z)| norm 0.2401 (-0.18z)| lr 5.24e-03 | 2039.55 ms | 67.3% bf16 MFU | 260863 tok/s +step 5351/18794 | loss 3.160261 (-0.69z)| norm 0.1751 (-1.23z)| lr 5.24e-03 | 2039.97 ms | 67.3% bf16 MFU | 260670 tok/s +step 5352/18794 | loss 3.209577 (+0.48z)| norm 0.1806 (-1.14z)| lr 5.24e-03 | 2025.93 ms | 67.7% bf16 MFU | 260576 tok/s +step 5353/18794 | loss 3.224786 (+0.83z)| norm 0.2439 (-0.13z)| lr 5.24e-03 | 2041.08 ms | 67.2% bf16 MFU | 260390 tok/s +step 5354/18794 | loss 3.212292 (+0.52z)| norm 0.3026 (+0.82z)| lr 5.24e-03 | 2031.37 ms | 67.6% bf16 MFU | 260276 tok/s +step 5355/18794 | loss 3.226329 (+0.84z)| norm 0.3475 (+1.51z)| lr 5.24e-03 | 2018.27 ms | 68.0% bf16 MFU | 260250 tok/s +step 5356/18794 | loss 3.204310 (+0.33z)| norm 0.2453 (-0.14z)| lr 5.24e-03 | 2018.13 ms | 68.0% bf16 MFU | 260227 tok/s +step 5357/18794 | loss 3.168860 (-0.53z)| norm 0.1738 (-1.28z)| lr 5.24e-03 | 2039.93 ms | 67.3% bf16 MFU | 260067 tok/s +step 5358/18794 | loss 3.194039 (+0.09z)| norm 0.2332 (-0.34z)| lr 5.24e-03 | 2022.14 ms | 67.9% bf16 MFU | 260027 tok/s +step 5359/18794 | loss 3.226481 (+0.86z)| norm 0.2437 (-0.17z)| lr 5.24e-03 | 2039.41 ms | 67.3% bf16 MFU | 259880 tok/s +step 5360/18794 | loss 3.233630 (+1.03z)| norm 0.2138 (-0.66z)| lr 5.24e-03 | 2033.85 ms | 67.5% bf16 MFU | 259775 tok/s +step 5361/18794 | loss 3.121130 (-1.70z)| norm 0.2017 (-0.85z)| lr 5.24e-03 | 2032.36 ms | 67.5% bf16 MFU | 259684 tok/s +step 5362/18794 | loss 3.160238 (-0.73z)| norm 0.2008 (-0.86z)| lr 5.24e-03 | 2030.19 ms | 67.6% bf16 MFU | 259612 tok/s +step 5363/18794 | loss 3.181792 (-0.21z)| norm 0.2328 (-0.33z)| lr 5.24e-03 | 2041.35 ms | 67.2% bf16 MFU | 259474 tok/s +step 5364/18794 | loss 3.184118 (-0.17z)| norm 0.2478 (-0.07z)| lr 5.24e-03 | 2036.02 ms | 67.4% bf16 MFU | 259375 tok/s +step 5365/18794 | loss 3.204955 (+0.34z)| norm 0.2331 (-0.31z)| lr 5.24e-03 | 2039.74 ms | 67.3% bf16 MFU | 259258 tok/s +step 5366/18794 | loss 3.157526 (-0.84z)| norm 0.1834 (-1.11z)| lr 5.24e-03 | 2029.11 ms | 67.6% bf16 MFU | 259214 tok/s +step 5367/18794 | loss 3.169450 (-0.53z)| norm 0.2180 (-0.53z)| lr 5.24e-03 | 2025.80 ms | 67.7% bf16 MFU | 259194 tok/s +step 5368/18794 | loss 3.154501 (-0.92z)| norm 0.2735 (+0.42z)| lr 5.24e-03 | 2018.25 ms | 68.0% bf16 MFU | 259223 tok/s +step 5369/18794 | loss 3.207818 (+0.43z)| norm 0.2102 (-0.66z)| lr 5.24e-03 | 2036.64 ms | 67.4% bf16 MFU | 259133 tok/s +step 5370/18794 | loss 3.086213 (-2.55z)| norm 0.1983 (-0.85z)| lr 5.24e-03 | 2022.32 ms | 67.9% bf16 MFU | 259139 tok/s +step 5371/18794 | loss 3.212013 (+0.54z)| norm 0.2965 (+0.84z)| lr 5.24e-03 | 2018.04 ms | 68.0% bf16 MFU | 259172 tok/s +step 5372/18794 | loss 3.206846 (+0.42z)| norm 0.2705 (+0.39z)| lr 5.24e-03 | 2015.36 ms | 68.1% bf16 MFU | 259221 tok/s +step 5373/18794 | loss 3.146488 (-1.05z)| norm 0.2130 (-0.61z)| lr 5.23e-03 | 2018.04 ms | 68.0% bf16 MFU | 259250 tok/s +step 5374/18794 | loss 3.166050 (-0.56z)| norm 0.2141 (-0.59z)| lr 5.23e-03 | 2033.50 ms | 67.5% bf16 MFU | 259179 tok/s +step 5375/18794 | loss 3.071042 (-2.78z)| norm 0.1838 (-1.10z)| lr 5.23e-03 | 2022.17 ms | 67.9% bf16 MFU | 259183 tok/s +step 5376/18794 | loss 3.151281 (-0.85z)| norm 0.2345 (-0.21z)| lr 5.23e-03 | 2025.60 ms | 67.7% bf16 MFU | 259166 tok/s +step 5377/18794 | loss 3.150711 (-0.85z)| norm 0.2498 (+0.06z)| lr 5.23e-03 | 2026.05 ms | 67.7% bf16 MFU | 259146 tok/s +step 5378/18794 | loss 3.182552 (-0.09z)| norm 0.2214 (-0.43z)| lr 5.23e-03 | 2018.79 ms | 68.0% bf16 MFU | 259174 tok/s +step 5379/18794 | loss 3.194500 (+0.19z)| norm 0.2797 (+0.57z)| lr 5.23e-03 | 2015.62 ms | 68.1% bf16 MFU | 259221 tok/s +step 5380/18794 | loss 3.251830 (+1.54z)| norm 0.2800 (+0.56z)| lr 5.23e-03 | 2009.98 ms | 68.3% bf16 MFU | 259302 tok/s +step 5381/18794 | loss 3.209589 (+0.53z)| norm 0.2174 (-0.53z)| lr 5.23e-03 | 2018.75 ms | 68.0% bf16 MFU | 259322 tok/s +step 5382/18794 | loss 3.220418 (+0.79z)| norm 0.2627 (+0.27z)| lr 5.23e-03 | 2020.32 ms | 67.9% bf16 MFU | 259332 tok/s +step 5383/18794 | loss 3.226075 (+0.91z)| norm 0.3172 (+1.22z)| lr 5.23e-03 | 2029.37 ms | 67.6% bf16 MFU | 259282 tok/s +step 5384/18794 | loss 3.243025 (+1.30z)| norm 0.3457 (+1.67z)| lr 5.23e-03 | 2026.91 ms | 67.7% bf16 MFU | 259252 tok/s +step 5385/18794 | loss 3.140627 (-1.17z)| norm 0.2804 (+0.54z)| lr 5.23e-03 | 2031.16 ms | 67.6% bf16 MFU | 259195 tok/s +step 5386/18794 | loss 3.176472 (-0.30z)| norm 0.1842 (-1.13z)| lr 5.23e-03 | 2017.04 ms | 68.0% bf16 MFU | 259232 tok/s +step 5387/18794 | loss 3.172487 (-0.38z)| norm 0.2206 (-0.51z)| lr 5.23e-03 | 2019.03 ms | 68.0% bf16 MFU | 259254 tok/s +step 5388/18794 | loss 3.169235 (-0.47z)| norm 0.2419 (-0.14z)| lr 5.23e-03 | 2031.35 ms | 67.6% bf16 MFU | 259196 tok/s +step 5389/18794 | loss 3.188526 (+0.01z)| norm 0.2085 (-0.71z)| lr 5.23e-03 | 2032.80 ms | 67.5% bf16 MFU | 259132 tok/s +step 5390/18794 | loss 3.198345 (+0.24z)| norm 0.1699 (-1.36z)| lr 5.23e-03 | 2010.23 ms | 68.3% bf16 MFU | 259216 tok/s +step 5391/18794 | loss 3.193043 (+0.10z)| norm 0.1810 (-1.15z)| lr 5.23e-03 | 2017.28 ms | 68.0% bf16 MFU | 259250 tok/s +step 5392/18794 | loss 3.198545 (+0.23z)| norm 0.2216 (-0.44z)| lr 5.23e-03 | 2017.02 ms | 68.0% bf16 MFU | 259284 tok/s +step 5393/18794 | loss 3.170425 (-0.49z)| norm 0.2655 (+0.33z)| lr 5.23e-03 | 2033.65 ms | 67.5% bf16 MFU | 259210 tok/s +step 5394/18794 | loss 3.184147 (-0.13z)| norm 0.3090 (+1.07z)| lr 5.23e-03 | 2009.98 ms | 68.3% bf16 MFU | 259292 tok/s +step 5395/18794 | loss 3.175045 (-0.35z)| norm 0.3109 (+1.08z)| lr 5.23e-03 | 2012.76 ms | 68.2% bf16 MFU | 259351 tok/s +step 5396/18794 | loss 3.170147 (-0.47z)| norm 0.2524 (+0.06z)| lr 5.23e-03 | 2008.84 ms | 68.3% bf16 MFU | 259433 tok/s +step 5397/18794 | loss 3.266849 (+1.97z)| norm 0.2843 (+0.61z)| lr 5.23e-03 | 2021.23 ms | 67.9% bf16 MFU | 259431 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.454843 +step 5398/18794 | loss 3.224316 (+0.87z)| norm 0.3965 (+2.45z)| lr 5.23e-03 | 2021.90 ms | 67.9% bf16 MFU | 259425 tok/s +step 5399/18794 | loss 3.193406 (+0.07z)| norm 0.3369 (+1.43z)| lr 5.23e-03 | 2021.74 ms | 67.9% bf16 MFU | 259420 tok/s +step 5400/18794 | loss 3.238958 (+1.23z)| norm 0.2441 (-0.13z)| lr 5.23e-03 | 2020.18 ms | 67.9% bf16 MFU | 259425 tok/s +step 5401/18794 | loss 3.192035 (-0.00z)| norm 0.2282 (-0.41z)| lr 5.23e-03 | 2013.54 ms | 68.2% bf16 MFU | 259473 tok/s +step 5402/18794 | loss 3.208464 (+0.43z)| norm 0.2213 (-0.52z)| lr 5.23e-03 | 2011.48 ms | 68.2% bf16 MFU | 259532 tok/s +step 5403/18794 | loss 3.164846 (-0.70z)| norm 0.2347 (-0.28z)| lr 5.22e-03 | 2028.37 ms | 67.7% bf16 MFU | 259479 tok/s +step 5404/18794 | loss 3.167332 (-0.62z)| norm 0.2613 (+0.17z)| lr 5.22e-03 | 2001.27 ms | 68.6% bf16 MFU | 259604 tok/s +step 5405/18794 | loss 3.188653 (-0.05z)| norm 0.2523 (+0.00z)| lr 5.22e-03 | 2008.76 ms | 68.3% bf16 MFU | 259674 tok/s +step 5406/18794 | loss 3.171761 (-0.48z)| norm 0.1990 (-0.91z)| lr 5.22e-03 | 2003.90 ms | 68.5% bf16 MFU | 259772 tok/s +step 5407/18794 | loss 3.178906 (-0.29z)| norm 0.2260 (-0.46z)| lr 5.22e-03 | 2000.97 ms | 68.6% bf16 MFU | 259884 tok/s +step 5408/18794 | loss 3.158899 (-0.81z)| norm 0.3199 (+1.14z)| lr 5.22e-03 | 2013.33 ms | 68.2% bf16 MFU | 259910 tok/s +step 5409/18794 | loss 3.165087 (-0.64z)| norm 0.2085 (-0.77z)| lr 5.22e-03 | 2015.64 ms | 68.1% bf16 MFU | 259920 tok/s +step 5410/18794 | loss 3.208753 (+0.54z)| norm 0.2676 (+0.23z)| lr 5.22e-03 | 2015.54 ms | 68.1% bf16 MFU | 259930 tok/s +step 5411/18794 | loss 3.191690 (+0.07z)| norm 0.3116 (+1.00z)| lr 5.22e-03 | 2015.39 ms | 68.1% bf16 MFU | 259941 tok/s +step 5412/18794 | loss 3.153418 (-0.98z)| norm 0.2099 (-0.74z)| lr 5.22e-03 | 2032.06 ms | 67.5% bf16 MFU | 259844 tok/s +step 5413/18794 | loss 3.185054 (-0.13z)| norm 0.1806 (-1.25z)| lr 5.22e-03 | 2017.75 ms | 68.0% bf16 MFU | 259844 tok/s +step 5414/18794 | loss 3.149639 (-1.12z)| norm 0.1733 (-1.36z)| lr 5.22e-03 | 2013.78 ms | 68.1% bf16 MFU | 259869 tok/s +step 5415/18794 | loss 3.157444 (-0.88z)| norm 0.1937 (-0.99z)| lr 5.22e-03 | 2006.77 ms | 68.4% bf16 MFU | 259939 tok/s +step 5416/18794 | loss 3.234489 (+1.24z)| norm 0.2136 (-0.65z)| lr 5.22e-03 | 2016.37 ms | 68.1% bf16 MFU | 259943 tok/s +step 5417/18794 | loss 3.192763 (+0.11z)| norm 0.2881 (+0.62z)| lr 5.22e-03 | 2017.54 ms | 68.0% bf16 MFU | 259939 tok/s +step 5418/18794 | loss 3.168764 (-0.56z)| norm 0.3424 (+1.63z)| lr 5.22e-03 | 2018.15 ms | 68.0% bf16 MFU | 259931 tok/s +step 5419/18794 | loss 3.203069 (+0.39z)| norm 0.3632 (+1.94z)| lr 5.22e-03 | 2011.10 ms | 68.2% bf16 MFU | 259969 tok/s +step 5420/18794 | loss 3.197856 (+0.22z)| norm 0.2689 (+0.34z)| lr 5.22e-03 | 2003.04 ms | 68.5% bf16 MFU | 260058 tok/s +step 5421/18794 | loss 3.182375 (-0.21z)| norm 0.2233 (-0.50z)| lr 5.22e-03 | 2013.12 ms | 68.2% bf16 MFU | 260077 tok/s +step 5422/18794 | loss 3.187299 (-0.06z)| norm 0.1888 (-1.16z)| lr 5.22e-03 | 2001.59 ms | 68.6% bf16 MFU | 260170 tok/s +step 5423/18794 | loss 3.262310 (+2.12z)| norm 0.2688 (+0.45z)| lr 5.22e-03 | 2004.56 ms | 68.5% bf16 MFU | 260239 tok/s +step 5424/18794 | loss 3.137468 (-1.53z)| norm 0.3409 (+1.89z)| lr 5.22e-03 | 1998.22 ms | 68.7% bf16 MFU | 260346 tok/s +step 5425/18794 | loss 3.207729 (+0.54z)| norm 0.2507 (+0.06z)| lr 5.22e-03 | 2007.31 ms | 68.4% bf16 MFU | 260388 tok/s +step 5426/18794 | loss 3.273070 (+2.41z)| norm 0.2369 (-0.22z)| lr 5.22e-03 | 2009.25 ms | 68.3% bf16 MFU | 260415 tok/s +step 5427/18794 | loss 3.220388 (+0.87z)| norm 0.2693 (+0.43z)| lr 5.22e-03 | 2009.29 ms | 68.3% bf16 MFU | 260441 tok/s +step 5428/18794 | loss 3.202783 (+0.35z)| norm 0.2527 (+0.09z)| lr 5.22e-03 | 2001.19 ms | 68.6% bf16 MFU | 260519 tok/s +step 5429/18794 | loss 3.160403 (-0.87z)| norm 0.2708 (+0.46z)| lr 5.22e-03 | 2025.21 ms | 67.8% bf16 MFU | 260437 tok/s +step 5430/18794 | loss 3.188788 (-0.07z)| norm 0.2197 (-0.58z)| lr 5.22e-03 | 1995.14 ms | 68.8% bf16 MFU | 260554 tok/s +step 5431/18794 | loss 3.212091 (+0.61z)| norm 0.2082 (-0.80z)| lr 5.22e-03 | 1991.31 ms | 68.9% bf16 MFU | 260691 tok/s +step 5432/18794 | loss 3.230832 (+1.14z)| norm 0.1932 (-1.10z)| lr 5.22e-03 | 2010.09 ms | 68.3% bf16 MFU | 260698 tok/s +step 5433/18794 | loss 3.217013 (+0.73z)| norm 0.1612 (-1.72z)| lr 5.21e-03 | 2008.58 ms | 68.3% bf16 MFU | 260714 tok/s +step 5434/18794 | loss 3.256621 (+1.85z)| norm 0.1943 (-1.02z)| lr 5.21e-03 | 2015.39 ms | 68.1% bf16 MFU | 260685 tok/s +step 5435/18794 | loss 3.266992 (+2.09z)| norm 0.2276 (-0.32z)| lr 5.21e-03 | 2018.54 ms | 68.0% bf16 MFU | 260638 tok/s +step 5436/18794 | loss 3.242183 (+1.37z)| norm 0.2813 (+0.77z)| lr 5.21e-03 | 2010.07 ms | 68.3% bf16 MFU | 260647 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.517400 +step 5437/18794 | loss 3.183827 (-0.29z)| norm 0.3707 (+2.52z)| lr 5.21e-03 | 1988.93 ms | 69.0% bf16 MFU | 260795 tok/s +step 5438/18794 | loss 3.246414 (+1.45z)| norm 0.2753 (+0.60z)| lr 5.21e-03 | 1999.76 ms | 68.6% bf16 MFU | 260864 tok/s +step 5439/18794 | loss 3.178509 (-0.43z)| norm 0.1954 (-0.99z)| lr 5.21e-03 | 2007.79 ms | 68.3% bf16 MFU | 260877 tok/s +step 5440/18794 | loss 3.152957 (-1.13z)| norm 0.2407 (-0.08z)| lr 5.21e-03 | 2006.99 ms | 68.4% bf16 MFU | 260895 tok/s +step 5441/18794 | loss 3.219431 (+0.75z)| norm 0.1910 (-1.06z)| lr 5.21e-03 | 2016.91 ms | 68.0% bf16 MFU | 260848 tok/s +step 5442/18794 | loss 3.225867 (+0.93z)| norm 0.1906 (-1.05z)| lr 5.21e-03 | 2007.60 ms | 68.4% bf16 MFU | 260863 tok/s +step 5443/18794 | loss 3.203202 (+0.28z)| norm 0.2067 (-0.71z)| lr 5.21e-03 | 2010.86 ms | 68.2% bf16 MFU | 260856 tok/s +step 5444/18794 | loss 3.145186 (-1.34z)| norm 0.2762 (+0.67z)| lr 5.21e-03 | 2002.74 ms | 68.5% bf16 MFU | 260902 tok/s +step 5445/18794 | loss 3.174301 (-0.51z)| norm 0.2887 (+0.91z)| lr 5.21e-03 | 2018.13 ms | 68.0% bf16 MFU | 260847 tok/s +step 5446/18794 | loss 3.184727 (-0.21z)| norm 0.2088 (-0.69z)| lr 5.21e-03 | 2012.28 ms | 68.2% bf16 MFU | 260832 tok/s +step 5447/18794 | loss 3.230347 (+1.05z)| norm 0.2363 (-0.14z)| lr 5.21e-03 | 2004.54 ms | 68.5% bf16 MFU | 260868 tok/s +step 5448/18794 | loss 3.166263 (-0.72z)| norm 0.2432 (-0.01z)| lr 5.21e-03 | 2000.58 ms | 68.6% bf16 MFU | 260928 tok/s +step 5449/18794 | loss 3.215088 (+0.65z)| norm 0.2538 (+0.19z)| lr 5.21e-03 | 2008.62 ms | 68.3% bf16 MFU | 260932 tok/s +step 5450/18794 | loss 3.209076 (+0.48z)| norm 0.3240 (+1.57z)| lr 5.21e-03 | 2000.80 ms | 68.6% bf16 MFU | 260988 tok/s +step 5451/18794 | loss 3.165247 (-0.75z)| norm 0.2479 (+0.04z)| lr 5.21e-03 | 2003.08 ms | 68.5% bf16 MFU | 261025 tok/s +step 5452/18794 | loss 3.122224 (-1.90z)| norm 0.2564 (+0.20z)| lr 5.21e-03 | 2001.11 ms | 68.6% bf16 MFU | 261074 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.900653 +step 5453/18794 | loss 3.224714 (+0.92z)| norm 0.3978 (+2.90z)| lr 5.21e-03 | 2010.93 ms | 68.2% bf16 MFU | 261056 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.093458 +step 5454/18794 | loss 3.212202 (+0.58z)| norm 0.3586 (+2.09z)| lr 5.21e-03 | 2011.00 ms | 68.2% bf16 MFU | 261039 tok/s +step 5455/18794 | loss 3.229928 (+1.06z)| norm 0.2170 (-0.59z)| lr 5.21e-03 | 2006.28 ms | 68.4% bf16 MFU | 261053 tok/s +step 5456/18794 | loss 3.112680 (-2.09z)| norm 0.2118 (-0.68z)| lr 5.21e-03 | 2006.63 ms | 68.4% bf16 MFU | 261064 tok/s +step 5457/18794 | loss 3.168541 (-0.59z)| norm 0.1988 (-0.94z)| lr 5.21e-03 | 2017.81 ms | 68.0% bf16 MFU | 261003 tok/s +step 5458/18794 | loss 3.167171 (-0.61z)| norm 0.2050 (-0.81z)| lr 5.21e-03 | 2001.10 ms | 68.6% bf16 MFU | 261052 tok/s +step 5459/18794 | loss 3.233772 (+1.17z)| norm 0.2187 (-0.54z)| lr 5.21e-03 | 2004.56 ms | 68.5% bf16 MFU | 261077 tok/s +step 5460/18794 | loss 3.188752 (-0.03z)| norm 0.1645 (-1.57z)| lr 5.21e-03 | 1997.24 ms | 68.7% bf16 MFU | 261149 tok/s +step 5461/18794 | loss 3.175781 (-0.40z)| norm 0.1886 (-1.10z)| lr 5.21e-03 | 2000.53 ms | 68.6% bf16 MFU | 261195 tok/s +step 5462/18794 | loss 3.216565 (+0.71z)| norm 0.2365 (-0.19z)| lr 5.20e-03 | 2002.84 ms | 68.5% bf16 MFU | 261224 tok/s +step 5463/18794 | loss 3.144961 (-1.24z)| norm 0.2607 (+0.27z)| lr 5.20e-03 | 2019.25 ms | 68.0% bf16 MFU | 261145 tok/s +step 5464/18794 | loss 3.192629 (+0.06z)| norm 0.2399 (-0.13z)| lr 5.20e-03 | 2010.55 ms | 68.3% bf16 MFU | 261126 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.433318 +step 5465/18794 | loss 3.164423 (-0.70z)| norm 0.3791 (+2.43z)| lr 5.20e-03 | 2014.50 ms | 68.1% bf16 MFU | 261082 tok/s +step 5466/18794 | loss 3.204191 (+0.37z)| norm 0.2795 (+0.57z)| lr 5.20e-03 | 2004.61 ms | 68.5% bf16 MFU | 261105 tok/s +step 5467/18794 | loss 3.154272 (-0.98z)| norm 0.2083 (-0.76z)| lr 5.20e-03 | 1995.60 ms | 68.8% bf16 MFU | 261186 tok/s +step 5468/18794 | loss 3.184573 (-0.17z)| norm 0.3119 (+1.16z)| lr 5.20e-03 | 2007.35 ms | 68.4% bf16 MFU | 261186 tok/s +step 5469/18794 | loss 3.243578 (+1.42z)| norm 0.3211 (+1.30z)| lr 5.20e-03 | 2001.57 ms | 68.6% bf16 MFU | 261224 tok/s +step 5470/18794 | loss 3.160705 (-0.88z)| norm 0.2596 (+0.16z)| lr 5.20e-03 | 1996.17 ms | 68.7% bf16 MFU | 261295 tok/s +step 5471/18794 | loss 3.258045 (+1.82z)| norm 0.2386 (-0.22z)| lr 5.20e-03 | 1999.39 ms | 68.6% bf16 MFU | 261341 tok/s +step 5472/18794 | loss 3.196578 (+0.12z)| norm 0.2242 (-0.48z)| lr 5.20e-03 | 1997.50 ms | 68.7% bf16 MFU | 261398 tok/s +step 5473/18794 | loss 3.195903 (+0.09z)| norm 0.1956 (-1.01z)| lr 5.20e-03 | 1998.68 ms | 68.7% bf16 MFU | 261444 tok/s +step 5474/18794 | loss 3.171349 (-0.60z)| norm 0.1891 (-1.12z)| lr 5.20e-03 | 2003.75 ms | 68.5% bf16 MFU | 261454 tok/s +step 5475/18794 | loss 3.226928 (+0.97z)| norm 0.1952 (-1.01z)| lr 5.20e-03 | 2004.35 ms | 68.5% bf16 MFU | 261460 tok/s +step 5476/18794 | loss 3.191067 (-0.11z)| norm 0.3068 (+1.04z)| lr 5.20e-03 | 1995.57 ms | 68.8% bf16 MFU | 261524 tok/s +step 5477/18794 | loss 3.222721 (+0.82z)| norm 0.2741 (+0.43z)| lr 5.20e-03 | 2001.52 ms | 68.6% bf16 MFU | 261545 tok/s +step 5478/18794 | loss 3.220844 (+0.75z)| norm 0.1957 (-1.00z)| lr 5.20e-03 | 2000.56 ms | 68.6% bf16 MFU | 261571 tok/s +step 5479/18794 | loss 3.188110 (-0.23z)| norm 0.2510 (+0.02z)| lr 5.20e-03 | 2003.06 ms | 68.5% bf16 MFU | 261580 tok/s +step 5480/18794 | loss 3.144964 (-1.50z)| norm 0.2724 (+0.41z)| lr 5.20e-03 | 1995.26 ms | 68.8% bf16 MFU | 261639 tok/s +step 5481/18794 | loss 3.151391 (-1.28z)| norm 0.2199 (-0.56z)| lr 5.20e-03 | 2007.34 ms | 68.4% bf16 MFU | 261616 tok/s +step 5482/18794 | loss 3.166509 (-0.81z)| norm 0.1996 (-0.91z)| lr 5.20e-03 | 1988.81 ms | 69.0% bf16 MFU | 261716 tok/s +step 5483/18794 | loss 3.187895 (-0.16z)| norm 0.2534 (+0.08z)| lr 5.20e-03 | 1998.91 ms | 68.7% bf16 MFU | 261745 tok/s +step 5484/18794 | loss 3.225046 (+0.97z)| norm 0.2515 (+0.07z)| lr 5.20e-03 | 1999.22 ms | 68.6% bf16 MFU | 261770 tok/s +step 5485/18794 | loss 3.175904 (-0.53z)| norm 0.2303 (-0.32z)| lr 5.20e-03 | 2004.62 ms | 68.5% bf16 MFU | 261759 tok/s +step 5486/18794 | loss 3.248118 (+1.63z)| norm 0.2202 (-0.52z)| lr 5.20e-03 | 2010.69 ms | 68.3% bf16 MFU | 261708 tok/s +step 5487/18794 | loss 3.215767 (+0.64z)| norm 0.2208 (-0.51z)| lr 5.20e-03 | 1996.45 ms | 68.7% bf16 MFU | 261753 tok/s +step 5488/18794 | loss 3.157906 (-1.10z)| norm 0.1824 (-1.21z)| lr 5.20e-03 | 1988.02 ms | 69.0% bf16 MFU | 261852 tok/s +step 5489/18794 | loss 3.188568 (-0.17z)| norm 0.1962 (-0.95z)| lr 5.20e-03 | 2002.97 ms | 68.5% bf16 MFU | 261847 tok/s +step 5490/18794 | loss 3.215787 (+0.64z)| norm 0.2105 (-0.70z)| lr 5.20e-03 | 2002.08 ms | 68.5% bf16 MFU | 261848 tok/s +step 5491/18794 | loss 3.158001 (-1.08z)| norm 0.1998 (-0.90z)| lr 5.20e-03 | 2010.40 ms | 68.3% bf16 MFU | 261795 tok/s +step 5492/18794 | loss 3.240167 (+1.35z)| norm 0.2463 (-0.03z)| lr 5.19e-03 | 1994.89 ms | 68.8% bf16 MFU | 261846 tok/s +step 5493/18794 | loss 3.229962 (+1.03z)| norm 0.2836 (+0.67z)| lr 5.19e-03 | 1991.81 ms | 68.9% bf16 MFU | 261915 tok/s +step 5494/18794 | loss 3.219147 (+0.70z)| norm 0.2621 (+0.28z)| lr 5.19e-03 | 1988.39 ms | 69.0% bf16 MFU | 262003 tok/s +step 5495/18794 | loss 3.202859 (+0.21z)| norm 0.2106 (-0.69z)| lr 5.19e-03 | 1996.46 ms | 68.7% bf16 MFU | 262033 tok/s +step 5496/18794 | loss 3.180930 (-0.44z)| norm 0.2923 (+0.86z)| lr 5.19e-03 | 1995.23 ms | 68.8% bf16 MFU | 262070 tok/s +step 5497/18794 | loss 3.227149 (+0.95z)| norm 0.3170 (+1.32z)| lr 5.19e-03 | 1995.34 ms | 68.8% bf16 MFU | 262104 tok/s +step 5498/18794 | loss 3.148190 (-1.39z)| norm 0.2514 (+0.11z)| lr 5.19e-03 | 1987.55 ms | 69.0% bf16 MFU | 262188 tok/s +step 5499/18794 | loss 3.146019 (-1.43z)| norm 0.2312 (-0.27z)| lr 5.19e-03 | 1996.10 ms | 68.8% bf16 MFU | 262212 tok/s +step 5500/18794 | loss 3.160891 (-0.97z)| norm 0.2639 (+0.38z)| lr 5.19e-03 | 1993.30 ms | 68.8% bf16 MFU | 262253 tok/s +val loss 3.224534 +HellaSwag: 2896/10042 = 0.288389Swag: 990/1256: 0/1256 +Writing checkpoint at step 5500 +Writing model to log_gpt3_125M_edu_v4/model_00005500.bin +Writing state to log_gpt3_125M_edu_v4/state_00005500_00001.bin +Writing state to log_gpt3_125M_edu_v4/state_00005500_00000.bin +Deleting checkpoint at step 3000 +step 5501/18794 | loss 3.182244 (-0.33z)| norm 0.2501 (+0.10z)| lr 5.19e-03 | 1987.39 ms | 69.1% bf16 MFU | 262330 tok/s +step 5502/18794 | loss 3.158623 (-1.01z)| norm 0.2309 (-0.29z)| lr 5.19e-03 | 1990.86 ms | 68.9% bf16 MFU | 262381 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.528245 +step 5503/18794 | loss 3.170825 (-0.66z)| norm 0.3768 (+2.53z)| lr 5.19e-03 | 2001.40 ms | 68.6% bf16 MFU | 262360 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.000300 +step 5504/18794 | loss 3.250460 (+1.66z)| norm 0.3527 (+2.00z)| lr 5.19e-03 | 1986.02 ms | 69.1% bf16 MFU | 262442 tok/s +step 5505/18794 | loss 3.191114 (-0.08z)| norm 0.1890 (-1.10z)| lr 5.19e-03 | 1988.01 ms | 69.0% bf16 MFU | 262506 tok/s +step 5506/18794 | loss 3.211617 (+0.51z)| norm 0.2571 (+0.18z)| lr 5.19e-03 | 2000.19 ms | 68.6% bf16 MFU | 262486 tok/s +step 5507/18794 | loss 3.213868 (+0.57z)| norm 0.2123 (-0.67z)| lr 5.19e-03 | 1988.97 ms | 69.0% bf16 MFU | 262542 tok/s +step 5508/18794 | loss 3.218164 (+0.68z)| norm 0.2436 (-0.06z)| lr 5.19e-03 | 1984.97 ms | 69.1% bf16 MFU | 262621 tok/s +step 5509/18794 | loss 3.140376 (-1.59z)| norm 0.1948 (-0.99z)| lr 5.19e-03 | 1989.94 ms | 69.0% bf16 MFU | 262664 tok/s +step 5510/18794 | loss 3.223670 (+0.83z)| norm 0.2374 (-0.17z)| lr 5.19e-03 | 1979.34 ms | 69.3% bf16 MFU | 262774 tok/s +step 5511/18794 | loss 3.146763 (-1.38z)| norm 0.3323 (+1.63z)| lr 5.19e-03 | 1986.89 ms | 69.1% bf16 MFU | 262829 tok/s +step 5512/18794 | loss 3.210064 (+0.43z)| norm 0.3129 (+1.24z)| lr 5.19e-03 | 1986.97 ms | 69.1% bf16 MFU | 262881 tok/s +step 5513/18794 | loss 3.196097 (+0.02z)| norm 0.1821 (-1.24z)| lr 5.19e-03 | 1989.08 ms | 69.0% bf16 MFU | 262916 tok/s +step 5514/18794 | loss 3.161394 (-0.99z)| norm 0.2171 (-0.59z)| lr 5.19e-03 | 1989.97 ms | 69.0% bf16 MFU | 262944 tok/s +step 5515/18794 | loss 3.266404 (+2.00z)| norm 0.2324 (-0.31z)| lr 5.19e-03 | 1987.77 ms | 69.0% bf16 MFU | 262984 tok/s +step 5516/18794 | loss 3.138985 (-1.61z)| norm 0.3117 (+1.19z)| lr 5.19e-03 | 1985.73 ms | 69.1% bf16 MFU | 263037 tok/s +step 5517/18794 | loss 3.205469 (+0.28z)| norm 0.2543 (+0.10z)| lr 5.19e-03 | 1982.21 ms | 69.2% bf16 MFU | 263110 tok/s +step 5518/18794 | loss 3.193489 (-0.07z)| norm 0.1931 (-1.05z)| lr 5.19e-03 | 1987.39 ms | 69.1% bf16 MFU | 263144 tok/s +step 5519/18794 | loss 3.193313 (-0.07z)| norm 0.2001 (-0.91z)| lr 5.19e-03 | 1980.46 ms | 69.3% bf16 MFU | 263224 tok/s +step 5520/18794 | loss 3.169862 (-0.73z)| norm 0.1941 (-1.00z)| lr 5.19e-03 | 1988.92 ms | 69.0% bf16 MFU | 263243 tok/s +step 5521/18794 | loss 3.166076 (-0.83z)| norm 0.1930 (-1.02z)| lr 5.18e-03 | 1985.74 ms | 69.1% bf16 MFU | 263282 tok/s +step 5522/18794 | loss 3.189144 (-0.18z)| norm 0.2259 (-0.38z)| lr 5.18e-03 | 1980.52 ms | 69.3% bf16 MFU | 263354 tok/s +step 5523/18794 | loss 3.246181 (+1.46z)| norm 0.1905 (-1.06z)| lr 5.18e-03 | 1979.99 ms | 69.3% bf16 MFU | 263426 tok/s +step 5524/18794 | loss 3.219946 (+0.69z)| norm 0.2703 (+0.53z)| lr 5.18e-03 | 1979.88 ms | 69.3% bf16 MFU | 263495 tok/s +step 5525/18794 | loss 3.195541 (-0.01z)| norm 0.2426 (-0.02z)| lr 5.18e-03 | 1980.59 ms | 69.3% bf16 MFU | 263556 tok/s +step 5526/18794 | loss 3.176347 (-0.55z)| norm 0.1761 (-1.32z)| lr 5.18e-03 | 1979.18 ms | 69.3% bf16 MFU | 263623 tok/s +step 5527/18794 | loss 3.206843 (+0.36z)| norm 0.2589 (+0.32z)| lr 5.18e-03 | 1980.13 ms | 69.3% bf16 MFU | 263681 tok/s +reducing beta2 to 0.9 and lr/wd by 0.934 due to grad z-score of 3.747499 +step 5528/18794 | loss 3.192543 (-0.07z)| norm 0.4491 (+3.75z)| lr 4.84e-03 | 1982.07 ms | 69.2% bf16 MFU | 263722 tok/s +step 5529/18794 | loss 3.165896 (-0.86z)| norm 0.2438 (-0.01z)| lr 5.18e-03 | 2035.07 ms | 67.4% bf16 MFU | 263418 tok/s +step 5530/18794 | loss 3.275683 (+2.32z)| norm 0.2617 (+0.31z)| lr 5.18e-03 | 2043.33 ms | 67.2% bf16 MFU | 263076 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.310460 +step 5531/18794 | loss 3.206879 (+0.33z)| norm 0.3756 (+2.31z)| lr 5.18e-03 | 2036.29 ms | 67.4% bf16 MFU | 262796 tok/s +step 5532/18794 | loss 3.164851 (-0.87z)| norm 0.2176 (-0.53z)| lr 5.18e-03 | 2026.41 ms | 67.7% bf16 MFU | 262592 tok/s +step 5533/18794 | loss 3.131470 (-1.79z)| norm 0.2064 (-0.74z)| lr 5.18e-03 | 2041.34 ms | 67.2% bf16 MFU | 262305 tok/s +step 5534/18794 | loss 3.175570 (-0.52z)| norm 0.1871 (-1.09z)| lr 5.18e-03 | 2041.97 ms | 67.2% bf16 MFU | 262027 tok/s +step 5535/18794 | loss 3.177152 (-0.45z)| norm 0.2076 (-0.71z)| lr 5.18e-03 | 2033.99 ms | 67.5% bf16 MFU | 261814 tok/s +step 5536/18794 | loss 3.155228 (-1.08z)| norm 0.1790 (-1.21z)| lr 5.18e-03 | 2034.60 ms | 67.4% bf16 MFU | 261608 tok/s +step 5537/18794 | loss 3.177519 (-0.42z)| norm 0.2433 (-0.03z)| lr 5.18e-03 | 2041.19 ms | 67.2% bf16 MFU | 261370 tok/s +step 5538/18794 | loss 3.251444 (+1.78z)| norm 0.1872 (-1.04z)| lr 5.18e-03 | 2033.87 ms | 67.5% bf16 MFU | 261190 tok/s +step 5539/18794 | loss 3.187675 (-0.12z)| norm 0.1785 (-1.20z)| lr 5.18e-03 | 2042.14 ms | 67.2% bf16 MFU | 260968 tok/s +step 5540/18794 | loss 3.167831 (-0.72z)| norm 0.2820 (+0.69z)| lr 5.18e-03 | 2018.42 ms | 68.0% bf16 MFU | 260907 tok/s +step 5541/18794 | loss 3.153962 (-1.11z)| norm 0.3367 (+1.65z)| lr 5.18e-03 | 2034.80 ms | 67.4% bf16 MFU | 260744 tok/s +step 5542/18794 | loss 3.211090 (+0.60z)| norm 0.2361 (-0.18z)| lr 5.18e-03 | 2041.90 ms | 67.2% bf16 MFU | 260545 tok/s +step 5543/18794 | loss 3.194127 (+0.10z)| norm 0.2419 (-0.08z)| lr 5.18e-03 | 2042.16 ms | 67.2% bf16 MFU | 260355 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.980762 +step 5544/18794 | loss 3.211814 (+0.61z)| norm 0.4187 (+2.98z)| lr 5.18e-03 | 2042.98 ms | 67.2% bf16 MFU | 260168 tok/s +step 5545/18794 | loss 3.173829 (-0.53z)| norm 0.3203 (+1.25z)| lr 5.18e-03 | 2042.53 ms | 67.2% bf16 MFU | 259994 tok/s +step 5546/18794 | loss 3.150062 (-1.23z)| norm 0.2045 (-0.76z)| lr 5.18e-03 | 2027.48 ms | 67.7% bf16 MFU | 259924 tok/s +step 5547/18794 | loss 3.190182 (-0.02z)| norm 0.3526 (+1.76z)| lr 5.18e-03 | 2043.18 ms | 67.2% bf16 MFU | 259758 tok/s +step 5548/18794 | loss 3.158731 (-0.96z)| norm 0.3613 (+1.86z)| lr 5.18e-03 | 2043.53 ms | 67.2% bf16 MFU | 259598 tok/s +step 5549/18794 | loss 3.169412 (-0.63z)| norm 0.1934 (-0.94z)| lr 5.18e-03 | 2043.76 ms | 67.1% bf16 MFU | 259445 tok/s +step 5550/18794 | loss 3.198276 (+0.25z)| norm 0.2440 (-0.09z)| lr 5.18e-03 | 2043.25 ms | 67.2% bf16 MFU | 259302 tok/s +step 5551/18794 | loss 3.168007 (-0.67z)| norm 0.2448 (-0.07z)| lr 5.17e-03 | 2034.88 ms | 67.4% bf16 MFU | 259220 tok/s +step 5552/18794 | loss 3.195075 (+0.13z)| norm 0.1656 (-1.38z)| lr 5.17e-03 | 2011.36 ms | 68.2% bf16 MFU | 259292 tok/s +step 5553/18794 | loss 3.179505 (-0.34z)| norm 0.1887 (-0.98z)| lr 5.17e-03 | 2009.99 ms | 68.3% bf16 MFU | 259369 tok/s +step 5554/18794 | loss 3.195395 (+0.16z)| norm 0.1750 (-1.20z)| lr 5.17e-03 | 2033.26 ms | 67.5% bf16 MFU | 259294 tok/s +step 5555/18794 | loss 3.185176 (-0.15z)| norm 0.1709 (-1.25z)| lr 5.17e-03 | 2042.38 ms | 67.2% bf16 MFU | 259164 tok/s +step 5556/18794 | loss 3.173419 (-0.55z)| norm 0.1855 (-1.00z)| lr 5.17e-03 | 2026.84 ms | 67.7% bf16 MFU | 259140 tok/s +step 5557/18794 | loss 3.195620 (+0.16z)| norm 0.1930 (-0.86z)| lr 5.17e-03 | 2034.91 ms | 67.4% bf16 MFU | 259065 tok/s +step 5558/18794 | loss 3.178949 (-0.38z)| norm 0.1711 (-1.23z)| lr 5.17e-03 | 2035.10 ms | 67.4% bf16 MFU | 258993 tok/s +step 5559/18794 | loss 3.204274 (+0.45z)| norm 0.2256 (-0.30z)| lr 5.17e-03 | 2019.88 ms | 67.9% bf16 MFU | 259022 tok/s +step 5560/18794 | loss 3.221006 (+0.98z)| norm 0.1973 (-0.79z)| lr 5.17e-03 | 2035.09 ms | 67.4% bf16 MFU | 258952 tok/s +step 5561/18794 | loss 3.227738 (+1.18z)| norm 0.2105 (-0.57z)| lr 5.17e-03 | 2042.63 ms | 67.2% bf16 MFU | 258838 tok/s +step 5562/18794 | loss 3.155754 (-1.13z)| norm 0.3067 (+1.07z)| lr 5.17e-03 | 2018.29 ms | 68.0% bf16 MFU | 258884 tok/s +step 5563/18794 | loss 3.165291 (-0.83z)| norm 0.2998 (+0.94z)| lr 5.17e-03 | 2026.85 ms | 67.7% bf16 MFU | 258874 tok/s +step 5564/18794 | loss 3.174716 (-0.52z)| norm 0.2246 (-0.34z)| lr 5.17e-03 | 2034.65 ms | 67.4% bf16 MFU | 258814 tok/s +step 5565/18794 | loss 3.178756 (-0.40z)| norm 0.2389 (-0.08z)| lr 5.17e-03 | 2035.08 ms | 67.4% bf16 MFU | 258754 tok/s +step 5566/18794 | loss 3.223156 (+1.04z)| norm 0.2350 (-0.14z)| lr 5.17e-03 | 2027.23 ms | 67.7% bf16 MFU | 258748 tok/s +step 5567/18794 | loss 3.189216 (-0.07z)| norm 0.2331 (-0.17z)| lr 5.17e-03 | 2041.20 ms | 67.2% bf16 MFU | 258653 tok/s +step 5568/18794 | loss 3.179515 (-0.39z)| norm 0.2265 (-0.28z)| lr 5.17e-03 | 2034.16 ms | 67.5% bf16 MFU | 258608 tok/s +step 5569/18794 | loss 3.202158 (+0.37z)| norm 0.2616 (+0.36z)| lr 5.17e-03 | 2039.73 ms | 67.3% bf16 MFU | 258529 tok/s +step 5570/18794 | loss 3.158709 (-1.07z)| norm 0.2950 (+0.94z)| lr 5.17e-03 | 2035.06 ms | 67.4% bf16 MFU | 258484 tok/s +step 5571/18794 | loss 3.171096 (-0.64z)| norm 0.2156 (-0.46z)| lr 5.17e-03 | 2013.58 ms | 68.2% bf16 MFU | 258579 tok/s +step 5572/18794 | loss 3.179458 (-0.35z)| norm 0.1558 (-1.50z)| lr 5.17e-03 | 2042.16 ms | 67.2% bf16 MFU | 258486 tok/s +step 5573/18794 | loss 3.123931 (-2.16z)| norm 0.1709 (-1.22z)| lr 5.17e-03 | 2028.42 ms | 67.7% bf16 MFU | 258486 tok/s +step 5574/18794 | loss 3.235164 (+1.48z)| norm 0.1523 (-1.53z)| lr 5.17e-03 | 2035.83 ms | 67.4% bf16 MFU | 258438 tok/s +step 5575/18794 | loss 3.158278 (-1.01z)| norm 0.1590 (-1.40z)| lr 5.17e-03 | 2027.97 ms | 67.7% bf16 MFU | 258442 tok/s +step 5576/18794 | loss 3.182358 (-0.22z)| norm 0.1565 (-1.41z)| lr 5.17e-03 | 2036.39 ms | 67.4% bf16 MFU | 258393 tok/s +step 5577/18794 | loss 3.166052 (-0.74z)| norm 0.1753 (-1.07z)| lr 5.17e-03 | 2034.24 ms | 67.5% bf16 MFU | 258360 tok/s +step 5578/18794 | loss 3.188906 (+0.02z)| norm 0.2015 (-0.62z)| lr 5.17e-03 | 2019.20 ms | 68.0% bf16 MFU | 258425 tok/s +step 5579/18794 | loss 3.183288 (-0.16z)| norm 0.2272 (-0.17z)| lr 5.17e-03 | 2005.35 ms | 68.4% bf16 MFU | 258576 tok/s +step 5580/18794 | loss 3.152048 (-1.20z)| norm 0.2000 (-0.63z)| lr 5.16e-03 | 2003.63 ms | 68.5% bf16 MFU | 258730 tok/s +step 5581/18794 | loss 3.175057 (-0.45z)| norm 0.1781 (-0.99z)| lr 5.16e-03 | 2018.94 ms | 68.0% bf16 MFU | 258778 tok/s +step 5582/18794 | loss 3.178204 (-0.35z)| norm 0.1872 (-0.83z)| lr 5.16e-03 | 2027.53 ms | 67.7% bf16 MFU | 258768 tok/s +step 5583/18794 | loss 3.168873 (-0.65z)| norm 0.2277 (-0.14z)| lr 5.16e-03 | 2018.54 ms | 68.0% bf16 MFU | 258817 tok/s +step 5584/18794 | loss 3.233436 (+1.50z)| norm 0.2313 (-0.07z)| lr 5.16e-03 | 2019.07 ms | 68.0% bf16 MFU | 258859 tok/s +step 5585/18794 | loss 3.140182 (-1.58z)| norm 0.1907 (-0.76z)| lr 5.16e-03 | 2026.95 ms | 67.7% bf16 MFU | 258849 tok/s +step 5586/18794 | loss 3.150500 (-1.22z)| norm 0.1966 (-0.65z)| lr 5.16e-03 | 2002.45 ms | 68.5% bf16 MFU | 258998 tok/s +step 5587/18794 | loss 3.187637 (+0.03z)| norm 0.1765 (-0.98z)| lr 5.16e-03 | 2034.26 ms | 67.5% bf16 MFU | 258935 tok/s +step 5588/18794 | loss 3.218070 (+1.02z)| norm 0.2143 (-0.35z)| lr 5.16e-03 | 2019.14 ms | 68.0% bf16 MFU | 258971 tok/s +step 5589/18794 | loss 3.128022 (-1.93z)| norm 0.3003 (+1.09z)| lr 5.16e-03 | 2018.18 ms | 68.0% bf16 MFU | 259011 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.060394 +step 5590/18794 | loss 3.186606 (+0.00z)| norm 0.3622 (+2.06z)| lr 5.16e-03 | 2018.61 ms | 68.0% bf16 MFU | 259047 tok/s +step 5591/18794 | loss 3.201971 (+0.49z)| norm 0.3198 (+1.33z)| lr 5.16e-03 | 2019.09 ms | 68.0% bf16 MFU | 259078 tok/s +step 5592/18794 | loss 3.191812 (+0.18z)| norm 0.1982 (-0.65z)| lr 5.16e-03 | 2008.68 ms | 68.3% bf16 MFU | 259175 tok/s +step 5593/18794 | loss 3.208045 (+0.74z)| norm 0.2424 (+0.08z)| lr 5.16e-03 | 2032.95 ms | 67.5% bf16 MFU | 259111 tok/s +step 5594/18794 | loss 3.181780 (-0.14z)| norm 0.2895 (+0.84z)| lr 5.16e-03 | 2019.07 ms | 68.0% bf16 MFU | 259139 tok/s +step 5595/18794 | loss 3.208097 (+0.75z)| norm 0.2479 (+0.15z)| lr 5.16e-03 | 2004.07 ms | 68.5% bf16 MFU | 259262 tok/s +step 5596/18794 | loss 3.143980 (-1.40z)| norm 0.2062 (-0.51z)| lr 5.16e-03 | 2033.68 ms | 67.5% bf16 MFU | 259189 tok/s +step 5597/18794 | loss 3.168407 (-0.56z)| norm 0.1768 (-0.98z)| lr 5.16e-03 | 2018.28 ms | 68.0% bf16 MFU | 259218 tok/s +step 5598/18794 | loss 3.256666 (+2.35z)| norm 0.2052 (-0.50z)| lr 5.16e-03 | 2010.16 ms | 68.3% bf16 MFU | 259298 tok/s +step 5599/18794 | loss 3.200542 (+0.47z)| norm 0.2366 (+0.01z)| lr 5.16e-03 | 2017.48 ms | 68.0% bf16 MFU | 259327 tok/s +step 5600/18794 | loss 3.210593 (+0.79z)| norm 0.2036 (-0.52z)| lr 5.16e-03 | 2018.12 ms | 68.0% bf16 MFU | 259350 tok/s +step 5601/18794 | loss 3.280610 (+2.96z)| norm 0.2155 (-0.32z)| lr 5.16e-03 | 2027.37 ms | 67.7% bf16 MFU | 259313 tok/s +step 5602/18794 | loss 3.176891 (-0.37z)| norm 0.1894 (-0.74z)| lr 5.16e-03 | 2027.51 ms | 67.7% bf16 MFU | 259277 tok/s +step 5603/18794 | loss 3.145487 (-1.36z)| norm 0.2076 (-0.42z)| lr 5.16e-03 | 2003.11 ms | 68.5% bf16 MFU | 259400 tok/s +step 5604/18794 | loss 3.220132 (+1.05z)| norm 0.2301 (-0.02z)| lr 5.16e-03 | 2011.84 ms | 68.2% bf16 MFU | 259460 tok/s +step 5605/18794 | loss 3.191841 (+0.13z)| norm 0.2651 (+0.57z)| lr 5.16e-03 | 2026.25 ms | 67.7% bf16 MFU | 259424 tok/s +step 5606/18794 | loss 3.169421 (-0.58z)| norm 0.2052 (-0.46z)| lr 5.16e-03 | 2008.99 ms | 68.3% bf16 MFU | 259501 tok/s +step 5607/18794 | loss 3.172892 (-0.46z)| norm 0.1949 (-0.63z)| lr 5.16e-03 | 2027.11 ms | 67.7% bf16 MFU | 259458 tok/s +step 5608/18794 | loss 3.171912 (-0.48z)| norm 0.2091 (-0.38z)| lr 5.16e-03 | 2018.28 ms | 68.0% bf16 MFU | 259474 tok/s +step 5609/18794 | loss 3.210013 (+0.75z)| norm 0.2561 (+0.42z)| lr 5.15e-03 | 2027.57 ms | 67.7% bf16 MFU | 259429 tok/s +step 5610/18794 | loss 3.204578 (+0.58z)| norm 0.2431 (+0.19z)| lr 5.15e-03 | 2009.49 ms | 68.3% bf16 MFU | 259503 tok/s +step 5611/18794 | loss 3.252251 (+2.10z)| norm 0.2432 (+0.21z)| lr 5.15e-03 | 2033.96 ms | 67.5% bf16 MFU | 259416 tok/s +step 5612/18794 | loss 3.153950 (-1.09z)| norm 0.2068 (-0.41z)| lr 5.15e-03 | 2018.78 ms | 68.0% bf16 MFU | 259431 tok/s +step 5613/18794 | loss 3.253637 (+2.09z)| norm 0.1890 (-0.72z)| lr 5.15e-03 | 2018.78 ms | 68.0% bf16 MFU | 259444 tok/s +step 5614/18794 | loss 3.238812 (+1.58z)| norm 0.2120 (-0.32z)| lr 5.15e-03 | 2011.13 ms | 68.2% bf16 MFU | 259507 tok/s +step 5615/18794 | loss 3.246524 (+1.85z)| norm 0.2152 (-0.26z)| lr 5.15e-03 | 2011.13 ms | 68.2% bf16 MFU | 259566 tok/s +step 5616/18794 | loss 3.228416 (+1.25z)| norm 0.2102 (-0.33z)| lr 5.15e-03 | 2012.00 ms | 68.2% bf16 MFU | 259617 tok/s +step 5617/18794 | loss 3.158784 (-0.97z)| norm 0.1670 (-1.08z)| lr 5.15e-03 | 2025.44 ms | 67.8% bf16 MFU | 259579 tok/s +step 5618/18794 | loss 3.164373 (-0.78z)| norm 0.2069 (-0.37z)| lr 5.15e-03 | 2009.72 ms | 68.3% bf16 MFU | 259643 tok/s +step 5619/18794 | loss 3.159888 (-0.91z)| norm 0.1790 (-0.87z)| lr 5.15e-03 | 2011.30 ms | 68.2% bf16 MFU | 259695 tok/s +step 5620/18794 | loss 3.205236 (+0.52z)| norm 0.1718 (-0.99z)| lr 5.15e-03 | 2002.87 ms | 68.5% bf16 MFU | 259798 tok/s +step 5621/18794 | loss 3.229159 (+1.26z)| norm 0.2038 (-0.42z)| lr 5.15e-03 | 2034.59 ms | 67.4% bf16 MFU | 259693 tok/s +step 5622/18794 | loss 3.185407 (-0.13z)| norm 0.1492 (-1.36z)| lr 5.15e-03 | 2018.90 ms | 68.0% bf16 MFU | 259693 tok/s +step 5623/18794 | loss 3.207828 (+0.61z)| norm 0.2277 (+0.01z)| lr 5.15e-03 | 2002.29 ms | 68.5% bf16 MFU | 259800 tok/s +step 5624/18794 | loss 3.183225 (-0.17z)| norm 0.2581 (+0.55z)| lr 5.15e-03 | 2010.31 ms | 68.3% bf16 MFU | 259850 tok/s +step 5625/18794 | loss 3.203058 (+0.46z)| norm 0.1955 (-0.55z)| lr 5.15e-03 | 2009.96 ms | 68.3% bf16 MFU | 259900 tok/s +step 5626/18794 | loss 3.182782 (-0.19z)| norm 0.2026 (-0.43z)| lr 5.15e-03 | 2003.66 ms | 68.5% bf16 MFU | 259988 tok/s +step 5627/18794 | loss 3.223909 (+1.12z)| norm 0.1937 (-0.57z)| lr 5.15e-03 | 1987.03 ms | 69.1% bf16 MFU | 260182 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.283424 +step 5628/18794 | loss 3.144220 (-1.41z)| norm 0.4071 (+3.28z)| lr 5.15e-03 | 2026.11 ms | 67.7% bf16 MFU | 260111 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.076512 +step 5629/18794 | loss 3.162425 (-0.83z)| norm 0.4060 (+3.08z)| lr 5.15e-03 | 2011.53 ms | 68.2% bf16 MFU | 260137 tok/s +step 5630/18794 | loss 3.147943 (-1.29z)| norm 0.1809 (-0.79z)| lr 5.15e-03 | 2018.35 ms | 68.0% bf16 MFU | 260118 tok/s +step 5631/18794 | loss 3.178692 (-0.27z)| norm 0.2573 (+0.57z)| lr 5.15e-03 | 2018.08 ms | 68.0% bf16 MFU | 260102 tok/s +step 5632/18794 | loss 3.171008 (-0.52z)| norm 0.1941 (-0.55z)| lr 5.15e-03 | 1986.03 ms | 69.1% bf16 MFU | 260297 tok/s +step 5633/18794 | loss 3.155905 (-1.04z)| norm 0.2055 (-0.35z)| lr 5.15e-03 | 2000.86 ms | 68.6% bf16 MFU | 260383 tok/s +step 5634/18794 | loss 3.124292 (-2.03z)| norm 0.1904 (-0.62z)| lr 5.15e-03 | 1994.46 ms | 68.8% bf16 MFU | 260508 tok/s +step 5635/18794 | loss 3.207788 (+0.68z)| norm 0.1984 (-0.48z)| lr 5.15e-03 | 2002.63 ms | 68.5% bf16 MFU | 260572 tok/s +step 5636/18794 | loss 3.209438 (+0.72z)| norm 0.1952 (-0.54z)| lr 5.15e-03 | 2001.91 ms | 68.6% bf16 MFU | 260638 tok/s +step 5637/18794 | loss 3.152186 (-1.14z)| norm 0.1688 (-0.99z)| lr 5.14e-03 | 2002.86 ms | 68.5% bf16 MFU | 260695 tok/s +step 5638/18794 | loss 3.143887 (-1.39z)| norm 0.2067 (-0.32z)| lr 5.14e-03 | 2003.05 ms | 68.5% bf16 MFU | 260747 tok/s +step 5639/18794 | loss 3.139485 (-1.50z)| norm 0.2127 (-0.22z)| lr 5.14e-03 | 1995.72 ms | 68.8% bf16 MFU | 260845 tok/s +step 5640/18794 | loss 3.162541 (-0.75z)| norm 0.2341 (+0.17z)| lr 5.14e-03 | 2009.06 ms | 68.3% bf16 MFU | 260851 tok/s +step 5641/18794 | loss 3.146167 (-1.27z)| norm 0.2283 (+0.09z)| lr 5.14e-03 | 2004.18 ms | 68.5% bf16 MFU | 260889 tok/s +step 5642/18794 | loss 3.185396 (+0.00z)| norm 0.2140 (-0.17z)| lr 5.14e-03 | 2004.02 ms | 68.5% bf16 MFU | 260925 tok/s +step 5643/18794 | loss 3.144171 (-1.31z)| norm 0.2319 (+0.16z)| lr 5.14e-03 | 1996.31 ms | 68.7% bf16 MFU | 261010 tok/s +step 5644/18794 | loss 3.209069 (+0.78z)| norm 0.2613 (+0.77z)| lr 5.14e-03 | 2004.48 ms | 68.5% bf16 MFU | 261038 tok/s +step 5645/18794 | loss 3.214722 (+0.95z)| norm 0.2802 (+1.16z)| lr 5.14e-03 | 2020.34 ms | 67.9% bf16 MFU | 260961 tok/s +step 5646/18794 | loss 3.171978 (-0.43z)| norm 0.2732 (+1.01z)| lr 5.14e-03 | 2001.99 ms | 68.5% bf16 MFU | 261007 tok/s +step 5647/18794 | loss 3.148266 (-1.18z)| norm 0.2168 (-0.08z)| lr 5.14e-03 | 2002.70 ms | 68.5% bf16 MFU | 261046 tok/s +step 5648/18794 | loss 3.187129 (+0.06z)| norm 0.1680 (-1.07z)| lr 5.14e-03 | 1986.55 ms | 69.1% bf16 MFU | 261190 tok/s +step 5649/18794 | loss 3.235832 (+1.59z)| norm 0.2149 (-0.09z)| lr 5.14e-03 | 1994.82 ms | 68.8% bf16 MFU | 261272 tok/s +step 5650/18794 | loss 3.144488 (-1.29z)| norm 0.1982 (-0.43z)| lr 5.14e-03 | 2003.03 ms | 68.5% bf16 MFU | 261295 tok/s +step 5651/18794 | loss 3.188454 (+0.09z)| norm 0.1694 (-1.02z)| lr 5.14e-03 | 2018.51 ms | 68.0% bf16 MFU | 261218 tok/s +step 5652/18794 | loss 3.204972 (+0.61z)| norm 0.2021 (-0.34z)| lr 5.14e-03 | 2034.44 ms | 67.5% bf16 MFU | 261042 tok/s +step 5653/18794 | loss 3.144170 (-1.29z)| norm 0.2483 (+0.63z)| lr 5.14e-03 | 1996.56 ms | 68.7% bf16 MFU | 261120 tok/s +step 5654/18794 | loss 3.273272 (+2.63z)| norm 0.2453 (+0.55z)| lr 5.14e-03 | 1981.09 ms | 69.3% bf16 MFU | 261296 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.404103 +step 5655/18794 | loss 3.243441 (+1.69z)| norm 0.3910 (+3.40z)| lr 5.14e-03 | 2010.39 ms | 68.3% bf16 MFU | 261271 tok/s +step 5656/18794 | loss 3.166292 (-0.60z)| norm 0.2814 (+1.18z)| lr 5.14e-03 | 1994.14 ms | 68.8% bf16 MFU | 261353 tok/s +step 5657/18794 | loss 3.186795 (+0.01z)| norm 0.2039 (-0.37z)| lr 5.14e-03 | 2002.18 ms | 68.5% bf16 MFU | 261378 tok/s +step 5658/18794 | loss 3.184008 (-0.08z)| norm 0.2296 (+0.13z)| lr 5.14e-03 | 1996.28 ms | 68.7% bf16 MFU | 261441 tok/s +step 5659/18794 | loss 3.238291 (+1.51z)| norm 0.2829 (+1.18z)| lr 5.14e-03 | 1994.80 ms | 68.8% bf16 MFU | 261510 tok/s +step 5660/18794 | loss 3.172359 (-0.42z)| norm 0.2476 (+0.47z)| lr 5.14e-03 | 1995.20 ms | 68.8% bf16 MFU | 261574 tok/s +step 5661/18794 | loss 3.258469 (+2.09z)| norm 0.2038 (-0.41z)| lr 5.14e-03 | 2001.97 ms | 68.5% bf16 MFU | 261589 tok/s +step 5662/18794 | loss 3.210883 (+0.69z)| norm 0.3152 (+1.81z)| lr 5.14e-03 | 2011.23 ms | 68.2% bf16 MFU | 261544 tok/s +step 5663/18794 | loss 3.179472 (-0.23z)| norm 0.2898 (+1.31z)| lr 5.14e-03 | 1988.24 ms | 69.0% bf16 MFU | 261651 tok/s +step 5664/18794 | loss 3.181147 (-0.19z)| norm 0.1655 (-1.15z)| lr 5.14e-03 | 1993.69 ms | 68.8% bf16 MFU | 261717 tok/s +step 5665/18794 | loss 3.137808 (-1.43z)| norm 0.1920 (-0.62z)| lr 5.14e-03 | 1995.70 ms | 68.8% bf16 MFU | 261767 tok/s +step 5666/18794 | loss 3.099282 (-2.44z)| norm 0.1680 (-1.07z)| lr 5.13e-03 | 1989.14 ms | 69.0% bf16 MFU | 261857 tok/s +step 5667/18794 | loss 3.200578 (+0.41z)| norm 0.1815 (-0.80z)| lr 5.13e-03 | 2004.35 ms | 68.5% bf16 MFU | 261843 tok/s +step 5668/18794 | loss 3.185367 (-0.02z)| norm 0.2323 (+0.20z)| lr 5.13e-03 | 2002.42 ms | 68.5% bf16 MFU | 261842 tok/s +step 5669/18794 | loss 3.189434 (+0.10z)| norm 0.2337 (+0.24z)| lr 5.13e-03 | 2003.67 ms | 68.5% bf16 MFU | 261833 tok/s +step 5670/18794 | loss 3.185252 (-0.03z)| norm 0.2443 (+0.46z)| lr 5.13e-03 | 1995.19 ms | 68.8% bf16 MFU | 261881 tok/s +step 5671/18794 | loss 3.241067 (+1.52z)| norm 0.2076 (-0.27z)| lr 5.13e-03 | 1994.98 ms | 68.8% bf16 MFU | 261927 tok/s +step 5672/18794 | loss 3.168342 (-0.52z)| norm 0.1771 (-0.88z)| lr 5.13e-03 | 1989.49 ms | 69.0% bf16 MFU | 262007 tok/s +step 5673/18794 | loss 3.158290 (-0.82z)| norm 0.1629 (-1.16z)| lr 5.13e-03 | 2004.03 ms | 68.5% bf16 MFU | 261987 tok/s +step 5674/18794 | loss 3.111935 (-2.07z)| norm 0.2274 (+0.11z)| lr 5.13e-03 | 2011.77 ms | 68.2% bf16 MFU | 261918 tok/s +step 5675/18794 | loss 3.219645 (+0.93z)| norm 0.3115 (+1.76z)| lr 5.13e-03 | 1988.40 ms | 69.0% bf16 MFU | 262006 tok/s +step 5676/18794 | loss 3.174420 (-0.33z)| norm 0.2244 (+0.00z)| lr 5.13e-03 | 1996.11 ms | 68.7% bf16 MFU | 262039 tok/s +step 5677/18794 | loss 3.216560 (+0.83z)| norm 0.1781 (-0.93z)| lr 5.13e-03 | 1987.71 ms | 69.0% bf16 MFU | 262125 tok/s +step 5678/18794 | loss 3.183448 (-0.09z)| norm 0.2135 (-0.22z)| lr 5.13e-03 | 1994.28 ms | 68.8% bf16 MFU | 262163 tok/s +step 5679/18794 | loss 3.185845 (-0.03z)| norm 0.3014 (+1.52z)| lr 5.13e-03 | 1994.97 ms | 68.8% bf16 MFU | 262196 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.004017 +step 5680/18794 | loss 3.214911 (+0.76z)| norm 0.3288 (+2.00z)| lr 5.13e-03 | 1989.11 ms | 69.0% bf16 MFU | 262265 tok/s +step 5681/18794 | loss 3.196586 (+0.25z)| norm 0.2600 (+0.64z)| lr 5.13e-03 | 1984.81 ms | 69.1% bf16 MFU | 262359 tok/s +step 5682/18794 | loss 3.165430 (-0.62z)| norm 0.2077 (-0.39z)| lr 5.13e-03 | 1993.27 ms | 68.8% bf16 MFU | 262392 tok/s +step 5683/18794 | loss 3.204051 (+0.45z)| norm 0.2507 (+0.45z)| lr 5.13e-03 | 1980.03 ms | 69.3% bf16 MFU | 262512 tok/s +step 5684/18794 | loss 3.188779 (+0.04z)| norm 0.1800 (-0.92z)| lr 5.13e-03 | 1987.30 ms | 69.1% bf16 MFU | 262578 tok/s +step 5685/18794 | loss 3.186591 (-0.04z)| norm 0.1918 (-0.69z)| lr 5.13e-03 | 2010.77 ms | 68.2% bf16 MFU | 262486 tok/s +step 5686/18794 | loss 3.192665 (+0.12z)| norm 0.1842 (-0.84z)| lr 5.13e-03 | 1987.58 ms | 69.0% bf16 MFU | 262550 tok/s +step 5687/18794 | loss 3.195494 (+0.20z)| norm 0.2011 (-0.51z)| lr 5.13e-03 | 1986.43 ms | 69.1% bf16 MFU | 262620 tok/s +step 5688/18794 | loss 3.207905 (+0.56z)| norm 0.1799 (-0.92z)| lr 5.13e-03 | 1984.30 ms | 69.2% bf16 MFU | 262700 tok/s +step 5689/18794 | loss 3.186430 (-0.07z)| norm 0.2476 (+0.42z)| lr 5.13e-03 | 1985.40 ms | 69.1% bf16 MFU | 262768 tok/s +step 5690/18794 | loss 3.149848 (-1.12z)| norm 0.2849 (+1.20z)| lr 5.13e-03 | 2003.22 ms | 68.5% bf16 MFU | 262716 tok/s +step 5691/18794 | loss 3.150702 (-1.07z)| norm 0.1942 (-0.62z)| lr 5.13e-03 | 2002.99 ms | 68.5% bf16 MFU | 262668 tok/s +step 5692/18794 | loss 3.153342 (-0.98z)| norm 0.2031 (-0.44z)| lr 5.13e-03 | 1984.56 ms | 69.1% bf16 MFU | 262744 tok/s +step 5693/18794 | loss 3.160873 (-0.75z)| norm 0.2210 (-0.07z)| lr 5.13e-03 | 1987.85 ms | 69.0% bf16 MFU | 262794 tok/s +step 5694/18794 | loss 3.197809 (+0.30z)| norm 0.3090 (+1.74z)| lr 5.13e-03 | 1980.09 ms | 69.3% bf16 MFU | 262893 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.109036 +step 5695/18794 | loss 3.196395 (+0.26z)| norm 0.3301 (+2.11z)| lr 5.12e-03 | 1996.49 ms | 68.7% bf16 MFU | 262879 tok/s +step 5696/18794 | loss 3.186912 (-0.02z)| norm 0.2485 (+0.46z)| lr 5.12e-03 | 1988.40 ms | 69.0% bf16 MFU | 262918 tok/s +step 5697/18794 | loss 3.173495 (-0.41z)| norm 0.2530 (+0.54z)| lr 5.12e-03 | 2001.66 ms | 68.6% bf16 MFU | 262869 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.682597 +step 5698/18794 | loss 3.155280 (-0.92z)| norm 0.3660 (+2.68z)| lr 5.12e-03 | 1981.84 ms | 69.2% bf16 MFU | 262953 tok/s +step 5699/18794 | loss 3.230406 (+1.26z)| norm 0.2493 (+0.41z)| lr 5.12e-03 | 1979.62 ms | 69.3% bf16 MFU | 263047 tok/s +step 5700/18794 | loss 3.184275 (-0.07z)| norm 0.2122 (-0.31z)| lr 5.12e-03 | 1987.78 ms | 69.0% bf16 MFU | 263083 tok/s +step 5701/18794 | loss 3.179517 (-0.19z)| norm 0.2623 (+0.65z)| lr 5.12e-03 | 1997.60 ms | 68.7% bf16 MFU | 263051 tok/s +step 5702/18794 | loss 3.157689 (-0.84z)| norm 0.2770 (+0.92z)| lr 5.12e-03 | 1991.94 ms | 68.9% bf16 MFU | 263059 tok/s +step 5703/18794 | loss 3.152493 (-1.00z)| norm 0.2022 (-0.53z)| lr 5.12e-03 | 1979.96 ms | 69.3% bf16 MFU | 263146 tok/s +step 5704/18794 | loss 3.144322 (-1.22z)| norm 0.3350 (+1.98z)| lr 5.12e-03 | 1987.52 ms | 69.0% bf16 MFU | 263178 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.508272 +step 5705/18794 | loss 3.177034 (-0.23z)| norm 0.3678 (+2.51z)| lr 5.12e-03 | 1988.65 ms | 69.0% bf16 MFU | 263201 tok/s +step 5706/18794 | loss 3.216262 (+0.93z)| norm 0.1882 (-0.79z)| lr 5.12e-03 | 1985.31 ms | 69.1% bf16 MFU | 263245 tok/s +step 5707/18794 | loss 3.168450 (-0.50z)| norm 0.3259 (+1.69z)| lr 5.12e-03 | 1983.52 ms | 69.2% bf16 MFU | 263299 tok/s +step 5708/18794 | loss 3.222190 (+1.09z)| norm 0.3354 (+1.81z)| lr 5.12e-03 | 1982.21 ms | 69.2% bf16 MFU | 263359 tok/s +step 5709/18794 | loss 3.150511 (-1.03z)| norm 0.1854 (-0.85z)| lr 5.12e-03 | 1983.62 ms | 69.2% bf16 MFU | 263407 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.722176 +step 5710/18794 | loss 3.212963 (+0.83z)| norm 0.3937 (+2.72z)| lr 5.12e-03 | 1991.81 ms | 68.9% bf16 MFU | 263397 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.729625 +step 5711/18794 | loss 3.208467 (+0.72z)| norm 0.4020 (+2.73z)| lr 5.12e-03 | 1988.59 ms | 69.0% bf16 MFU | 263410 tok/s +step 5712/18794 | loss 3.143356 (-1.25z)| norm 0.1738 (-1.02z)| lr 5.12e-03 | 1979.98 ms | 69.3% bf16 MFU | 263479 tok/s +step 5713/18794 | loss 3.211631 (+0.85z)| norm 0.2777 (+0.67z)| lr 5.12e-03 | 1980.83 ms | 69.3% bf16 MFU | 263539 tok/s +step 5714/18794 | loss 3.178821 (-0.15z)| norm 0.2381 (+0.01z)| lr 5.12e-03 | 1980.13 ms | 69.3% bf16 MFU | 263601 tok/s +step 5715/18794 | loss 3.192186 (+0.29z)| norm 0.1881 (-0.80z)| lr 5.12e-03 | 1982.00 ms | 69.2% bf16 MFU | 263647 tok/s +step 5716/18794 | loss 3.153112 (-0.93z)| norm 0.2265 (-0.17z)| lr 5.12e-03 | 1983.22 ms | 69.2% bf16 MFU | 263683 tok/s +step 5717/18794 | loss 3.153841 (-0.91z)| norm 0.1724 (-1.06z)| lr 5.12e-03 | 1982.41 ms | 69.2% bf16 MFU | 263722 tok/s +step 5718/18794 | loss 3.152813 (-0.93z)| norm 0.2710 (+0.55z)| lr 5.12e-03 | 1988.38 ms | 69.0% bf16 MFU | 263720 tok/s +step 5719/18794 | loss 3.197475 (+0.48z)| norm 0.2436 (+0.08z)| lr 5.12e-03 | 1985.78 ms | 69.1% bf16 MFU | 263735 tok/s +step 5720/18794 | loss 3.179893 (-0.07z)| norm 0.2281 (-0.18z)| lr 5.12e-03 | 1986.20 ms | 69.1% bf16 MFU | 263746 tok/s +step 5721/18794 | loss 3.150780 (-0.99z)| norm 0.3086 (+1.13z)| lr 5.12e-03 | 1980.57 ms | 69.3% bf16 MFU | 263795 tok/s +step 5722/18794 | loss 3.163913 (-0.56z)| norm 0.2978 (+0.94z)| lr 5.12e-03 | 1981.29 ms | 69.3% bf16 MFU | 263836 tok/s +step 5723/18794 | loss 3.224394 (+1.38z)| norm 0.2236 (-0.30z)| lr 5.11e-03 | 1979.78 ms | 69.3% bf16 MFU | 263885 tok/s +step 5724/18794 | loss 3.209188 (+0.88z)| norm 0.2784 (+0.61z)| lr 5.11e-03 | 1980.61 ms | 69.3% bf16 MFU | 263927 tok/s +step 5725/18794 | loss 3.205430 (+0.76z)| norm 0.1919 (-0.83z)| lr 5.11e-03 | 1981.31 ms | 69.3% bf16 MFU | 263961 tok/s +step 5726/18794 | loss 3.170505 (-0.35z)| norm 0.2493 (+0.12z)| lr 5.11e-03 | 1980.81 ms | 69.3% bf16 MFU | 263997 tok/s +step 5727/18794 | loss 3.157318 (-0.76z)| norm 0.1981 (-0.73z)| lr 5.11e-03 | 1980.54 ms | 69.3% bf16 MFU | 264033 tok/s +step 5728/18794 | loss 3.217061 (+1.14z)| norm 0.2141 (-0.45z)| lr 5.11e-03 | 1983.15 ms | 69.2% bf16 MFU | 264050 tok/s +step 5729/18794 | loss 3.137998 (-1.38z)| norm 0.2139 (-0.44z)| lr 5.11e-03 | 1982.89 ms | 69.2% bf16 MFU | 264068 tok/s +step 5730/18794 | loss 3.171717 (-0.32z)| norm 0.2304 (-0.15z)| lr 5.11e-03 | 1986.62 ms | 69.1% bf16 MFU | 264060 tok/s +step 5731/18794 | loss 3.196000 (+0.45z)| norm 0.2873 (+0.87z)| lr 5.11e-03 | 1985.88 ms | 69.1% bf16 MFU | 264057 tok/s +step 5732/18794 | loss 3.230215 (+1.51z)| norm 0.1818 (-1.03z)| lr 5.11e-03 | 1988.31 ms | 69.0% bf16 MFU | 264039 tok/s +step 5733/18794 | loss 3.235484 (+1.64z)| norm 0.2380 (-0.02z)| lr 5.11e-03 | 1981.05 ms | 69.3% bf16 MFU | 264069 tok/s +step 5734/18794 | loss 3.211240 (+0.86z)| norm 0.3385 (+1.75z)| lr 5.11e-03 | 1980.97 ms | 69.3% bf16 MFU | 264099 tok/s +step 5735/18794 | loss 3.172155 (-0.37z)| norm 0.2947 (+0.94z)| lr 5.11e-03 | 1979.72 ms | 69.3% bf16 MFU | 264136 tok/s +step 5736/18794 | loss 3.251488 (+2.10z)| norm 0.2307 (-0.21z)| lr 5.11e-03 | 1980.68 ms | 69.3% bf16 MFU | 264164 tok/s +step 5737/18794 | loss 3.149699 (-1.07z)| norm 0.2618 (+0.34z)| lr 5.11e-03 | 1979.45 ms | 69.3% bf16 MFU | 264199 tok/s +step 5738/18794 | loss 3.183031 (-0.05z)| norm 0.1813 (-1.11z)| lr 5.11e-03 | 1981.27 ms | 69.3% bf16 MFU | 264220 tok/s +step 5739/18794 | loss 3.180801 (-0.13z)| norm 0.1829 (-1.07z)| lr 5.11e-03 | 1979.43 ms | 69.3% bf16 MFU | 264253 tok/s +step 5740/18794 | loss 3.194257 (+0.29z)| norm 0.2585 (+0.28z)| lr 5.11e-03 | 1983.20 ms | 69.2% bf16 MFU | 264258 tok/s +step 5741/18794 | loss 3.209981 (+0.77z)| norm 0.2570 (+0.25z)| lr 5.11e-03 | 1983.35 ms | 69.2% bf16 MFU | 264262 tok/s +step 5742/18794 | loss 3.192054 (+0.20z)| norm 0.2460 (+0.05z)| lr 5.11e-03 | 1983.53 ms | 69.2% bf16 MFU | 264265 tok/s +step 5743/18794 | loss 3.201236 (+0.48z)| norm 0.2860 (+0.75z)| lr 5.11e-03 | 1986.86 ms | 69.1% bf16 MFU | 264246 tok/s +step 5744/18794 | loss 3.187654 (+0.04z)| norm 0.2161 (-0.49z)| lr 5.11e-03 | 1987.34 ms | 69.1% bf16 MFU | 264224 tok/s +step 5745/18794 | loss 3.247082 (+1.92z)| norm 0.2345 (-0.15z)| lr 5.11e-03 | 1982.96 ms | 69.2% bf16 MFU | 264233 tok/s +step 5746/18794 | loss 3.187386 (+0.02z)| norm 0.2787 (+0.64z)| lr 5.11e-03 | 1979.57 ms | 69.3% bf16 MFU | 264264 tok/s +step 5747/18794 | loss 3.196119 (+0.29z)| norm 0.3063 (+1.11z)| lr 5.11e-03 | 1983.67 ms | 69.2% bf16 MFU | 264266 tok/s +step 5748/18794 | loss 3.218288 (+0.98z)| norm 0.2093 (-0.63z)| lr 5.11e-03 | 1980.90 ms | 69.3% bf16 MFU | 264286 tok/s +step 5749/18794 | loss 3.234751 (+1.51z)| norm 0.1610 (-1.47z)| lr 5.11e-03 | 1980.90 ms | 69.3% bf16 MFU | 264305 tok/s +step 5750/18794 | loss 3.261885 (+2.30z)| norm 0.1743 (-1.23z)| lr 5.11e-03 | 1980.37 ms | 69.3% bf16 MFU | 264327 tok/s +val loss 3.212880 +HellaSwag: 2847/10042 = 0.283509: 0/1256 +step 5751/18794 | loss 3.201505 (+0.40z)| norm 0.2158 (-0.50z)| lr 5.10e-03 | 1995.47 ms | 68.8% bf16 MFU | 264248 tok/s +step 5752/18794 | loss 3.175799 (-0.40z)| norm 0.2305 (-0.25z)| lr 5.10e-03 | 1982.83 ms | 69.2% bf16 MFU | 264256 tok/s +step 5753/18794 | loss 3.221049 (+1.00z)| norm 0.2863 (+0.74z)| lr 5.10e-03 | 1979.29 ms | 69.3% bf16 MFU | 264288 tok/s +step 5754/18794 | loss 3.198741 (+0.33z)| norm 0.3074 (+1.10z)| lr 5.10e-03 | 1982.36 ms | 69.2% bf16 MFU | 264297 tok/s +step 5755/18794 | loss 3.232051 (+1.43z)| norm 0.2223 (-0.39z)| lr 5.10e-03 | 1978.71 ms | 69.4% bf16 MFU | 264330 tok/s +step 5756/18794 | loss 3.241012 (+1.68z)| norm 0.2124 (-0.56z)| lr 5.10e-03 | 1981.31 ms | 69.3% bf16 MFU | 264345 tok/s +step 5757/18794 | loss 3.186439 (-0.09z)| norm 0.2991 (+1.01z)| lr 5.10e-03 | 1978.69 ms | 69.4% bf16 MFU | 264376 tok/s +step 5758/18794 | loss 3.172674 (-0.53z)| norm 0.2670 (+0.41z)| lr 5.10e-03 | 1984.14 ms | 69.2% bf16 MFU | 264369 tok/s +step 5759/18794 | loss 3.133408 (-1.77z)| norm 0.2202 (-0.43z)| lr 5.10e-03 | 1983.41 ms | 69.2% bf16 MFU | 264367 tok/s +step 5760/18794 | loss 3.210718 (+0.72z)| norm 0.2017 (-0.76z)| lr 5.10e-03 | 1985.14 ms | 69.1% bf16 MFU | 264354 tok/s +step 5761/18794 | loss 3.210451 (+0.74z)| norm 0.3074 (+1.15z)| lr 5.10e-03 | 1988.98 ms | 69.0% bf16 MFU | 264316 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.337031 +step 5762/18794 | loss 3.184962 (-0.09z)| norm 0.3761 (+2.34z)| lr 5.10e-03 | 1981.03 ms | 69.3% bf16 MFU | 264333 tok/s +step 5763/18794 | loss 3.229509 (+1.36z)| norm 0.3090 (+1.14z)| lr 5.10e-03 | 1988.41 ms | 69.0% bf16 MFU | 264300 tok/s +step 5764/18794 | loss 3.184926 (-0.11z)| norm 0.2254 (-0.36z)| lr 5.10e-03 | 1979.49 ms | 69.3% bf16 MFU | 264328 tok/s +step 5765/18794 | loss 3.182269 (-0.21z)| norm 0.3248 (+1.39z)| lr 5.10e-03 | 1979.07 ms | 69.3% bf16 MFU | 264358 tok/s +step 5766/18794 | loss 3.172029 (-0.61z)| norm 0.2482 (+0.01z)| lr 5.10e-03 | 1984.06 ms | 69.2% bf16 MFU | 264352 tok/s +step 5767/18794 | loss 3.161775 (-0.95z)| norm 0.1995 (-0.87z)| lr 5.10e-03 | 1979.60 ms | 69.3% bf16 MFU | 264377 tok/s +step 5768/18794 | loss 3.163947 (-0.86z)| norm 0.1955 (-0.94z)| lr 5.10e-03 | 1983.41 ms | 69.2% bf16 MFU | 264375 tok/s +step 5769/18794 | loss 3.149122 (-1.35z)| norm 0.1724 (-1.33z)| lr 5.10e-03 | 1980.99 ms | 69.3% bf16 MFU | 264389 tok/s +step 5770/18794 | loss 3.177059 (-0.39z)| norm 0.3320 (+1.48z)| lr 5.10e-03 | 1980.97 ms | 69.3% bf16 MFU | 264403 tok/s +step 5771/18794 | loss 3.213888 (+0.90z)| norm 0.3303 (+1.42z)| lr 5.10e-03 | 1980.47 ms | 69.3% bf16 MFU | 264419 tok/s +step 5772/18794 | loss 3.153548 (-1.19z)| norm 0.3051 (+0.96z)| lr 5.10e-03 | 1983.59 ms | 69.2% bf16 MFU | 264414 tok/s +step 5773/18794 | loss 3.178205 (-0.34z)| norm 0.1998 (-0.90z)| lr 5.10e-03 | 1984.01 ms | 69.2% bf16 MFU | 264406 tok/s +step 5774/18794 | loss 3.146646 (-1.49z)| norm 0.2434 (-0.13z)| lr 5.10e-03 | 1981.29 ms | 69.3% bf16 MFU | 264416 tok/s +step 5775/18794 | loss 3.194713 (+0.23z)| norm 0.2054 (-0.79z)| lr 5.10e-03 | 1984.34 ms | 69.2% bf16 MFU | 264406 tok/s +step 5776/18794 | loss 3.160917 (-0.97z)| norm 0.2344 (-0.28z)| lr 5.10e-03 | 1985.37 ms | 69.1% bf16 MFU | 264390 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.007216 +step 5777/18794 | loss 3.233924 (+1.62z)| norm 0.3661 (+2.01z)| lr 5.10e-03 | 1980.57 ms | 69.3% bf16 MFU | 264406 tok/s +step 5778/18794 | loss 3.190123 (+0.07z)| norm 0.3385 (+1.49z)| lr 5.10e-03 | 1985.56 ms | 69.1% bf16 MFU | 264388 tok/s +step 5779/18794 | loss 3.300919 (+3.67z)| norm 0.2463 (-0.11z)| lr 5.09e-03 | 1980.92 ms | 69.3% bf16 MFU | 264402 tok/s +step 5780/18794 | loss 3.149633 (-1.28z)| norm 0.2156 (-0.63z)| lr 5.09e-03 | 1980.83 ms | 69.3% bf16 MFU | 264416 tok/s +step 5781/18794 | loss 3.196098 (+0.24z)| norm 0.1919 (-1.03z)| lr 5.09e-03 | 1982.41 ms | 69.2% bf16 MFU | 264419 tok/s +step 5782/18794 | loss 3.205283 (+0.53z)| norm 0.2061 (-0.78z)| lr 5.09e-03 | 1981.45 ms | 69.3% bf16 MFU | 264428 tok/s +step 5783/18794 | loss 3.216895 (+0.90z)| norm 0.2157 (-0.60z)| lr 5.09e-03 | 1979.66 ms | 69.3% bf16 MFU | 264448 tok/s +step 5784/18794 | loss 3.283200 (+2.91z)| norm 0.2585 (+0.13z)| lr 5.09e-03 | 1983.89 ms | 69.2% bf16 MFU | 264440 tok/s +step 5785/18794 | loss 3.211937 (+0.67z)| norm 0.2566 (+0.09z)| lr 5.09e-03 | 1982.39 ms | 69.2% bf16 MFU | 264441 tok/s +step 5786/18794 | loss 3.167559 (-0.71z)| norm 0.2281 (-0.43z)| lr 5.09e-03 | 1982.87 ms | 69.2% bf16 MFU | 264440 tok/s +step 5787/18794 | loss 3.192155 (+0.06z)| norm 0.2211 (-0.56z)| lr 5.09e-03 | 1982.26 ms | 69.2% bf16 MFU | 264442 tok/s +step 5788/18794 | loss 3.235963 (+1.40z)| norm 0.2070 (-0.82z)| lr 5.09e-03 | 1985.12 ms | 69.1% bf16 MFU | 264425 tok/s +step 5789/18794 | loss 3.193073 (+0.08z)| norm 0.1851 (-1.19z)| lr 5.09e-03 | 1980.39 ms | 69.3% bf16 MFU | 264441 tok/s +step 5790/18794 | loss 3.235940 (+1.37z)| norm 0.2358 (-0.28z)| lr 5.09e-03 | 1983.59 ms | 69.2% bf16 MFU | 264435 tok/s +step 5791/18794 | loss 3.203871 (+0.37z)| norm 0.3055 (+0.94z)| lr 5.09e-03 | 1983.44 ms | 69.2% bf16 MFU | 264430 tok/s +step 5792/18794 | loss 3.193466 (+0.03z)| norm 0.2278 (-0.45z)| lr 5.09e-03 | 1979.26 ms | 69.3% bf16 MFU | 264453 tok/s +step 5793/18794 | loss 3.188285 (-0.14z)| norm 0.1959 (-1.01z)| lr 5.09e-03 | 1986.21 ms | 69.1% bf16 MFU | 264428 tok/s +step 5794/18794 | loss 3.119186 (-2.23z)| norm 0.1886 (-1.12z)| lr 5.09e-03 | 1983.34 ms | 69.2% bf16 MFU | 264424 tok/s +step 5795/18794 | loss 3.198307 (+0.20z)| norm 0.1803 (-1.24z)| lr 5.09e-03 | 1979.13 ms | 69.3% bf16 MFU | 264448 tok/s +step 5796/18794 | loss 3.254686 (+1.87z)| norm 0.3236 (+1.29z)| lr 5.09e-03 | 1980.24 ms | 69.3% bf16 MFU | 264464 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.275581 +step 5797/18794 | loss 3.186229 (-0.19z)| norm 0.3840 (+2.28z)| lr 5.09e-03 | 1980.94 ms | 69.3% bf16 MFU | 264474 tok/s +step 5798/18794 | loss 3.212510 (+0.58z)| norm 0.2316 (-0.34z)| lr 5.09e-03 | 1979.51 ms | 69.3% bf16 MFU | 264493 tok/s +step 5799/18794 | loss 3.160207 (-0.98z)| norm 0.2381 (-0.22z)| lr 5.09e-03 | 1983.51 ms | 69.2% bf16 MFU | 264485 tok/s +step 5800/18794 | loss 3.194871 (+0.07z)| norm 0.3524 (+1.74z)| lr 5.09e-03 | 1982.99 ms | 69.2% bf16 MFU | 264480 tok/s +step 5801/18794 | loss 3.206182 (+0.40z)| norm 0.2542 (+0.04z)| lr 5.09e-03 | 1981.37 ms | 69.3% bf16 MFU | 264486 tok/s +step 5802/18794 | loss 3.217805 (+0.74z)| norm 0.1855 (-1.13z)| lr 5.09e-03 | 1981.78 ms | 69.2% bf16 MFU | 264490 tok/s +step 5803/18794 | loss 3.183825 (-0.31z)| norm 0.1777 (-1.25z)| lr 5.09e-03 | 1981.81 ms | 69.2% bf16 MFU | 264493 tok/s +step 5804/18794 | loss 3.230361 (+1.10z)| norm 0.1773 (-1.24z)| lr 5.09e-03 | 1983.23 ms | 69.2% bf16 MFU | 264486 tok/s +step 5805/18794 | loss 3.269917 (+2.24z)| norm 0.2321 (-0.28z)| lr 5.09e-03 | 1985.72 ms | 69.1% bf16 MFU | 264463 tok/s +step 5806/18794 | loss 3.176349 (-0.57z)| norm 0.2497 (+0.02z)| lr 5.09e-03 | 1985.89 ms | 69.1% bf16 MFU | 264440 tok/s +step 5807/18794 | loss 3.223321 (+0.83z)| norm 0.2041 (-0.77z)| lr 5.08e-03 | 1979.81 ms | 69.3% bf16 MFU | 264459 tok/s +step 5808/18794 | loss 3.200945 (+0.16z)| norm 0.3441 (+1.72z)| lr 5.08e-03 | 1982.94 ms | 69.2% bf16 MFU | 264456 tok/s +step 5809/18794 | loss 3.127621 (-2.02z)| norm 0.2847 (+0.65z)| lr 5.08e-03 | 1980.98 ms | 69.3% bf16 MFU | 264467 tok/s +step 5810/18794 | loss 3.187076 (-0.24z)| norm 0.2359 (-0.20z)| lr 5.08e-03 | 1980.17 ms | 69.3% bf16 MFU | 264482 tok/s +step 5811/18794 | loss 3.174689 (-0.60z)| norm 0.3247 (+1.50z)| lr 5.08e-03 | 1979.79 ms | 69.3% bf16 MFU | 264499 tok/s +step 5812/18794 | loss 3.202251 (+0.21z)| norm 0.2475 (+0.02z)| lr 5.08e-03 | 1979.47 ms | 69.3% bf16 MFU | 264517 tok/s +step 5813/18794 | loss 3.187295 (-0.23z)| norm 0.2268 (-0.37z)| lr 5.08e-03 | 1980.07 ms | 69.3% bf16 MFU | 264530 tok/s +step 5814/18794 | loss 3.131127 (-1.89z)| norm 0.3105 (+1.22z)| lr 5.08e-03 | 1979.74 ms | 69.3% bf16 MFU | 264545 tok/s +step 5815/18794 | loss 3.224294 (+0.87z)| norm 0.3072 (+1.13z)| lr 5.08e-03 | 1979.89 ms | 69.3% bf16 MFU | 264558 tok/s +step 5816/18794 | loss 3.236489 (+1.21z)| norm 0.3297 (+1.52z)| lr 5.08e-03 | 1982.66 ms | 69.2% bf16 MFU | 264552 tok/s +step 5817/18794 | loss 3.183283 (-0.38z)| norm 0.2448 (-0.10z)| lr 5.08e-03 | 1980.57 ms | 69.3% bf16 MFU | 264560 tok/s +step 5818/18794 | loss 3.262404 (+1.93z)| norm 0.2977 (+0.91z)| lr 5.08e-03 | 1982.15 ms | 69.2% bf16 MFU | 264557 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.228980 +step 5819/18794 | loss 3.210744 (+0.40z)| norm 0.3714 (+2.23z)| lr 5.08e-03 | 1983.17 ms | 69.2% bf16 MFU | 264548 tok/s +step 5820/18794 | loss 3.173560 (-0.70z)| norm 0.1795 (-1.32z)| lr 5.08e-03 | 1980.21 ms | 69.3% bf16 MFU | 264559 tok/s +step 5821/18794 | loss 3.239898 (+1.23z)| norm 0.2935 (+0.79z)| lr 5.08e-03 | 1983.01 ms | 69.2% bf16 MFU | 264550 tok/s +step 5822/18794 | loss 3.172520 (-0.76z)| norm 0.2580 (+0.14z)| lr 5.08e-03 | 1983.06 ms | 69.2% bf16 MFU | 264542 tok/s +step 5823/18794 | loss 3.138882 (-1.71z)| norm 0.1997 (-0.93z)| lr 5.08e-03 | 1980.28 ms | 69.3% bf16 MFU | 264553 tok/s +step 5824/18794 | loss 3.232833 (+1.03z)| norm 0.1803 (-1.26z)| lr 5.08e-03 | 1987.36 ms | 69.1% bf16 MFU | 264515 tok/s +step 5825/18794 | loss 3.216418 (+0.54z)| norm 0.1983 (-0.93z)| lr 5.08e-03 | 1982.21 ms | 69.2% bf16 MFU | 264514 tok/s +step 5826/18794 | loss 3.163750 (-0.98z)| norm 0.1805 (-1.24z)| lr 5.08e-03 | 1983.32 ms | 69.2% bf16 MFU | 264506 tok/s +step 5827/18794 | loss 3.189873 (-0.24z)| norm 0.2012 (-0.86z)| lr 5.08e-03 | 1981.37 ms | 69.3% bf16 MFU | 264511 tok/s +step 5828/18794 | loss 3.212523 (+0.43z)| norm 0.2224 (-0.48z)| lr 5.08e-03 | 1980.48 ms | 69.3% bf16 MFU | 264522 tok/s +step 5829/18794 | loss 3.207347 (+0.26z)| norm 0.2233 (-0.46z)| lr 5.08e-03 | 1980.39 ms | 69.3% bf16 MFU | 264533 tok/s +step 5830/18794 | loss 3.141087 (-1.68z)| norm 0.2369 (-0.22z)| lr 5.08e-03 | 1983.85 ms | 69.2% bf16 MFU | 264520 tok/s +step 5831/18794 | loss 3.227150 (+0.83z)| norm 0.1726 (-1.36z)| lr 5.08e-03 | 1979.35 ms | 69.3% bf16 MFU | 264538 tok/s +step 5832/18794 | loss 3.278660 (+2.27z)| norm 0.1992 (-0.88z)| lr 5.08e-03 | 1981.41 ms | 69.3% bf16 MFU | 264541 tok/s +step 5833/18794 | loss 3.210513 (+0.34z)| norm 0.2325 (-0.28z)| lr 5.08e-03 | 1982.53 ms | 69.2% bf16 MFU | 264537 tok/s +step 5834/18794 | loss 3.150438 (-1.36z)| norm 0.1975 (-0.90z)| lr 5.08e-03 | 1979.95 ms | 69.3% bf16 MFU | 264550 tok/s +step 5835/18794 | loss 3.205592 (+0.20z)| norm 0.2046 (-0.75z)| lr 5.07e-03 | 1983.84 ms | 69.2% bf16 MFU | 264537 tok/s +step 5836/18794 | loss 3.213841 (+0.45z)| norm 0.2872 (+0.76z)| lr 5.07e-03 | 1982.23 ms | 69.2% bf16 MFU | 264534 tok/s +step 5837/18794 | loss 3.131708 (-1.90z)| norm 0.3037 (+1.05z)| lr 5.07e-03 | 1980.06 ms | 69.3% bf16 MFU | 264547 tok/s +step 5838/18794 | loss 3.205387 (+0.21z)| norm 0.2001 (-0.85z)| lr 5.07e-03 | 1980.97 ms | 69.3% bf16 MFU | 264553 tok/s +step 5839/18794 | loss 3.166930 (-0.89z)| norm 0.2412 (-0.11z)| lr 5.07e-03 | 1988.99 ms | 69.0% bf16 MFU | 264505 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.160572 +step 5840/18794 | loss 3.161088 (-1.04z)| norm 0.4294 (+3.16z)| lr 5.07e-03 | 1986.48 ms | 69.1% bf16 MFU | 264476 tok/s +step 5841/18794 | loss 3.206458 (+0.25z)| norm 0.2758 (+0.47z)| lr 5.07e-03 | 1979.13 ms | 69.3% bf16 MFU | 264498 tok/s +step 5842/18794 | loss 3.143772 (-1.50z)| norm 0.2327 (-0.28z)| lr 5.07e-03 | 1980.29 ms | 69.3% bf16 MFU | 264510 tok/s +step 5843/18794 | loss 3.152121 (-1.24z)| norm 0.2547 (+0.11z)| lr 5.07e-03 | 1979.90 ms | 69.3% bf16 MFU | 264525 tok/s +step 5844/18794 | loss 3.239044 (+1.16z)| norm 0.2643 (+0.27z)| lr 5.07e-03 | 1979.64 ms | 69.3% bf16 MFU | 264541 tok/s +step 5845/18794 | loss 3.260995 (+1.75z)| norm 0.2796 (+0.53z)| lr 5.07e-03 | 1979.23 ms | 69.3% bf16 MFU | 264558 tok/s +step 5846/18794 | loss 3.206307 (+0.24z)| norm 0.2015 (-0.83z)| lr 5.07e-03 | 1980.51 ms | 69.3% bf16 MFU | 264567 tok/s +step 5847/18794 | loss 3.213741 (+0.44z)| norm 0.1917 (-0.98z)| lr 5.07e-03 | 1979.63 ms | 69.3% bf16 MFU | 264580 tok/s +step 5848/18794 | loss 3.214512 (+0.46z)| norm 0.1797 (-1.18z)| lr 5.07e-03 | 1979.78 ms | 69.3% bf16 MFU | 264592 tok/s +step 5849/18794 | loss 3.207032 (+0.27z)| norm 0.1918 (-0.98z)| lr 5.07e-03 | 1985.79 ms | 69.1% bf16 MFU | 264564 tok/s +step 5850/18794 | loss 3.252175 (+1.52z)| norm 0.1915 (-0.99z)| lr 5.07e-03 | 1984.68 ms | 69.1% bf16 MFU | 264544 tok/s +step 5851/18794 | loss 3.224834 (+0.75z)| norm 0.2528 (+0.08z)| lr 5.07e-03 | 1981.70 ms | 69.2% bf16 MFU | 264545 tok/s +step 5852/18794 | loss 3.138851 (-1.60z)| norm 0.2684 (+0.35z)| lr 5.07e-03 | 1982.57 ms | 69.2% bf16 MFU | 264540 tok/s +step 5853/18794 | loss 3.222998 (+0.71z)| norm 0.2212 (-0.47z)| lr 5.07e-03 | 1986.51 ms | 69.1% bf16 MFU | 264509 tok/s +step 5854/18794 | loss 3.217161 (+0.54z)| norm 0.2627 (+0.27z)| lr 5.07e-03 | 1980.01 ms | 69.3% bf16 MFU | 264524 tok/s +step 5855/18794 | loss 3.168849 (-0.76z)| norm 0.2575 (+0.17z)| lr 5.07e-03 | 1986.48 ms | 69.1% bf16 MFU | 264494 tok/s +step 5856/18794 | loss 3.227566 (+0.85z)| norm 0.2974 (+0.86z)| lr 5.07e-03 | 1983.84 ms | 69.2% bf16 MFU | 264483 tok/s +step 5857/18794 | loss 3.162809 (-0.92z)| norm 0.2388 (-0.17z)| lr 5.07e-03 | 1980.53 ms | 69.3% bf16 MFU | 264495 tok/s +step 5858/18794 | loss 3.183547 (-0.36z)| norm 0.1798 (-1.19z)| lr 5.07e-03 | 1980.88 ms | 69.3% bf16 MFU | 264504 tok/s +step 5859/18794 | loss 3.246418 (+1.34z)| norm 0.1704 (-1.34z)| lr 5.07e-03 | 1979.67 ms | 69.3% bf16 MFU | 264520 tok/s +step 5860/18794 | loss 3.180281 (-0.47z)| norm 0.2842 (+0.64z)| lr 5.07e-03 | 1979.54 ms | 69.3% bf16 MFU | 264537 tok/s +step 5861/18794 | loss 3.151186 (-1.25z)| norm 0.2943 (+0.82z)| lr 5.07e-03 | 1983.00 ms | 69.2% bf16 MFU | 264530 tok/s +step 5862/18794 | loss 3.184888 (-0.32z)| norm 0.2057 (-0.72z)| lr 5.07e-03 | 1980.15 ms | 69.3% bf16 MFU | 264542 tok/s +step 5863/18794 | loss 3.160353 (-0.98z)| norm 0.2686 (+0.42z)| lr 5.06e-03 | 1981.58 ms | 69.3% bf16 MFU | 264544 tok/s +step 5864/18794 | loss 3.183592 (-0.34z)| norm 0.1843 (-1.09z)| lr 5.06e-03 | 1983.27 ms | 69.2% bf16 MFU | 264534 tok/s +step 5865/18794 | loss 3.202745 (+0.18z)| norm 0.1946 (-0.89z)| lr 5.06e-03 | 1980.43 ms | 69.3% bf16 MFU | 264544 tok/s +step 5866/18794 | loss 3.183737 (-0.35z)| norm 0.2102 (-0.60z)| lr 5.06e-03 | 1983.48 ms | 69.2% bf16 MFU | 264534 tok/s +step 5867/18794 | loss 3.214476 (+0.48z)| norm 0.1977 (-0.82z)| lr 5.06e-03 | 1981.74 ms | 69.2% bf16 MFU | 264535 tok/s +step 5868/18794 | loss 3.206522 (+0.25z)| norm 0.2160 (-0.50z)| lr 5.06e-03 | 1985.20 ms | 69.1% bf16 MFU | 264513 tok/s +step 5869/18794 | loss 3.242747 (+1.23z)| norm 0.2175 (-0.48z)| lr 5.06e-03 | 1983.48 ms | 69.2% bf16 MFU | 264504 tok/s +step 5870/18794 | loss 3.223689 (+0.69z)| norm 0.1969 (-0.84z)| lr 5.06e-03 | 1986.53 ms | 69.1% bf16 MFU | 264475 tok/s +step 5871/18794 | loss 3.168782 (-0.81z)| norm 0.1871 (-1.00z)| lr 5.06e-03 | 1980.81 ms | 69.3% bf16 MFU | 264485 tok/s +step 5872/18794 | loss 3.172485 (-0.72z)| norm 0.1705 (-1.29z)| lr 5.06e-03 | 1980.32 ms | 69.3% bf16 MFU | 264498 tok/s +step 5873/18794 | loss 3.179238 (-0.53z)| norm 0.2155 (-0.45z)| lr 5.06e-03 | 1978.79 ms | 69.4% bf16 MFU | 264521 tok/s +step 5874/18794 | loss 3.167922 (-0.86z)| norm 0.2801 (+0.74z)| lr 5.06e-03 | 1979.68 ms | 69.3% bf16 MFU | 264537 tok/s +step 5875/18794 | loss 3.186215 (-0.34z)| norm 0.2984 (+1.06z)| lr 5.06e-03 | 1979.57 ms | 69.3% bf16 MFU | 264552 tok/s +step 5876/18794 | loss 3.191737 (-0.20z)| norm 0.2802 (+0.71z)| lr 5.06e-03 | 1979.92 ms | 69.3% bf16 MFU | 264565 tok/s +step 5877/18794 | loss 3.167476 (-0.86z)| norm 0.2002 (-0.75z)| lr 5.06e-03 | 1985.67 ms | 69.1% bf16 MFU | 264538 tok/s +step 5878/18794 | loss 3.148056 (-1.38z)| norm 0.2332 (-0.11z)| lr 5.06e-03 | 1981.97 ms | 69.2% bf16 MFU | 264538 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.731196 +step 5879/18794 | loss 3.184178 (-0.36z)| norm 0.3879 (+2.73z)| lr 5.06e-03 | 1981.25 ms | 69.3% bf16 MFU | 264542 tok/s +step 5880/18794 | loss 3.180423 (-0.48z)| norm 0.3515 (+1.99z)| lr 5.06e-03 | 1981.70 ms | 69.2% bf16 MFU | 264543 tok/s +step 5881/18794 | loss 3.201732 (+0.14z)| norm 0.2096 (-0.59z)| lr 5.06e-03 | 1988.85 ms | 69.0% bf16 MFU | 264497 tok/s +step 5882/18794 | loss 3.129858 (-1.90z)| norm 0.2814 (+0.71z)| lr 5.06e-03 | 1982.82 ms | 69.2% bf16 MFU | 264493 tok/s +step 5883/18794 | loss 3.196682 (+0.02z)| norm 0.2823 (+0.71z)| lr 5.06e-03 | 1986.47 ms | 69.1% bf16 MFU | 264465 tok/s +step 5884/18794 | loss 3.190078 (-0.15z)| norm 0.1581 (-1.51z)| lr 5.06e-03 | 1983.69 ms | 69.2% bf16 MFU | 264456 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.006409 +step 5885/18794 | loss 3.195063 (+0.00z)| norm 0.4195 (+3.01z)| lr 5.06e-03 | 1981.26 ms | 69.3% bf16 MFU | 264465 tok/s +step 5886/18794 | loss 3.188565 (-0.20z)| norm 0.3168 (+1.22z)| lr 5.06e-03 | 1980.57 ms | 69.3% bf16 MFU | 264477 tok/s +step 5887/18794 | loss 3.180912 (-0.42z)| norm 0.3034 (+0.98z)| lr 5.06e-03 | 1980.73 ms | 69.3% bf16 MFU | 264488 tok/s +reducing beta2 to 0.9 and lr/wd by 0.857 due to grad z-score of 4.085090 +step 5888/18794 | loss 3.159246 (-1.05z)| norm 0.5127 (+4.09z)| lr 4.33e-03 | 1979.75 ms | 69.3% bf16 MFU | 264505 tok/s +step 5889/18794 | loss 3.223810 (+0.87z)| norm 0.1669 (-1.26z)| lr 5.06e-03 | 1979.54 ms | 69.3% bf16 MFU | 264522 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.791508 +step 5890/18794 | loss 3.170546 (-0.70z)| norm 0.4390 (+2.79z)| lr 5.06e-03 | 1981.21 ms | 69.3% bf16 MFU | 264528 tok/s +step 5891/18794 | loss 3.182362 (-0.34z)| norm 0.2309 (-0.28z)| lr 5.05e-03 | 1980.23 ms | 69.3% bf16 MFU | 264539 tok/s +step 5892/18794 | loss 3.193497 (-0.01z)| norm 0.3054 (+0.81z)| lr 5.05e-03 | 1984.55 ms | 69.2% bf16 MFU | 264522 tok/s +step 5893/18794 | loss 3.213439 (+0.58z)| norm 0.2476 (-0.05z)| lr 5.05e-03 | 1980.50 ms | 69.3% bf16 MFU | 264532 tok/s +step 5894/18794 | loss 3.162910 (-0.96z)| norm 0.2096 (-0.62z)| lr 5.05e-03 | 1980.92 ms | 69.3% bf16 MFU | 264539 tok/s +step 5895/18794 | loss 3.202945 (+0.26z)| norm 0.1746 (-1.14z)| lr 5.05e-03 | 1982.97 ms | 69.2% bf16 MFU | 264532 tok/s +step 5896/18794 | loss 3.198901 (+0.15z)| norm 0.2067 (-0.65z)| lr 5.05e-03 | 1988.26 ms | 69.0% bf16 MFU | 264490 tok/s +step 5897/18794 | loss 3.128489 (-1.97z)| norm 0.1668 (-1.23z)| lr 5.05e-03 | 1984.97 ms | 69.1% bf16 MFU | 264472 tok/s +step 5898/18794 | loss 3.175491 (-0.53z)| norm 0.2629 (+0.22z)| lr 5.05e-03 | 1987.01 ms | 69.1% bf16 MFU | 264441 tok/s +step 5899/18794 | loss 3.152228 (-1.23z)| norm 0.2989 (+0.76z)| lr 5.05e-03 | 1979.11 ms | 69.3% bf16 MFU | 264464 tok/s +step 5900/18794 | loss 3.208851 (+0.48z)| norm 0.1893 (-0.88z)| lr 5.05e-03 | 1979.09 ms | 69.3% bf16 MFU | 264487 tok/s +step 5901/18794 | loss 3.239916 (+1.40z)| norm 0.2090 (-0.57z)| lr 5.05e-03 | 1979.97 ms | 69.3% bf16 MFU | 264502 tok/s +step 5902/18794 | loss 3.162019 (-0.92z)| norm 0.2126 (-0.52z)| lr 5.05e-03 | 1980.31 ms | 69.3% bf16 MFU | 264515 tok/s +step 5903/18794 | loss 3.226171 (+0.98z)| norm 0.1951 (-0.79z)| lr 5.05e-03 | 1979.17 ms | 69.3% bf16 MFU | 264534 tok/s +step 5904/18794 | loss 3.231312 (+1.14z)| norm 0.1844 (-0.96z)| lr 5.05e-03 | 1979.43 ms | 69.3% bf16 MFU | 264551 tok/s +step 5905/18794 | loss 3.183131 (-0.28z)| norm 0.1980 (-0.75z)| lr 5.05e-03 | 1979.26 ms | 69.3% bf16 MFU | 264568 tok/s +step 5906/18794 | loss 3.132751 (-1.79z)| norm 0.1559 (-1.36z)| lr 5.05e-03 | 1980.71 ms | 69.3% bf16 MFU | 264574 tok/s +step 5907/18794 | loss 3.175653 (-0.48z)| norm 0.1939 (-0.78z)| lr 5.05e-03 | 1980.35 ms | 69.3% bf16 MFU | 264583 tok/s +step 5908/18794 | loss 3.203341 (+0.36z)| norm 0.2015 (-0.65z)| lr 5.05e-03 | 1980.61 ms | 69.3% bf16 MFU | 264589 tok/s +step 5909/18794 | loss 3.233855 (+1.26z)| norm 0.2163 (-0.42z)| lr 5.05e-03 | 1981.16 ms | 69.3% bf16 MFU | 264592 tok/s +step 5910/18794 | loss 3.209248 (+0.50z)| norm 0.2069 (-0.56z)| lr 5.05e-03 | 2017.85 ms | 68.0% bf16 MFU | 264353 tok/s +step 5911/18794 | loss 3.110165 (-2.43z)| norm 0.3059 (+0.96z)| lr 5.05e-03 | 2034.80 ms | 67.4% bf16 MFU | 264019 tok/s +step 5912/18794 | loss 3.168118 (-0.70z)| norm 0.3719 (+1.91z)| lr 5.05e-03 | 2033.48 ms | 67.5% bf16 MFU | 263709 tok/s +step 5913/18794 | loss 3.199865 (+0.24z)| norm 0.3431 (+1.45z)| lr 5.05e-03 | 2040.15 ms | 67.3% bf16 MFU | 263373 tok/s +step 5914/18794 | loss 3.146694 (-1.35z)| norm 0.2970 (+0.76z)| lr 5.05e-03 | 2034.63 ms | 67.4% bf16 MFU | 263088 tok/s +step 5915/18794 | loss 3.184105 (-0.23z)| norm 0.3390 (+1.38z)| lr 5.05e-03 | 2042.61 ms | 67.2% bf16 MFU | 262768 tok/s +step 5916/18794 | loss 3.188056 (-0.09z)| norm 0.3635 (+1.72z)| lr 5.05e-03 | 2027.49 ms | 67.7% bf16 MFU | 262559 tok/s +step 5917/18794 | loss 3.145793 (-1.35z)| norm 0.2103 (-0.52z)| lr 5.05e-03 | 2039.40 ms | 67.3% bf16 MFU | 262285 tok/s +step 5918/18794 | loss 3.213849 (+0.72z)| norm 0.2158 (-0.43z)| lr 5.04e-03 | 2042.80 ms | 67.2% bf16 MFU | 262003 tok/s +step 5919/18794 | loss 3.203923 (+0.42z)| norm 0.2462 (+0.04z)| lr 5.04e-03 | 2037.94 ms | 67.3% bf16 MFU | 261766 tok/s +step 5920/18794 | loss 3.176211 (-0.43z)| norm 0.1736 (-1.05z)| lr 5.04e-03 | 2037.38 ms | 67.4% bf16 MFU | 261545 tok/s +step 5921/18794 | loss 3.138141 (-1.57z)| norm 0.1985 (-0.66z)| lr 5.04e-03 | 2039.42 ms | 67.3% bf16 MFU | 261321 tok/s +step 5922/18794 | loss 3.080328 (-3.14z)| norm 0.1731 (-1.02z)| lr 5.04e-03 | 2029.94 ms | 67.6% bf16 MFU | 261169 tok/s +step 5923/18794 | loss 3.177548 (-0.33z)| norm 0.2558 (+0.20z)| lr 5.04e-03 | 2024.16 ms | 67.8% bf16 MFU | 261061 tok/s +step 5924/18794 | loss 3.163064 (-0.74z)| norm 0.1967 (-0.69z)| lr 5.04e-03 | 2041.08 ms | 67.2% bf16 MFU | 260852 tok/s +step 5925/18794 | loss 3.120063 (-1.95z)| norm 0.2768 (+0.50z)| lr 5.04e-03 | 2035.60 ms | 67.4% bf16 MFU | 260687 tok/s +step 5926/18794 | loss 3.139432 (-1.37z)| norm 0.3681 (+1.81z)| lr 5.04e-03 | 2029.35 ms | 67.6% bf16 MFU | 260570 tok/s +step 5927/18794 | loss 3.195842 (+0.26z)| norm 0.3357 (+1.31z)| lr 5.04e-03 | 2018.91 ms | 68.0% bf16 MFU | 260526 tok/s +step 5928/18794 | loss 3.125578 (-1.72z)| norm 0.1849 (-0.90z)| lr 5.04e-03 | 2018.63 ms | 68.0% bf16 MFU | 260486 tok/s +step 5929/18794 | loss 3.190521 (+0.13z)| norm 0.3307 (+1.21z)| lr 5.04e-03 | 2033.06 ms | 67.5% bf16 MFU | 260356 tok/s +step 5930/18794 | loss 3.130621 (-1.57z)| norm 0.3810 (+1.88z)| lr 5.04e-03 | 2027.33 ms | 67.7% bf16 MFU | 260269 tok/s +step 5931/18794 | loss 3.190121 (+0.14z)| norm 0.2428 (-0.10z)| lr 5.04e-03 | 2020.41 ms | 67.9% bf16 MFU | 260230 tok/s +step 5932/18794 | loss 3.171218 (-0.39z)| norm 0.2189 (-0.44z)| lr 5.04e-03 | 2033.57 ms | 67.5% bf16 MFU | 260109 tok/s +step 5933/18794 | loss 3.163153 (-0.61z)| norm 0.2128 (-0.53z)| lr 5.04e-03 | 2026.01 ms | 67.7% bf16 MFU | 260043 tok/s +step 5934/18794 | loss 3.116721 (-1.96z)| norm 0.1920 (-0.82z)| lr 5.04e-03 | 2026.71 ms | 67.7% bf16 MFU | 259975 tok/s +step 5935/18794 | loss 3.133432 (-1.43z)| norm 0.1752 (-1.06z)| lr 5.04e-03 | 2028.05 ms | 67.7% bf16 MFU | 259902 tok/s +step 5936/18794 | loss 3.200606 (+0.52z)| norm 0.2437 (-0.07z)| lr 5.04e-03 | 2018.24 ms | 68.0% bf16 MFU | 259896 tok/s +step 5937/18794 | loss 3.208728 (+0.75z)| norm 0.2058 (-0.60z)| lr 5.04e-03 | 2019.91 ms | 67.9% bf16 MFU | 259879 tok/s +step 5938/18794 | loss 3.124490 (-1.69z)| norm 0.2419 (-0.09z)| lr 5.04e-03 | 2037.26 ms | 67.4% bf16 MFU | 259753 tok/s +step 5939/18794 | loss 3.234062 (+1.46z)| norm 0.3016 (+0.76z)| lr 5.04e-03 | 2027.53 ms | 67.7% bf16 MFU | 259694 tok/s +step 5940/18794 | loss 3.171443 (-0.34z)| norm 0.2328 (-0.21z)| lr 5.04e-03 | 2035.76 ms | 67.4% bf16 MFU | 259586 tok/s +step 5941/18794 | loss 3.191856 (+0.25z)| norm 0.3574 (+1.61z)| lr 5.04e-03 | 2038.16 ms | 67.3% bf16 MFU | 259469 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.372230 +step 5942/18794 | loss 3.191492 (+0.23z)| norm 0.4951 (+3.37z)| lr 5.04e-03 | 2022.67 ms | 67.8% bf16 MFU | 259456 tok/s +step 5943/18794 | loss 3.170013 (-0.40z)| norm 0.1920 (-0.79z)| lr 5.04e-03 | 2029.59 ms | 67.6% bf16 MFU | 259399 tok/s +step 5944/18794 | loss 3.193509 (+0.30z)| norm 0.3032 (+0.73z)| lr 5.04e-03 | 2011.58 ms | 68.2% bf16 MFU | 259461 tok/s +step 5945/18794 | loss 3.167213 (-0.46z)| norm 0.3597 (+1.48z)| lr 5.04e-03 | 2017.80 ms | 68.0% bf16 MFU | 259479 tok/s +step 5946/18794 | loss 3.145703 (-1.09z)| norm 0.2203 (-0.42z)| lr 5.03e-03 | 2011.11 ms | 68.2% bf16 MFU | 259540 tok/s +step 5947/18794 | loss 3.153650 (-0.83z)| norm 0.2081 (-0.59z)| lr 5.03e-03 | 2031.75 ms | 67.5% bf16 MFU | 259466 tok/s +step 5948/18794 | loss 3.207096 (+0.79z)| norm 0.2488 (-0.04z)| lr 5.03e-03 | 2034.55 ms | 67.5% bf16 MFU | 259377 tok/s +step 5949/18794 | loss 3.098166 (-2.42z)| norm 0.2183 (-0.46z)| lr 5.03e-03 | 2016.42 ms | 68.1% bf16 MFU | 259409 tok/s +step 5950/18794 | loss 3.122254 (-1.68z)| norm 0.2245 (-0.38z)| lr 5.03e-03 | 2008.84 ms | 68.3% bf16 MFU | 259488 tok/s +step 5951/18794 | loss 3.111883 (-1.94z)| norm 0.2074 (-0.61z)| lr 5.03e-03 | 2032.84 ms | 67.5% bf16 MFU | 259409 tok/s +step 5952/18794 | loss 3.136761 (-1.20z)| norm 0.1935 (-0.79z)| lr 5.03e-03 | 2032.62 ms | 67.5% bf16 MFU | 259335 tok/s +step 5953/18794 | loss 3.196438 (+0.57z)| norm 0.2684 (+0.23z)| lr 5.03e-03 | 2022.90 ms | 67.8% bf16 MFU | 259327 tok/s +step 5954/18794 | loss 3.177975 (+0.03z)| norm 0.2840 (+0.44z)| lr 5.03e-03 | 2009.98 ms | 68.3% bf16 MFU | 259403 tok/s +step 5955/18794 | loss 3.163582 (-0.40z)| norm 0.1813 (-0.95z)| lr 5.03e-03 | 2031.75 ms | 67.5% bf16 MFU | 259335 tok/s +step 5956/18794 | loss 3.182198 (+0.17z)| norm 0.3638 (+1.51z)| lr 5.03e-03 | 2014.90 ms | 68.1% bf16 MFU | 259379 tok/s +step 5957/18794 | loss 3.145430 (-0.93z)| norm 0.3798 (+1.68z)| lr 5.03e-03 | 2019.65 ms | 67.9% bf16 MFU | 259390 tok/s +step 5958/18794 | loss 3.134768 (-1.23z)| norm 0.2150 (-0.51z)| lr 5.03e-03 | 2020.66 ms | 67.9% bf16 MFU | 259393 tok/s +step 5959/18794 | loss 3.150396 (-0.75z)| norm 0.2240 (-0.40z)| lr 5.03e-03 | 2018.94 ms | 68.0% bf16 MFU | 259408 tok/s +step 5960/18794 | loss 3.122128 (-1.57z)| norm 0.2306 (-0.31z)| lr 5.03e-03 | 2021.67 ms | 67.9% bf16 MFU | 259404 tok/s +step 5961/18794 | loss 3.204623 (+0.90z)| norm 0.3054 (+0.69z)| lr 5.03e-03 | 2018.32 ms | 68.0% bf16 MFU | 259422 tok/s +step 5962/18794 | loss 3.103665 (-2.07z)| norm 0.3786 (+1.63z)| lr 5.03e-03 | 2005.27 ms | 68.4% bf16 MFU | 259524 tok/s +step 5963/18794 | loss 3.177657 (+0.10z)| norm 0.2296 (-0.34z)| lr 5.03e-03 | 2013.58 ms | 68.2% bf16 MFU | 259566 tok/s +step 5964/18794 | loss 3.173008 (-0.03z)| norm 0.1891 (-0.87z)| lr 5.03e-03 | 2021.17 ms | 67.9% bf16 MFU | 259558 tok/s +step 5965/18794 | loss 3.155858 (-0.52z)| norm 0.2046 (-0.67z)| lr 5.03e-03 | 2018.83 ms | 68.0% bf16 MFU | 259565 tok/s +step 5966/18794 | loss 3.091328 (-2.33z)| norm 0.1730 (-1.08z)| lr 5.03e-03 | 2015.77 ms | 68.1% bf16 MFU | 259591 tok/s +step 5967/18794 | loss 3.134669 (-1.07z)| norm 0.2476 (-0.10z)| lr 5.03e-03 | 2011.06 ms | 68.2% bf16 MFU | 259647 tok/s +step 5968/18794 | loss 3.077577 (-2.59z)| norm 0.2260 (-0.39z)| lr 5.03e-03 | 2016.28 ms | 68.1% bf16 MFU | 259666 tok/s +step 5969/18794 | loss 3.193156 (+0.66z)| norm 0.1931 (-0.82z)| lr 5.03e-03 | 2007.36 ms | 68.4% bf16 MFU | 259742 tok/s +step 5970/18794 | loss 3.124143 (-1.28z)| norm 0.2175 (-0.50z)| lr 5.03e-03 | 2003.90 ms | 68.5% bf16 MFU | 259836 tok/s +step 5971/18794 | loss 3.143207 (-0.73z)| norm 0.2015 (-0.72z)| lr 5.03e-03 | 2018.95 ms | 68.0% bf16 MFU | 259829 tok/s +step 5972/18794 | loss 3.139883 (-0.81z)| norm 0.1760 (-1.05z)| lr 5.03e-03 | 2010.27 ms | 68.3% bf16 MFU | 259878 tok/s +step 5973/18794 | loss 3.085503 (-2.27z)| norm 0.2481 (-0.10z)| lr 5.02e-03 | 2009.88 ms | 68.3% bf16 MFU | 259926 tok/s +step 5974/18794 | loss 3.162453 (-0.14z)| norm 0.2296 (-0.34z)| lr 5.02e-03 | 2027.90 ms | 67.7% bf16 MFU | 259857 tok/s +step 5975/18794 | loss 3.212651 (+1.23z)| norm 0.2106 (-0.58z)| lr 5.02e-03 | 2016.57 ms | 68.1% bf16 MFU | 259864 tok/s +step 5976/18794 | loss 3.173687 (+0.17z)| norm 0.2861 (+0.42z)| lr 5.02e-03 | 2014.64 ms | 68.1% bf16 MFU | 259882 tok/s +step 5977/18794 | loss 3.180656 (+0.35z)| norm 0.1960 (-0.78z)| lr 5.02e-03 | 2009.66 ms | 68.3% bf16 MFU | 259932 tok/s +step 5978/18794 | loss 3.140542 (-0.74z)| norm 0.3151 (+0.79z)| lr 5.02e-03 | 2009.82 ms | 68.3% bf16 MFU | 259979 tok/s +step 5979/18794 | loss 3.126369 (-1.11z)| norm 0.3737 (+1.57z)| lr 5.02e-03 | 2031.25 ms | 67.6% bf16 MFU | 259886 tok/s +step 5980/18794 | loss 3.154174 (-0.35z)| norm 0.2270 (-0.36z)| lr 5.02e-03 | 2013.92 ms | 68.1% bf16 MFU | 259908 tok/s +step 5981/18794 | loss 3.117101 (-1.33z)| norm 0.2899 (+0.47z)| lr 5.02e-03 | 2027.43 ms | 67.7% bf16 MFU | 259842 tok/s +step 5982/18794 | loss 3.135982 (-0.82z)| norm 0.4014 (+1.91z)| lr 5.02e-03 | 2019.51 ms | 68.0% bf16 MFU | 259831 tok/s +step 5983/18794 | loss 3.182229 (+0.45z)| norm 0.1956 (-0.78z)| lr 5.02e-03 | 2015.04 ms | 68.1% bf16 MFU | 259849 tok/s +step 5984/18794 | loss 3.111217 (-1.46z)| norm 0.3491 (+1.21z)| lr 5.02e-03 | 2029.41 ms | 67.6% bf16 MFU | 259773 tok/s +step 5985/18794 | loss 3.190662 (+0.69z)| norm 0.2491 (-0.08z)| lr 5.02e-03 | 2008.47 ms | 68.3% bf16 MFU | 259837 tok/s +step 5986/18794 | loss 3.184517 (+0.53z)| norm 0.2154 (-0.52z)| lr 5.02e-03 | 2017.70 ms | 68.0% bf16 MFU | 259837 tok/s +step 5987/18794 | loss 3.131151 (-0.90z)| norm 0.1968 (-0.76z)| lr 5.02e-03 | 2012.93 ms | 68.2% bf16 MFU | 259868 tok/s +step 5988/18794 | loss 3.059761 (-2.70z)| norm 0.2414 (-0.13z)| lr 5.02e-03 | 2001.08 ms | 68.6% bf16 MFU | 259975 tok/s +step 5989/18794 | loss 3.204007 (+1.06z)| norm 0.1997 (-0.74z)| lr 5.02e-03 | 2010.61 ms | 68.3% bf16 MFU | 260014 tok/s +step 5990/18794 | loss 3.165371 (+0.05z)| norm 0.2121 (-0.55z)| lr 5.02e-03 | 2008.48 ms | 68.3% bf16 MFU | 260065 tok/s +step 5991/18794 | loss 3.195458 (+0.84z)| norm 0.2049 (-0.65z)| lr 5.02e-03 | 2006.61 ms | 68.4% bf16 MFU | 260126 tok/s +step 5992/18794 | loss 3.155654 (-0.19z)| norm 0.2179 (-0.44z)| lr 5.02e-03 | 2014.75 ms | 68.1% bf16 MFU | 260131 tok/s +step 5993/18794 | loss 3.141133 (-0.56z)| norm 0.1795 (-1.00z)| lr 5.02e-03 | 2011.11 ms | 68.2% bf16 MFU | 260159 tok/s +step 5994/18794 | loss 3.196435 (+0.89z)| norm 0.2929 (+0.68z)| lr 5.02e-03 | 2025.52 ms | 67.8% bf16 MFU | 260093 tok/s +step 5995/18794 | loss 3.155806 (-0.17z)| norm 0.3020 (+0.80z)| lr 5.02e-03 | 1998.29 ms | 68.7% bf16 MFU | 260207 tok/s +step 5996/18794 | loss 3.184277 (+0.59z)| norm 0.2045 (-0.66z)| lr 5.02e-03 | 2008.92 ms | 68.3% bf16 MFU | 260246 tok/s +step 5997/18794 | loss 3.207091 (+1.17z)| norm 0.1740 (-1.12z)| lr 5.02e-03 | 2010.97 ms | 68.2% bf16 MFU | 260269 tok/s +step 5998/18794 | loss 3.155186 (-0.20z)| norm 0.2123 (-0.54z)| lr 5.02e-03 | 1999.06 ms | 68.6% bf16 MFU | 260369 tok/s +step 5999/18794 | loss 3.154582 (-0.21z)| norm 0.1760 (-1.06z)| lr 5.02e-03 | 2002.66 ms | 68.5% bf16 MFU | 260441 tok/s +step 6000/18794 | loss 3.162997 (+0.02z)| norm 0.1742 (-1.08z)| lr 5.01e-03 | 2002.27 ms | 68.5% bf16 MFU | 260511 tok/s +val loss 3.203067 +HellaSwag: 2882/10042 = 0.286995Swag: 990/1256: 0/1256 +Writing state to log_gpt3_125M_edu_v4/state_00006000_00001.bin +generating: +--- +I have been consulting many times on my parents giving advice on their details. Every time my son of mine went to New York City. Were all familiar with a few words from likely childhood…As a fifth gr +ader I have been studying the Dickens plot A Christmas Carol and knew formal schooling in the Smartway gallery, but I always felt that I had understood more than the plots grand scope. Knowing the st +ory of the author and gospels plots allows me to skim through this first few years of developing my own knowledge. +So, I concentrated on a specific subject: that Ezekiels father and a coal miners persecutor. +--- +Writing checkpoint at step 6000 +Writing model to log_gpt3_125M_edu_v4/model_00006000.bin +Writing state to log_gpt3_125M_edu_v4/state_00006000_00000.bin +Deleting checkpoint at step 3500 +step 6001/18794 | loss 3.101381 (-1.60z)| norm 0.1698 (-1.14z)| lr 5.01e-03 | 1999.90 ms | 68.6% bf16 MFU | 260593 tok/s +step 6002/18794 | loss 3.097713 (-1.66z)| norm 0.1761 (-1.03z)| lr 5.01e-03 | 2006.33 ms | 68.4% bf16 MFU | 260629 tok/s +step 6003/18794 | loss 3.136261 (-0.62z)| norm 0.1955 (-0.75z)| lr 5.01e-03 | 2004.25 ms | 68.5% bf16 MFU | 260677 tok/s +step 6004/18794 | loss 3.167142 (+0.23z)| norm 0.2487 (+0.03z)| lr 5.01e-03 | 2021.19 ms | 67.9% bf16 MFU | 260613 tok/s +step 6005/18794 | loss 3.211512 (+1.43z)| norm 0.2041 (-0.64z)| lr 5.01e-03 | 2009.31 ms | 68.3% bf16 MFU | 260629 tok/s +step 6006/18794 | loss 3.139058 (-0.54z)| norm 0.2543 (+0.09z)| lr 5.01e-03 | 2012.66 ms | 68.2% bf16 MFU | 260622 tok/s +step 6007/18794 | loss 3.137234 (-0.58z)| norm 0.2442 (-0.06z)| lr 5.01e-03 | 1996.92 ms | 68.7% bf16 MFU | 260719 tok/s +step 6008/18794 | loss 3.123447 (-0.94z)| norm 0.2149 (-0.51z)| lr 5.01e-03 | 2003.69 ms | 68.5% bf16 MFU | 260766 tok/s +step 6009/18794 | loss 3.135128 (-0.60z)| norm 0.2447 (-0.06z)| lr 5.01e-03 | 2019.57 ms | 68.0% bf16 MFU | 260708 tok/s +step 6010/18794 | loss 3.100741 (-1.53z)| norm 0.2178 (-0.47z)| lr 5.01e-03 | 2007.99 ms | 68.3% bf16 MFU | 260727 tok/s +step 6011/18794 | loss 3.182273 (+0.72z)| norm 0.2636 (+0.23z)| lr 5.01e-03 | 2016.47 ms | 68.1% bf16 MFU | 260691 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.435857 +step 6012/18794 | loss 3.155237 (-0.03z)| norm 0.4124 (+2.44z)| lr 5.01e-03 | 2005.38 ms | 68.4% bf16 MFU | 260729 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.449386 +step 6013/18794 | loss 3.157866 (+0.06z)| norm 0.4176 (+2.45z)| lr 5.01e-03 | 1999.30 ms | 68.6% bf16 MFU | 260804 tok/s +step 6014/18794 | loss 3.161884 (+0.17z)| norm 0.1845 (-0.93z)| lr 5.01e-03 | 2019.48 ms | 68.0% bf16 MFU | 260744 tok/s +step 6015/18794 | loss 3.184701 (+0.81z)| norm 0.3297 (+1.18z)| lr 5.01e-03 | 2011.75 ms | 68.2% bf16 MFU | 260738 tok/s +step 6016/18794 | loss 3.173810 (+0.51z)| norm 0.1692 (-1.14z)| lr 5.01e-03 | 1988.92 ms | 69.0% bf16 MFU | 260881 tok/s +step 6017/18794 | loss 3.164654 (+0.24z)| norm 0.3536 (+1.54z)| lr 5.01e-03 | 1985.49 ms | 69.1% bf16 MFU | 261040 tok/s +step 6018/18794 | loss 3.170932 (+0.44z)| norm 0.2012 (-0.68z)| lr 5.01e-03 | 2004.03 ms | 68.5% bf16 MFU | 261069 tok/s +step 6019/18794 | loss 3.206612 (+1.46z)| norm 0.3641 (+1.65z)| lr 5.01e-03 | 2004.08 ms | 68.5% bf16 MFU | 261096 tok/s +step 6020/18794 | loss 3.117376 (-1.07z)| norm 0.3839 (+1.88z)| lr 5.01e-03 | 1998.01 ms | 68.7% bf16 MFU | 261161 tok/s +step 6021/18794 | loss 3.139045 (-0.46z)| norm 0.3905 (+1.91z)| lr 5.01e-03 | 2011.05 ms | 68.2% bf16 MFU | 261139 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.070094 +step 6022/18794 | loss 3.149569 (-0.18z)| norm 0.4064 (+2.07z)| lr 5.01e-03 | 1997.03 ms | 68.7% bf16 MFU | 261208 tok/s +step 6023/18794 | loss 3.195916 (+1.16z)| norm 0.3071 (+0.70z)| lr 5.01e-03 | 2015.01 ms | 68.1% bf16 MFU | 261157 tok/s +step 6024/18794 | loss 3.128740 (-0.78z)| norm 0.2450 (-0.16z)| lr 5.01e-03 | 2008.80 ms | 68.3% bf16 MFU | 261149 tok/s +step 6025/18794 | loss 3.112360 (-1.25z)| norm 0.3890 (+1.78z)| lr 5.01e-03 | 2003.75 ms | 68.5% bf16 MFU | 261174 tok/s +step 6026/18794 | loss 3.158743 (+0.09z)| norm 0.3424 (+1.16z)| lr 5.01e-03 | 2011.76 ms | 68.2% bf16 MFU | 261146 tok/s +step 6027/18794 | loss 3.123065 (-0.93z)| norm 0.2305 (-0.35z)| lr 5.00e-03 | 2004.61 ms | 68.5% bf16 MFU | 261166 tok/s +step 6028/18794 | loss 3.168156 (+0.37z)| norm 0.1760 (-1.09z)| lr 5.00e-03 | 2019.23 ms | 68.0% bf16 MFU | 261090 tok/s +step 6029/18794 | loss 3.122967 (-0.92z)| norm 0.2383 (-0.23z)| lr 5.00e-03 | 2001.61 ms | 68.6% bf16 MFU | 261132 tok/s +step 6030/18794 | loss 3.164234 (+0.27z)| norm 0.2043 (-0.68z)| lr 5.00e-03 | 2003.64 ms | 68.5% bf16 MFU | 261159 tok/s +step 6031/18794 | loss 3.147218 (-0.22z)| norm 0.2326 (-0.29z)| lr 5.00e-03 | 2001.60 ms | 68.6% bf16 MFU | 261198 tok/s +step 6032/18794 | loss 3.205167 (+1.45z)| norm 0.3288 (+1.03z)| lr 5.00e-03 | 2004.91 ms | 68.4% bf16 MFU | 261213 tok/s +step 6033/18794 | loss 3.170330 (+0.44z)| norm 0.2798 (+0.34z)| lr 5.00e-03 | 2009.72 ms | 68.3% bf16 MFU | 261196 tok/s +step 6034/18794 | loss 3.200752 (+1.30z)| norm 0.2166 (-0.54z)| lr 5.00e-03 | 1989.34 ms | 69.0% bf16 MFU | 261314 tok/s +step 6035/18794 | loss 3.141353 (-0.42z)| norm 0.3725 (+1.58z)| lr 5.00e-03 | 2004.30 ms | 68.5% bf16 MFU | 261327 tok/s +step 6036/18794 | loss 3.120004 (-1.02z)| norm 0.3480 (+1.22z)| lr 5.00e-03 | 1992.71 ms | 68.9% bf16 MFU | 261416 tok/s +step 6037/18794 | loss 3.126843 (-0.81z)| norm 0.2014 (-0.78z)| lr 5.00e-03 | 2003.90 ms | 68.5% bf16 MFU | 261427 tok/s +step 6038/18794 | loss 3.100299 (-1.56z)| norm 0.3208 (+0.84z)| lr 5.00e-03 | 2006.22 ms | 68.4% bf16 MFU | 261422 tok/s +step 6039/18794 | loss 3.167982 (+0.43z)| norm 0.2000 (-0.79z)| lr 5.00e-03 | 2004.06 ms | 68.5% bf16 MFU | 261432 tok/s +step 6040/18794 | loss 3.217873 (+1.88z)| norm 0.2004 (-0.78z)| lr 5.00e-03 | 2006.83 ms | 68.4% bf16 MFU | 261423 tok/s +step 6041/18794 | loss 3.153624 (+0.00z)| norm 0.1760 (-1.09z)| lr 5.00e-03 | 2006.66 ms | 68.4% bf16 MFU | 261415 tok/s +step 6042/18794 | loss 3.162914 (+0.29z)| norm 0.1897 (-0.91z)| lr 5.00e-03 | 2010.27 ms | 68.3% bf16 MFU | 261385 tok/s +step 6043/18794 | loss 3.148938 (-0.12z)| norm 0.2218 (-0.45z)| lr 5.00e-03 | 2003.86 ms | 68.5% bf16 MFU | 261397 tok/s +step 6044/18794 | loss 3.173177 (+0.61z)| norm 0.2063 (-0.66z)| lr 5.00e-03 | 1995.54 ms | 68.8% bf16 MFU | 261464 tok/s +step 6045/18794 | loss 3.215430 (+1.83z)| norm 0.1776 (-1.05z)| lr 5.00e-03 | 1989.01 ms | 69.0% bf16 MFU | 261570 tok/s +step 6046/18794 | loss 3.165835 (+0.36z)| norm 0.1966 (-0.77z)| lr 5.00e-03 | 1997.62 ms | 68.7% bf16 MFU | 261615 tok/s +step 6047/18794 | loss 3.163403 (+0.29z)| norm 0.1769 (-1.05z)| lr 5.00e-03 | 2002.17 ms | 68.5% bf16 MFU | 261627 tok/s +step 6048/18794 | loss 3.176538 (+0.69z)| norm 0.3234 (+1.04z)| lr 5.00e-03 | 1995.84 ms | 68.8% bf16 MFU | 261680 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.055633 +step 6049/18794 | loss 3.107354 (-1.37z)| norm 0.4774 (+3.06z)| lr 5.00e-03 | 1989.57 ms | 69.0% bf16 MFU | 261772 tok/s +step 6050/18794 | loss 3.102306 (-1.51z)| norm 0.2821 (+0.39z)| lr 5.00e-03 | 2017.31 ms | 68.0% bf16 MFU | 261678 tok/s +step 6051/18794 | loss 3.147667 (-0.18z)| norm 0.3281 (+0.99z)| lr 5.00e-03 | 1996.02 ms | 68.8% bf16 MFU | 261728 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.619366 +step 6052/18794 | loss 3.163046 (+0.28z)| norm 0.4567 (+2.62z)| lr 5.00e-03 | 1993.70 ms | 68.8% bf16 MFU | 261790 tok/s +step 6053/18794 | loss 3.150399 (-0.09z)| norm 0.1986 (-0.76z)| lr 5.00e-03 | 1996.66 ms | 68.7% bf16 MFU | 261829 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.230695 +step 6054/18794 | loss 3.171569 (+0.55z)| norm 0.4328 (+2.23z)| lr 4.99e-03 | 2000.01 ms | 68.6% bf16 MFU | 261845 tok/s +step 6055/18794 | loss 3.140472 (-0.38z)| norm 0.3385 (+1.01z)| lr 4.99e-03 | 1995.24 ms | 68.8% bf16 MFU | 261891 tok/s +step 6056/18794 | loss 3.191950 (+1.17z)| norm 0.2006 (-0.74z)| lr 4.99e-03 | 2002.88 ms | 68.5% bf16 MFU | 261885 tok/s +step 6057/18794 | loss 3.177112 (+0.71z)| norm 0.1998 (-0.74z)| lr 4.99e-03 | 2002.38 ms | 68.5% bf16 MFU | 261882 tok/s +step 6058/18794 | loss 3.108831 (-1.32z)| norm 0.2253 (-0.41z)| lr 4.99e-03 | 2001.80 ms | 68.6% bf16 MFU | 261884 tok/s +step 6059/18794 | loss 3.157136 (+0.11z)| norm 0.1838 (-0.94z)| lr 4.99e-03 | 2017.76 ms | 68.0% bf16 MFU | 261781 tok/s +step 6060/18794 | loss 3.073955 (-2.30z)| norm 0.1799 (-0.98z)| lr 4.99e-03 | 1996.06 ms | 68.8% bf16 MFU | 261825 tok/s +step 6061/18794 | loss 3.178000 (+0.75z)| norm 0.1984 (-0.73z)| lr 4.99e-03 | 2018.78 ms | 68.0% bf16 MFU | 261719 tok/s +step 6062/18794 | loss 3.158888 (+0.17z)| norm 0.2247 (-0.37z)| lr 4.99e-03 | 1998.85 ms | 68.7% bf16 MFU | 261748 tok/s +step 6063/18794 | loss 3.171509 (+0.55z)| norm 0.2306 (-0.29z)| lr 4.99e-03 | 1985.60 ms | 69.1% bf16 MFU | 261863 tok/s +step 6064/18794 | loss 3.228792 (+2.18z)| norm 0.2317 (-0.29z)| lr 4.99e-03 | 2003.09 ms | 68.5% bf16 MFU | 261857 tok/s +step 6065/18794 | loss 3.145118 (-0.24z)| norm 0.2083 (-0.59z)| lr 4.99e-03 | 1997.75 ms | 68.7% bf16 MFU | 261886 tok/s +step 6066/18794 | loss 3.162722 (+0.25z)| norm 0.2096 (-0.58z)| lr 4.99e-03 | 1992.03 ms | 68.9% bf16 MFU | 261951 tok/s +step 6067/18794 | loss 3.123564 (-0.90z)| norm 0.2111 (-0.56z)| lr 4.99e-03 | 1999.22 ms | 68.6% bf16 MFU | 261966 tok/s +step 6068/18794 | loss 3.194817 (+1.19z)| norm 0.2715 (+0.23z)| lr 4.99e-03 | 1994.58 ms | 68.8% bf16 MFU | 262011 tok/s +step 6069/18794 | loss 3.106552 (-1.43z)| norm 0.2948 (+0.52z)| lr 4.99e-03 | 1999.14 ms | 68.6% bf16 MFU | 262023 tok/s +step 6070/18794 | loss 3.180447 (+0.76z)| norm 0.1715 (-1.09z)| lr 4.99e-03 | 1999.41 ms | 68.6% bf16 MFU | 262033 tok/s +step 6071/18794 | loss 3.108465 (-1.37z)| norm 0.2251 (-0.39z)| lr 4.99e-03 | 1988.68 ms | 69.0% bf16 MFU | 262113 tok/s +step 6072/18794 | loss 3.157330 (+0.08z)| norm 0.2155 (-0.52z)| lr 4.99e-03 | 1988.08 ms | 69.0% bf16 MFU | 262193 tok/s +step 6073/18794 | loss 3.136925 (-0.56z)| norm 0.2115 (-0.57z)| lr 4.99e-03 | 1994.81 ms | 68.8% bf16 MFU | 262225 tok/s +step 6074/18794 | loss 3.166554 (+0.34z)| norm 0.1959 (-0.77z)| lr 4.99e-03 | 1997.05 ms | 68.7% bf16 MFU | 262240 tok/s +step 6075/18794 | loss 3.189108 (+1.04z)| norm 0.2228 (-0.42z)| lr 4.99e-03 | 1983.87 ms | 69.2% bf16 MFU | 262342 tok/s +step 6076/18794 | loss 3.151827 (-0.09z)| norm 0.1707 (-1.08z)| lr 4.99e-03 | 1986.05 ms | 69.1% bf16 MFU | 262424 tok/s +step 6077/18794 | loss 3.094794 (-1.79z)| norm 0.1631 (-1.17z)| lr 4.99e-03 | 1989.73 ms | 69.0% bf16 MFU | 262478 tok/s +step 6078/18794 | loss 3.167764 (+0.41z)| norm 0.1930 (-0.77z)| lr 4.99e-03 | 1984.25 ms | 69.2% bf16 MFU | 262565 tok/s +step 6079/18794 | loss 3.155972 (+0.04z)| norm 0.2120 (-0.50z)| lr 4.99e-03 | 1982.52 ms | 69.2% bf16 MFU | 262660 tok/s +step 6080/18794 | loss 3.179133 (+0.73z)| norm 0.2929 (+0.55z)| lr 4.99e-03 | 1987.91 ms | 69.0% bf16 MFU | 262713 tok/s +step 6081/18794 | loss 3.171006 (+0.47z)| norm 0.2938 (+0.56z)| lr 4.98e-03 | 1986.27 ms | 69.1% bf16 MFU | 262776 tok/s +step 6082/18794 | loss 3.123899 (-0.95z)| norm 0.2048 (-0.59z)| lr 4.98e-03 | 1988.05 ms | 69.0% bf16 MFU | 262823 tok/s +step 6083/18794 | loss 3.169089 (+0.42z)| norm 0.2225 (-0.36z)| lr 4.98e-03 | 1980.65 ms | 69.3% bf16 MFU | 262917 tok/s +step 6084/18794 | loss 3.174438 (+0.57z)| norm 0.3373 (+1.19z)| lr 4.98e-03 | 1990.57 ms | 68.9% bf16 MFU | 262940 tok/s +step 6085/18794 | loss 3.156564 (+0.03z)| norm 0.2772 (+0.37z)| lr 4.98e-03 | 1980.92 ms | 69.3% bf16 MFU | 263027 tok/s +step 6086/18794 | loss 3.158133 (+0.09z)| norm 0.1850 (-0.86z)| lr 4.98e-03 | 1986.84 ms | 69.1% bf16 MFU | 263069 tok/s +step 6087/18794 | loss 3.090441 (-1.96z)| norm 0.3205 (+0.94z)| lr 4.98e-03 | 1989.92 ms | 69.0% bf16 MFU | 263090 tok/s +step 6088/18794 | loss 3.162851 (+0.22z)| norm 0.3416 (+1.20z)| lr 4.98e-03 | 1992.56 ms | 68.9% bf16 MFU | 263091 tok/s +step 6089/18794 | loss 3.165154 (+0.31z)| norm 0.1812 (-0.93z)| lr 4.98e-03 | 2005.18 ms | 68.4% bf16 MFU | 263010 tok/s +step 6090/18794 | loss 3.179872 (+0.78z)| norm 0.3836 (+1.71z)| lr 4.98e-03 | 1992.91 ms | 68.9% bf16 MFU | 263013 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.721732 +step 6091/18794 | loss 3.074272 (-2.51z)| norm 0.4716 (+2.72z)| lr 4.98e-03 | 1990.26 ms | 69.0% bf16 MFU | 263034 tok/s +step 6092/18794 | loss 3.116945 (-1.15z)| norm 0.2310 (-0.31z)| lr 4.98e-03 | 1983.82 ms | 69.2% bf16 MFU | 263096 tok/s +step 6093/18794 | loss 3.092435 (-1.86z)| norm 0.3646 (+1.34z)| lr 4.98e-03 | 1983.23 ms | 69.2% bf16 MFU | 263160 tok/s +step 6094/18794 | loss 3.200208 (+1.42z)| norm 0.2870 (+0.37z)| lr 4.98e-03 | 1989.46 ms | 69.0% bf16 MFU | 263178 tok/s +step 6095/18794 | loss 3.162431 (+0.27z)| norm 0.2105 (-0.58z)| lr 4.98e-03 | 1987.49 ms | 69.0% bf16 MFU | 263209 tok/s +step 6096/18794 | loss 3.105386 (-1.44z)| norm 0.2408 (-0.20z)| lr 4.98e-03 | 1984.24 ms | 69.2% bf16 MFU | 263260 tok/s +step 6097/18794 | loss 3.117741 (-1.04z)| norm 0.1832 (-0.93z)| lr 4.98e-03 | 1983.29 ms | 69.2% bf16 MFU | 263315 tok/s +step 6098/18794 | loss 3.111583 (-1.21z)| norm 0.1813 (-0.95z)| lr 4.98e-03 | 1981.99 ms | 69.2% bf16 MFU | 263375 tok/s +step 6099/18794 | loss 3.161287 (+0.30z)| norm 0.1700 (-1.09z)| lr 4.98e-03 | 1980.64 ms | 69.3% bf16 MFU | 263442 tok/s +step 6100/18794 | loss 3.228899 (+2.27z)| norm 0.1991 (-0.73z)| lr 4.98e-03 | 1980.19 ms | 69.3% bf16 MFU | 263508 tok/s +step 6101/18794 | loss 3.161302 (+0.25z)| norm 0.1804 (-0.97z)| lr 4.98e-03 | 2041.09 ms | 67.2% bf16 MFU | 263176 tok/s +step 6102/18794 | loss 3.159596 (+0.19z)| norm 0.2343 (-0.29z)| lr 4.98e-03 | 2033.71 ms | 67.5% bf16 MFU | 262907 tok/s +step 6103/18794 | loss 3.179793 (+0.78z)| norm 0.2110 (-0.59z)| lr 4.98e-03 | 2040.13 ms | 67.3% bf16 MFU | 262611 tok/s +step 6104/18794 | loss 3.214953 (+1.80z)| norm 0.3310 (+0.91z)| lr 4.98e-03 | 2032.90 ms | 67.5% bf16 MFU | 262376 tok/s +step 6105/18794 | loss 3.224286 (+2.06z)| norm 0.2854 (+0.33z)| lr 4.98e-03 | 2028.19 ms | 67.7% bf16 MFU | 262182 tok/s +step 6106/18794 | loss 3.153090 (-0.04z)| norm 0.2130 (-0.58z)| lr 4.98e-03 | 2035.64 ms | 67.4% bf16 MFU | 261950 tok/s +step 6107/18794 | loss 3.142251 (-0.37z)| norm 0.3130 (+0.67z)| lr 4.98e-03 | 2042.35 ms | 67.2% bf16 MFU | 261688 tok/s +step 6108/18794 | loss 3.193575 (+1.13z)| norm 0.2441 (-0.20z)| lr 4.97e-03 | 2034.88 ms | 67.4% bf16 MFU | 261486 tok/s +step 6109/18794 | loss 3.175478 (+0.58z)| norm 0.1780 (-1.02z)| lr 4.97e-03 | 2041.29 ms | 67.2% bf16 MFU | 261254 tok/s +step 6110/18794 | loss 3.308109 (+4.10z)| norm 0.2157 (-0.55z)| lr 4.97e-03 | 2041.80 ms | 67.2% bf16 MFU | 261030 tok/s +step 6111/18794 | loss 3.164726 (+0.19z)| norm 0.2080 (-0.64z)| lr 4.97e-03 | 2030.45 ms | 67.6% bf16 MFU | 260889 tok/s +step 6112/18794 | loss 3.229566 (+1.91z)| norm 0.2415 (-0.20z)| lr 4.97e-03 | 2033.58 ms | 67.5% bf16 MFU | 260736 tok/s +step 6113/18794 | loss 3.191413 (+0.87z)| norm 0.2381 (-0.22z)| lr 4.97e-03 | 2033.08 ms | 67.5% bf16 MFU | 260593 tok/s +step 6114/18794 | loss 3.189591 (+0.82z)| norm 0.2419 (-0.18z)| lr 4.97e-03 | 2042.50 ms | 67.2% bf16 MFU | 260398 tok/s +step 6115/18794 | loss 3.146936 (-0.31z)| norm 0.1978 (-0.74z)| lr 4.97e-03 | 2025.41 ms | 67.8% bf16 MFU | 260321 tok/s +step 6116/18794 | loss 3.231835 (+1.90z)| norm 0.2909 (+0.46z)| lr 4.97e-03 | 2025.73 ms | 67.7% bf16 MFU | 260245 tok/s +step 6117/18794 | loss 3.153952 (-0.13z)| norm 0.3256 (+0.93z)| lr 4.97e-03 | 2036.48 ms | 67.4% bf16 MFU | 260105 tok/s +step 6118/18794 | loss 3.249904 (+2.30z)| norm 0.1895 (-0.87z)| lr 4.97e-03 | 2018.31 ms | 68.0% bf16 MFU | 260088 tok/s +step 6119/18794 | loss 3.161713 (+0.06z)| norm 0.1732 (-1.07z)| lr 4.97e-03 | 2034.55 ms | 67.5% bf16 MFU | 259969 tok/s +step 6120/18794 | loss 3.215250 (+1.40z)| norm 0.1752 (-1.02z)| lr 4.97e-03 | 2031.25 ms | 67.6% bf16 MFU | 259876 tok/s +step 6121/18794 | loss 3.150291 (-0.26z)| norm 0.2728 (+0.31z)| lr 4.97e-03 | 2026.37 ms | 67.7% bf16 MFU | 259819 tok/s +step 6122/18794 | loss 3.143982 (-0.42z)| norm 0.3506 (+1.40z)| lr 4.97e-03 | 2025.25 ms | 67.8% bf16 MFU | 259771 tok/s +step 6123/18794 | loss 3.153327 (-0.17z)| norm 0.2233 (-0.35z)| lr 4.97e-03 | 2043.08 ms | 67.2% bf16 MFU | 259614 tok/s +step 6124/18794 | loss 3.180961 (+0.53z)| norm 0.2259 (-0.31z)| lr 4.97e-03 | 2024.56 ms | 67.8% bf16 MFU | 259581 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.130050 +step 6125/18794 | loss 3.139679 (-0.55z)| norm 0.4028 (+2.13z)| lr 4.97e-03 | 2027.22 ms | 67.7% bf16 MFU | 259533 tok/s +step 6126/18794 | loss 3.144415 (-0.42z)| norm 0.3071 (+0.82z)| lr 4.97e-03 | 2027.14 ms | 67.7% bf16 MFU | 259488 tok/s +step 6127/18794 | loss 3.163429 (+0.06z)| norm 0.1581 (-1.23z)| lr 4.97e-03 | 2015.43 ms | 68.1% bf16 MFU | 259521 tok/s +step 6128/18794 | loss 3.163001 (+0.05z)| norm 0.3356 (+1.19z)| lr 4.97e-03 | 2039.72 ms | 67.3% bf16 MFU | 259397 tok/s +step 6129/18794 | loss 3.193270 (+0.82z)| norm 0.3089 (+0.81z)| lr 4.97e-03 | 2015.30 ms | 68.1% bf16 MFU | 259435 tok/s +step 6130/18794 | loss 3.227847 (+1.68z)| norm 0.1917 (-0.80z)| lr 4.97e-03 | 2008.35 ms | 68.3% bf16 MFU | 259515 tok/s +step 6131/18794 | loss 3.135506 (-0.69z)| norm 0.2268 (-0.31z)| lr 4.97e-03 | 2027.79 ms | 67.7% bf16 MFU | 259467 tok/s +step 6132/18794 | loss 3.258699 (+2.40z)| norm 0.1827 (-0.90z)| lr 4.97e-03 | 2025.11 ms | 67.8% bf16 MFU | 259439 tok/s +step 6133/18794 | loss 3.143595 (-0.47z)| norm 0.2555 (+0.10z)| lr 4.97e-03 | 2016.52 ms | 68.1% bf16 MFU | 259467 tok/s +step 6134/18794 | loss 3.126803 (-0.88z)| norm 0.2950 (+0.63z)| lr 4.97e-03 | 2022.95 ms | 67.8% bf16 MFU | 259452 tok/s +step 6135/18794 | loss 3.235669 (+1.80z)| norm 0.2061 (-0.57z)| lr 4.96e-03 | 2028.20 ms | 67.7% bf16 MFU | 259404 tok/s +step 6136/18794 | loss 3.206804 (+1.07z)| norm 0.1912 (-0.76z)| lr 4.96e-03 | 2026.02 ms | 67.7% bf16 MFU | 259373 tok/s +step 6137/18794 | loss 3.163248 (-0.02z)| norm 0.2524 (+0.09z)| lr 4.96e-03 | 2033.16 ms | 67.5% bf16 MFU | 259297 tok/s +step 6138/18794 | loss 3.174876 (+0.25z)| norm 0.3894 (+1.97z)| lr 4.96e-03 | 2025.31 ms | 67.8% bf16 MFU | 259276 tok/s +step 6139/18794 | loss 3.151872 (-0.32z)| norm 0.2571 (+0.14z)| lr 4.96e-03 | 2009.55 ms | 68.3% bf16 MFU | 259357 tok/s +step 6140/18794 | loss 3.151989 (-0.30z)| norm 0.1768 (-0.97z)| lr 4.96e-03 | 2027.29 ms | 67.7% bf16 MFU | 259320 tok/s +step 6141/18794 | loss 3.165418 (+0.04z)| norm 0.2172 (-0.42z)| lr 4.96e-03 | 2012.75 ms | 68.2% bf16 MFU | 259378 tok/s +step 6142/18794 | loss 3.162725 (-0.03z)| norm 0.2163 (-0.44z)| lr 4.96e-03 | 2020.97 ms | 67.9% bf16 MFU | 259380 tok/s +step 6143/18794 | loss 3.141273 (-0.57z)| norm 0.2346 (-0.18z)| lr 4.96e-03 | 2021.51 ms | 67.9% bf16 MFU | 259379 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.187129 +step 6144/18794 | loss 3.213168 (+1.23z)| norm 0.4109 (+2.19z)| lr 4.96e-03 | 2008.92 ms | 68.3% bf16 MFU | 259459 tok/s +step 6145/18794 | loss 3.177078 (+0.33z)| norm 0.3978 (+1.95z)| lr 4.96e-03 | 2009.90 ms | 68.3% bf16 MFU | 259529 tok/s +step 6146/18794 | loss 3.197601 (+0.84z)| norm 0.2567 (+0.05z)| lr 4.96e-03 | 2021.22 ms | 67.9% bf16 MFU | 259522 tok/s +step 6147/18794 | loss 3.187796 (+0.59z)| norm 0.2872 (+0.45z)| lr 4.96e-03 | 2005.40 ms | 68.4% bf16 MFU | 259618 tok/s +step 6148/18794 | loss 3.192032 (+0.69z)| norm 0.2687 (+0.21z)| lr 4.96e-03 | 2012.42 ms | 68.2% bf16 MFU | 259663 tok/s +step 6149/18794 | loss 3.193850 (+0.72z)| norm 0.2387 (-0.17z)| lr 4.96e-03 | 2010.47 ms | 68.3% bf16 MFU | 259719 tok/s +step 6150/18794 | loss 3.172665 (+0.16z)| norm 0.2817 (+0.44z)| lr 4.96e-03 | 2011.10 ms | 68.2% bf16 MFU | 259768 tok/s +step 6151/18794 | loss 3.145591 (-0.53z)| norm 0.3934 (+1.99z)| lr 4.96e-03 | 2014.04 ms | 68.1% bf16 MFU | 259795 tok/s +step 6152/18794 | loss 3.167254 (+0.03z)| norm 0.2792 (+0.43z)| lr 4.96e-03 | 2014.17 ms | 68.1% bf16 MFU | 259821 tok/s +step 6153/18794 | loss 3.160884 (-0.14z)| norm 0.2152 (-0.51z)| lr 4.96e-03 | 2019.94 ms | 67.9% bf16 MFU | 259807 tok/s +step 6154/18794 | loss 3.154480 (-0.30z)| norm 0.3646 (+1.73z)| lr 4.96e-03 | 2004.77 ms | 68.5% bf16 MFU | 259893 tok/s +step 6155/18794 | loss 3.271136 (+2.57z)| norm 0.3388 (+1.34z)| lr 4.96e-03 | 2019.85 ms | 67.9% bf16 MFU | 259877 tok/s +step 6156/18794 | loss 3.148319 (-0.47z)| norm 0.2332 (-0.25z)| lr 4.96e-03 | 2019.53 ms | 68.0% bf16 MFU | 259863 tok/s +step 6157/18794 | loss 3.166411 (-0.01z)| norm 0.2519 (+0.03z)| lr 4.96e-03 | 2008.05 ms | 68.3% bf16 MFU | 259925 tok/s +step 6158/18794 | loss 3.175995 (+0.21z)| norm 0.2425 (-0.12z)| lr 4.96e-03 | 2015.53 ms | 68.1% bf16 MFU | 259935 tok/s +step 6159/18794 | loss 3.180172 (+0.31z)| norm 0.2444 (-0.10z)| lr 4.96e-03 | 2022.35 ms | 67.9% bf16 MFU | 259900 tok/s +step 6160/18794 | loss 3.184732 (+0.41z)| norm 0.2155 (-0.54z)| lr 4.96e-03 | 2019.25 ms | 68.0% bf16 MFU | 259888 tok/s +step 6161/18794 | loss 3.182143 (+0.34z)| norm 0.1871 (-0.97z)| lr 4.95e-03 | 2022.31 ms | 67.9% bf16 MFU | 259856 tok/s +step 6162/18794 | loss 3.138750 (-0.77z)| norm 0.2172 (-0.51z)| lr 4.95e-03 | 2003.85 ms | 68.5% bf16 MFU | 259945 tok/s +step 6163/18794 | loss 3.152766 (-0.41z)| norm 0.2348 (-0.25z)| lr 4.95e-03 | 2008.26 ms | 68.3% bf16 MFU | 260001 tok/s +step 6164/18794 | loss 3.163083 (-0.13z)| norm 0.1612 (-1.34z)| lr 4.95e-03 | 2030.36 ms | 67.6% bf16 MFU | 259912 tok/s +step 6165/18794 | loss 3.167707 (-0.01z)| norm 0.3705 (+1.76z)| lr 4.95e-03 | 1996.24 ms | 68.7% bf16 MFU | 260048 tok/s +step 6166/18794 | loss 3.226627 (+1.49z)| norm 0.3502 (+1.42z)| lr 4.95e-03 | 2007.03 ms | 68.4% bf16 MFU | 260107 tok/s +step 6167/18794 | loss 3.178908 (+0.25z)| norm 0.1924 (-0.89z)| lr 4.95e-03 | 2004.35 ms | 68.5% bf16 MFU | 260181 tok/s +step 6168/18794 | loss 3.170338 (+0.03z)| norm 0.2484 (-0.07z)| lr 4.95e-03 | 2018.57 ms | 68.0% bf16 MFU | 260158 tok/s +step 6169/18794 | loss 3.133467 (-0.94z)| norm 0.2029 (-0.72z)| lr 4.95e-03 | 2012.19 ms | 68.2% bf16 MFU | 260178 tok/s +step 6170/18794 | loss 3.102829 (-1.70z)| norm 0.1998 (-0.78z)| lr 4.95e-03 | 2003.95 ms | 68.5% bf16 MFU | 260251 tok/s +step 6171/18794 | loss 3.124516 (-1.15z)| norm 0.1855 (-0.98z)| lr 4.95e-03 | 2004.38 ms | 68.5% bf16 MFU | 260317 tok/s +step 6172/18794 | loss 3.178168 (+0.24z)| norm 0.2020 (-0.73z)| lr 4.95e-03 | 2016.14 ms | 68.1% bf16 MFU | 260303 tok/s +step 6173/18794 | loss 3.148188 (-0.55z)| norm 0.1517 (-1.45z)| lr 4.95e-03 | 2009.50 ms | 68.3% bf16 MFU | 260333 tok/s +step 6174/18794 | loss 3.121470 (-1.22z)| norm 0.1728 (-1.13z)| lr 4.95e-03 | 2019.34 ms | 68.0% bf16 MFU | 260298 tok/s +step 6175/18794 | loss 3.143773 (-0.63z)| norm 0.2044 (-0.67z)| lr 4.95e-03 | 2010.91 ms | 68.2% bf16 MFU | 260319 tok/s +step 6176/18794 | loss 3.177536 (+0.23z)| norm 0.2010 (-0.73z)| lr 4.95e-03 | 2009.44 ms | 68.3% bf16 MFU | 260349 tok/s +step 6177/18794 | loss 3.172504 (+0.09z)| norm 0.1756 (-1.10z)| lr 4.95e-03 | 2030.79 ms | 67.6% bf16 MFU | 260240 tok/s +step 6178/18794 | loss 3.175303 (+0.16z)| norm 0.2086 (-0.62z)| lr 4.95e-03 | 2009.18 ms | 68.3% bf16 MFU | 260275 tok/s +step 6179/18794 | loss 3.140058 (-0.77z)| norm 0.1562 (-1.37z)| lr 4.95e-03 | 1999.94 ms | 68.6% bf16 MFU | 260369 tok/s +step 6180/18794 | loss 3.191830 (+0.59z)| norm 0.3034 (+0.76z)| lr 4.95e-03 | 1994.34 ms | 68.8% bf16 MFU | 260495 tok/s +step 6181/18794 | loss 3.161941 (-0.19z)| norm 0.3712 (+1.70z)| lr 4.95e-03 | 1997.87 ms | 68.7% bf16 MFU | 260592 tok/s +step 6182/18794 | loss 3.204729 (+0.91z)| norm 0.2109 (-0.58z)| lr 4.95e-03 | 2010.72 ms | 68.3% bf16 MFU | 260599 tok/s +step 6183/18794 | loss 3.120005 (-1.29z)| norm 0.2710 (+0.27z)| lr 4.95e-03 | 2011.78 ms | 68.2% bf16 MFU | 260600 tok/s +step 6184/18794 | loss 3.194995 (+0.66z)| norm 0.3319 (+1.14z)| lr 4.95e-03 | 2019.39 ms | 68.0% bf16 MFU | 260551 tok/s +step 6185/18794 | loss 3.152672 (-0.44z)| norm 0.1815 (-0.99z)| lr 4.95e-03 | 2011.25 ms | 68.2% bf16 MFU | 260557 tok/s +step 6186/18794 | loss 3.137319 (-0.83z)| norm 0.2827 (+0.44z)| lr 4.95e-03 | 2010.18 ms | 68.3% bf16 MFU | 260570 tok/s +step 6187/18794 | loss 3.180931 (+0.28z)| norm 0.2109 (-0.58z)| lr 4.95e-03 | 2010.98 ms | 68.2% bf16 MFU | 260578 tok/s +step 6188/18794 | loss 3.143326 (-0.71z)| norm 0.2116 (-0.55z)| lr 4.94e-03 | 2002.58 ms | 68.5% bf16 MFU | 260639 tok/s +step 6189/18794 | loss 3.171779 (+0.04z)| norm 0.2924 (+0.60z)| lr 4.94e-03 | 2027.14 ms | 67.7% bf16 MFU | 260539 tok/s +step 6190/18794 | loss 3.112212 (-1.50z)| norm 0.2221 (-0.40z)| lr 4.94e-03 | 2003.13 ms | 68.5% bf16 MFU | 260598 tok/s +step 6191/18794 | loss 3.174948 (+0.12z)| norm 0.2091 (-0.59z)| lr 4.94e-03 | 2008.00 ms | 68.3% bf16 MFU | 260624 tok/s +step 6192/18794 | loss 3.146436 (-0.67z)| norm 0.2781 (+0.48z)| lr 4.94e-03 | 2004.05 ms | 68.5% bf16 MFU | 260673 tok/s +step 6193/18794 | loss 3.167664 (-0.11z)| norm 0.3020 (+0.87z)| lr 4.94e-03 | 2009.68 ms | 68.3% bf16 MFU | 260684 tok/s +step 6194/18794 | loss 3.194150 (+0.63z)| norm 0.3417 (+1.48z)| lr 4.94e-03 | 2011.86 ms | 68.2% bf16 MFU | 260679 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.439532 +step 6195/18794 | loss 3.137932 (-0.93z)| norm 0.4097 (+2.44z)| lr 4.94e-03 | 2025.82 ms | 67.7% bf16 MFU | 260585 tok/s +step 6196/18794 | loss 3.182166 (+0.29z)| norm 0.2224 (-0.40z)| lr 4.94e-03 | 1994.77 ms | 68.8% bf16 MFU | 260698 tok/s +step 6197/18794 | loss 3.185595 (+0.37z)| norm 0.2869 (+0.56z)| lr 4.94e-03 | 2024.23 ms | 67.8% bf16 MFU | 260613 tok/s +step 6198/18794 | loss 3.185483 (+0.35z)| norm 0.3130 (+0.94z)| lr 4.94e-03 | 2011.69 ms | 68.2% bf16 MFU | 260614 tok/s +step 6199/18794 | loss 3.178521 (+0.14z)| norm 0.2192 (-0.50z)| lr 4.94e-03 | 2012.08 ms | 68.2% bf16 MFU | 260611 tok/s +step 6200/18794 | loss 3.156224 (-0.49z)| norm 0.2328 (-0.30z)| lr 4.94e-03 | 1988.34 ms | 69.0% bf16 MFU | 260765 tok/s +step 6201/18794 | loss 3.170259 (-0.08z)| norm 0.2510 (-0.03z)| lr 4.94e-03 | 2011.09 ms | 68.2% bf16 MFU | 260762 tok/s +step 6202/18794 | loss 3.141693 (-0.91z)| norm 0.2490 (-0.06z)| lr 4.94e-03 | 2010.68 ms | 68.3% bf16 MFU | 260761 tok/s +step 6203/18794 | loss 3.147296 (-0.74z)| norm 0.2054 (-0.74z)| lr 4.94e-03 | 2010.15 ms | 68.3% bf16 MFU | 260764 tok/s +step 6204/18794 | loss 3.204648 (+0.95z)| norm 0.2935 (+0.64z)| lr 4.94e-03 | 2024.31 ms | 67.8% bf16 MFU | 260676 tok/s +step 6205/18794 | loss 3.171203 (-0.02z)| norm 0.3100 (+0.89z)| lr 4.94e-03 | 2014.43 ms | 68.1% bf16 MFU | 260655 tok/s +step 6206/18794 | loss 3.145619 (-0.78z)| norm 0.2498 (-0.05z)| lr 4.94e-03 | 2002.84 ms | 68.5% bf16 MFU | 260711 tok/s +step 6207/18794 | loss 3.229131 (+1.66z)| norm 0.2047 (-0.74z)| lr 4.94e-03 | 2010.01 ms | 68.3% bf16 MFU | 260717 tok/s +step 6208/18794 | loss 3.229518 (+1.64z)| norm 0.1786 (-1.13z)| lr 4.94e-03 | 2013.62 ms | 68.2% bf16 MFU | 260700 tok/s +step 6209/18794 | loss 3.152001 (-0.60z)| norm 0.1889 (-0.97z)| lr 4.94e-03 | 2013.24 ms | 68.2% bf16 MFU | 260686 tok/s +step 6210/18794 | loss 3.206478 (+1.09z)| norm 0.2021 (-0.76z)| lr 4.94e-03 | 2018.73 ms | 68.0% bf16 MFU | 260637 tok/s +step 6211/18794 | loss 3.137078 (-1.07z)| norm 0.2011 (-0.78z)| lr 4.94e-03 | 1999.26 ms | 68.6% bf16 MFU | 260717 tok/s +step 6212/18794 | loss 3.184503 (+0.43z)| norm 0.1731 (-1.19z)| lr 4.94e-03 | 2013.20 ms | 68.2% bf16 MFU | 260703 tok/s +step 6213/18794 | loss 3.287905 (+3.45z)| norm 0.2594 (+0.13z)| lr 4.94e-03 | 2013.69 ms | 68.1% bf16 MFU | 260686 tok/s +step 6214/18794 | loss 3.176713 (+0.14z)| norm 0.3016 (+0.77z)| lr 4.93e-03 | 2006.83 ms | 68.4% bf16 MFU | 260714 tok/s +step 6215/18794 | loss 3.182910 (+0.32z)| norm 0.1801 (-1.09z)| lr 4.93e-03 | 2024.00 ms | 67.8% bf16 MFU | 260630 tok/s +step 6216/18794 | loss 3.175375 (+0.11z)| norm 0.2616 (+0.16z)| lr 4.93e-03 | 2002.99 ms | 68.5% bf16 MFU | 260686 tok/s +step 6217/18794 | loss 3.199578 (+0.83z)| norm 0.3189 (+1.04z)| lr 4.93e-03 | 2010.70 ms | 68.3% bf16 MFU | 260689 tok/s +step 6218/18794 | loss 3.181595 (+0.32z)| norm 0.1971 (-0.83z)| lr 4.93e-03 | 2010.34 ms | 68.3% bf16 MFU | 260695 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.759264 +step 6219/18794 | loss 3.191933 (+0.63z)| norm 0.4395 (+2.76z)| lr 4.93e-03 | 2024.98 ms | 67.8% bf16 MFU | 260605 tok/s +step 6220/18794 | loss 3.146771 (-0.76z)| norm 0.2988 (+0.65z)| lr 4.93e-03 | 2013.10 ms | 68.2% bf16 MFU | 260597 tok/s +step 6221/18794 | loss 3.197633 (+0.82z)| norm 0.2398 (-0.22z)| lr 4.93e-03 | 1996.85 ms | 68.7% bf16 MFU | 260695 tok/s +step 6222/18794 | loss 3.144428 (-0.85z)| norm 0.2905 (+0.55z)| lr 4.93e-03 | 2024.24 ms | 67.8% bf16 MFU | 260611 tok/s +step 6223/18794 | loss 3.138789 (-1.02z)| norm 0.1992 (-0.82z)| lr 4.93e-03 | 2023.56 ms | 67.8% bf16 MFU | 260535 tok/s +step 6224/18794 | loss 3.190331 (+0.59z)| norm 0.1963 (-0.86z)| lr 4.93e-03 | 2024.86 ms | 67.8% bf16 MFU | 260454 tok/s +step 6225/18794 | loss 3.175673 (+0.12z)| norm 0.1882 (-0.97z)| lr 4.93e-03 | 2021.12 ms | 67.9% bf16 MFU | 260402 tok/s +step 6226/18794 | loss 3.266191 (+2.81z)| norm 0.2625 (+0.18z)| lr 4.93e-03 | 2005.38 ms | 68.4% bf16 MFU | 260454 tok/s +step 6227/18794 | loss 3.193952 (+0.62z)| norm 0.2878 (+0.55z)| lr 4.93e-03 | 2023.89 ms | 67.8% bf16 MFU | 260384 tok/s +step 6228/18794 | loss 3.213525 (+1.18z)| norm 0.2254 (-0.40z)| lr 4.93e-03 | 2023.16 ms | 67.8% bf16 MFU | 260322 tok/s +step 6229/18794 | loss 3.193295 (+0.58z)| norm 0.1830 (-1.04z)| lr 4.93e-03 | 2018.85 ms | 68.0% bf16 MFU | 260290 tok/s +step 6230/18794 | loss 3.172201 (-0.03z)| norm 0.2153 (-0.54z)| lr 4.93e-03 | 2014.27 ms | 68.1% bf16 MFU | 260290 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.198946 +step 6231/18794 | loss 3.179554 (+0.18z)| norm 0.3959 (+2.20z)| lr 4.93e-03 | 2015.21 ms | 68.1% bf16 MFU | 260284 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.171696 +step 6232/18794 | loss 3.149185 (-0.74z)| norm 0.3990 (+2.17z)| lr 4.93e-03 | 2016.54 ms | 68.1% bf16 MFU | 260269 tok/s +step 6233/18794 | loss 3.153031 (-0.62z)| norm 0.2025 (-0.76z)| lr 4.93e-03 | 2016.78 ms | 68.0% bf16 MFU | 260254 tok/s +step 6234/18794 | loss 3.210377 (+1.16z)| norm 0.2671 (+0.21z)| lr 4.93e-03 | 2008.24 ms | 68.3% bf16 MFU | 260295 tok/s +step 6235/18794 | loss 3.179220 (+0.20z)| norm 0.1855 (-1.00z)| lr 4.93e-03 | 2015.81 ms | 68.1% bf16 MFU | 260284 tok/s +step 6236/18794 | loss 3.219655 (+1.49z)| norm 0.3372 (+1.23z)| lr 4.93e-03 | 2004.82 ms | 68.5% bf16 MFU | 260346 tok/s +step 6237/18794 | loss 3.180073 (+0.22z)| norm 0.3428 (+1.29z)| lr 4.93e-03 | 2008.54 ms | 68.3% bf16 MFU | 260380 tok/s +step 6238/18794 | loss 3.185145 (+0.38z)| norm 0.2474 (-0.10z)| lr 4.93e-03 | 2018.64 ms | 68.0% bf16 MFU | 260347 tok/s +step 6239/18794 | loss 3.184403 (+0.34z)| norm 0.3588 (+1.54z)| lr 4.93e-03 | 2010.58 ms | 68.3% bf16 MFU | 260368 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.160298 +step 6240/18794 | loss 3.248326 (+2.31z)| norm 0.4807 (+3.16z)| lr 4.93e-03 | 2013.70 ms | 68.1% bf16 MFU | 260368 tok/s +step 6241/18794 | loss 3.165495 (-0.29z)| norm 0.2132 (-0.63z)| lr 4.92e-03 | 2006.74 ms | 68.4% bf16 MFU | 260413 tok/s +step 6242/18794 | loss 3.195201 (+0.63z)| norm 0.3286 (+0.99z)| lr 4.92e-03 | 2031.63 ms | 67.5% bf16 MFU | 260295 tok/s +step 6243/18794 | loss 3.181417 (+0.19z)| norm 0.3402 (+1.13z)| lr 4.92e-03 | 2015.72 ms | 68.1% bf16 MFU | 260285 tok/s +step 6244/18794 | loss 3.194378 (+0.61z)| norm 0.1887 (-0.99z)| lr 4.92e-03 | 2025.22 ms | 67.8% bf16 MFU | 260215 tok/s +step 6245/18794 | loss 3.152336 (-0.72z)| norm 0.1824 (-1.06z)| lr 4.92e-03 | 2021.39 ms | 67.9% bf16 MFU | 260173 tok/s +step 6246/18794 | loss 3.248820 (+2.26z)| norm 0.2193 (-0.52z)| lr 4.92e-03 | 2015.17 ms | 68.1% bf16 MFU | 260173 tok/s +step 6247/18794 | loss 3.225817 (+1.52z)| norm 0.1981 (-0.81z)| lr 4.92e-03 | 2033.10 ms | 67.5% bf16 MFU | 260058 tok/s +step 6248/18794 | loss 3.175188 (-0.02z)| norm 0.1950 (-0.85z)| lr 4.92e-03 | 2020.97 ms | 67.9% bf16 MFU | 260026 tok/s +step 6249/18794 | loss 3.194065 (+0.56z)| norm 0.1572 (-1.36z)| lr 4.92e-03 | 2005.93 ms | 68.4% bf16 MFU | 260093 tok/s +step 6250/18794 | loss 3.159802 (-0.48z)| norm 0.1808 (-1.01z)| lr 4.92e-03 | 2013.62 ms | 68.2% bf16 MFU | 260107 tok/s +val loss 3.196147 +HellaSwag: 2860/10042 = 0.284804: 0/1256 +step 6251/18794 | loss 3.206397 (+0.92z)| norm 0.1898 (-0.87z)| lr 4.92e-03 | 2025.73 ms | 67.7% bf16 MFU | 260042 tok/s +step 6252/18794 | loss 3.242442 (+1.96z)| norm 0.2355 (-0.20z)| lr 4.92e-03 | 2009.01 ms | 68.3% bf16 MFU | 260089 tok/s +step 6253/18794 | loss 3.139763 (-1.10z)| norm 0.1691 (-1.15z)| lr 4.92e-03 | 1995.85 ms | 68.8% bf16 MFU | 260219 tok/s +step 6254/18794 | loss 3.185178 (+0.24z)| norm 0.1786 (-0.99z)| lr 4.92e-03 | 1995.40 ms | 68.8% bf16 MFU | 260345 tok/s +step 6255/18794 | loss 3.202790 (+0.82z)| norm 0.1748 (-1.03z)| lr 4.92e-03 | 2012.20 ms | 68.2% bf16 MFU | 260356 tok/s +step 6256/18794 | loss 3.190880 (+0.44z)| norm 0.1637 (-1.17z)| lr 4.92e-03 | 2008.53 ms | 68.3% bf16 MFU | 260389 tok/s +step 6257/18794 | loss 3.157860 (-0.59z)| norm 0.1542 (-1.29z)| lr 4.92e-03 | 2007.71 ms | 68.4% bf16 MFU | 260427 tok/s +step 6258/18794 | loss 3.212440 (+1.09z)| norm 0.1777 (-0.94z)| lr 4.92e-03 | 2003.47 ms | 68.5% bf16 MFU | 260490 tok/s +step 6259/18794 | loss 3.225009 (+1.45z)| norm 0.2056 (-0.53z)| lr 4.92e-03 | 2015.65 ms | 68.1% bf16 MFU | 260471 tok/s +step 6260/18794 | loss 3.213091 (+1.07z)| norm 0.2610 (+0.26z)| lr 4.92e-03 | 2009.41 ms | 68.3% bf16 MFU | 260493 tok/s +step 6261/18794 | loss 3.139408 (-1.15z)| norm 0.1940 (-0.70z)| lr 4.92e-03 | 2030.22 ms | 67.6% bf16 MFU | 260381 tok/s +step 6262/18794 | loss 3.222455 (+1.33z)| norm 0.2328 (-0.15z)| lr 4.92e-03 | 2013.16 ms | 68.2% bf16 MFU | 260383 tok/s +step 6263/18794 | loss 3.174449 (-0.12z)| norm 0.2838 (+0.57z)| lr 4.92e-03 | 2021.57 ms | 67.9% bf16 MFU | 260331 tok/s +step 6264/18794 | loss 3.175168 (-0.10z)| norm 0.2282 (-0.23z)| lr 4.92e-03 | 2007.08 ms | 68.4% bf16 MFU | 260376 tok/s +step 6265/18794 | loss 3.200854 (+0.66z)| norm 0.1774 (-0.95z)| lr 4.92e-03 | 2031.54 ms | 67.6% bf16 MFU | 260261 tok/s +step 6266/18794 | loss 3.151706 (-0.80z)| norm 0.2763 (+0.51z)| lr 4.92e-03 | 2035.90 ms | 67.4% bf16 MFU | 260124 tok/s +step 6267/18794 | loss 3.240429 (+1.84z)| norm 0.2649 (+0.33z)| lr 4.91e-03 | 2011.91 ms | 68.2% bf16 MFU | 260147 tok/s +step 6268/18794 | loss 3.201964 (+0.68z)| norm 0.2018 (-0.59z)| lr 4.91e-03 | 2010.45 ms | 68.3% bf16 MFU | 260179 tok/s +step 6269/18794 | loss 3.123731 (-1.64z)| norm 0.1672 (-1.09z)| lr 4.91e-03 | 2023.53 ms | 67.8% bf16 MFU | 260125 tok/s +step 6270/18794 | loss 3.226172 (+1.39z)| norm 0.1833 (-0.85z)| lr 4.91e-03 | 2024.06 ms | 67.8% bf16 MFU | 260070 tok/s +step 6271/18794 | loss 3.151260 (-0.89z)| norm 0.1824 (-0.87z)| lr 4.91e-03 | 2010.50 ms | 68.3% bf16 MFU | 260105 tok/s +step 6272/18794 | loss 3.178337 (-0.07z)| norm 0.2386 (-0.05z)| lr 4.91e-03 | 1999.61 ms | 68.6% bf16 MFU | 260210 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.134703 +step 6273/18794 | loss 3.250815 (+2.08z)| norm 0.3920 (+2.13z)| lr 4.91e-03 | 2013.56 ms | 68.2% bf16 MFU | 260218 tok/s +step 6274/18794 | loss 3.168731 (-0.40z)| norm 0.2897 (+0.64z)| lr 4.91e-03 | 2010.89 ms | 68.2% bf16 MFU | 260243 tok/s +step 6275/18794 | loss 3.183544 (+0.03z)| norm 0.2427 (-0.05z)| lr 4.91e-03 | 1993.06 ms | 68.9% bf16 MFU | 260384 tok/s +step 6276/18794 | loss 3.166987 (-0.47z)| norm 0.3443 (+1.40z)| lr 4.91e-03 | 2015.80 ms | 68.1% bf16 MFU | 260369 tok/s +step 6277/18794 | loss 3.168818 (-0.41z)| norm 0.2024 (-0.65z)| lr 4.91e-03 | 1990.54 ms | 68.9% bf16 MFU | 260520 tok/s +step 6278/18794 | loss 3.166855 (-0.47z)| norm 0.2057 (-0.60z)| lr 4.91e-03 | 2026.00 ms | 67.7% bf16 MFU | 260433 tok/s +step 6279/18794 | loss 3.181738 (-0.03z)| norm 0.2828 (+0.49z)| lr 4.91e-03 | 1997.41 ms | 68.7% bf16 MFU | 260536 tok/s +step 6280/18794 | loss 3.141066 (-1.26z)| norm 0.2338 (-0.21z)| lr 4.91e-03 | 2017.18 ms | 68.0% bf16 MFU | 260505 tok/s +step 6281/18794 | loss 3.152095 (-0.91z)| norm 0.2370 (-0.15z)| lr 4.91e-03 | 2008.43 ms | 68.3% bf16 MFU | 260532 tok/s +step 6282/18794 | loss 3.186650 (+0.15z)| norm 0.1926 (-0.80z)| lr 4.91e-03 | 2031.65 ms | 67.5% bf16 MFU | 260408 tok/s +step 6283/18794 | loss 3.209133 (+0.82z)| norm 0.1827 (-0.93z)| lr 4.91e-03 | 2025.43 ms | 67.8% bf16 MFU | 260330 tok/s +step 6284/18794 | loss 3.222639 (+1.22z)| norm 0.2486 (+0.06z)| lr 4.91e-03 | 2009.75 ms | 68.3% bf16 MFU | 260357 tok/s +step 6285/18794 | loss 3.189591 (+0.19z)| norm 0.2047 (-0.60z)| lr 4.91e-03 | 2012.90 ms | 68.2% bf16 MFU | 260363 tok/s +step 6286/18794 | loss 3.204209 (+0.63z)| norm 0.1816 (-0.93z)| lr 4.91e-03 | 2011.89 ms | 68.2% bf16 MFU | 260374 tok/s +step 6287/18794 | loss 3.141942 (-1.29z)| norm 0.2375 (-0.10z)| lr 4.91e-03 | 2007.76 ms | 68.4% bf16 MFU | 260412 tok/s +step 6288/18794 | loss 3.173901 (-0.31z)| norm 0.2557 (+0.16z)| lr 4.91e-03 | 1999.46 ms | 68.6% bf16 MFU | 260502 tok/s +step 6289/18794 | loss 3.213384 (+0.90z)| norm 0.2097 (-0.51z)| lr 4.91e-03 | 2024.99 ms | 67.8% bf16 MFU | 260423 tok/s +step 6290/18794 | loss 3.144931 (-1.26z)| norm 0.1803 (-0.94z)| lr 4.91e-03 | 2018.40 ms | 68.0% bf16 MFU | 260389 tok/s +step 6291/18794 | loss 3.179217 (-0.17z)| norm 0.1951 (-0.72z)| lr 4.91e-03 | 2001.90 ms | 68.6% bf16 MFU | 260464 tok/s +step 6292/18794 | loss 3.211680 (+0.84z)| norm 0.1784 (-0.95z)| lr 4.91e-03 | 2030.54 ms | 67.6% bf16 MFU | 260351 tok/s +step 6293/18794 | loss 3.164814 (-0.65z)| norm 0.2112 (-0.45z)| lr 4.90e-03 | 2033.80 ms | 67.5% bf16 MFU | 260223 tok/s +step 6294/18794 | loss 3.221419 (+1.13z)| norm 0.2204 (-0.30z)| lr 4.90e-03 | 2020.97 ms | 67.9% bf16 MFU | 260183 tok/s +step 6295/18794 | loss 3.225035 (+1.22z)| norm 0.2321 (-0.10z)| lr 4.90e-03 | 2013.07 ms | 68.2% bf16 MFU | 260196 tok/s +step 6296/18794 | loss 3.165310 (-0.67z)| norm 0.2472 (+0.13z)| lr 4.90e-03 | 2019.29 ms | 68.0% bf16 MFU | 260168 tok/s +reducing beta2 to 0.9 and lr/wd by 0.888 due to grad z-score of 3.941010 +step 6297/18794 | loss 3.128321 (-1.79z)| norm 0.5164 (+3.94z)| lr 4.35e-03 | 2022.30 ms | 67.9% bf16 MFU | 260122 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.200591 +step 6298/18794 | loss 3.158659 (-0.83z)| norm 0.3987 (+2.20z)| lr 4.90e-03 | 2003.91 ms | 68.5% bf16 MFU | 260198 tok/s +step 6299/18794 | loss 3.189748 (+0.13z)| norm 0.2307 (-0.16z)| lr 4.90e-03 | 2018.63 ms | 68.0% bf16 MFU | 260174 tok/s +step 6300/18794 | loss 3.183680 (-0.07z)| norm 0.3819 (+1.91z)| lr 4.90e-03 | 2004.64 ms | 68.5% bf16 MFU | 260242 tok/s +step 6301/18794 | loss 3.186851 (+0.03z)| norm 0.2942 (+0.69z)| lr 4.90e-03 | 2008.07 ms | 68.3% bf16 MFU | 260285 tok/s +step 6302/18794 | loss 3.179057 (-0.23z)| norm 0.1932 (-0.69z)| lr 4.90e-03 | 2027.09 ms | 67.7% bf16 MFU | 260203 tok/s +step 6303/18794 | loss 3.114233 (-2.22z)| norm 0.3390 (+1.28z)| lr 4.90e-03 | 2026.25 ms | 67.7% bf16 MFU | 260130 tok/s +step 6304/18794 | loss 3.196461 (+0.32z)| norm 0.2724 (+0.38z)| lr 4.90e-03 | 1995.98 ms | 68.8% bf16 MFU | 260257 tok/s +step 6305/18794 | loss 3.143189 (-1.31z)| norm 0.2393 (-0.06z)| lr 4.90e-03 | 2015.45 ms | 68.1% bf16 MFU | 260251 tok/s +step 6306/18794 | loss 3.192936 (+0.21z)| norm 0.3281 (+1.14z)| lr 4.90e-03 | 2032.29 ms | 67.5% bf16 MFU | 260137 tok/s +step 6307/18794 | loss 3.157065 (-0.88z)| norm 0.2091 (-0.48z)| lr 4.90e-03 | 2020.19 ms | 67.9% bf16 MFU | 260107 tok/s +step 6308/18794 | loss 3.168564 (-0.51z)| norm 0.2604 (+0.20z)| lr 4.90e-03 | 2014.43 ms | 68.1% bf16 MFU | 260115 tok/s +step 6309/18794 | loss 3.145381 (-1.23z)| norm 0.3013 (+0.75z)| lr 4.90e-03 | 2013.53 ms | 68.2% bf16 MFU | 260128 tok/s +step 6310/18794 | loss 3.140944 (-1.34z)| norm 0.2103 (-0.50z)| lr 4.90e-03 | 2019.34 ms | 68.0% bf16 MFU | 260103 tok/s +step 6311/18794 | loss 3.169364 (-0.47z)| norm 0.2957 (+0.66z)| lr 4.90e-03 | 2023.31 ms | 67.8% bf16 MFU | 260054 tok/s +step 6312/18794 | loss 3.132730 (-1.59z)| norm 0.3346 (+1.17z)| lr 4.90e-03 | 2031.69 ms | 67.5% bf16 MFU | 259954 tok/s +step 6313/18794 | loss 3.151137 (-1.03z)| norm 0.1797 (-0.93z)| lr 4.90e-03 | 2005.77 ms | 68.4% bf16 MFU | 260026 tok/s +step 6314/18794 | loss 3.142955 (-1.27z)| norm 0.2444 (-0.05z)| lr 4.90e-03 | 2026.54 ms | 67.7% bf16 MFU | 259960 tok/s +step 6315/18794 | loss 3.202853 (+0.66z)| norm 0.2250 (-0.32z)| lr 4.90e-03 | 2015.74 ms | 68.1% bf16 MFU | 259967 tok/s +step 6316/18794 | loss 3.148847 (-1.07z)| norm 0.1847 (-0.86z)| lr 4.90e-03 | 2024.30 ms | 67.8% bf16 MFU | 259919 tok/s +step 6317/18794 | loss 3.235165 (+1.67z)| norm 0.2519 (+0.07z)| lr 4.90e-03 | 2009.02 ms | 68.3% bf16 MFU | 259971 tok/s +step 6318/18794 | loss 3.211974 (+0.92z)| norm 0.2830 (+0.48z)| lr 4.90e-03 | 2024.99 ms | 67.8% bf16 MFU | 259918 tok/s +step 6319/18794 | loss 3.163947 (-0.59z)| norm 0.1808 (-0.91z)| lr 4.89e-03 | 1999.18 ms | 68.6% bf16 MFU | 260035 tok/s +step 6320/18794 | loss 3.135784 (-1.46z)| norm 0.2691 (+0.34z)| lr 4.89e-03 | 2012.77 ms | 68.2% bf16 MFU | 260057 tok/s +step 6321/18794 | loss 3.211413 (+0.90z)| norm 0.3368 (+1.28z)| lr 4.89e-03 | 2028.13 ms | 67.7% bf16 MFU | 259980 tok/s +step 6322/18794 | loss 3.122802 (-1.84z)| norm 0.2119 (-0.47z)| lr 4.89e-03 | 2024.72 ms | 67.8% bf16 MFU | 259928 tok/s +step 6323/18794 | loss 3.200854 (+0.56z)| norm 0.2028 (-0.60z)| lr 4.89e-03 | 2020.00 ms | 67.9% bf16 MFU | 259909 tok/s +step 6324/18794 | loss 3.179710 (-0.10z)| norm 0.1959 (-0.69z)| lr 4.89e-03 | 2022.81 ms | 67.8% bf16 MFU | 259873 tok/s +step 6325/18794 | loss 3.183663 (+0.02z)| norm 0.1970 (-0.68z)| lr 4.89e-03 | 2015.07 ms | 68.1% bf16 MFU | 259888 tok/s +step 6326/18794 | loss 3.238365 (+1.77z)| norm 0.3791 (+1.84z)| lr 4.89e-03 | 2001.37 ms | 68.6% bf16 MFU | 259992 tok/s +step 6327/18794 | loss 3.136986 (-1.42z)| norm 0.3022 (+0.77z)| lr 4.89e-03 | 2016.73 ms | 68.0% bf16 MFU | 259991 tok/s +step 6328/18794 | loss 3.161542 (-0.63z)| norm 0.1786 (-0.93z)| lr 4.89e-03 | 2019.11 ms | 68.0% bf16 MFU | 259974 tok/s +step 6329/18794 | loss 3.136894 (-1.38z)| norm 0.3495 (+1.40z)| lr 4.89e-03 | 2026.85 ms | 67.7% bf16 MFU | 259909 tok/s +step 6330/18794 | loss 3.152655 (-0.88z)| norm 0.2889 (+0.56z)| lr 4.89e-03 | 2009.57 ms | 68.3% bf16 MFU | 259959 tok/s +step 6331/18794 | loss 3.194299 (+0.42z)| norm 0.2246 (-0.31z)| lr 4.89e-03 | 2003.71 ms | 68.5% bf16 MFU | 260044 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.069552 +step 6332/18794 | loss 3.173697 (-0.24z)| norm 0.3945 (+2.07z)| lr 4.89e-03 | 2004.65 ms | 68.5% bf16 MFU | 260118 tok/s +step 6333/18794 | loss 3.189352 (+0.24z)| norm 0.2602 (+0.18z)| lr 4.89e-03 | 2010.81 ms | 68.2% bf16 MFU | 260149 tok/s +step 6334/18794 | loss 3.177870 (-0.11z)| norm 0.1596 (-1.20z)| lr 4.89e-03 | 2007.42 ms | 68.4% bf16 MFU | 260200 tok/s +step 6335/18794 | loss 3.164306 (-0.53z)| norm 0.1929 (-0.74z)| lr 4.89e-03 | 2017.94 ms | 68.0% bf16 MFU | 260181 tok/s +step 6336/18794 | loss 3.170999 (-0.31z)| norm 0.1815 (-0.88z)| lr 4.89e-03 | 2010.92 ms | 68.2% bf16 MFU | 260208 tok/s +step 6337/18794 | loss 3.214220 (+1.05z)| norm 0.2388 (-0.07z)| lr 4.89e-03 | 2011.42 ms | 68.2% bf16 MFU | 260230 tok/s +step 6338/18794 | loss 3.153636 (-0.85z)| norm 0.2317 (-0.17z)| lr 4.89e-03 | 1999.81 ms | 68.6% bf16 MFU | 260327 tok/s +step 6339/18794 | loss 3.165347 (-0.48z)| norm 0.1609 (-1.15z)| lr 4.89e-03 | 2003.41 ms | 68.5% bf16 MFU | 260396 tok/s +step 6340/18794 | loss 3.189411 (+0.31z)| norm 0.2338 (-0.08z)| lr 4.89e-03 | 1996.96 ms | 68.7% bf16 MFU | 260503 tok/s +step 6341/18794 | loss 3.121340 (-1.84z)| norm 0.2350 (-0.06z)| lr 4.89e-03 | 1995.83 ms | 68.8% bf16 MFU | 260613 tok/s +step 6342/18794 | loss 3.167469 (-0.37z)| norm 0.2013 (-0.56z)| lr 4.89e-03 | 2002.39 ms | 68.5% bf16 MFU | 260673 tok/s +step 6343/18794 | loss 3.189333 (+0.32z)| norm 0.1932 (-0.67z)| lr 4.89e-03 | 1996.50 ms | 68.7% bf16 MFU | 260770 tok/s +step 6344/18794 | loss 3.188146 (+0.28z)| norm 0.2186 (-0.28z)| lr 4.89e-03 | 2002.35 ms | 68.5% bf16 MFU | 260823 tok/s +step 6345/18794 | loss 3.164195 (-0.48z)| norm 0.2593 (+0.34z)| lr 4.88e-03 | 2018.38 ms | 68.0% bf16 MFU | 260770 tok/s +step 6346/18794 | loss 3.212749 (+1.10z)| norm 0.2169 (-0.32z)| lr 4.88e-03 | 2011.35 ms | 68.2% bf16 MFU | 260765 tok/s +step 6347/18794 | loss 3.154403 (-0.78z)| norm 0.1774 (-0.93z)| lr 4.88e-03 | 2026.14 ms | 67.7% bf16 MFU | 260665 tok/s +step 6348/18794 | loss 3.177429 (-0.03z)| norm 0.1900 (-0.73z)| lr 4.88e-03 | 2018.35 ms | 68.0% bf16 MFU | 260619 tok/s +step 6349/18794 | loss 3.173085 (-0.16z)| norm 0.1885 (-0.76z)| lr 4.88e-03 | 2011.44 ms | 68.2% bf16 MFU | 260621 tok/s +step 6350/18794 | loss 3.179281 (+0.03z)| norm 0.2069 (-0.48z)| lr 4.88e-03 | 2020.44 ms | 67.9% bf16 MFU | 260565 tok/s +step 6351/18794 | loss 3.164567 (-0.43z)| norm 0.2224 (-0.25z)| lr 4.88e-03 | 2007.95 ms | 68.3% bf16 MFU | 260592 tok/s +step 6352/18794 | loss 3.162842 (-0.48z)| norm 0.1902 (-0.74z)| lr 4.88e-03 | 2011.37 ms | 68.2% bf16 MFU | 260595 tok/s +step 6353/18794 | loss 3.206743 (+0.98z)| norm 0.1686 (-1.08z)| lr 4.88e-03 | 2019.63 ms | 67.9% bf16 MFU | 260545 tok/s +step 6354/18794 | loss 3.162195 (-0.51z)| norm 0.2126 (-0.40z)| lr 4.88e-03 | 1994.38 ms | 68.8% bf16 MFU | 260662 tok/s +step 6355/18794 | loss 3.192587 (+0.51z)| norm 0.3582 (+1.83z)| lr 4.88e-03 | 2011.15 ms | 68.2% bf16 MFU | 260664 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.148601 +step 6356/18794 | loss 3.216866 (+1.32z)| norm 0.3836 (+2.15z)| lr 4.88e-03 | 2009.85 ms | 68.3% bf16 MFU | 260673 tok/s +step 6357/18794 | loss 3.215757 (+1.25z)| norm 0.2051 (-0.57z)| lr 4.88e-03 | 2017.19 ms | 68.0% bf16 MFU | 260635 tok/s +step 6358/18794 | loss 3.208630 (+1.02z)| norm 0.2729 (+0.45z)| lr 4.88e-03 | 2010.90 ms | 68.2% bf16 MFU | 260640 tok/s +step 6359/18794 | loss 3.197060 (+0.65z)| norm 0.2135 (-0.46z)| lr 4.88e-03 | 2002.15 ms | 68.5% bf16 MFU | 260701 tok/s +step 6360/18794 | loss 3.166314 (-0.38z)| norm 0.2675 (+0.37z)| lr 4.88e-03 | 2026.11 ms | 67.7% bf16 MFU | 260604 tok/s +step 6361/18794 | loss 3.191016 (+0.45z)| norm 0.3225 (+1.19z)| lr 4.88e-03 | 2001.32 ms | 68.6% bf16 MFU | 260672 tok/s +step 6362/18794 | loss 3.110861 (-2.22z)| norm 0.2599 (+0.22z)| lr 4.88e-03 | 2002.25 ms | 68.5% bf16 MFU | 260731 tok/s +step 6363/18794 | loss 3.128098 (-1.60z)| norm 0.1807 (-0.97z)| lr 4.88e-03 | 2010.24 ms | 68.3% bf16 MFU | 260735 tok/s +step 6364/18794 | loss 3.143956 (-1.06z)| norm 0.3107 (+1.00z)| lr 4.88e-03 | 1996.30 ms | 68.7% bf16 MFU | 260830 tok/s +step 6365/18794 | loss 3.179498 (+0.12z)| norm 0.3678 (+1.81z)| lr 4.88e-03 | 2011.12 ms | 68.2% bf16 MFU | 260823 tok/s +step 6366/18794 | loss 3.224252 (+1.57z)| norm 0.1980 (-0.72z)| lr 4.88e-03 | 2005.83 ms | 68.4% bf16 MFU | 260851 tok/s +step 6367/18794 | loss 3.244356 (+2.22z)| norm 0.3431 (+1.43z)| lr 4.88e-03 | 2026.71 ms | 67.7% bf16 MFU | 260743 tok/s +step 6368/18794 | loss 3.233799 (+1.84z)| norm 0.2838 (+0.54z)| lr 4.88e-03 | 2019.66 ms | 67.9% bf16 MFU | 260685 tok/s +step 6369/18794 | loss 3.130899 (-1.49z)| norm 0.2755 (+0.40z)| lr 4.88e-03 | 2004.08 ms | 68.5% bf16 MFU | 260732 tok/s +reducing beta2 to 0.9 and lr/wd by 0.937 due to grad z-score of 3.734776 +step 6370/18794 | loss 3.195452 (+0.62z)| norm 0.5207 (+3.73z)| lr 4.57e-03 | 2009.57 ms | 68.3% bf16 MFU | 260740 tok/s +step 6371/18794 | loss 3.193437 (+0.54z)| norm 0.2188 (-0.47z)| lr 4.87e-03 | 2018.80 ms | 68.0% bf16 MFU | 260688 tok/s +step 6372/18794 | loss 3.154846 (-0.72z)| norm 0.2922 (+0.55z)| lr 4.87e-03 | 2011.15 ms | 68.2% bf16 MFU | 260688 tok/s +step 6373/18794 | loss 3.167247 (-0.30z)| norm 0.2290 (-0.32z)| lr 4.87e-03 | 2006.84 ms | 68.4% bf16 MFU | 260716 tok/s +step 6374/18794 | loss 3.164888 (-0.37z)| norm 0.2852 (+0.48z)| lr 4.87e-03 | 2026.02 ms | 67.7% bf16 MFU | 260619 tok/s +step 6375/18794 | loss 3.182439 (+0.22z)| norm 0.2501 (-0.02z)| lr 4.87e-03 | 2002.90 ms | 68.5% bf16 MFU | 260676 tok/s +step 6376/18794 | loss 3.116050 (-1.97z)| norm 0.2141 (-0.52z)| lr 4.87e-03 | 2009.75 ms | 68.3% bf16 MFU | 260686 tok/s +step 6377/18794 | loss 3.188545 (+0.43z)| norm 0.2277 (-0.33z)| lr 4.87e-03 | 2002.93 ms | 68.5% bf16 MFU | 260740 tok/s +step 6378/18794 | loss 3.213048 (+1.21z)| norm 0.2066 (-0.63z)| lr 4.87e-03 | 1984.97 ms | 69.1% bf16 MFU | 260909 tok/s +step 6379/18794 | loss 3.147430 (-0.93z)| norm 0.1634 (-1.22z)| lr 4.87e-03 | 2011.22 ms | 68.2% bf16 MFU | 260898 tok/s +step 6380/18794 | loss 3.151922 (-0.79z)| norm 0.1990 (-0.71z)| lr 4.87e-03 | 2025.26 ms | 67.8% bf16 MFU | 260797 tok/s +step 6381/18794 | loss 3.135468 (-1.31z)| norm 0.2028 (-0.65z)| lr 4.87e-03 | 2009.03 ms | 68.3% bf16 MFU | 260805 tok/s +step 6382/18794 | loss 3.235609 (+1.91z)| norm 0.2367 (-0.18z)| lr 4.87e-03 | 1994.88 ms | 68.8% bf16 MFU | 260906 tok/s +step 6383/18794 | loss 3.221621 (+1.45z)| norm 0.1562 (-1.31z)| lr 4.87e-03 | 2010.91 ms | 68.2% bf16 MFU | 260897 tok/s +step 6384/18794 | loss 3.178984 (+0.10z)| norm 0.1769 (-1.00z)| lr 4.87e-03 | 2006.17 ms | 68.4% bf16 MFU | 260919 tok/s +step 6385/18794 | loss 3.141066 (-1.10z)| norm 0.1852 (-0.88z)| lr 4.87e-03 | 2007.53 ms | 68.4% bf16 MFU | 260931 tok/s +step 6386/18794 | loss 3.176004 (+0.03z)| norm 0.1636 (-1.18z)| lr 4.87e-03 | 2009.81 ms | 68.3% bf16 MFU | 260928 tok/s +step 6387/18794 | loss 3.208302 (+1.05z)| norm 0.1569 (-1.25z)| lr 4.87e-03 | 2018.07 ms | 68.0% bf16 MFU | 260871 tok/s +step 6388/18794 | loss 3.190502 (+0.47z)| norm 0.2710 (+0.33z)| lr 4.87e-03 | 1993.65 ms | 68.8% bf16 MFU | 260976 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.123474 +step 6389/18794 | loss 3.217857 (+1.35z)| norm 0.4857 (+3.12z)| lr 4.87e-03 | 2009.00 ms | 68.3% bf16 MFU | 260976 tok/s +step 6390/18794 | loss 3.156250 (-0.64z)| norm 0.3084 (+0.76z)| lr 4.87e-03 | 2011.11 ms | 68.2% bf16 MFU | 260962 tok/s +step 6391/18794 | loss 3.183776 (+0.25z)| norm 0.2808 (+0.38z)| lr 4.87e-03 | 1994.19 ms | 68.8% bf16 MFU | 261059 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.340790 +step 6392/18794 | loss 3.177200 (+0.05z)| norm 0.4349 (+2.34z)| lr 4.87e-03 | 2011.33 ms | 68.2% bf16 MFU | 261040 tok/s +step 6393/18794 | loss 3.181669 (+0.19z)| norm 0.3201 (+0.84z)| lr 4.87e-03 | 2005.06 ms | 68.4% bf16 MFU | 261062 tok/s +step 6394/18794 | loss 3.156975 (-0.60z)| norm 0.1990 (-0.73z)| lr 4.87e-03 | 2012.99 ms | 68.2% bf16 MFU | 261031 tok/s +step 6395/18794 | loss 3.128974 (-1.49z)| norm 0.2167 (-0.50z)| lr 4.87e-03 | 2019.17 ms | 68.0% bf16 MFU | 260963 tok/s +step 6396/18794 | loss 3.162674 (-0.38z)| norm 0.3078 (+0.67z)| lr 4.87e-03 | 2013.97 ms | 68.1% bf16 MFU | 260931 tok/s +step 6397/18794 | loss 3.172222 (-0.08z)| norm 0.2824 (+0.40z)| lr 4.86e-03 | 2018.54 ms | 68.0% bf16 MFU | 260871 tok/s +step 6398/18794 | loss 3.182648 (+0.25z)| norm 0.1905 (-0.85z)| lr 4.86e-03 | 2020.03 ms | 67.9% bf16 MFU | 260805 tok/s +step 6399/18794 | loss 3.180706 (+0.19z)| norm 0.2740 (+0.31z)| lr 4.86e-03 | 2023.33 ms | 67.8% bf16 MFU | 260721 tok/s +step 6400/18794 | loss 3.152543 (-0.73z)| norm 0.2217 (-0.40z)| lr 4.86e-03 | 2013.53 ms | 68.2% bf16 MFU | 260704 tok/s +step 6401/18794 | loss 3.162128 (-0.41z)| norm 0.2290 (-0.29z)| lr 4.86e-03 | 2019.52 ms | 68.0% bf16 MFU | 260649 tok/s +step 6402/18794 | loss 3.132782 (-1.35z)| norm 0.2929 (+0.60z)| lr 4.86e-03 | 2014.07 ms | 68.1% bf16 MFU | 260632 tok/s +step 6403/18794 | loss 3.205590 (+1.03z)| norm 0.2110 (-0.55z)| lr 4.86e-03 | 2003.03 ms | 68.5% bf16 MFU | 260688 tok/s +step 6404/18794 | loss 3.158718 (-0.53z)| norm 0.2000 (-0.69z)| lr 4.86e-03 | 2015.65 ms | 68.1% bf16 MFU | 260659 tok/s +step 6405/18794 | loss 3.165312 (-0.31z)| norm 0.1892 (-0.84z)| lr 4.86e-03 | 2014.12 ms | 68.1% bf16 MFU | 260641 tok/s +step 6406/18794 | loss 3.163435 (-0.37z)| norm 0.2867 (+0.56z)| lr 4.86e-03 | 2012.48 ms | 68.2% bf16 MFU | 260635 tok/s +step 6407/18794 | loss 3.100269 (-2.41z)| norm 0.3354 (+1.23z)| lr 4.86e-03 | 2025.81 ms | 67.7% bf16 MFU | 260544 tok/s +step 6408/18794 | loss 3.167076 (-0.22z)| norm 0.1998 (-0.69z)| lr 4.86e-03 | 2013.19 ms | 68.2% bf16 MFU | 260538 tok/s +step 6409/18794 | loss 3.114694 (-1.90z)| norm 0.2106 (-0.52z)| lr 4.86e-03 | 2003.74 ms | 68.5% bf16 MFU | 260594 tok/s +step 6410/18794 | loss 3.155450 (-0.59z)| norm 0.2464 (-0.02z)| lr 4.86e-03 | 2019.06 ms | 68.0% bf16 MFU | 260547 tok/s +step 6411/18794 | loss 3.230788 (+1.80z)| norm 0.1937 (-0.75z)| lr 4.86e-03 | 2014.90 ms | 68.1% bf16 MFU | 260530 tok/s +step 6412/18794 | loss 3.141632 (-1.04z)| norm 0.2477 (+0.03z)| lr 4.86e-03 | 2017.91 ms | 68.0% bf16 MFU | 260495 tok/s +step 6413/18794 | loss 3.114060 (-1.88z)| norm 0.1963 (-0.71z)| lr 4.86e-03 | 2011.83 ms | 68.2% bf16 MFU | 260500 tok/s +step 6414/18794 | loss 3.158708 (-0.49z)| norm 0.2632 (+0.24z)| lr 4.86e-03 | 2012.03 ms | 68.2% bf16 MFU | 260504 tok/s +step 6415/18794 | loss 3.171509 (-0.07z)| norm 0.2609 (+0.21z)| lr 4.86e-03 | 2016.68 ms | 68.0% bf16 MFU | 260477 tok/s +step 6416/18794 | loss 3.148561 (-0.80z)| norm 0.1819 (-0.93z)| lr 4.86e-03 | 2028.67 ms | 67.6% bf16 MFU | 260376 tok/s +step 6417/18794 | loss 3.293514 (+3.59z)| norm 0.2062 (-0.57z)| lr 4.86e-03 | 2019.89 ms | 67.9% bf16 MFU | 260335 tok/s +step 6418/18794 | loss 3.207331 (+1.00z)| norm 0.1736 (-1.02z)| lr 4.86e-03 | 2003.76 ms | 68.5% bf16 MFU | 260401 tok/s +step 6419/18794 | loss 3.182941 (+0.25z)| norm 0.1835 (-0.88z)| lr 4.86e-03 | 2010.03 ms | 68.3% bf16 MFU | 260422 tok/s +step 6420/18794 | loss 3.230751 (+1.66z)| norm 0.2000 (-0.63z)| lr 4.86e-03 | 2013.96 ms | 68.1% bf16 MFU | 260418 tok/s +step 6421/18794 | loss 3.127032 (-1.42z)| norm 0.2943 (+0.72z)| lr 4.86e-03 | 2016.62 ms | 68.1% bf16 MFU | 260396 tok/s +step 6422/18794 | loss 3.147663 (-0.82z)| norm 0.2844 (+0.57z)| lr 4.85e-03 | 2030.43 ms | 67.6% bf16 MFU | 260287 tok/s +step 6423/18794 | loss 3.128043 (-1.38z)| norm 0.2998 (+0.77z)| lr 4.85e-03 | 2025.27 ms | 67.8% bf16 MFU | 260216 tok/s +step 6424/18794 | loss 3.195385 (+0.63z)| norm 0.2780 (+0.45z)| lr 4.85e-03 | 2027.91 ms | 67.7% bf16 MFU | 260132 tok/s +step 6425/18794 | loss 3.148094 (-0.77z)| norm 0.1772 (-0.98z)| lr 4.85e-03 | 2010.54 ms | 68.3% bf16 MFU | 260164 tok/s +step 6426/18794 | loss 3.183964 (+0.32z)| norm 0.1638 (-1.16z)| lr 4.85e-03 | 2006.81 ms | 68.4% bf16 MFU | 260219 tok/s +step 6427/18794 | loss 3.200731 (+0.81z)| norm 0.2441 (+0.01z)| lr 4.85e-03 | 2007.02 ms | 68.4% bf16 MFU | 260269 tok/s +step 6428/18794 | loss 3.131484 (-1.28z)| norm 0.2867 (+0.61z)| lr 4.85e-03 | 2003.92 ms | 68.5% bf16 MFU | 260337 tok/s +step 6429/18794 | loss 3.154664 (-0.58z)| norm 0.1817 (-0.89z)| lr 4.85e-03 | 2003.00 ms | 68.5% bf16 MFU | 260408 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.205650 +step 6430/18794 | loss 3.174592 (+0.01z)| norm 0.3981 (+2.21z)| lr 4.85e-03 | 2005.75 ms | 68.4% bf16 MFU | 260457 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.209953 +step 6431/18794 | loss 3.142920 (-0.93z)| norm 0.4835 (+3.21z)| lr 4.85e-03 | 2003.83 ms | 68.5% bf16 MFU | 260516 tok/s +step 6432/18794 | loss 3.127717 (-1.37z)| norm 0.2303 (-0.20z)| lr 4.85e-03 | 2020.25 ms | 67.9% bf16 MFU | 260466 tok/s +step 6433/18794 | loss 3.159611 (-0.40z)| norm 0.2430 (-0.02z)| lr 4.85e-03 | 2008.26 ms | 68.3% bf16 MFU | 260496 tok/s +step 6434/18794 | loss 3.166183 (-0.20z)| norm 0.2105 (-0.48z)| lr 4.85e-03 | 2019.89 ms | 67.9% bf16 MFU | 260450 tok/s +step 6435/18794 | loss 3.163956 (-0.26z)| norm 0.2418 (-0.05z)| lr 4.85e-03 | 2016.43 ms | 68.1% bf16 MFU | 260428 tok/s +step 6436/18794 | loss 3.210209 (+1.11z)| norm 0.3774 (+1.79z)| lr 4.85e-03 | 2015.07 ms | 68.1% bf16 MFU | 260415 tok/s +step 6437/18794 | loss 3.167997 (-0.14z)| norm 0.2750 (+0.37z)| lr 4.85e-03 | 2016.70 ms | 68.0% bf16 MFU | 260393 tok/s +step 6438/18794 | loss 3.117745 (-1.62z)| norm 0.2211 (-0.37z)| lr 4.85e-03 | 2022.93 ms | 67.8% bf16 MFU | 260332 tok/s +step 6439/18794 | loss 3.120454 (-1.51z)| norm 0.2508 (+0.03z)| lr 4.85e-03 | 2021.23 ms | 67.9% bf16 MFU | 260285 tok/s +step 6440/18794 | loss 3.187057 (+0.45z)| norm 0.1855 (-0.87z)| lr 4.85e-03 | 2015.64 ms | 68.1% bf16 MFU | 260276 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.144548 +step 6441/18794 | loss 3.154936 (-0.51z)| norm 0.4091 (+2.14z)| lr 4.85e-03 | 2006.33 ms | 68.4% bf16 MFU | 260328 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.205959 +step 6442/18794 | loss 3.218539 (+1.35z)| norm 0.4196 (+2.21z)| lr 4.85e-03 | 2011.41 ms | 68.2% bf16 MFU | 260345 tok/s +step 6443/18794 | loss 3.128238 (-1.28z)| norm 0.2038 (-0.64z)| lr 4.85e-03 | 2014.52 ms | 68.1% bf16 MFU | 260340 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.332432 +step 6444/18794 | loss 3.162810 (-0.26z)| norm 0.4362 (+2.33z)| lr 4.85e-03 | 2017.77 ms | 68.0% bf16 MFU | 260315 tok/s +step 6445/18794 | loss 3.192203 (+0.59z)| norm 0.2830 (+0.36z)| lr 4.85e-03 | 2000.34 ms | 68.6% bf16 MFU | 260404 tok/s +step 6446/18794 | loss 3.158473 (-0.38z)| norm 0.2607 (+0.07z)| lr 4.85e-03 | 2027.70 ms | 67.7% bf16 MFU | 260312 tok/s +step 6447/18794 | loss 3.183038 (+0.33z)| norm 0.3260 (+0.89z)| lr 4.85e-03 | 2010.61 ms | 68.3% bf16 MFU | 260335 tok/s +step 6448/18794 | loss 3.139223 (-0.94z)| norm 0.2274 (-0.38z)| lr 4.84e-03 | 2008.42 ms | 68.3% bf16 MFU | 260370 tok/s +step 6449/18794 | loss 3.168818 (-0.08z)| norm 0.2955 (+0.48z)| lr 4.84e-03 | 2014.06 ms | 68.1% bf16 MFU | 260367 tok/s +step 6450/18794 | loss 3.217228 (+1.32z)| norm 0.2748 (+0.21z)| lr 4.84e-03 | 2009.41 ms | 68.3% bf16 MFU | 260395 tok/s +step 6451/18794 | loss 3.154756 (-0.49z)| norm 0.1987 (-0.78z)| lr 4.84e-03 | 2004.72 ms | 68.5% bf16 MFU | 260451 tok/s +step 6452/18794 | loss 3.160044 (-0.34z)| norm 0.3114 (+0.67z)| lr 4.84e-03 | 2022.69 ms | 67.8% bf16 MFU | 260389 tok/s +step 6453/18794 | loss 3.155419 (-0.46z)| norm 0.3315 (+0.91z)| lr 4.84e-03 | 2029.21 ms | 67.6% bf16 MFU | 260288 tok/s +step 6454/18794 | loss 3.173409 (+0.06z)| norm 0.2500 (-0.15z)| lr 4.84e-03 | 2006.15 ms | 68.4% bf16 MFU | 260341 tok/s +step 6455/18794 | loss 3.123039 (-1.37z)| norm 0.2288 (-0.42z)| lr 4.84e-03 | 2005.61 ms | 68.4% bf16 MFU | 260394 tok/s +step 6456/18794 | loss 3.182126 (+0.35z)| norm 0.2133 (-0.60z)| lr 4.84e-03 | 2031.51 ms | 67.6% bf16 MFU | 260278 tok/s +step 6457/18794 | loss 3.185778 (+0.47z)| norm 0.2546 (-0.06z)| lr 4.84e-03 | 2026.15 ms | 67.7% bf16 MFU | 260202 tok/s +step 6458/18794 | loss 3.201196 (+0.92z)| norm 0.2590 (-0.00z)| lr 4.84e-03 | 2019.26 ms | 68.0% bf16 MFU | 260174 tok/s +step 6459/18794 | loss 3.123484 (-1.34z)| norm 0.1955 (-0.84z)| lr 4.84e-03 | 2031.53 ms | 67.6% bf16 MFU | 260070 tok/s +step 6460/18794 | loss 3.121701 (-1.36z)| norm 0.2794 (+0.27z)| lr 4.84e-03 | 2007.98 ms | 68.3% bf16 MFU | 260121 tok/s +step 6461/18794 | loss 3.123119 (-1.29z)| norm 0.2711 (+0.17z)| lr 4.84e-03 | 2005.77 ms | 68.4% bf16 MFU | 260185 tok/s +step 6462/18794 | loss 3.172884 (+0.12z)| norm 0.1908 (-0.89z)| lr 4.84e-03 | 2004.50 ms | 68.5% bf16 MFU | 260253 tok/s +step 6463/18794 | loss 3.126068 (-1.24z)| norm 0.1831 (-0.99z)| lr 4.84e-03 | 2004.54 ms | 68.5% bf16 MFU | 260318 tok/s +step 6464/18794 | loss 3.162110 (-0.20z)| norm 0.2496 (-0.10z)| lr 4.84e-03 | 2004.66 ms | 68.5% bf16 MFU | 260379 tok/s +step 6465/18794 | loss 3.183076 (+0.42z)| norm 0.2538 (-0.03z)| lr 4.84e-03 | 2024.39 ms | 67.8% bf16 MFU | 260309 tok/s +step 6466/18794 | loss 3.174433 (+0.18z)| norm 0.1823 (-0.99z)| lr 4.84e-03 | 2008.36 ms | 68.3% bf16 MFU | 260346 tok/s +step 6467/18794 | loss 3.150229 (-0.52z)| norm 0.2379 (-0.23z)| lr 4.84e-03 | 1991.40 ms | 68.9% bf16 MFU | 260493 tok/s +step 6468/18794 | loss 3.172637 (+0.18z)| norm 0.2363 (-0.25z)| lr 4.84e-03 | 2020.94 ms | 67.9% bf16 MFU | 260440 tok/s +step 6469/18794 | loss 3.161059 (-0.19z)| norm 0.1856 (-0.92z)| lr 4.84e-03 | 2016.86 ms | 68.0% bf16 MFU | 260415 tok/s +step 6470/18794 | loss 3.160740 (-0.19z)| norm 0.1715 (-1.13z)| lr 4.84e-03 | 2019.02 ms | 68.0% bf16 MFU | 260378 tok/s +step 6471/18794 | loss 3.124618 (-1.29z)| norm 0.1876 (-0.89z)| lr 4.84e-03 | 2002.27 ms | 68.5% bf16 MFU | 260452 tok/s +step 6472/18794 | loss 3.155421 (-0.33z)| norm 0.2061 (-0.61z)| lr 4.84e-03 | 2024.20 ms | 67.8% bf16 MFU | 260380 tok/s +step 6473/18794 | loss 3.106451 (-1.81z)| norm 0.3005 (+0.73z)| lr 4.83e-03 | 2009.83 ms | 68.3% bf16 MFU | 260404 tok/s +step 6474/18794 | loss 3.121403 (-1.33z)| norm 0.3288 (+1.12z)| lr 4.83e-03 | 2021.97 ms | 67.9% bf16 MFU | 260348 tok/s +step 6475/18794 | loss 3.177413 (+0.38z)| norm 0.2172 (-0.46z)| lr 4.83e-03 | 2009.40 ms | 68.3% bf16 MFU | 260377 tok/s +step 6476/18794 | loss 3.197116 (+0.96z)| norm 0.2188 (-0.44z)| lr 4.83e-03 | 1994.69 ms | 68.8% bf16 MFU | 260500 tok/s +step 6477/18794 | loss 3.112002 (-1.61z)| norm 0.2657 (+0.22z)| lr 4.83e-03 | 2012.50 ms | 68.2% bf16 MFU | 260501 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.338513 +step 6478/18794 | loss 3.140666 (-0.72z)| norm 0.4214 (+2.34z)| lr 4.83e-03 | 2015.94 ms | 68.1% bf16 MFU | 260479 tok/s +step 6479/18794 | loss 3.127357 (-1.12z)| norm 0.2541 (+0.01z)| lr 4.83e-03 | 2014.50 ms | 68.1% bf16 MFU | 260468 tok/s +step 6480/18794 | loss 3.172963 (+0.26z)| norm 0.2095 (-0.61z)| lr 4.83e-03 | 1997.62 ms | 68.7% bf16 MFU | 260568 tok/s +step 6481/18794 | loss 3.160798 (-0.11z)| norm 0.2723 (+0.26z)| lr 4.83e-03 | 2022.54 ms | 67.9% bf16 MFU | 260500 tok/s +step 6482/18794 | loss 3.220405 (+1.73z)| norm 0.1786 (-1.04z)| lr 4.83e-03 | 2020.66 ms | 67.9% bf16 MFU | 260448 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.840279 +step 6483/18794 | loss 3.184429 (+0.64z)| norm 0.4677 (+2.84z)| lr 4.83e-03 | 2022.72 ms | 67.8% bf16 MFU | 260386 tok/s +step 6484/18794 | loss 3.227623 (+1.94z)| norm 0.4088 (+1.99z)| lr 4.83e-03 | 2017.00 ms | 68.0% bf16 MFU | 260363 tok/s +step 6485/18794 | loss 3.197581 (+0.99z)| norm 0.1961 (-0.84z)| lr 4.83e-03 | 2020.94 ms | 67.9% bf16 MFU | 260317 tok/s +step 6486/18794 | loss 3.204972 (+1.20z)| norm 0.3259 (+0.87z)| lr 4.83e-03 | 2005.75 ms | 68.4% bf16 MFU | 260370 tok/s +step 6487/18794 | loss 3.189539 (+0.74z)| norm 0.3278 (+0.88z)| lr 4.83e-03 | 2014.25 ms | 68.1% bf16 MFU | 260366 tok/s +step 6488/18794 | loss 3.263111 (+2.86z)| norm 0.2052 (-0.76z)| lr 4.83e-03 | 2035.49 ms | 67.4% bf16 MFU | 260227 tok/s +step 6489/18794 | loss 3.123034 (-1.24z)| norm 0.2178 (-0.58z)| lr 4.83e-03 | 2029.18 ms | 67.6% bf16 MFU | 260134 tok/s +step 6490/18794 | loss 3.173747 (+0.25z)| norm 0.1957 (-0.87z)| lr 4.83e-03 | 2009.12 ms | 68.3% bf16 MFU | 260175 tok/s +step 6491/18794 | loss 3.209421 (+1.29z)| norm 0.1899 (-0.93z)| lr 4.83e-03 | 2017.42 ms | 68.0% bf16 MFU | 260160 tok/s +step 6492/18794 | loss 3.184128 (+0.55z)| norm 0.3146 (+0.84z)| lr 4.83e-03 | 2011.93 ms | 68.2% bf16 MFU | 260182 tok/s +step 6493/18794 | loss 3.120511 (-1.29z)| norm 0.3704 (+1.62z)| lr 4.83e-03 | 2004.37 ms | 68.5% bf16 MFU | 260251 tok/s +step 6494/18794 | loss 3.131796 (-0.95z)| norm 0.1913 (-0.92z)| lr 4.83e-03 | 2025.49 ms | 67.8% bf16 MFU | 260181 tok/s +step 6495/18794 | loss 3.188148 (+0.67z)| norm 0.3994 (+1.97z)| lr 4.83e-03 | 2023.52 ms | 67.8% bf16 MFU | 260127 tok/s +step 6496/18794 | loss 3.202885 (+1.08z)| norm 0.3131 (+0.77z)| lr 4.83e-03 | 2015.00 ms | 68.1% bf16 MFU | 260130 tok/s +step 6497/18794 | loss 3.160130 (-0.15z)| norm 0.2070 (-0.70z)| lr 4.83e-03 | 2020.33 ms | 67.9% bf16 MFU | 260099 tok/s +step 6498/18794 | loss 3.230712 (+1.84z)| norm 0.2383 (-0.27z)| lr 4.83e-03 | 2026.51 ms | 67.7% bf16 MFU | 260030 tok/s +step 6499/18794 | loss 3.189425 (+0.66z)| norm 0.2060 (-0.71z)| lr 4.82e-03 | 2019.36 ms | 68.0% bf16 MFU | 260010 tok/s +step 6500/18794 | loss 3.163401 (-0.08z)| norm 0.2700 (+0.17z)| lr 4.82e-03 | 2015.36 ms | 68.1% bf16 MFU | 260017 tok/s +val loss 3.197522 +HellaSwag: 2834/10042 = 0.282215Swag: 990/1256: 0/1256 +Writing checkpoint at step 6500 +Writing model to log_gpt3_125M_edu_v4/model_00006500.bin +Writing state to log_gpt3_125M_edu_v4/state_00006500_00001.bin +Writing state to log_gpt3_125M_edu_v4/state_00006500_00000.bin +step 6501/18794 | loss 3.167069 (+0.03z)| norm 0.2632 (+0.07z)| lr 4.82e-03 | 2018.15 ms | 68.0% bf16 MFU | 260005 tok/s +step 6502/18794 | loss 3.144842 (-0.61z)| norm 0.2358 (-0.30z)| lr 4.82e-03 | 2027.39 ms | 67.7% bf16 MFU | 259935 tok/s +step 6503/18794 | loss 3.264586 (+2.69z)| norm 0.2395 (-0.25z)| lr 4.82e-03 | 2019.13 ms | 68.0% bf16 MFU | 259921 tok/s +step 6504/18794 | loss 3.208078 (+1.11z)| norm 0.3472 (+1.23z)| lr 4.82e-03 | 2019.08 ms | 68.0% bf16 MFU | 259908 tok/s +step 6505/18794 | loss 3.231965 (+1.72z)| norm 0.2362 (-0.33z)| lr 4.82e-03 | 2021.11 ms | 67.9% bf16 MFU | 259883 tok/s +step 6506/18794 | loss 3.161821 (-0.17z)| norm 0.2172 (-0.58z)| lr 4.82e-03 | 2017.79 ms | 68.0% bf16 MFU | 259881 tok/s +step 6507/18794 | loss 3.224584 (+1.50z)| norm 0.1982 (-0.83z)| lr 4.82e-03 | 2028.28 ms | 67.7% bf16 MFU | 259811 tok/s +step 6508/18794 | loss 3.215489 (+1.23z)| norm 0.2201 (-0.53z)| lr 4.82e-03 | 2019.90 ms | 67.9% bf16 MFU | 259799 tok/s +step 6509/18794 | loss 3.230866 (+1.61z)| norm 0.2317 (-0.37z)| lr 4.82e-03 | 2024.65 ms | 67.8% bf16 MFU | 259756 tok/s +step 6510/18794 | loss 3.168575 (-0.07z)| norm 0.2972 (+0.54z)| lr 4.82e-03 | 2009.99 ms | 68.3% bf16 MFU | 259811 tok/s +step 6511/18794 | loss 3.165160 (-0.14z)| norm 0.3558 (+1.34z)| lr 4.82e-03 | 2026.64 ms | 67.7% bf16 MFU | 259755 tok/s +step 6512/18794 | loss 3.189592 (+0.51z)| norm 0.3066 (+0.64z)| lr 4.82e-03 | 2023.28 ms | 67.8% bf16 MFU | 259724 tok/s +step 6513/18794 | loss 3.189635 (+0.50z)| norm 0.2338 (-0.38z)| lr 4.82e-03 | 2028.18 ms | 67.7% bf16 MFU | 259663 tok/s +step 6514/18794 | loss 3.192206 (+0.56z)| norm 0.1931 (-0.94z)| lr 4.82e-03 | 2021.89 ms | 67.9% bf16 MFU | 259645 tok/s +step 6515/18794 | loss 3.202470 (+0.83z)| norm 0.2603 (-0.00z)| lr 4.82e-03 | 2044.80 ms | 67.1% bf16 MFU | 259483 tok/s +step 6516/18794 | loss 3.202337 (+0.81z)| norm 0.3119 (+0.70z)| lr 4.82e-03 | 1988.56 ms | 69.0% bf16 MFU | 259691 tok/s +step 6517/18794 | loss 3.189491 (+0.51z)| norm 0.2642 (+0.03z)| lr 4.82e-03 | 2002.52 ms | 68.5% bf16 MFU | 259797 tok/s +step 6518/18794 | loss 3.219734 (+1.38z)| norm 0.2705 (+0.10z)| lr 4.82e-03 | 2021.77 ms | 67.9% bf16 MFU | 259773 tok/s +step 6519/18794 | loss 3.118804 (-1.50z)| norm 0.2088 (-0.77z)| lr 4.82e-03 | 2027.00 ms | 67.7% bf16 MFU | 259717 tok/s +step 6520/18794 | loss 3.214303 (+1.25z)| norm 0.2116 (-0.74z)| lr 4.82e-03 | 2022.96 ms | 67.8% bf16 MFU | 259690 tok/s +step 6521/18794 | loss 3.134809 (-1.05z)| norm 0.1796 (-1.17z)| lr 4.82e-03 | 2004.85 ms | 68.5% bf16 MFU | 259781 tok/s +step 6522/18794 | loss 3.189239 (+0.51z)| norm 0.2555 (-0.09z)| lr 4.82e-03 | 2021.58 ms | 67.9% bf16 MFU | 259759 tok/s +step 6523/18794 | loss 3.186323 (+0.41z)| norm 0.3407 (+1.10z)| lr 4.82e-03 | 2019.75 ms | 67.9% bf16 MFU | 259750 tok/s +step 6524/18794 | loss 3.190949 (+0.55z)| norm 0.1856 (-1.06z)| lr 4.81e-03 | 2010.42 ms | 68.3% bf16 MFU | 259802 tok/s +step 6525/18794 | loss 3.186386 (+0.41z)| norm 0.2161 (-0.64z)| lr 4.81e-03 | 2025.36 ms | 67.8% bf16 MFU | 259755 tok/s +step 6526/18794 | loss 3.155550 (-0.49z)| norm 0.1793 (-1.17z)| lr 4.81e-03 | 2027.47 ms | 67.7% bf16 MFU | 259697 tok/s +step 6527/18794 | loss 3.178018 (+0.18z)| norm 0.3251 (+0.87z)| lr 4.81e-03 | 2003.30 ms | 68.5% bf16 MFU | 259798 tok/s +step 6528/18794 | loss 3.143549 (-0.84z)| norm 0.3680 (+1.45z)| lr 4.81e-03 | 2024.82 ms | 67.8% bf16 MFU | 259754 tok/s +step 6529/18794 | loss 3.168296 (-0.12z)| norm 0.2180 (-0.64z)| lr 4.81e-03 | 2005.04 ms | 68.4% bf16 MFU | 259841 tok/s +step 6530/18794 | loss 3.212015 (+1.15z)| norm 0.1787 (-1.17z)| lr 4.81e-03 | 2014.02 ms | 68.1% bf16 MFU | 259865 tok/s +step 6531/18794 | loss 3.141213 (-0.92z)| norm 0.1860 (-1.08z)| lr 4.81e-03 | 1998.01 ms | 68.7% bf16 MFU | 259992 tok/s +step 6532/18794 | loss 3.184886 (+0.34z)| norm 0.2872 (+0.41z)| lr 4.81e-03 | 2011.40 ms | 68.2% bf16 MFU | 260025 tok/s +step 6533/18794 | loss 3.263849 (+2.55z)| norm 0.2645 (+0.07z)| lr 4.81e-03 | 2011.57 ms | 68.2% bf16 MFU | 260056 tok/s +step 6534/18794 | loss 3.131406 (-1.20z)| norm 0.1546 (-1.53z)| lr 4.81e-03 | 2005.10 ms | 68.4% bf16 MFU | 260127 tok/s +step 6535/18794 | loss 3.181855 (+0.22z)| norm 0.2698 (+0.15z)| lr 4.81e-03 | 2017.50 ms | 68.0% bf16 MFU | 260114 tok/s +step 6536/18794 | loss 3.168936 (-0.13z)| norm 0.2130 (-0.67z)| lr 4.81e-03 | 2019.25 ms | 68.0% bf16 MFU | 260090 tok/s +step 6537/18794 | loss 3.134200 (-1.11z)| norm 0.2110 (-0.68z)| lr 4.81e-03 | 2010.96 ms | 68.2% bf16 MFU | 260122 tok/s +step 6538/18794 | loss 3.183721 (+0.28z)| norm 0.3442 (+1.26z)| lr 4.81e-03 | 2027.92 ms | 67.7% bf16 MFU | 260042 tok/s +step 6539/18794 | loss 3.202041 (+0.79z)| norm 0.3164 (+0.84z)| lr 4.81e-03 | 2018.54 ms | 68.0% bf16 MFU | 260027 tok/s +step 6540/18794 | loss 3.118304 (-1.59z)| norm 0.1875 (-1.05z)| lr 4.81e-03 | 2019.97 ms | 67.9% bf16 MFU | 260003 tok/s +step 6541/18794 | loss 3.160869 (-0.38z)| norm 0.2161 (-0.62z)| lr 4.81e-03 | 2010.14 ms | 68.3% bf16 MFU | 260044 tok/s +step 6542/18794 | loss 3.133549 (-1.14z)| norm 0.2426 (-0.20z)| lr 4.81e-03 | 2002.54 ms | 68.5% bf16 MFU | 260133 tok/s +step 6543/18794 | loss 3.197182 (+0.67z)| norm 0.2017 (-0.83z)| lr 4.81e-03 | 2009.24 ms | 68.3% bf16 MFU | 260173 tok/s +step 6544/18794 | loss 3.230949 (+1.60z)| norm 0.2726 (+0.30z)| lr 4.81e-03 | 2025.37 ms | 67.8% bf16 MFU | 260107 tok/s +step 6545/18794 | loss 3.178358 (+0.11z)| norm 0.3084 (+0.87z)| lr 4.81e-03 | 2003.92 ms | 68.5% bf16 MFU | 260183 tok/s +step 6546/18794 | loss 3.176661 (+0.06z)| norm 0.2563 (+0.04z)| lr 4.81e-03 | 2007.68 ms | 68.4% bf16 MFU | 260231 tok/s +step 6547/18794 | loss 3.161304 (-0.38z)| norm 0.1822 (-1.13z)| lr 4.81e-03 | 2005.95 ms | 68.4% bf16 MFU | 260288 tok/s +step 6548/18794 | loss 3.214771 (+1.12z)| norm 0.2806 (+0.44z)| lr 4.81e-03 | 2013.30 ms | 68.2% bf16 MFU | 260294 tok/s +step 6549/18794 | loss 3.213086 (+1.06z)| norm 0.2838 (+0.50z)| lr 4.81e-03 | 2011.01 ms | 68.2% bf16 MFU | 260315 tok/s +step 6550/18794 | loss 3.153347 (-0.62z)| norm 0.2427 (-0.16z)| lr 4.80e-03 | 2026.33 ms | 67.7% bf16 MFU | 260236 tok/s +step 6551/18794 | loss 3.177057 (+0.05z)| norm 0.2251 (-0.45z)| lr 4.80e-03 | 2006.71 ms | 68.4% bf16 MFU | 260288 tok/s +step 6552/18794 | loss 3.205264 (+0.84z)| norm 0.1853 (-1.07z)| lr 4.80e-03 | 2018.77 ms | 68.0% bf16 MFU | 260259 tok/s +step 6553/18794 | loss 3.181522 (+0.16z)| norm 0.1787 (-1.15z)| lr 4.80e-03 | 2012.30 ms | 68.2% bf16 MFU | 260273 tok/s +step 6554/18794 | loss 3.181046 (+0.14z)| norm 0.1867 (-1.01z)| lr 4.80e-03 | 2009.46 ms | 68.3% bf16 MFU | 260305 tok/s +step 6555/18794 | loss 3.179944 (+0.10z)| norm 0.2063 (-0.69z)| lr 4.80e-03 | 2022.78 ms | 67.8% bf16 MFU | 260249 tok/s +step 6556/18794 | loss 3.258076 (+2.26z)| norm 0.2605 (+0.17z)| lr 4.80e-03 | 2004.08 ms | 68.5% bf16 MFU | 260317 tok/s +step 6557/18794 | loss 3.162051 (-0.42z)| norm 0.3528 (+1.62z)| lr 4.80e-03 | 2014.37 ms | 68.1% bf16 MFU | 260315 tok/s +step 6558/18794 | loss 3.189384 (+0.35z)| norm 0.2198 (-0.48z)| lr 4.80e-03 | 2012.89 ms | 68.2% bf16 MFU | 260322 tok/s +step 6559/18794 | loss 3.167812 (-0.27z)| norm 0.2056 (-0.71z)| lr 4.80e-03 | 2004.76 ms | 68.5% bf16 MFU | 260382 tok/s +step 6560/18794 | loss 3.178258 (+0.01z)| norm 0.2006 (-0.77z)| lr 4.80e-03 | 2017.16 ms | 68.0% bf16 MFU | 260359 tok/s +step 6561/18794 | loss 3.173395 (-0.15z)| norm 0.2050 (-0.69z)| lr 4.80e-03 | 2025.81 ms | 67.7% bf16 MFU | 260281 tok/s +step 6562/18794 | loss 3.157010 (-0.62z)| norm 0.1766 (-1.14z)| lr 4.80e-03 | 2018.64 ms | 68.0% bf16 MFU | 260253 tok/s +step 6563/18794 | loss 3.182277 (+0.10z)| norm 0.2073 (-0.66z)| lr 4.80e-03 | 1993.46 ms | 68.8% bf16 MFU | 260391 tok/s +step 6564/18794 | loss 3.179767 (+0.02z)| norm 0.1828 (-1.03z)| lr 4.80e-03 | 2003.65 ms | 68.5% bf16 MFU | 260455 tok/s +step 6565/18794 | loss 3.168089 (-0.32z)| norm 0.1863 (-0.96z)| lr 4.80e-03 | 1995.94 ms | 68.8% bf16 MFU | 260566 tok/s +step 6566/18794 | loss 3.213540 (+1.00z)| norm 0.1857 (-0.97z)| lr 4.80e-03 | 2002.03 ms | 68.5% bf16 MFU | 260631 tok/s +step 6567/18794 | loss 3.177628 (-0.06z)| norm 0.2011 (-0.72z)| lr 4.80e-03 | 2015.37 ms | 68.1% bf16 MFU | 260607 tok/s +step 6568/18794 | loss 3.213217 (+0.97z)| norm 0.1856 (-0.95z)| lr 4.80e-03 | 2017.73 ms | 68.0% bf16 MFU | 260569 tok/s +step 6569/18794 | loss 3.172298 (-0.23z)| norm 0.2307 (-0.26z)| lr 4.80e-03 | 2001.74 ms | 68.6% bf16 MFU | 260636 tok/s +step 6570/18794 | loss 3.207192 (+0.78z)| norm 0.2228 (-0.39z)| lr 4.80e-03 | 1993.15 ms | 68.9% bf16 MFU | 260756 tok/s +step 6571/18794 | loss 3.159063 (-0.65z)| norm 0.3536 (+1.62z)| lr 4.80e-03 | 1994.87 ms | 68.8% bf16 MFU | 260860 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.057279 +step 6572/18794 | loss 3.222791 (+1.21z)| norm 0.3861 (+2.06z)| lr 4.80e-03 | 2011.90 ms | 68.2% bf16 MFU | 260846 tok/s +step 6573/18794 | loss 3.294108 (+3.16z)| norm 0.2390 (-0.18z)| lr 4.80e-03 | 2027.38 ms | 67.7% bf16 MFU | 260734 tok/s +step 6574/18794 | loss 3.177475 (-0.19z)| norm 0.2892 (+0.60z)| lr 4.80e-03 | 1996.54 ms | 68.7% bf16 MFU | 260827 tok/s +step 6575/18794 | loss 3.169376 (-0.43z)| norm 0.3659 (+1.73z)| lr 4.79e-03 | 2009.65 ms | 68.3% bf16 MFU | 260830 tok/s +step 6576/18794 | loss 3.124703 (-1.68z)| norm 0.2924 (+0.60z)| lr 4.79e-03 | 2003.40 ms | 68.5% bf16 MFU | 260874 tok/s +step 6577/18794 | loss 3.190060 (+0.18z)| norm 0.2144 (-0.57z)| lr 4.79e-03 | 2026.21 ms | 67.7% bf16 MFU | 260768 tok/s +step 6578/18794 | loss 3.161240 (-0.68z)| norm 0.1655 (-1.30z)| lr 4.79e-03 | 1994.04 ms | 68.8% bf16 MFU | 260876 tok/s +step 6579/18794 | loss 3.133510 (-1.50z)| norm 0.2841 (+0.53z)| lr 4.79e-03 | 2003.09 ms | 68.5% bf16 MFU | 260919 tok/s +step 6580/18794 | loss 3.121084 (-1.82z)| norm 0.2729 (+0.35z)| lr 4.79e-03 | 2018.16 ms | 68.0% bf16 MFU | 260862 tok/s +step 6581/18794 | loss 3.183373 (-0.02z)| norm 0.1991 (-0.78z)| lr 4.79e-03 | 1987.40 ms | 69.1% bf16 MFU | 261009 tok/s +step 6582/18794 | loss 3.184159 (+0.01z)| norm 0.3644 (+1.73z)| lr 4.79e-03 | 1994.92 ms | 68.8% bf16 MFU | 261099 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.150219 +step 6583/18794 | loss 3.143309 (-1.16z)| norm 0.3859 (+2.15z)| lr 4.79e-03 | 2002.27 ms | 68.5% bf16 MFU | 261137 tok/s +step 6584/18794 | loss 3.153563 (-0.85z)| norm 0.2389 (-0.17z)| lr 4.79e-03 | 2018.64 ms | 68.0% bf16 MFU | 261066 tok/s +step 6585/18794 | loss 3.225557 (+1.24z)| norm 0.2305 (-0.31z)| lr 4.79e-03 | 2013.52 ms | 68.2% bf16 MFU | 261032 tok/s +step 6586/18794 | loss 3.229960 (+1.35z)| norm 0.2046 (-0.72z)| lr 4.79e-03 | 2034.02 ms | 67.5% bf16 MFU | 260868 tok/s +step 6587/18794 | loss 3.182600 (-0.01z)| norm 0.3362 (+1.46z)| lr 4.79e-03 | 2010.96 ms | 68.2% bf16 MFU | 260861 tok/s +step 6588/18794 | loss 3.275413 (+2.64z)| norm 0.3407 (+1.50z)| lr 4.79e-03 | 2031.42 ms | 67.6% bf16 MFU | 260722 tok/s +step 6589/18794 | loss 3.181484 (-0.06z)| norm 0.1932 (-0.92z)| lr 4.79e-03 | 2011.02 ms | 68.2% bf16 MFU | 260721 tok/s +step 6590/18794 | loss 3.159050 (-0.71z)| norm 0.3696 (+1.91z)| lr 4.79e-03 | 2025.93 ms | 67.7% bf16 MFU | 260625 tok/s +step 6591/18794 | loss 3.152206 (-0.89z)| norm 0.3612 (+1.73z)| lr 4.79e-03 | 2010.17 ms | 68.3% bf16 MFU | 260634 tok/s +step 6592/18794 | loss 3.226267 (+1.23z)| norm 0.2153 (-0.58z)| lr 4.79e-03 | 2010.83 ms | 68.2% bf16 MFU | 260639 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.963264 +step 6593/18794 | loss 3.201840 (+0.51z)| norm 0.4430 (+2.96z)| lr 4.79e-03 | 2024.88 ms | 67.8% bf16 MFU | 260553 tok/s +step 6594/18794 | loss 3.236000 (+1.48z)| norm 0.3618 (+1.66z)| lr 4.79e-03 | 2026.06 ms | 67.7% bf16 MFU | 260464 tok/s +step 6595/18794 | loss 3.178270 (-0.20z)| norm 0.2502 (-0.04z)| lr 4.79e-03 | 2003.62 ms | 68.5% bf16 MFU | 260525 tok/s +step 6596/18794 | loss 3.243196 (+1.66z)| norm 0.2998 (+0.75z)| lr 4.79e-03 | 2028.20 ms | 67.7% bf16 MFU | 260423 tok/s +step 6597/18794 | loss 3.191041 (+0.15z)| norm 0.2287 (-0.38z)| lr 4.79e-03 | 2013.16 ms | 68.2% bf16 MFU | 260424 tok/s +step 6598/18794 | loss 3.167960 (-0.50z)| norm 0.3296 (+1.20z)| lr 4.79e-03 | 2028.22 ms | 67.7% bf16 MFU | 260327 tok/s +step 6599/18794 | loss 3.186001 (+0.02z)| norm 0.1918 (-0.97z)| lr 4.79e-03 | 2022.80 ms | 67.8% bf16 MFU | 260270 tok/s +step 6600/18794 | loss 3.270844 (+2.39z)| norm 0.2350 (-0.29z)| lr 4.78e-03 | 2007.29 ms | 68.4% bf16 MFU | 260316 tok/s +step 6601/18794 | loss 3.208953 (+0.63z)| norm 0.1739 (-1.22z)| lr 4.78e-03 | 2022.46 ms | 67.9% bf16 MFU | 260262 tok/s +step 6602/18794 | loss 3.274300 (+2.38z)| norm 0.2697 (+0.27z)| lr 4.78e-03 | 2017.38 ms | 68.0% bf16 MFU | 260243 tok/s +step 6603/18794 | loss 3.199719 (+0.35z)| norm 0.2389 (-0.21z)| lr 4.78e-03 | 2020.17 ms | 67.9% bf16 MFU | 260208 tok/s +step 6604/18794 | loss 3.213668 (+0.74z)| norm 0.2096 (-0.66z)| lr 4.78e-03 | 1998.72 ms | 68.7% bf16 MFU | 260313 tok/s +step 6605/18794 | loss 3.190820 (+0.11z)| norm 0.1941 (-0.89z)| lr 4.78e-03 | 2023.45 ms | 67.8% bf16 MFU | 260252 tok/s +step 6606/18794 | loss 3.243623 (+1.57z)| norm 0.2312 (-0.31z)| lr 4.78e-03 | 2011.10 ms | 68.2% bf16 MFU | 260275 tok/s +step 6607/18794 | loss 3.173852 (-0.38z)| norm 0.3774 (+1.93z)| lr 4.78e-03 | 2017.57 ms | 68.0% bf16 MFU | 260254 tok/s +step 6608/18794 | loss 3.190498 (+0.10z)| norm 0.3674 (+1.73z)| lr 4.78e-03 | 2017.57 ms | 68.0% bf16 MFU | 260234 tok/s +step 6609/18794 | loss 3.199535 (+0.37z)| norm 0.2450 (-0.14z)| lr 4.78e-03 | 2013.58 ms | 68.2% bf16 MFU | 260241 tok/s +step 6610/18794 | loss 3.242055 (+1.54z)| norm 0.2335 (-0.31z)| lr 4.78e-03 | 2024.82 ms | 67.8% bf16 MFU | 260176 tok/s +step 6611/18794 | loss 3.201784 (+0.40z)| norm 0.1908 (-0.94z)| lr 4.78e-03 | 2004.95 ms | 68.4% bf16 MFU | 260242 tok/s +step 6612/18794 | loss 3.186107 (-0.05z)| norm 0.1935 (-0.88z)| lr 4.78e-03 | 2035.24 ms | 67.4% bf16 MFU | 260110 tok/s +step 6613/18794 | loss 3.194480 (+0.19z)| norm 0.1730 (-1.18z)| lr 4.78e-03 | 2007.02 ms | 68.4% bf16 MFU | 260166 tok/s +step 6614/18794 | loss 3.140514 (-1.31z)| norm 0.1981 (-0.80z)| lr 4.78e-03 | 1997.75 ms | 68.7% bf16 MFU | 260280 tok/s +step 6615/18794 | loss 3.206423 (+0.53z)| norm 0.2772 (+0.41z)| lr 4.78e-03 | 2011.07 ms | 68.2% bf16 MFU | 260301 tok/s +step 6616/18794 | loss 3.152791 (-0.95z)| norm 0.2153 (-0.53z)| lr 4.78e-03 | 2002.91 ms | 68.5% bf16 MFU | 260374 tok/s +step 6617/18794 | loss 3.192649 (+0.16z)| norm 0.2148 (-0.52z)| lr 4.78e-03 | 2006.11 ms | 68.4% bf16 MFU | 260422 tok/s +step 6618/18794 | loss 3.208552 (+0.61z)| norm 0.3277 (+1.19z)| lr 4.78e-03 | 2011.06 ms | 68.2% bf16 MFU | 260436 tok/s +step 6619/18794 | loss 3.241774 (+1.52z)| norm 0.1917 (-0.88z)| lr 4.78e-03 | 2025.38 ms | 67.8% bf16 MFU | 260357 tok/s +step 6620/18794 | loss 3.165182 (-0.63z)| norm 0.3367 (+1.30z)| lr 4.78e-03 | 2000.54 ms | 68.6% bf16 MFU | 260443 tok/s +step 6621/18794 | loss 3.137835 (-1.41z)| norm 0.2052 (-0.69z)| lr 4.78e-03 | 2023.93 ms | 67.8% bf16 MFU | 260373 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.402345 +step 6622/18794 | loss 3.121382 (-1.82z)| norm 0.4162 (+2.40z)| lr 4.78e-03 | 2017.85 ms | 68.0% bf16 MFU | 260346 tok/s +reducing beta2 to 0.9 and lr/wd by 0.948 due to grad z-score of 3.692713 +step 6623/18794 | loss 3.238351 (+1.40z)| norm 0.5229 (+3.69z)| lr 4.53e-03 | 2013.43 ms | 68.2% bf16 MFU | 260348 tok/s +step 6624/18794 | loss 3.215217 (+0.76z)| norm 0.2303 (-0.34z)| lr 4.78e-03 | 2009.93 ms | 68.3% bf16 MFU | 260373 tok/s +step 6625/18794 | loss 3.194716 (+0.19z)| norm 0.2078 (-0.65z)| lr 4.77e-03 | 2015.08 ms | 68.1% bf16 MFU | 260364 tok/s +step 6626/18794 | loss 3.182657 (-0.15z)| norm 0.1966 (-0.81z)| lr 4.77e-03 | 2019.48 ms | 68.0% bf16 MFU | 260326 tok/s +step 6627/18794 | loss 3.191286 (+0.09z)| norm 0.2535 (-0.01z)| lr 4.77e-03 | 2014.49 ms | 68.1% bf16 MFU | 260323 tok/s +step 6628/18794 | loss 3.156507 (-0.88z)| norm 0.1850 (-0.94z)| lr 4.77e-03 | 2025.34 ms | 67.8% bf16 MFU | 260250 tok/s +step 6629/18794 | loss 3.231951 (+1.18z)| norm 0.2892 (+0.51z)| lr 4.77e-03 | 2031.58 ms | 67.5% bf16 MFU | 260141 tok/s +step 6630/18794 | loss 3.169320 (-0.53z)| norm 0.2564 (+0.04z)| lr 4.77e-03 | 2013.78 ms | 68.1% bf16 MFU | 260151 tok/s +step 6631/18794 | loss 3.205843 (+0.46z)| norm 0.1800 (-1.04z)| lr 4.77e-03 | 2016.97 ms | 68.0% bf16 MFU | 260141 tok/s +step 6632/18794 | loss 3.187658 (-0.04z)| norm 0.2311 (-0.31z)| lr 4.77e-03 | 2021.68 ms | 67.9% bf16 MFU | 260100 tok/s +step 6633/18794 | loss 3.150501 (-1.06z)| norm 0.2033 (-0.69z)| lr 4.77e-03 | 2009.37 ms | 68.3% bf16 MFU | 260141 tok/s +step 6634/18794 | loss 3.189340 (+0.02z)| norm 0.1850 (-0.96z)| lr 4.77e-03 | 1997.82 ms | 68.7% bf16 MFU | 260256 tok/s +step 6635/18794 | loss 3.250814 (+1.73z)| norm 0.2270 (-0.36z)| lr 4.77e-03 | 2017.84 ms | 68.0% bf16 MFU | 260234 tok/s +step 6636/18794 | loss 3.186096 (-0.09z)| norm 0.2209 (-0.45z)| lr 4.77e-03 | 2003.03 ms | 68.5% bf16 MFU | 260310 tok/s +step 6637/18794 | loss 3.198704 (+0.25z)| norm 0.1845 (-0.96z)| lr 4.77e-03 | 2021.01 ms | 67.9% bf16 MFU | 260265 tok/s +step 6638/18794 | loss 3.239237 (+1.37z)| norm 0.2038 (-0.67z)| lr 4.77e-03 | 2036.51 ms | 67.4% bf16 MFU | 260124 tok/s +step 6639/18794 | loss 3.173195 (-0.48z)| norm 0.2994 (+0.69z)| lr 4.77e-03 | 2024.54 ms | 67.8% bf16 MFU | 260066 tok/s +step 6640/18794 | loss 3.220504 (+0.84z)| norm 0.3777 (+1.75z)| lr 4.77e-03 | 2005.06 ms | 68.4% bf16 MFU | 260137 tok/s +step 6641/18794 | loss 3.177933 (-0.39z)| norm 0.2361 (-0.24z)| lr 4.77e-03 | 2000.48 ms | 68.6% bf16 MFU | 260234 tok/s +reducing beta2 to 0.9 and lr/wd by 0.816 due to grad z-score of 4.289346 +step 6642/18794 | loss 3.176886 (-0.44z)| norm 0.5947 (+4.29z)| lr 3.89e-03 | 2021.03 ms | 67.9% bf16 MFU | 260194 tok/s +step 6643/18794 | loss 3.212158 (+0.58z)| norm 0.2712 (+0.18z)| lr 4.77e-03 | 2041.46 ms | 67.2% bf16 MFU | 260025 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.587854 +step 6644/18794 | loss 3.226208 (+1.00z)| norm 0.4699 (+2.59z)| lr 4.77e-03 | 2014.58 ms | 68.1% bf16 MFU | 260036 tok/s +step 6645/18794 | loss 3.180030 (-0.35z)| norm 0.4060 (+1.77z)| lr 4.77e-03 | 2033.34 ms | 67.5% bf16 MFU | 259926 tok/s +step 6646/18794 | loss 3.164432 (-0.80z)| norm 0.2398 (-0.24z)| lr 4.77e-03 | 1992.56 ms | 68.9% bf16 MFU | 260086 tok/s +step 6647/18794 | loss 3.165239 (-0.78z)| norm 0.2871 (+0.32z)| lr 4.77e-03 | 2020.60 ms | 67.9% bf16 MFU | 260056 tok/s +step 6648/18794 | loss 3.146305 (-1.31z)| norm 0.2224 (-0.46z)| lr 4.77e-03 | 2015.42 ms | 68.1% bf16 MFU | 260060 tok/s +step 6649/18794 | loss 3.198399 (+0.21z)| norm 0.1821 (-0.93z)| lr 4.77e-03 | 2036.86 ms | 67.4% bf16 MFU | 259927 tok/s +step 6650/18794 | loss 3.150074 (-1.19z)| norm 0.3296 (+0.84z)| lr 4.76e-03 | 2000.35 ms | 68.6% bf16 MFU | 260035 tok/s +step 6651/18794 | loss 3.209509 (+0.52z)| norm 0.1703 (-1.07z)| lr 4.76e-03 | 2018.75 ms | 68.0% bf16 MFU | 260019 tok/s +step 6652/18794 | loss 3.230056 (+1.11z)| norm 0.3550 (+1.12z)| lr 4.76e-03 | 2016.62 ms | 68.1% bf16 MFU | 260017 tok/s +step 6653/18794 | loss 3.123489 (-1.92z)| norm 0.3570 (+1.12z)| lr 4.76e-03 | 2001.58 ms | 68.6% bf16 MFU | 260113 tok/s +step 6654/18794 | loss 3.172098 (-0.54z)| norm 0.2324 (-0.37z)| lr 4.76e-03 | 2015.83 ms | 68.1% bf16 MFU | 260112 tok/s +step 6655/18794 | loss 3.225376 (+0.95z)| norm 0.4080 (+1.69z)| lr 4.76e-03 | 2010.60 ms | 68.3% bf16 MFU | 260144 tok/s +step 6656/18794 | loss 3.149771 (-1.16z)| norm 0.2680 (+0.03z)| lr 4.76e-03 | 2000.84 ms | 68.6% bf16 MFU | 260239 tok/s +step 6657/18794 | loss 3.155898 (-0.98z)| norm 0.2363 (-0.34z)| lr 4.76e-03 | 2024.54 ms | 67.8% bf16 MFU | 260175 tok/s +step 6658/18794 | loss 3.186455 (-0.11z)| norm 0.3008 (+0.42z)| lr 4.76e-03 | 2025.13 ms | 67.8% bf16 MFU | 260111 tok/s +step 6659/18794 | loss 3.240120 (+1.38z)| norm 0.2773 (+0.13z)| lr 4.76e-03 | 2003.90 ms | 68.5% bf16 MFU | 260187 tok/s +step 6660/18794 | loss 3.204971 (+0.38z)| norm 0.2375 (-0.35z)| lr 4.76e-03 | 2011.17 ms | 68.2% bf16 MFU | 260212 tok/s +step 6661/18794 | loss 3.185183 (-0.18z)| norm 0.2294 (-0.45z)| lr 4.76e-03 | 1990.89 ms | 68.9% bf16 MFU | 260369 tok/s +step 6662/18794 | loss 3.195241 (+0.10z)| norm 0.2040 (-0.76z)| lr 4.76e-03 | 2010.57 ms | 68.3% bf16 MFU | 260389 tok/s +step 6663/18794 | loss 3.196207 (+0.12z)| norm 0.2173 (-0.60z)| lr 4.76e-03 | 2004.43 ms | 68.5% bf16 MFU | 260447 tok/s +step 6664/18794 | loss 3.242820 (+1.41z)| norm 0.2943 (+0.32z)| lr 4.76e-03 | 2018.75 ms | 68.0% bf16 MFU | 260410 tok/s +step 6665/18794 | loss 3.220344 (+0.76z)| norm 0.2083 (-0.73z)| lr 4.76e-03 | 2009.48 ms | 68.3% bf16 MFU | 260435 tok/s +step 6666/18794 | loss 3.159408 (-0.93z)| norm 0.1806 (-1.06z)| lr 4.76e-03 | 2001.16 ms | 68.6% bf16 MFU | 260513 tok/s +step 6667/18794 | loss 3.176061 (-0.46z)| norm 0.1534 (-1.38z)| lr 4.76e-03 | 2005.97 ms | 68.4% bf16 MFU | 260556 tok/s +step 6668/18794 | loss 3.179420 (-0.36z)| norm 0.2000 (-0.82z)| lr 4.76e-03 | 2008.20 ms | 68.3% bf16 MFU | 260582 tok/s +step 6669/18794 | loss 3.194574 (+0.06z)| norm 0.1742 (-1.12z)| lr 4.76e-03 | 2014.96 ms | 68.1% bf16 MFU | 260562 tok/s +step 6670/18794 | loss 3.221148 (+0.80z)| norm 0.1941 (-0.87z)| lr 4.76e-03 | 2042.66 ms | 67.2% bf16 MFU | 260368 tok/s +step 6671/18794 | loss 3.177540 (-0.43z)| norm 0.2211 (-0.54z)| lr 4.76e-03 | 2014.23 ms | 68.1% bf16 MFU | 260364 tok/s +step 6672/18794 | loss 3.211574 (+0.53z)| norm 0.2252 (-0.47z)| lr 4.76e-03 | 2007.11 ms | 68.4% bf16 MFU | 260406 tok/s +step 6673/18794 | loss 3.151456 (-1.16z)| norm 0.1973 (-0.80z)| lr 4.76e-03 | 2017.80 ms | 68.0% bf16 MFU | 260378 tok/s +step 6674/18794 | loss 3.172401 (-0.55z)| norm 0.2175 (-0.55z)| lr 4.76e-03 | 2019.28 ms | 68.0% bf16 MFU | 260341 tok/s +step 6675/18794 | loss 3.153335 (-1.10z)| norm 0.2339 (-0.34z)| lr 4.75e-03 | 2018.98 ms | 68.0% bf16 MFU | 260308 tok/s +step 6676/18794 | loss 3.167899 (-0.70z)| norm 0.1818 (-0.96z)| lr 4.75e-03 | 2014.22 ms | 68.1% bf16 MFU | 260307 tok/s +step 6677/18794 | loss 3.193680 (+0.06z)| norm 0.2232 (-0.46z)| lr 4.75e-03 | 2006.74 ms | 68.4% bf16 MFU | 260355 tok/s +step 6678/18794 | loss 3.148951 (-1.25z)| norm 0.1626 (-1.19z)| lr 4.75e-03 | 2010.26 ms | 68.3% bf16 MFU | 260377 tok/s +step 6679/18794 | loss 3.164055 (-0.82z)| norm 0.1610 (-1.18z)| lr 4.75e-03 | 2025.96 ms | 67.7% bf16 MFU | 260298 tok/s +step 6680/18794 | loss 3.227083 (+1.04z)| norm 0.1891 (-0.83z)| lr 4.75e-03 | 2021.16 ms | 67.9% bf16 MFU | 260253 tok/s +step 6681/18794 | loss 3.186165 (-0.20z)| norm 0.2990 (+0.47z)| lr 4.75e-03 | 2013.43 ms | 68.2% bf16 MFU | 260260 tok/s +step 6682/18794 | loss 3.182942 (-0.30z)| norm 0.2427 (-0.19z)| lr 4.75e-03 | 2009.29 ms | 68.3% bf16 MFU | 260294 tok/s +step 6683/18794 | loss 3.145319 (-1.44z)| norm 0.1640 (-1.12z)| lr 4.75e-03 | 2030.79 ms | 67.6% bf16 MFU | 260187 tok/s +step 6684/18794 | loss 3.160606 (-0.98z)| norm 0.1703 (-1.03z)| lr 4.75e-03 | 2018.88 ms | 68.0% bf16 MFU | 260163 tok/s +step 6685/18794 | loss 3.188805 (-0.11z)| norm 0.1744 (-0.97z)| lr 4.75e-03 | 2022.70 ms | 67.8% bf16 MFU | 260115 tok/s +step 6686/18794 | loss 3.178025 (-0.43z)| norm 0.2508 (-0.06z)| lr 4.75e-03 | 2011.36 ms | 68.2% bf16 MFU | 260142 tok/s +step 6687/18794 | loss 3.107399 (-2.50z)| norm 0.2558 (+0.01z)| lr 4.75e-03 | 1999.72 ms | 68.6% bf16 MFU | 260244 tok/s +step 6688/18794 | loss 3.098268 (-2.70z)| norm 0.1979 (-0.67z)| lr 4.75e-03 | 2035.05 ms | 67.4% bf16 MFU | 260113 tok/s +step 6689/18794 | loss 3.149122 (-1.18z)| norm 0.2025 (-0.62z)| lr 4.75e-03 | 2028.61 ms | 67.6% bf16 MFU | 260030 tok/s +step 6690/18794 | loss 3.195657 (+0.18z)| norm 0.1831 (-0.84z)| lr 4.75e-03 | 2017.99 ms | 68.0% bf16 MFU | 260019 tok/s +step 6691/18794 | loss 3.201994 (+0.36z)| norm 0.1886 (-0.75z)| lr 4.75e-03 | 2012.04 ms | 68.2% bf16 MFU | 260047 tok/s +step 6692/18794 | loss 3.146630 (-1.26z)| norm 0.2189 (-0.38z)| lr 4.75e-03 | 2011.07 ms | 68.2% bf16 MFU | 260079 tok/s +step 6693/18794 | loss 3.187489 (-0.05z)| norm 0.2024 (-0.57z)| lr 4.75e-03 | 2017.63 ms | 68.0% bf16 MFU | 260068 tok/s +step 6694/18794 | loss 3.131290 (-1.67z)| norm 0.1940 (-0.66z)| lr 4.75e-03 | 2008.54 ms | 68.3% bf16 MFU | 260116 tok/s +step 6695/18794 | loss 3.136505 (-1.49z)| norm 0.2265 (-0.24z)| lr 4.75e-03 | 2026.75 ms | 67.7% bf16 MFU | 260044 tok/s +step 6696/18794 | loss 3.166734 (-0.59z)| norm 0.1958 (-0.62z)| lr 4.75e-03 | 2025.67 ms | 67.7% bf16 MFU | 259983 tok/s +step 6697/18794 | loss 3.207808 (+0.61z)| norm 0.2029 (-0.53z)| lr 4.75e-03 | 2014.51 ms | 68.1% bf16 MFU | 259997 tok/s +step 6698/18794 | loss 3.146730 (-1.17z)| norm 0.2256 (-0.23z)| lr 4.75e-03 | 2014.94 ms | 68.1% bf16 MFU | 260007 tok/s +step 6699/18794 | loss 3.149741 (-1.07z)| norm 0.2161 (-0.35z)| lr 4.75e-03 | 2021.45 ms | 67.9% bf16 MFU | 259975 tok/s +step 6700/18794 | loss 3.166897 (-0.55z)| norm 0.1884 (-0.70z)| lr 4.74e-03 | 2027.03 ms | 67.7% bf16 MFU | 259909 tok/s +step 6701/18794 | loss 3.069089 (-3.27z)| norm 0.1795 (-0.82z)| lr 4.74e-03 | 1997.95 ms | 68.7% bf16 MFU | 260034 tok/s +step 6702/18794 | loss 3.199657 (+0.48z)| norm 0.2718 (+0.37z)| lr 4.74e-03 | 2019.51 ms | 68.0% bf16 MFU | 260013 tok/s +step 6703/18794 | loss 3.167383 (-0.46z)| norm 0.1766 (-0.84z)| lr 4.74e-03 | 2029.47 ms | 67.6% bf16 MFU | 259929 tok/s +step 6704/18794 | loss 3.196935 (+0.42z)| norm 0.3300 (+1.10z)| lr 4.74e-03 | 2010.21 ms | 68.3% bf16 MFU | 259973 tok/s +reducing beta2 to 0.9 and lr/wd by 0.865 due to grad z-score of 4.045813 +step 6705/18794 | loss 3.134280 (-1.40z)| norm 0.5940 (+4.05z)| lr 4.10e-03 | 2019.67 ms | 67.9% bf16 MFU | 259954 tok/s +step 6706/18794 | loss 3.232228 (+1.48z)| norm 0.3163 (+0.79z)| lr 4.74e-03 | 2016.00 ms | 68.1% bf16 MFU | 259959 tok/s +step 6707/18794 | loss 3.203524 (+0.62z)| norm 0.2508 (+0.04z)| lr 4.74e-03 | 2015.33 ms | 68.1% bf16 MFU | 259969 tok/s +step 6708/18794 | loss 3.206399 (+0.70z)| norm 0.3351 (+1.05z)| lr 4.74e-03 | 2015.86 ms | 68.1% bf16 MFU | 259975 tok/s +step 6709/18794 | loss 3.221057 (+1.12z)| norm 0.3102 (+0.74z)| lr 4.74e-03 | 2016.00 ms | 68.1% bf16 MFU | 259979 tok/s +step 6710/18794 | loss 3.195667 (+0.40z)| norm 0.2091 (-0.45z)| lr 4.74e-03 | 2016.47 ms | 68.1% bf16 MFU | 259980 tok/s +step 6711/18794 | loss 3.185316 (+0.10z)| norm 0.3344 (+1.01z)| lr 4.74e-03 | 2034.22 ms | 67.5% bf16 MFU | 259868 tok/s +step 6712/18794 | loss 3.136517 (-1.32z)| norm 0.2458 (-0.04z)| lr 4.74e-03 | 2011.73 ms | 68.2% bf16 MFU | 259905 tok/s +step 6713/18794 | loss 3.235253 (+1.55z)| norm 0.2377 (-0.14z)| lr 4.74e-03 | 2010.93 ms | 68.2% bf16 MFU | 259946 tok/s +step 6714/18794 | loss 3.206984 (+0.71z)| norm 0.3211 (+0.83z)| lr 4.74e-03 | 2012.85 ms | 68.2% bf16 MFU | 259972 tok/s +step 6715/18794 | loss 3.178113 (-0.12z)| norm 0.2232 (-0.32z)| lr 4.74e-03 | 2034.12 ms | 67.5% bf16 MFU | 259861 tok/s +step 6716/18794 | loss 3.208034 (+0.73z)| norm 0.1786 (-0.84z)| lr 4.74e-03 | 2010.47 ms | 68.3% bf16 MFU | 259907 tok/s +step 6717/18794 | loss 3.169006 (-0.40z)| norm 0.2351 (-0.18z)| lr 4.74e-03 | 2009.96 ms | 68.3% bf16 MFU | 259954 tok/s +step 6718/18794 | loss 3.130959 (-1.48z)| norm 0.2624 (+0.15z)| lr 4.74e-03 | 2012.18 ms | 68.2% bf16 MFU | 259984 tok/s +step 6719/18794 | loss 3.173025 (-0.24z)| norm 0.1710 (-0.93z)| lr 4.74e-03 | 2003.15 ms | 68.5% bf16 MFU | 260071 tok/s +step 6720/18794 | loss 3.198129 (+0.49z)| norm 0.1992 (-0.58z)| lr 4.74e-03 | 2017.74 ms | 68.0% bf16 MFU | 260060 tok/s +step 6721/18794 | loss 3.165430 (-0.49z)| norm 0.2540 (+0.06z)| lr 4.74e-03 | 2026.76 ms | 67.7% bf16 MFU | 259991 tok/s +step 6722/18794 | loss 3.188967 (+0.19z)| norm 0.3230 (+0.91z)| lr 4.74e-03 | 2010.20 ms | 68.3% bf16 MFU | 260032 tok/s +step 6723/18794 | loss 3.173409 (-0.26z)| norm 0.2589 (+0.18z)| lr 4.74e-03 | 2008.29 ms | 68.3% bf16 MFU | 260083 tok/s +step 6724/18794 | loss 3.196821 (+0.46z)| norm 0.2196 (-0.32z)| lr 4.74e-03 | 2020.32 ms | 67.9% bf16 MFU | 260055 tok/s +step 6725/18794 | loss 3.185948 (+0.13z)| norm 0.3051 (+0.76z)| lr 4.73e-03 | 2001.70 ms | 68.6% bf16 MFU | 260148 tok/s +step 6726/18794 | loss 3.145662 (-1.09z)| norm 0.2968 (+0.64z)| lr 4.73e-03 | 2015.50 ms | 68.1% bf16 MFU | 260147 tok/s +step 6727/18794 | loss 3.183546 (+0.07z)| norm 0.1969 (-0.63z)| lr 4.73e-03 | 2016.02 ms | 68.1% bf16 MFU | 260143 tok/s +step 6728/18794 | loss 3.157456 (-0.72z)| norm 0.2778 (+0.39z)| lr 4.73e-03 | 2008.29 ms | 68.3% bf16 MFU | 260189 tok/s +step 6729/18794 | loss 3.131147 (-1.50z)| norm 0.3082 (+0.78z)| lr 4.73e-03 | 2017.37 ms | 68.0% bf16 MFU | 260173 tok/s +step 6730/18794 | loss 3.202020 (+0.66z)| norm 0.2848 (+0.47z)| lr 4.73e-03 | 2025.89 ms | 67.7% bf16 MFU | 260104 tok/s +step 6731/18794 | loss 3.206069 (+0.78z)| norm 0.2003 (-0.61z)| lr 4.73e-03 | 2020.44 ms | 67.9% bf16 MFU | 260074 tok/s +step 6732/18794 | loss 3.166675 (-0.41z)| norm 0.2275 (-0.26z)| lr 4.73e-03 | 2009.71 ms | 68.3% bf16 MFU | 260114 tok/s +step 6733/18794 | loss 3.200319 (+0.60z)| norm 0.2548 (+0.08z)| lr 4.73e-03 | 2005.34 ms | 68.4% bf16 MFU | 260181 tok/s +step 6734/18794 | loss 3.208199 (+0.83z)| norm 0.2395 (-0.12z)| lr 4.73e-03 | 2009.44 ms | 68.3% bf16 MFU | 260217 tok/s +step 6735/18794 | loss 3.206133 (+0.80z)| norm 0.1583 (-1.15z)| lr 4.73e-03 | 2009.60 ms | 68.3% bf16 MFU | 260251 tok/s +step 6736/18794 | loss 3.189196 (+0.27z)| norm 0.2044 (-0.56z)| lr 4.73e-03 | 2004.42 ms | 68.5% bf16 MFU | 260317 tok/s +step 6737/18794 | loss 3.242574 (+1.88z)| norm 0.1951 (-0.68z)| lr 4.73e-03 | 2033.45 ms | 67.5% bf16 MFU | 260192 tok/s +step 6738/18794 | loss 3.191147 (+0.33z)| norm 0.2550 (+0.08z)| lr 4.73e-03 | 2006.05 ms | 68.4% bf16 MFU | 260250 tok/s +step 6739/18794 | loss 3.185133 (+0.14z)| norm 0.1978 (-0.64z)| lr 4.73e-03 | 2027.37 ms | 67.7% bf16 MFU | 260168 tok/s +step 6740/18794 | loss 3.167804 (-0.38z)| norm 0.3647 (+1.50z)| lr 4.73e-03 | 1996.18 ms | 68.7% bf16 MFU | 260292 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.964017 +step 6741/18794 | loss 3.112478 (-2.05z)| norm 0.4927 (+2.96z)| lr 4.73e-03 | 2019.26 ms | 68.0% bf16 MFU | 260260 tok/s +step 6742/18794 | loss 3.212257 (+0.99z)| norm 0.3539 (+1.42z)| lr 4.73e-03 | 2026.76 ms | 67.7% bf16 MFU | 260181 tok/s +step 6743/18794 | loss 3.183878 (+0.13z)| norm 0.1850 (-0.82z)| lr 4.73e-03 | 2002.97 ms | 68.5% bf16 MFU | 260260 tok/s +step 6744/18794 | loss 3.170248 (-0.27z)| norm 0.2984 (+0.74z)| lr 4.73e-03 | 2026.39 ms | 67.7% bf16 MFU | 260183 tok/s +step 6745/18794 | loss 3.156264 (-0.69z)| norm 0.2727 (+0.41z)| lr 4.73e-03 | 2026.11 ms | 67.7% bf16 MFU | 260112 tok/s +step 6746/18794 | loss 3.152241 (-0.81z)| norm 0.1505 (-1.31z)| lr 4.73e-03 | 2026.35 ms | 67.7% bf16 MFU | 260043 tok/s +step 6747/18794 | loss 3.226315 (+1.44z)| norm 0.1939 (-0.68z)| lr 4.73e-03 | 2017.48 ms | 68.0% bf16 MFU | 260035 tok/s +step 6748/18794 | loss 3.221277 (+1.26z)| norm 0.2226 (-0.27z)| lr 4.73e-03 | 2027.32 ms | 67.7% bf16 MFU | 259964 tok/s +step 6749/18794 | loss 3.136637 (-1.29z)| norm 0.1760 (-0.93z)| lr 4.72e-03 | 2001.52 ms | 68.6% bf16 MFU | 260063 tok/s +step 6750/18794 | loss 3.173418 (-0.19z)| norm 0.2449 (+0.05z)| lr 4.72e-03 | 2009.45 ms | 68.3% bf16 MFU | 260105 tok/s +val loss 3.181049 +HellaSwag: 2920/10042 = 0.290779: 0/1256 +step 6751/18794 | loss 3.209195 (+0.90z)| norm 0.2299 (-0.17z)| lr 4.72e-03 | 2016.83 ms | 68.0% bf16 MFU | 260098 tok/s +step 6752/18794 | loss 3.223105 (+1.33z)| norm 0.2361 (-0.06z)| lr 4.72e-03 | 2019.49 ms | 68.0% bf16 MFU | 260073 tok/s +step 6753/18794 | loss 3.185097 (+0.15z)| norm 0.2867 (+0.69z)| lr 4.72e-03 | 2012.16 ms | 68.2% bf16 MFU | 260098 tok/s +step 6754/18794 | loss 3.126799 (-1.61z)| norm 0.2139 (-0.38z)| lr 4.72e-03 | 2001.36 ms | 68.6% bf16 MFU | 260191 tok/s +step 6755/18794 | loss 3.138813 (-1.22z)| norm 0.1841 (-0.80z)| lr 4.72e-03 | 2003.00 ms | 68.5% bf16 MFU | 260269 tok/s +step 6756/18794 | loss 3.139675 (-1.19z)| norm 0.2005 (-0.55z)| lr 4.72e-03 | 2018.67 ms | 68.0% bf16 MFU | 260242 tok/s +step 6757/18794 | loss 3.243829 (+1.92z)| norm 0.2172 (-0.29z)| lr 4.72e-03 | 2012.79 ms | 68.2% bf16 MFU | 260254 tok/s +step 6758/18794 | loss 3.181443 (+0.06z)| norm 0.2036 (-0.48z)| lr 4.72e-03 | 2002.90 ms | 68.5% bf16 MFU | 260329 tok/s +step 6759/18794 | loss 3.233738 (+1.63z)| norm 0.2582 (+0.35z)| lr 4.72e-03 | 2026.87 ms | 67.7% bf16 MFU | 260246 tok/s +step 6760/18794 | loss 3.170592 (-0.26z)| norm 0.2576 (+0.33z)| lr 4.72e-03 | 2004.92 ms | 68.4% bf16 MFU | 260309 tok/s +step 6761/18794 | loss 3.207673 (+0.85z)| norm 0.2486 (+0.20z)| lr 4.72e-03 | 2004.24 ms | 68.5% bf16 MFU | 260373 tok/s +step 6762/18794 | loss 3.158910 (-0.60z)| norm 0.1983 (-0.56z)| lr 4.72e-03 | 2015.92 ms | 68.1% bf16 MFU | 260358 tok/s +step 6763/18794 | loss 3.154691 (-0.72z)| norm 0.2015 (-0.51z)| lr 4.72e-03 | 2020.11 ms | 67.9% bf16 MFU | 260317 tok/s +step 6764/18794 | loss 3.161997 (-0.48z)| norm 0.2336 (-0.02z)| lr 4.72e-03 | 1995.56 ms | 68.8% bf16 MFU | 260437 tok/s +step 6765/18794 | loss 3.183787 (+0.20z)| norm 0.2177 (-0.26z)| lr 4.72e-03 | 2010.80 ms | 68.2% bf16 MFU | 260452 tok/s +step 6766/18794 | loss 3.184537 (+0.21z)| norm 0.1677 (-1.02z)| lr 4.72e-03 | 2027.29 ms | 67.7% bf16 MFU | 260360 tok/s +step 6767/18794 | loss 3.161291 (-0.50z)| norm 0.2015 (-0.52z)| lr 4.72e-03 | 1987.53 ms | 69.0% bf16 MFU | 260532 tok/s +step 6768/18794 | loss 3.176676 (-0.02z)| norm 0.1788 (-0.86z)| lr 4.72e-03 | 2003.29 ms | 68.5% bf16 MFU | 260591 tok/s +step 6769/18794 | loss 3.119565 (-1.73z)| norm 0.1702 (-0.99z)| lr 4.72e-03 | 2013.09 ms | 68.2% bf16 MFU | 260583 tok/s +step 6770/18794 | loss 3.142172 (-1.03z)| norm 0.1852 (-0.76z)| lr 4.72e-03 | 2035.64 ms | 67.4% bf16 MFU | 260432 tok/s +step 6771/18794 | loss 3.183467 (+0.23z)| norm 0.1814 (-0.81z)| lr 4.72e-03 | 2005.60 ms | 68.4% bf16 MFU | 260481 tok/s +step 6772/18794 | loss 3.132246 (-1.30z)| norm 0.2254 (-0.14z)| lr 4.72e-03 | 1989.33 ms | 69.0% bf16 MFU | 260634 tok/s +step 6773/18794 | loss 3.158593 (-0.51z)| norm 0.2113 (-0.36z)| lr 4.72e-03 | 2010.16 ms | 68.3% bf16 MFU | 260644 tok/s +step 6774/18794 | loss 3.222909 (+1.42z)| norm 0.3584 (+1.82z)| lr 4.71e-03 | 2018.88 ms | 68.0% bf16 MFU | 260596 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.145283 +step 6775/18794 | loss 3.157584 (-0.55z)| norm 0.3850 (+2.15z)| lr 4.71e-03 | 1987.83 ms | 69.0% bf16 MFU | 260754 tok/s +step 6776/18794 | loss 3.191792 (+0.47z)| norm 0.1775 (-0.87z)| lr 4.71e-03 | 2011.36 ms | 68.2% bf16 MFU | 260749 tok/s +step 6777/18794 | loss 3.190495 (+0.44z)| norm 0.3439 (+1.51z)| lr 4.71e-03 | 2012.46 ms | 68.2% bf16 MFU | 260738 tok/s +step 6778/18794 | loss 3.122591 (-1.59z)| norm 0.3588 (+1.68z)| lr 4.71e-03 | 2006.69 ms | 68.4% bf16 MFU | 260764 tok/s +step 6779/18794 | loss 3.147370 (-0.84z)| norm 0.2051 (-0.52z)| lr 4.71e-03 | 2012.74 ms | 68.2% bf16 MFU | 260750 tok/s +step 6780/18794 | loss 3.161426 (-0.41z)| norm 0.2977 (+0.79z)| lr 4.71e-03 | 2017.89 ms | 68.0% bf16 MFU | 260704 tok/s +step 6781/18794 | loss 3.175910 (+0.03z)| norm 0.3167 (+1.06z)| lr 4.71e-03 | 2025.78 ms | 67.7% bf16 MFU | 260609 tok/s +step 6782/18794 | loss 3.149135 (-0.76z)| norm 0.2284 (-0.20z)| lr 4.71e-03 | 2011.63 ms | 68.2% bf16 MFU | 260610 tok/s +step 6783/18794 | loss 3.211117 (+1.08z)| norm 0.2392 (-0.06z)| lr 4.71e-03 | 2027.11 ms | 67.7% bf16 MFU | 260511 tok/s +step 6784/18794 | loss 3.176775 (+0.04z)| norm 0.1627 (-1.15z)| lr 4.71e-03 | 2003.83 ms | 68.5% bf16 MFU | 260568 tok/s +step 6785/18794 | loss 3.218110 (+1.27z)| norm 0.3552 (+1.57z)| lr 4.71e-03 | 2017.42 ms | 68.0% bf16 MFU | 260534 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.928788 +step 6786/18794 | loss 3.100047 (-2.17z)| norm 0.4618 (+2.93z)| lr 4.71e-03 | 2013.08 ms | 68.2% bf16 MFU | 260529 tok/s +step 6787/18794 | loss 3.133573 (-1.22z)| norm 0.2271 (-0.27z)| lr 4.71e-03 | 2025.16 ms | 67.8% bf16 MFU | 260447 tok/s +step 6788/18794 | loss 3.207632 (+0.95z)| norm 0.2715 (+0.33z)| lr 4.71e-03 | 2023.42 ms | 67.8% bf16 MFU | 260380 tok/s +step 6789/18794 | loss 3.173077 (-0.10z)| norm 0.2195 (-0.38z)| lr 4.71e-03 | 2007.07 ms | 68.4% bf16 MFU | 260422 tok/s +step 6790/18794 | loss 3.211406 (+1.05z)| norm 0.1968 (-0.70z)| lr 4.71e-03 | 2011.89 ms | 68.2% bf16 MFU | 260431 tok/s +step 6791/18794 | loss 3.142676 (-1.00z)| norm 0.2383 (-0.14z)| lr 4.71e-03 | 2020.75 ms | 67.9% bf16 MFU | 260382 tok/s +step 6792/18794 | loss 3.209826 (+0.99z)| norm 0.2562 (+0.10z)| lr 4.71e-03 | 2007.68 ms | 68.4% bf16 MFU | 260420 tok/s +step 6793/18794 | loss 3.208814 (+0.95z)| norm 0.1665 (-1.12z)| lr 4.71e-03 | 2015.50 ms | 68.1% bf16 MFU | 260405 tok/s +step 6794/18794 | loss 3.173968 (-0.10z)| norm 0.2316 (-0.23z)| lr 4.71e-03 | 2019.40 ms | 68.0% bf16 MFU | 260366 tok/s +step 6795/18794 | loss 3.144370 (-0.99z)| norm 0.2327 (-0.22z)| lr 4.71e-03 | 2012.25 ms | 68.2% bf16 MFU | 260375 tok/s +step 6796/18794 | loss 3.180403 (+0.09z)| norm 0.1753 (-1.00z)| lr 4.71e-03 | 2022.84 ms | 67.8% bf16 MFU | 260316 tok/s +step 6797/18794 | loss 3.153340 (-0.71z)| norm 0.1841 (-0.88z)| lr 4.71e-03 | 2022.09 ms | 67.9% bf16 MFU | 260264 tok/s +step 6798/18794 | loss 3.128256 (-1.46z)| norm 0.2469 (-0.02z)| lr 4.71e-03 | 2011.39 ms | 68.2% bf16 MFU | 260284 tok/s +step 6799/18794 | loss 3.149981 (-0.80z)| norm 0.2107 (-0.51z)| lr 4.70e-03 | 2024.38 ms | 67.8% bf16 MFU | 260219 tok/s +step 6800/18794 | loss 3.182350 (+0.16z)| norm 0.1997 (-0.67z)| lr 4.70e-03 | 2016.05 ms | 68.1% bf16 MFU | 260211 tok/s +step 6801/18794 | loss 3.209311 (+0.98z)| norm 0.2544 (+0.07z)| lr 4.70e-03 | 2007.62 ms | 68.4% bf16 MFU | 260258 tok/s +step 6802/18794 | loss 3.171699 (-0.20z)| norm 0.3255 (+1.03z)| lr 4.70e-03 | 2020.63 ms | 67.9% bf16 MFU | 260218 tok/s +step 6803/18794 | loss 3.134327 (-1.36z)| norm 0.2596 (+0.12z)| lr 4.70e-03 | 2016.30 ms | 68.1% bf16 MFU | 260209 tok/s +step 6804/18794 | loss 3.146620 (-0.96z)| norm 0.2841 (+0.47z)| lr 4.70e-03 | 1997.58 ms | 68.7% bf16 MFU | 260321 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.219068 +step 6805/18794 | loss 3.174376 (-0.10z)| norm 0.3931 (+2.22z)| lr 4.70e-03 | 2024.94 ms | 67.8% bf16 MFU | 260251 tok/s +step 6806/18794 | loss 3.184488 (+0.24z)| norm 0.1965 (-0.78z)| lr 4.70e-03 | 2014.41 ms | 68.1% bf16 MFU | 260252 tok/s +reducing beta2 to 0.9 and lr/wd by 0.826 due to grad z-score of 4.234892 +step 6807/18794 | loss 3.144027 (-1.04z)| norm 0.5549 (+4.23z)| lr 3.89e-03 | 2007.37 ms | 68.4% bf16 MFU | 260298 tok/s +step 6808/18794 | loss 3.209791 (+1.06z)| norm 0.2103 (-0.54z)| lr 4.70e-03 | 2019.91 ms | 67.9% bf16 MFU | 260261 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.220515 +step 6809/18794 | loss 3.154185 (-0.70z)| norm 0.4933 (+3.22z)| lr 4.70e-03 | 2008.65 ms | 68.3% bf16 MFU | 260299 tok/s +step 6810/18794 | loss 3.176105 (+0.01z)| norm 0.3732 (+1.59z)| lr 4.70e-03 | 1998.32 ms | 68.7% bf16 MFU | 260402 tok/s +step 6811/18794 | loss 3.170336 (-0.17z)| norm 0.2219 (-0.39z)| lr 4.70e-03 | 2009.84 ms | 68.3% bf16 MFU | 260425 tok/s +step 6812/18794 | loss 3.157356 (-0.60z)| norm 0.1964 (-0.71z)| lr 4.70e-03 | 2012.29 ms | 68.2% bf16 MFU | 260431 tok/s +step 6813/18794 | loss 3.169416 (-0.19z)| norm 0.2616 (+0.14z)| lr 4.70e-03 | 2001.24 ms | 68.6% bf16 MFU | 260509 tok/s +step 6814/18794 | loss 3.174107 (-0.02z)| norm 0.2653 (+0.20z)| lr 4.70e-03 | 2009.01 ms | 68.3% bf16 MFU | 260532 tok/s +step 6815/18794 | loss 3.189412 (+0.48z)| norm 0.2039 (-0.61z)| lr 4.70e-03 | 2003.63 ms | 68.5% bf16 MFU | 260588 tok/s +step 6816/18794 | loss 3.146732 (-0.92z)| norm 0.2130 (-0.50z)| lr 4.70e-03 | 2008.74 ms | 68.3% bf16 MFU | 260609 tok/s +step 6817/18794 | loss 3.164444 (-0.33z)| norm 0.3022 (+0.67z)| lr 4.70e-03 | 2011.68 ms | 68.2% bf16 MFU | 260610 tok/s +step 6818/18794 | loss 3.166614 (-0.27z)| norm 0.2294 (-0.28z)| lr 4.70e-03 | 2001.64 ms | 68.6% bf16 MFU | 260676 tok/s +step 6819/18794 | loss 3.175899 (+0.04z)| norm 0.2013 (-0.66z)| lr 4.70e-03 | 2019.82 ms | 67.9% bf16 MFU | 260621 tok/s +step 6820/18794 | loss 3.143202 (-1.04z)| norm 0.1921 (-0.78z)| lr 4.70e-03 | 2000.39 ms | 68.6% bf16 MFU | 260694 tok/s +step 6821/18794 | loss 3.138454 (-1.18z)| norm 0.1644 (-1.13z)| lr 4.70e-03 | 2021.88 ms | 67.9% bf16 MFU | 260625 tok/s +step 6822/18794 | loss 3.159205 (-0.48z)| norm 0.1843 (-0.85z)| lr 4.70e-03 | 2020.31 ms | 67.9% bf16 MFU | 260569 tok/s +step 6823/18794 | loss 3.138229 (-1.16z)| norm 0.2071 (-0.54z)| lr 4.69e-03 | 2021.63 ms | 67.9% bf16 MFU | 260507 tok/s +step 6824/18794 | loss 3.171872 (-0.03z)| norm 0.2669 (+0.24z)| lr 4.69e-03 | 2004.02 ms | 68.5% bf16 MFU | 260563 tok/s +step 6825/18794 | loss 3.128089 (-1.46z)| norm 0.1966 (-0.67z)| lr 4.69e-03 | 1996.83 ms | 68.7% bf16 MFU | 260663 tok/s +step 6826/18794 | loss 3.153504 (-0.62z)| norm 0.3262 (+1.03z)| lr 4.69e-03 | 2010.11 ms | 68.3% bf16 MFU | 260671 tok/s +step 6827/18794 | loss 3.159732 (-0.41z)| norm 0.2463 (-0.03z)| lr 4.69e-03 | 2026.19 ms | 67.7% bf16 MFU | 260575 tok/s +step 6828/18794 | loss 3.173585 (+0.04z)| norm 0.2552 (+0.09z)| lr 4.69e-03 | 2023.88 ms | 67.8% bf16 MFU | 260499 tok/s +step 6829/18794 | loss 3.169087 (-0.12z)| norm 0.3697 (+1.58z)| lr 4.69e-03 | 2018.79 ms | 68.0% bf16 MFU | 260459 tok/s +step 6830/18794 | loss 3.188374 (+0.53z)| norm 0.2426 (-0.07z)| lr 4.69e-03 | 2012.57 ms | 68.2% bf16 MFU | 260462 tok/s +step 6831/18794 | loss 3.176467 (+0.14z)| norm 0.2266 (-0.29z)| lr 4.69e-03 | 2024.11 ms | 67.8% bf16 MFU | 260390 tok/s +step 6832/18794 | loss 3.203407 (+1.03z)| norm 0.2252 (-0.31z)| lr 4.69e-03 | 2009.31 ms | 68.3% bf16 MFU | 260417 tok/s +step 6833/18794 | loss 3.187025 (+0.49z)| norm 0.2240 (-0.32z)| lr 4.69e-03 | 2008.34 ms | 68.3% bf16 MFU | 260449 tok/s +step 6834/18794 | loss 3.140796 (-1.04z)| norm 0.2063 (-0.54z)| lr 4.69e-03 | 2004.31 ms | 68.5% bf16 MFU | 260505 tok/s +step 6835/18794 | loss 3.180063 (+0.29z)| norm 0.1930 (-0.72z)| lr 4.69e-03 | 1998.58 ms | 68.7% bf16 MFU | 260596 tok/s +step 6836/18794 | loss 3.205214 (+1.13z)| norm 0.1854 (-0.82z)| lr 4.69e-03 | 2008.49 ms | 68.3% bf16 MFU | 260618 tok/s +step 6837/18794 | loss 3.142673 (-0.97z)| norm 0.1893 (-0.77z)| lr 4.69e-03 | 2026.27 ms | 67.7% bf16 MFU | 260525 tok/s +step 6838/18794 | loss 3.199999 (+1.01z)| norm 0.2922 (+0.57z)| lr 4.69e-03 | 2017.23 ms | 68.0% bf16 MFU | 260494 tok/s +step 6839/18794 | loss 3.132505 (-1.29z)| norm 0.3780 (+1.65z)| lr 4.69e-03 | 2009.40 ms | 68.3% bf16 MFU | 260515 tok/s +step 6840/18794 | loss 3.179348 (+0.31z)| norm 0.2266 (-0.29z)| lr 4.69e-03 | 1994.04 ms | 68.8% bf16 MFU | 260635 tok/s +step 6841/18794 | loss 3.168134 (-0.10z)| norm 0.2802 (+0.46z)| lr 4.69e-03 | 2016.63 ms | 68.1% bf16 MFU | 260603 tok/s +step 6842/18794 | loss 3.205250 (+1.21z)| norm 0.2873 (+0.57z)| lr 4.69e-03 | 2010.52 ms | 68.3% bf16 MFU | 260611 tok/s +step 6843/18794 | loss 3.170077 (-0.02z)| norm 0.1894 (-0.79z)| lr 4.69e-03 | 2006.10 ms | 68.4% bf16 MFU | 260648 tok/s +step 6844/18794 | loss 3.157163 (-0.47z)| norm 0.1532 (-1.26z)| lr 4.69e-03 | 2007.79 ms | 68.3% bf16 MFU | 260672 tok/s +step 6845/18794 | loss 3.166464 (-0.15z)| norm 0.1917 (-0.72z)| lr 4.69e-03 | 2016.63 ms | 68.1% bf16 MFU | 260638 tok/s +step 6846/18794 | loss 3.191604 (+0.72z)| norm 0.1562 (-1.21z)| lr 4.69e-03 | 2018.70 ms | 68.0% bf16 MFU | 260591 tok/s +step 6847/18794 | loss 3.140016 (-1.07z)| norm 0.2079 (-0.50z)| lr 4.69e-03 | 2016.81 ms | 68.0% bf16 MFU | 260560 tok/s +step 6848/18794 | loss 3.178068 (+0.30z)| norm 0.3015 (+0.78z)| lr 4.68e-03 | 2016.90 ms | 68.0% bf16 MFU | 260529 tok/s +step 6849/18794 | loss 3.138602 (-1.13z)| norm 0.2001 (-0.62z)| lr 4.68e-03 | 2005.58 ms | 68.4% bf16 MFU | 260574 tok/s +step 6850/18794 | loss 3.204316 (+1.23z)| norm 0.2832 (+0.52z)| lr 4.68e-03 | 2018.64 ms | 68.0% bf16 MFU | 260531 tok/s +step 6851/18794 | loss 3.172263 (+0.09z)| norm 0.3110 (+0.89z)| lr 4.68e-03 | 1995.23 ms | 68.8% bf16 MFU | 260643 tok/s +step 6852/18794 | loss 3.194579 (+0.92z)| norm 0.2566 (+0.14z)| lr 4.68e-03 | 2011.02 ms | 68.2% bf16 MFU | 260646 tok/s +step 6853/18794 | loss 3.173531 (+0.15z)| norm 0.2807 (+0.47z)| lr 4.68e-03 | 2014.22 ms | 68.1% bf16 MFU | 260629 tok/s +step 6854/18794 | loss 3.171136 (+0.05z)| norm 0.2385 (-0.11z)| lr 4.68e-03 | 2021.58 ms | 67.9% bf16 MFU | 260564 tok/s +step 6855/18794 | loss 3.196617 (+0.98z)| norm 0.2026 (-0.61z)| lr 4.68e-03 | 2017.93 ms | 68.0% bf16 MFU | 260527 tok/s +step 6856/18794 | loss 3.136702 (-1.26z)| norm 0.2058 (-0.57z)| lr 4.68e-03 | 2016.73 ms | 68.0% bf16 MFU | 260499 tok/s +step 6857/18794 | loss 3.235086 (+2.44z)| norm 0.2081 (-0.53z)| lr 4.68e-03 | 2010.04 ms | 68.3% bf16 MFU | 260516 tok/s +step 6858/18794 | loss 3.147290 (-0.85z)| norm 0.2213 (-0.35z)| lr 4.68e-03 | 1999.08 ms | 68.6% bf16 MFU | 260603 tok/s +step 6859/18794 | loss 3.158744 (-0.40z)| norm 0.2586 (+0.16z)| lr 4.68e-03 | 2001.16 ms | 68.6% bf16 MFU | 260673 tok/s +step 6860/18794 | loss 3.166797 (-0.09z)| norm 0.2014 (-0.62z)| lr 4.68e-03 | 2010.95 ms | 68.2% bf16 MFU | 260675 tok/s +step 6861/18794 | loss 3.171387 (+0.10z)| norm 0.2288 (-0.24z)| lr 4.68e-03 | 2008.54 ms | 68.3% bf16 MFU | 260693 tok/s +step 6862/18794 | loss 3.155684 (-0.51z)| norm 0.1701 (-1.04z)| lr 4.68e-03 | 2013.25 ms | 68.2% bf16 MFU | 260679 tok/s +step 6863/18794 | loss 3.175928 (+0.27z)| norm 0.2088 (-0.51z)| lr 4.68e-03 | 2007.84 ms | 68.3% bf16 MFU | 260701 tok/s +step 6864/18794 | loss 3.108980 (-2.27z)| norm 0.2690 (+0.31z)| lr 4.68e-03 | 2017.71 ms | 68.0% bf16 MFU | 260658 tok/s +step 6865/18794 | loss 3.229750 (+2.26z)| norm 0.3323 (+1.16z)| lr 4.68e-03 | 2018.02 ms | 68.0% bf16 MFU | 260615 tok/s +step 6866/18794 | loss 3.176139 (+0.27z)| norm 0.2467 (-0.02z)| lr 4.68e-03 | 2025.69 ms | 67.7% bf16 MFU | 260526 tok/s +step 6867/18794 | loss 3.217049 (+1.75z)| norm 0.2644 (+0.21z)| lr 4.68e-03 | 2018.95 ms | 68.0% bf16 MFU | 260484 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.397825 +step 6868/18794 | loss 3.178467 (+0.33z)| norm 0.4303 (+2.40z)| lr 4.68e-03 | 2024.42 ms | 67.8% bf16 MFU | 260408 tok/s +step 6869/18794 | loss 3.173682 (+0.14z)| norm 0.2306 (-0.29z)| lr 4.68e-03 | 2024.89 ms | 67.8% bf16 MFU | 260334 tok/s +step 6870/18794 | loss 3.164114 (-0.23z)| norm 0.3954 (+1.88z)| lr 4.68e-03 | 2001.85 ms | 68.6% bf16 MFU | 260412 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.443747 +step 6871/18794 | loss 3.170707 (+0.03z)| norm 0.4455 (+2.44z)| lr 4.68e-03 | 2020.71 ms | 67.9% bf16 MFU | 260365 tok/s +step 6872/18794 | loss 3.176166 (+0.22z)| norm 0.2548 (-0.03z)| lr 4.67e-03 | 2008.52 ms | 68.3% bf16 MFU | 260398 tok/s +step 6873/18794 | loss 3.146864 (-0.89z)| norm 0.2114 (-0.59z)| lr 4.67e-03 | 2029.48 ms | 67.6% bf16 MFU | 260295 tok/s +step 6874/18794 | loss 3.136130 (-1.27z)| norm 0.1775 (-1.01z)| lr 4.67e-03 | 2016.99 ms | 68.0% bf16 MFU | 260277 tok/s +step 6875/18794 | loss 3.182532 (+0.49z)| norm 0.1932 (-0.79z)| lr 4.67e-03 | 2020.15 ms | 67.9% bf16 MFU | 260240 tok/s +step 6876/18794 | loss 3.214986 (+1.70z)| norm 0.2211 (-0.43z)| lr 4.67e-03 | 2015.30 ms | 68.1% bf16 MFU | 260235 tok/s +step 6877/18794 | loss 3.139041 (-1.14z)| norm 0.1945 (-0.77z)| lr 4.67e-03 | 2018.09 ms | 68.0% bf16 MFU | 260213 tok/s +step 6878/18794 | loss 3.172918 (+0.11z)| norm 0.3259 (+0.99z)| lr 4.67e-03 | 2010.15 ms | 68.3% bf16 MFU | 260244 tok/s +step 6879/18794 | loss 3.119815 (-1.88z)| norm 0.3937 (+1.84z)| lr 4.67e-03 | 2001.63 ms | 68.6% bf16 MFU | 260328 tok/s +step 6880/18794 | loss 3.163164 (-0.25z)| norm 0.1774 (-0.99z)| lr 4.67e-03 | 2025.90 ms | 67.7% bf16 MFU | 260251 tok/s +step 6881/18794 | loss 3.177655 (+0.30z)| norm 0.3218 (+0.90z)| lr 4.67e-03 | 2024.36 ms | 67.8% bf16 MFU | 260188 tok/s +step 6882/18794 | loss 3.174615 (+0.18z)| norm 0.2299 (-0.30z)| lr 4.67e-03 | 2010.84 ms | 68.2% bf16 MFU | 260215 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.024509 +step 6883/18794 | loss 3.221061 (+1.92z)| norm 0.4125 (+2.02z)| lr 4.67e-03 | 2022.84 ms | 67.8% bf16 MFU | 260164 tok/s +step 6884/18794 | loss 3.191225 (+0.79z)| norm 0.2537 (-0.02z)| lr 4.67e-03 | 2038.87 ms | 67.3% bf16 MFU | 260013 tok/s +step 6885/18794 | loss 3.191331 (+0.81z)| norm 0.2430 (-0.15z)| lr 4.67e-03 | 2005.98 ms | 68.4% bf16 MFU | 260080 tok/s +step 6886/18794 | loss 3.137913 (-1.27z)| norm 0.2522 (+0.00z)| lr 4.67e-03 | 2006.70 ms | 68.4% bf16 MFU | 260140 tok/s +step 6887/18794 | loss 3.098403 (-2.72z)| norm 0.2118 (-0.54z)| lr 4.67e-03 | 2021.35 ms | 67.9% bf16 MFU | 260101 tok/s +step 6888/18794 | loss 3.152953 (-0.63z)| norm 0.2912 (+0.53z)| lr 4.67e-03 | 2024.51 ms | 67.8% bf16 MFU | 260045 tok/s +step 6889/18794 | loss 3.136940 (-1.22z)| norm 0.1802 (-0.96z)| lr 4.67e-03 | 2021.36 ms | 67.9% bf16 MFU | 260011 tok/s +step 6890/18794 | loss 3.167866 (-0.03z)| norm 0.3445 (+1.22z)| lr 4.67e-03 | 2000.50 ms | 68.6% bf16 MFU | 260115 tok/s +step 6891/18794 | loss 3.248949 (+2.94z)| norm 0.4057 (+1.98z)| lr 4.67e-03 | 2006.15 ms | 68.4% bf16 MFU | 260176 tok/s +step 6892/18794 | loss 3.184376 (+0.56z)| norm 0.1754 (-1.03z)| lr 4.67e-03 | 2017.62 ms | 68.0% bf16 MFU | 260160 tok/s +step 6893/18794 | loss 3.219061 (+1.84z)| norm 0.2534 (-0.02z)| lr 4.67e-03 | 2005.79 ms | 68.4% bf16 MFU | 260221 tok/s +step 6894/18794 | loss 3.258530 (+3.11z)| norm 0.2292 (-0.34z)| lr 4.67e-03 | 2007.33 ms | 68.4% bf16 MFU | 260270 tok/s +step 6895/18794 | loss 3.166615 (-0.14z)| norm 0.1833 (-0.93z)| lr 4.67e-03 | 2018.60 ms | 68.0% bf16 MFU | 260242 tok/s +step 6896/18794 | loss 3.122957 (-1.65z)| norm 0.1969 (-0.76z)| lr 4.66e-03 | 2009.49 ms | 68.3% bf16 MFU | 260276 tok/s +step 6897/18794 | loss 3.205801 (+1.23z)| norm 0.2169 (-0.50z)| lr 4.66e-03 | 2018.71 ms | 68.0% bf16 MFU | 260248 tok/s +step 6898/18794 | loss 3.196898 (+0.90z)| norm 0.1955 (-0.77z)| lr 4.66e-03 | 2015.52 ms | 68.1% bf16 MFU | 260242 tok/s +step 6899/18794 | loss 3.176052 (+0.16z)| norm 0.2357 (-0.25z)| lr 4.66e-03 | 2010.23 ms | 68.3% bf16 MFU | 260270 tok/s +step 6900/18794 | loss 3.166515 (-0.17z)| norm 0.2993 (+0.57z)| lr 4.66e-03 | 2002.92 ms | 68.5% bf16 MFU | 260344 tok/s +step 6901/18794 | loss 3.144946 (-0.91z)| norm 0.2414 (-0.19z)| lr 4.66e-03 | 2015.93 ms | 68.1% bf16 MFU | 260331 tok/s +step 6902/18794 | loss 3.201420 (+1.07z)| norm 0.2088 (-0.60z)| lr 4.66e-03 | 2006.29 ms | 68.4% bf16 MFU | 260380 tok/s +step 6903/18794 | loss 3.184227 (+0.45z)| norm 0.2039 (-0.66z)| lr 4.66e-03 | 2008.07 ms | 68.3% bf16 MFU | 260416 tok/s +step 6904/18794 | loss 3.122533 (-1.70z)| norm 0.1971 (-0.73z)| lr 4.66e-03 | 2018.24 ms | 68.0% bf16 MFU | 260384 tok/s +step 6905/18794 | loss 3.199733 (+0.98z)| norm 0.2178 (-0.45z)| lr 4.66e-03 | 2012.95 ms | 68.2% bf16 MFU | 260388 tok/s +step 6906/18794 | loss 3.146936 (-0.84z)| norm 0.1780 (-0.97z)| lr 4.66e-03 | 2009.63 ms | 68.3% bf16 MFU | 260413 tok/s +step 6907/18794 | loss 3.190713 (+0.67z)| norm 0.1843 (-0.91z)| lr 4.66e-03 | 2018.33 ms | 68.0% bf16 MFU | 260380 tok/s +step 6908/18794 | loss 3.152006 (-0.67z)| norm 0.1629 (-1.21z)| lr 4.66e-03 | 2013.62 ms | 68.2% bf16 MFU | 260380 tok/s +step 6909/18794 | loss 3.116592 (-1.86z)| norm 0.2241 (-0.31z)| lr 4.66e-03 | 2018.89 ms | 68.0% bf16 MFU | 260345 tok/s +step 6910/18794 | loss 3.169217 (-0.05z)| norm 0.2975 (+0.85z)| lr 4.66e-03 | 2028.04 ms | 67.7% bf16 MFU | 260254 tok/s +step 6911/18794 | loss 3.150944 (-0.67z)| norm 0.2618 (+0.28z)| lr 4.66e-03 | 2017.44 ms | 68.0% bf16 MFU | 260235 tok/s +step 6912/18794 | loss 3.199200 (+0.97z)| norm 0.2045 (-0.62z)| lr 4.66e-03 | 2019.33 ms | 68.0% bf16 MFU | 260205 tok/s +step 6913/18794 | loss 3.130701 (-1.35z)| norm 0.1604 (-1.29z)| lr 4.66e-03 | 2009.61 ms | 68.3% bf16 MFU | 260239 tok/s +step 6914/18794 | loss 3.123911 (-1.54z)| norm 0.2996 (+0.87z)| lr 4.66e-03 | 2011.50 ms | 68.2% bf16 MFU | 260260 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.184203 +step 6915/18794 | loss 3.147597 (-0.73z)| norm 0.3891 (+2.18z)| lr 4.66e-03 | 2009.65 ms | 68.3% bf16 MFU | 260291 tok/s +step 6916/18794 | loss 3.198533 (+0.96z)| norm 0.1934 (-0.78z)| lr 4.66e-03 | 2010.19 ms | 68.3% bf16 MFU | 260317 tok/s +step 6917/18794 | loss 3.136654 (-1.10z)| norm 0.2671 (+0.34z)| lr 4.66e-03 | 2002.04 ms | 68.5% bf16 MFU | 260395 tok/s +step 6918/18794 | loss 3.158972 (-0.35z)| norm 0.3320 (+1.30z)| lr 4.66e-03 | 2016.85 ms | 68.0% bf16 MFU | 260373 tok/s +step 6919/18794 | loss 3.133406 (-1.18z)| norm 0.1726 (-1.09z)| lr 4.66e-03 | 2004.21 ms | 68.5% bf16 MFU | 260434 tok/s +step 6920/18794 | loss 3.192992 (+0.77z)| norm 0.3501 (+1.53z)| lr 4.66e-03 | 2012.92 ms | 68.2% bf16 MFU | 260435 tok/s +step 6921/18794 | loss 3.148987 (-0.69z)| norm 0.3572 (+1.60z)| lr 4.65e-03 | 2015.16 ms | 68.1% bf16 MFU | 260422 tok/s +step 6922/18794 | loss 3.153746 (-0.53z)| norm 0.2549 (+0.08z)| lr 4.65e-03 | 2012.22 ms | 68.2% bf16 MFU | 260429 tok/s +step 6923/18794 | loss 3.129537 (-1.32z)| norm 0.1892 (-0.89z)| lr 4.65e-03 | 2005.69 ms | 68.4% bf16 MFU | 260477 tok/s +step 6924/18794 | loss 3.189698 (+0.65z)| norm 0.2152 (-0.50z)| lr 4.65e-03 | 2018.85 ms | 68.0% bf16 MFU | 260438 tok/s +step 6925/18794 | loss 3.145427 (-0.82z)| norm 0.2000 (-0.72z)| lr 4.65e-03 | 2013.40 ms | 68.2% bf16 MFU | 260436 tok/s +step 6926/18794 | loss 3.243022 (+2.33z)| norm 0.1804 (-0.99z)| lr 4.65e-03 | 2020.78 ms | 67.9% bf16 MFU | 260387 tok/s +step 6927/18794 | loss 3.164486 (-0.21z)| norm 0.1742 (-1.07z)| lr 4.65e-03 | 2017.95 ms | 68.0% bf16 MFU | 260358 tok/s +step 6928/18794 | loss 3.161349 (-0.31z)| norm 0.2501 (+0.05z)| lr 4.65e-03 | 2011.10 ms | 68.2% bf16 MFU | 260375 tok/s +step 6929/18794 | loss 3.188097 (+0.55z)| norm 0.3754 (+1.90z)| lr 4.65e-03 | 2020.43 ms | 67.9% bf16 MFU | 260331 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.703444 +step 6930/18794 | loss 3.199583 (+0.91z)| norm 0.4393 (+2.70z)| lr 4.65e-03 | 2011.98 ms | 68.2% bf16 MFU | 260344 tok/s +step 6931/18794 | loss 3.180562 (+0.30z)| norm 0.1943 (-0.76z)| lr 4.65e-03 | 2011.74 ms | 68.2% bf16 MFU | 260357 tok/s +step 6932/18794 | loss 3.150648 (-0.65z)| norm 0.3090 (+0.84z)| lr 4.65e-03 | 2018.96 ms | 68.0% bf16 MFU | 260323 tok/s +step 6933/18794 | loss 3.131409 (-1.24z)| norm 0.2630 (+0.19z)| lr 4.65e-03 | 1994.86 ms | 68.8% bf16 MFU | 260448 tok/s +step 6934/18794 | loss 3.185613 (+0.48z)| norm 0.1896 (-0.84z)| lr 4.65e-03 | 2002.71 ms | 68.5% bf16 MFU | 260515 tok/s +step 6935/18794 | loss 3.167650 (-0.09z)| norm 0.3653 (+1.59z)| lr 4.65e-03 | 2008.85 ms | 68.3% bf16 MFU | 260539 tok/s +step 6936/18794 | loss 3.181882 (+0.38z)| norm 0.3583 (+1.46z)| lr 4.65e-03 | 2006.33 ms | 68.4% bf16 MFU | 260578 tok/s +step 6937/18794 | loss 3.175417 (+0.16z)| norm 0.1896 (-0.88z)| lr 4.65e-03 | 2026.63 ms | 67.7% bf16 MFU | 260484 tok/s +step 6938/18794 | loss 3.262250 (+2.85z)| norm 0.2344 (-0.25z)| lr 4.65e-03 | 2010.84 ms | 68.2% bf16 MFU | 260496 tok/s +step 6939/18794 | loss 3.189608 (+0.56z)| norm 0.1846 (-0.92z)| lr 4.65e-03 | 2010.38 ms | 68.3% bf16 MFU | 260511 tok/s +step 6940/18794 | loss 3.145994 (-0.80z)| norm 0.2067 (-0.61z)| lr 4.65e-03 | 2010.53 ms | 68.3% bf16 MFU | 260524 tok/s +step 6941/18794 | loss 3.147823 (-0.73z)| norm 0.2135 (-0.50z)| lr 4.65e-03 | 2003.06 ms | 68.5% bf16 MFU | 260585 tok/s +step 6942/18794 | loss 3.171542 (+0.02z)| norm 0.1790 (-0.97z)| lr 4.65e-03 | 2017.87 ms | 68.0% bf16 MFU | 260547 tok/s +step 6943/18794 | loss 3.159319 (-0.36z)| norm 0.1693 (-1.10z)| lr 4.65e-03 | 2003.05 ms | 68.5% bf16 MFU | 260607 tok/s +step 6944/18794 | loss 3.165868 (-0.16z)| norm 0.1749 (-1.03z)| lr 4.65e-03 | 2010.44 ms | 68.3% bf16 MFU | 260616 tok/s +step 6945/18794 | loss 3.188407 (+0.55z)| norm 0.1829 (-0.91z)| lr 4.64e-03 | 2010.24 ms | 68.3% bf16 MFU | 260625 tok/s +step 6946/18794 | loss 3.177641 (+0.21z)| norm 0.1801 (-0.96z)| lr 4.64e-03 | 2010.23 ms | 68.3% bf16 MFU | 260634 tok/s +step 6947/18794 | loss 3.121681 (-1.53z)| norm 0.1734 (-1.05z)| lr 4.64e-03 | 1997.25 ms | 68.7% bf16 MFU | 260728 tok/s +step 6948/18794 | loss 3.148016 (-0.70z)| norm 0.1700 (-1.07z)| lr 4.64e-03 | 2011.05 ms | 68.2% bf16 MFU | 260727 tok/s +step 6949/18794 | loss 3.176623 (+0.18z)| norm 0.1933 (-0.74z)| lr 4.64e-03 | 2010.25 ms | 68.3% bf16 MFU | 260731 tok/s +step 6950/18794 | loss 3.171764 (+0.04z)| norm 0.1860 (-0.83z)| lr 4.64e-03 | 2027.29 ms | 67.7% bf16 MFU | 260625 tok/s +step 6951/18794 | loss 3.168116 (-0.07z)| norm 0.2805 (+0.49z)| lr 4.64e-03 | 2003.31 ms | 68.5% bf16 MFU | 260679 tok/s +step 6952/18794 | loss 3.140576 (-0.92z)| norm 0.3282 (+1.13z)| lr 4.64e-03 | 2003.50 ms | 68.5% bf16 MFU | 260730 tok/s +step 6953/18794 | loss 3.173301 (+0.11z)| norm 0.1842 (-0.84z)| lr 4.64e-03 | 1982.63 ms | 69.2% bf16 MFU | 260915 tok/s +step 6954/18794 | loss 3.203625 (+1.04z)| norm 0.3545 (+1.47z)| lr 4.64e-03 | 2010.44 ms | 68.3% bf16 MFU | 260909 tok/s +step 6955/18794 | loss 3.134401 (-1.10z)| norm 0.3959 (+1.98z)| lr 4.64e-03 | 2008.53 ms | 68.3% bf16 MFU | 260915 tok/s +step 6956/18794 | loss 3.150080 (-0.61z)| norm 0.2288 (-0.27z)| lr 4.64e-03 | 2005.04 ms | 68.4% bf16 MFU | 260943 tok/s +step 6957/18794 | loss 3.154742 (-0.45z)| norm 0.2137 (-0.47z)| lr 4.64e-03 | 2001.23 ms | 68.6% bf16 MFU | 260995 tok/s +step 6958/18794 | loss 3.160222 (-0.28z)| norm 0.2645 (+0.21z)| lr 4.64e-03 | 1996.35 ms | 68.7% bf16 MFU | 261077 tok/s +step 6959/18794 | loss 3.159606 (-0.30z)| norm 0.2014 (-0.63z)| lr 4.64e-03 | 2026.81 ms | 67.7% bf16 MFU | 260957 tok/s +step 6960/18794 | loss 3.168488 (-0.02z)| norm 0.1775 (-0.95z)| lr 4.64e-03 | 2000.48 ms | 68.6% bf16 MFU | 261013 tok/s +step 6961/18794 | loss 3.114836 (-1.69z)| norm 0.2277 (-0.27z)| lr 4.64e-03 | 2004.44 ms | 68.5% bf16 MFU | 261040 tok/s +step 6962/18794 | loss 3.170896 (+0.07z)| norm 0.2059 (-0.57z)| lr 4.64e-03 | 2011.60 ms | 68.2% bf16 MFU | 261020 tok/s +step 6963/18794 | loss 3.154940 (-0.43z)| norm 0.2718 (+0.30z)| lr 4.64e-03 | 1996.70 ms | 68.7% bf16 MFU | 261098 tok/s +step 6964/18794 | loss 3.160735 (-0.26z)| norm 0.2010 (-0.64z)| lr 4.64e-03 | 2010.17 ms | 68.3% bf16 MFU | 261084 tok/s +step 6965/18794 | loss 3.166923 (-0.05z)| norm 0.2662 (+0.25z)| lr 4.64e-03 | 2010.71 ms | 68.3% bf16 MFU | 261067 tok/s +step 6966/18794 | loss 3.162253 (-0.19z)| norm 0.2778 (+0.40z)| lr 4.64e-03 | 2008.13 ms | 68.3% bf16 MFU | 261068 tok/s +step 6967/18794 | loss 3.153328 (-0.47z)| norm 0.2668 (+0.25z)| lr 4.64e-03 | 2004.00 ms | 68.5% bf16 MFU | 261095 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.202748 +step 6968/18794 | loss 3.142508 (-0.82z)| norm 0.4922 (+3.20z)| lr 4.64e-03 | 2019.24 ms | 68.0% bf16 MFU | 261023 tok/s +step 6969/18794 | loss 3.097340 (-2.23z)| norm 0.3026 (+0.70z)| lr 4.63e-03 | 2003.81 ms | 68.5% bf16 MFU | 261054 tok/s +step 6970/18794 | loss 3.106599 (-1.88z)| norm 0.2405 (-0.10z)| lr 4.63e-03 | 2015.50 ms | 68.1% bf16 MFU | 261008 tok/s +step 6971/18794 | loss 3.153137 (-0.40z)| norm 0.3039 (+0.79z)| lr 4.63e-03 | 2012.93 ms | 68.2% bf16 MFU | 260980 tok/s +step 6972/18794 | loss 3.159703 (-0.18z)| norm 0.1666 (-1.09z)| lr 4.63e-03 | 2021.76 ms | 67.9% bf16 MFU | 260898 tok/s +step 6973/18794 | loss 3.183739 (+0.56z)| norm 0.3353 (+1.21z)| lr 4.63e-03 | 2006.09 ms | 68.4% bf16 MFU | 260920 tok/s +step 6974/18794 | loss 3.153922 (-0.39z)| norm 0.2238 (-0.33z)| lr 4.63e-03 | 2018.89 ms | 68.0% bf16 MFU | 260859 tok/s +step 6975/18794 | loss 3.124093 (-1.31z)| norm 0.3216 (+1.00z)| lr 4.63e-03 | 2018.71 ms | 68.0% bf16 MFU | 260801 tok/s +step 6976/18794 | loss 3.180744 (+0.50z)| norm 0.3372 (+1.19z)| lr 4.63e-03 | 2030.86 ms | 67.6% bf16 MFU | 260669 tok/s +step 6977/18794 | loss 3.129646 (-1.13z)| norm 0.1934 (-0.77z)| lr 4.63e-03 | 2012.67 ms | 68.2% bf16 MFU | 260661 tok/s +step 6978/18794 | loss 3.110860 (-1.68z)| norm 0.2016 (-0.64z)| lr 4.63e-03 | 2012.73 ms | 68.2% bf16 MFU | 260652 tok/s +step 6979/18794 | loss 3.147388 (-0.55z)| norm 0.1941 (-0.73z)| lr 4.63e-03 | 2016.05 ms | 68.1% bf16 MFU | 260622 tok/s +step 6980/18794 | loss 3.132250 (-1.01z)| norm 0.1913 (-0.77z)| lr 4.63e-03 | 2010.76 ms | 68.2% bf16 MFU | 260628 tok/s +step 6981/18794 | loss 3.123205 (-1.27z)| norm 0.1823 (-0.88z)| lr 4.63e-03 | 2005.14 ms | 68.4% bf16 MFU | 260670 tok/s +step 6982/18794 | loss 3.115244 (-1.48z)| norm 0.2365 (-0.13z)| lr 4.63e-03 | 2010.20 ms | 68.3% bf16 MFU | 260677 tok/s +step 6983/18794 | loss 3.170686 (+0.25z)| norm 0.2275 (-0.23z)| lr 4.63e-03 | 2022.32 ms | 67.9% bf16 MFU | 260606 tok/s +step 6984/18794 | loss 3.160087 (-0.07z)| norm 0.3189 (+1.06z)| lr 4.63e-03 | 2024.65 ms | 67.8% bf16 MFU | 260523 tok/s +step 6985/18794 | loss 3.167980 (+0.18z)| norm 0.3820 (+1.91z)| lr 4.63e-03 | 2030.04 ms | 67.6% bf16 MFU | 260411 tok/s +step 6986/18794 | loss 3.192586 (+0.94z)| norm 0.2140 (-0.44z)| lr 4.63e-03 | 2014.55 ms | 68.1% bf16 MFU | 260402 tok/s +step 6987/18794 | loss 3.160078 (-0.11z)| norm 0.3237 (+1.07z)| lr 4.63e-03 | 2019.92 ms | 67.9% bf16 MFU | 260360 tok/s +step 6988/18794 | loss 3.143762 (-0.63z)| norm 0.2677 (+0.30z)| lr 4.63e-03 | 2009.57 ms | 68.3% bf16 MFU | 260387 tok/s +step 6989/18794 | loss 3.105208 (-1.84z)| norm 0.2440 (-0.04z)| lr 4.63e-03 | 2001.51 ms | 68.6% bf16 MFU | 260465 tok/s +step 6990/18794 | loss 3.117943 (-1.40z)| norm 0.2854 (+0.55z)| lr 4.63e-03 | 2006.85 ms | 68.4% bf16 MFU | 260504 tok/s +step 6991/18794 | loss 3.096100 (-2.08z)| norm 0.1821 (-0.89z)| lr 4.63e-03 | 2032.52 ms | 67.5% bf16 MFU | 260376 tok/s +step 6992/18794 | loss 3.138921 (-0.69z)| norm 0.2343 (-0.15z)| lr 4.63e-03 | 2035.13 ms | 67.4% bf16 MFU | 260239 tok/s +step 6993/18794 | loss 3.190298 (+0.98z)| norm 0.1962 (-0.69z)| lr 4.62e-03 | 2003.94 ms | 68.5% bf16 MFU | 260308 tok/s +step 6994/18794 | loss 3.168954 (+0.33z)| norm 0.2175 (-0.38z)| lr 4.62e-03 | 2004.73 ms | 68.5% bf16 MFU | 260369 tok/s +step 6995/18794 | loss 3.159107 (-0.00z)| norm 0.2244 (-0.29z)| lr 4.62e-03 | 2029.64 ms | 67.6% bf16 MFU | 260266 tok/s +step 6996/18794 | loss 3.188805 (+0.99z)| norm 0.1754 (-0.99z)| lr 4.62e-03 | 2008.35 ms | 68.3% bf16 MFU | 260306 tok/s +step 6997/18794 | loss 3.177377 (+0.62z)| norm 0.2036 (-0.58z)| lr 4.62e-03 | 2005.22 ms | 68.4% bf16 MFU | 260364 tok/s +step 6998/18794 | loss 3.128330 (-1.06z)| norm 0.1758 (-0.98z)| lr 4.62e-03 | 2023.35 ms | 67.8% bf16 MFU | 260301 tok/s +step 6999/18794 | loss 3.224545 (+2.22z)| norm 0.1847 (-0.84z)| lr 4.62e-03 | 2005.36 ms | 68.4% bf16 MFU | 260358 tok/s +step 7000/18794 | loss 3.134645 (-0.83z)| norm 0.1958 (-0.67z)| lr 4.62e-03 | 2035.21 ms | 67.4% bf16 MFU | 260221 tok/s +val loss 3.170832 +HellaSwag: 2916/10042 = 0.290380Swag: 990/1256: 0/1256 +generating: +--- +Writing state to log_gpt3_125M_edu_v4/state_00007000_00001.bin + +Hull bands were made of wood and shell and would cover the body of the helmet. Bell bands +were used to protect the rider and allow the wearer +to switch from a quick movement to multiple movements. Note +that the skull would be difficult to see reflected any oxygen from +the rider's nasal passage or nasal drainage. +Lighting Laws: Many "regular" lights +would normally be lit of Blue radials. If you can concentrate +on a particular area of darkness, you may lose cool down in hemitarics. +Cars used for "wind" (generally C remarked or +after the period of anesthesia and change) were pre-supervised and +fitted with reflectors +--- +Writing checkpoint at step 7000 +Writing model to log_gpt3_125M_edu_v4/model_00007000.bin +Writing state to log_gpt3_125M_edu_v4/state_00007000_00000.bin +Deleting checkpoint at step 4500 +step 7001/18794 | loss 3.201346 (+1.40z)| norm 0.1645 (-1.10z)| lr 4.62e-03 | 2000.11 ms | 68.6% bf16 MFU | 260316 tok/s +step 7002/18794 | loss 3.140643 (-0.62z)| norm 0.1791 (-0.88z)| lr 4.62e-03 | 2027.89 ms | 67.7% bf16 MFU | 260227 tok/s +step 7003/18794 | loss 3.178910 (+0.68z)| norm 0.1778 (-0.90z)| lr 4.62e-03 | 2010.22 ms | 68.3% bf16 MFU | 260257 tok/s +step 7004/18794 | loss 3.176530 (+0.58z)| norm 0.1827 (-0.82z)| lr 4.62e-03 | 2002.95 ms | 68.5% bf16 MFU | 260332 tok/s +step 7005/18794 | loss 3.182393 (+0.79z)| norm 0.1919 (-0.69z)| lr 4.62e-03 | 2012.49 ms | 68.2% bf16 MFU | 260341 tok/s +step 7006/18794 | loss 3.146333 (-0.45z)| norm 0.1626 (-1.10z)| lr 4.62e-03 | 2003.82 ms | 68.5% bf16 MFU | 260406 tok/s +step 7007/18794 | loss 3.180687 (+0.74z)| norm 0.2007 (-0.56z)| lr 4.62e-03 | 2013.20 ms | 68.2% bf16 MFU | 260407 tok/s +step 7008/18794 | loss 3.166284 (+0.24z)| norm 0.1889 (-0.74z)| lr 4.62e-03 | 2018.28 ms | 68.0% bf16 MFU | 260375 tok/s +step 7009/18794 | loss 3.188709 (+0.99z)| norm 0.1633 (-1.09z)| lr 4.62e-03 | 2026.59 ms | 67.7% bf16 MFU | 260292 tok/s +step 7010/18794 | loss 3.121931 (-1.30z)| norm 0.2179 (-0.30z)| lr 4.62e-03 | 2019.89 ms | 67.9% bf16 MFU | 260255 tok/s +step 7011/18794 | loss 3.166798 (+0.24z)| norm 0.1737 (-0.91z)| lr 4.62e-03 | 2006.89 ms | 68.4% bf16 MFU | 260305 tok/s +step 7012/18794 | loss 3.188149 (+0.98z)| norm 0.3046 (+0.92z)| lr 4.62e-03 | 2004.69 ms | 68.5% bf16 MFU | 260366 tok/s +step 7013/18794 | loss 3.141057 (-0.65z)| norm 0.3703 (+1.79z)| lr 4.62e-03 | 2014.28 ms | 68.1% bf16 MFU | 260362 tok/s +step 7014/18794 | loss 3.273234 (+3.64z)| norm 0.2418 (+0.01z)| lr 4.62e-03 | 2017.47 ms | 68.0% bf16 MFU | 260338 tok/s +step 7015/18794 | loss 3.166076 (+0.15z)| norm 0.1985 (-0.58z)| lr 4.62e-03 | 2019.45 ms | 68.0% bf16 MFU | 260302 tok/s +step 7016/18794 | loss 3.131685 (-0.95z)| norm 0.1950 (-0.63z)| lr 4.62e-03 | 1994.32 ms | 68.8% bf16 MFU | 260431 tok/s +step 7017/18794 | loss 3.094055 (-2.12z)| norm 0.1946 (-0.62z)| lr 4.61e-03 | 2007.59 ms | 68.4% bf16 MFU | 260467 tok/s +step 7018/18794 | loss 3.181480 (+0.67z)| norm 0.2221 (-0.22z)| lr 4.61e-03 | 2026.45 ms | 67.7% bf16 MFU | 260380 tok/s +step 7019/18794 | loss 3.092720 (-2.11z)| norm 0.1580 (-1.13z)| lr 4.61e-03 | 2012.27 ms | 68.2% bf16 MFU | 260388 tok/s +step 7020/18794 | loss 3.139342 (-0.64z)| norm 0.2079 (-0.40z)| lr 4.61e-03 | 2014.63 ms | 68.1% bf16 MFU | 260381 tok/s +step 7021/18794 | loss 3.140212 (-0.61z)| norm 0.1988 (-0.52z)| lr 4.61e-03 | 2000.19 ms | 68.6% bf16 MFU | 260468 tok/s +step 7022/18794 | loss 3.119453 (-1.24z)| norm 0.2115 (-0.33z)| lr 4.61e-03 | 2000.18 ms | 68.6% bf16 MFU | 260550 tok/s +step 7023/18794 | loss 3.095570 (-1.94z)| norm 0.2388 (+0.07z)| lr 4.61e-03 | 1999.12 ms | 68.6% bf16 MFU | 260636 tok/s +step 7024/18794 | loss 3.122898 (-1.08z)| norm 0.2133 (-0.31z)| lr 4.61e-03 | 2023.81 ms | 67.8% bf16 MFU | 260557 tok/s +step 7025/18794 | loss 3.155089 (-0.10z)| norm 0.2155 (-0.28z)| lr 4.61e-03 | 2036.23 ms | 67.4% bf16 MFU | 260403 tok/s +step 7026/18794 | loss 3.187270 (+0.93z)| norm 0.2251 (-0.14z)| lr 4.61e-03 | 1996.51 ms | 68.7% bf16 MFU | 260513 tok/s +step 7027/18794 | loss 3.147416 (-0.32z)| norm 0.2366 (+0.02z)| lr 4.61e-03 | 2006.52 ms | 68.4% bf16 MFU | 260552 tok/s +step 7028/18794 | loss 3.154545 (-0.09z)| norm 0.2076 (-0.41z)| lr 4.61e-03 | 2029.53 ms | 67.6% bf16 MFU | 260441 tok/s +step 7029/18794 | loss 3.119519 (-1.18z)| norm 0.1840 (-0.74z)| lr 4.61e-03 | 2015.63 ms | 68.1% bf16 MFU | 260424 tok/s +step 7030/18794 | loss 3.195886 (+1.23z)| norm 0.1838 (-0.74z)| lr 4.61e-03 | 1998.86 ms | 68.7% bf16 MFU | 260518 tok/s +step 7031/18794 | loss 3.154921 (-0.05z)| norm 0.2020 (-0.46z)| lr 4.61e-03 | 2032.07 ms | 67.5% bf16 MFU | 260392 tok/s +step 7032/18794 | loss 3.151874 (-0.15z)| norm 0.1817 (-0.76z)| lr 4.61e-03 | 2014.00 ms | 68.1% bf16 MFU | 260389 tok/s +step 7033/18794 | loss 3.186471 (+0.93z)| norm 0.2469 (+0.29z)| lr 4.61e-03 | 2002.72 ms | 68.5% bf16 MFU | 260459 tok/s +step 7034/18794 | loss 3.190623 (+1.06z)| norm 0.2289 (-0.01z)| lr 4.61e-03 | 2008.93 ms | 68.3% bf16 MFU | 260485 tok/s +step 7035/18794 | loss 3.141891 (-0.47z)| norm 0.1808 (-0.77z)| lr 4.61e-03 | 2005.37 ms | 68.4% bf16 MFU | 260533 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.912639 +step 7036/18794 | loss 3.120132 (-1.14z)| norm 0.4092 (+2.91z)| lr 4.61e-03 | 2014.97 ms | 68.1% bf16 MFU | 260516 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.931032 +step 7037/18794 | loss 3.163125 (+0.22z)| norm 0.4208 (+2.93z)| lr 4.61e-03 | 2005.65 ms | 68.4% bf16 MFU | 260560 tok/s +step 7038/18794 | loss 3.084550 (-2.27z)| norm 0.2300 (-0.01z)| lr 4.61e-03 | 2017.86 ms | 68.0% bf16 MFU | 260523 tok/s +step 7039/18794 | loss 3.181148 (+0.88z)| norm 0.2232 (-0.12z)| lr 4.61e-03 | 2000.19 ms | 68.6% bf16 MFU | 260603 tok/s +step 7040/18794 | loss 3.123168 (-1.00z)| norm 0.2883 (+0.87z)| lr 4.61e-03 | 2009.44 ms | 68.3% bf16 MFU | 260619 tok/s +step 7041/18794 | loss 3.175376 (+0.68z)| norm 0.2268 (-0.08z)| lr 4.60e-03 | 2013.52 ms | 68.2% bf16 MFU | 260607 tok/s +step 7042/18794 | loss 3.166519 (+0.40z)| norm 0.2078 (-0.37z)| lr 4.60e-03 | 2035.73 ms | 67.4% bf16 MFU | 260454 tok/s +step 7043/18794 | loss 3.115077 (-1.25z)| norm 0.1866 (-0.71z)| lr 4.60e-03 | 2010.79 ms | 68.2% bf16 MFU | 260468 tok/s +step 7044/18794 | loss 3.125987 (-0.88z)| norm 0.2151 (-0.27z)| lr 4.60e-03 | 2012.16 ms | 68.2% bf16 MFU | 260473 tok/s +step 7045/18794 | loss 3.180098 (+0.86z)| norm 0.2138 (-0.30z)| lr 4.60e-03 | 2010.02 ms | 68.3% bf16 MFU | 260491 tok/s +step 7046/18794 | loss 3.145261 (-0.25z)| norm 0.1953 (-0.59z)| lr 4.60e-03 | 2032.58 ms | 67.5% bf16 MFU | 260363 tok/s +step 7047/18794 | loss 3.121269 (-1.02z)| norm 0.3041 (+1.09z)| lr 4.60e-03 | 2000.05 ms | 68.6% bf16 MFU | 260452 tok/s +step 7048/18794 | loss 3.110348 (-1.35z)| norm 0.3257 (+1.39z)| lr 4.60e-03 | 2017.18 ms | 68.0% bf16 MFU | 260425 tok/s +step 7049/18794 | loss 3.120856 (-1.00z)| norm 0.2100 (-0.41z)| lr 4.60e-03 | 2020.95 ms | 67.9% bf16 MFU | 260375 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.288847 +step 7050/18794 | loss 3.091666 (-1.87z)| norm 0.4627 (+3.29z)| lr 4.60e-03 | 2009.60 ms | 68.3% bf16 MFU | 260401 tok/s +step 7051/18794 | loss 3.199073 (+1.48z)| norm 0.2894 (+0.74z)| lr 4.60e-03 | 2010.30 ms | 68.3% bf16 MFU | 260421 tok/s +step 7052/18794 | loss 3.154167 (+0.08z)| norm 0.2442 (+0.09z)| lr 4.60e-03 | 2017.73 ms | 68.0% bf16 MFU | 260392 tok/s +step 7053/18794 | loss 3.173189 (+0.67z)| norm 0.1947 (-0.65z)| lr 4.60e-03 | 2025.52 ms | 67.8% bf16 MFU | 260314 tok/s +step 7054/18794 | loss 3.118579 (-1.01z)| norm 0.1853 (-0.77z)| lr 4.60e-03 | 2018.42 ms | 68.0% bf16 MFU | 260286 tok/s +step 7055/18794 | loss 3.124975 (-0.81z)| norm 0.2029 (-0.49z)| lr 4.60e-03 | 2029.21 ms | 67.6% bf16 MFU | 260191 tok/s +step 7056/18794 | loss 3.164103 (+0.41z)| norm 0.1852 (-0.76z)| lr 4.60e-03 | 2000.63 ms | 68.6% bf16 MFU | 260284 tok/s +step 7057/18794 | loss 3.145786 (-0.16z)| norm 0.2650 (+0.47z)| lr 4.60e-03 | 2025.94 ms | 67.7% bf16 MFU | 260209 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.898657 +step 7058/18794 | loss 3.169873 (+0.59z)| norm 0.4331 (+2.90z)| lr 4.60e-03 | 2027.82 ms | 67.7% bf16 MFU | 260126 tok/s +step 7059/18794 | loss 3.158053 (+0.22z)| norm 0.1657 (-1.03z)| lr 4.60e-03 | 2033.45 ms | 67.5% bf16 MFU | 260011 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.116500 +step 7060/18794 | loss 3.081987 (-2.08z)| norm 0.4616 (+3.12z)| lr 4.60e-03 | 2011.91 ms | 68.2% bf16 MFU | 260040 tok/s +step 7061/18794 | loss 3.142000 (-0.26z)| norm 0.3045 (+0.90z)| lr 4.60e-03 | 2019.37 ms | 68.0% bf16 MFU | 260020 tok/s +step 7062/18794 | loss 3.164307 (+0.43z)| norm 0.2369 (-0.04z)| lr 4.60e-03 | 2018.76 ms | 68.0% bf16 MFU | 260004 tok/s +step 7063/18794 | loss 3.092674 (-1.73z)| norm 0.3140 (+1.02z)| lr 4.60e-03 | 2024.54 ms | 67.8% bf16 MFU | 259952 tok/s +step 7064/18794 | loss 3.094435 (-1.63z)| norm 0.2109 (-0.41z)| lr 4.60e-03 | 2018.99 ms | 68.0% bf16 MFU | 259939 tok/s +step 7065/18794 | loss 3.195521 (+1.37z)| norm 0.2064 (-0.47z)| lr 4.59e-03 | 2015.52 ms | 68.1% bf16 MFU | 259948 tok/s +step 7066/18794 | loss 3.145668 (-0.10z)| norm 0.2097 (-0.41z)| lr 4.59e-03 | 2010.86 ms | 68.2% bf16 MFU | 259987 tok/s +step 7067/18794 | loss 3.156269 (+0.21z)| norm 0.1681 (-0.97z)| lr 4.59e-03 | 2017.69 ms | 68.0% bf16 MFU | 259980 tok/s +step 7068/18794 | loss 3.181357 (+0.94z)| norm 0.2122 (-0.34z)| lr 4.59e-03 | 2026.90 ms | 67.7% bf16 MFU | 259914 tok/s +step 7069/18794 | loss 3.119168 (-0.91z)| norm 0.2387 (+0.06z)| lr 4.59e-03 | 2011.33 ms | 68.2% bf16 MFU | 259952 tok/s +step 7070/18794 | loss 3.134295 (-0.47z)| norm 0.2922 (+0.84z)| lr 4.59e-03 | 2017.27 ms | 68.0% bf16 MFU | 259949 tok/s +step 7071/18794 | loss 3.114785 (-1.04z)| norm 0.2050 (-0.44z)| lr 4.59e-03 | 2007.67 ms | 68.4% bf16 MFU | 260009 tok/s +step 7072/18794 | loss 3.146227 (-0.10z)| norm 0.2439 (+0.13z)| lr 4.59e-03 | 2025.27 ms | 67.8% bf16 MFU | 259952 tok/s +step 7073/18794 | loss 3.163774 (+0.43z)| norm 0.2388 (+0.07z)| lr 4.59e-03 | 2023.22 ms | 67.8% bf16 MFU | 259911 tok/s +step 7074/18794 | loss 3.143631 (-0.17z)| norm 0.2305 (-0.06z)| lr 4.59e-03 | 2028.92 ms | 67.6% bf16 MFU | 259836 tok/s +step 7075/18794 | loss 3.213177 (+1.87z)| norm 0.3360 (+1.53z)| lr 4.59e-03 | 2017.02 ms | 68.0% bf16 MFU | 259841 tok/s +step 7076/18794 | loss 3.124528 (-0.74z)| norm 0.1921 (-0.62z)| lr 4.59e-03 | 2018.37 ms | 68.0% bf16 MFU | 259837 tok/s +step 7077/18794 | loss 3.212677 (+1.82z)| norm 0.3128 (+1.19z)| lr 4.59e-03 | 2024.69 ms | 67.8% bf16 MFU | 259792 tok/s +step 7078/18794 | loss 3.097420 (-1.53z)| norm 0.3094 (+1.12z)| lr 4.59e-03 | 2012.31 ms | 68.2% bf16 MFU | 259830 tok/s +step 7079/18794 | loss 3.171280 (+0.60z)| norm 0.2290 (-0.10z)| lr 4.59e-03 | 2013.18 ms | 68.2% bf16 MFU | 259860 tok/s +step 7080/18794 | loss 3.151620 (+0.03z)| norm 0.2104 (-0.38z)| lr 4.59e-03 | 2025.91 ms | 67.7% bf16 MFU | 259806 tok/s +step 7081/18794 | loss 3.164581 (+0.39z)| norm 0.3033 (+1.00z)| lr 4.59e-03 | 2010.31 ms | 68.3% bf16 MFU | 259856 tok/s +step 7082/18794 | loss 3.150003 (-0.04z)| norm 0.2177 (-0.29z)| lr 4.59e-03 | 2009.77 ms | 68.3% bf16 MFU | 259907 tok/s +step 7083/18794 | loss 3.150512 (-0.02z)| norm 0.2933 (+0.84z)| lr 4.59e-03 | 2004.36 ms | 68.5% bf16 MFU | 259990 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.790265 +step 7084/18794 | loss 3.216365 (+1.86z)| norm 0.4303 (+2.79z)| lr 4.59e-03 | 2023.23 ms | 67.8% bf16 MFU | 259947 tok/s +step 7085/18794 | loss 3.147484 (-0.12z)| norm 0.2107 (-0.39z)| lr 4.59e-03 | 2013.12 ms | 68.2% bf16 MFU | 259971 tok/s +step 7086/18794 | loss 3.198995 (+1.36z)| norm 0.2858 (+0.72z)| lr 4.59e-03 | 2010.46 ms | 68.3% bf16 MFU | 260012 tok/s +step 7087/18794 | loss 3.179787 (+0.80z)| norm 0.3239 (+1.28z)| lr 4.59e-03 | 2030.38 ms | 67.6% bf16 MFU | 259922 tok/s +step 7088/18794 | loss 3.146033 (-0.17z)| norm 0.2583 (+0.31z)| lr 4.59e-03 | 2011.34 ms | 68.2% bf16 MFU | 259960 tok/s +step 7089/18794 | loss 3.210431 (+1.64z)| norm 0.2276 (-0.14z)| lr 4.58e-03 | 2004.91 ms | 68.4% bf16 MFU | 260037 tok/s +step 7090/18794 | loss 3.146563 (-0.19z)| norm 0.3160 (+1.16z)| lr 4.58e-03 | 2004.62 ms | 68.5% bf16 MFU | 260112 tok/s +step 7091/18794 | loss 3.131537 (-0.64z)| norm 0.2290 (-0.13z)| lr 4.58e-03 | 2013.89 ms | 68.1% bf16 MFU | 260123 tok/s +step 7092/18794 | loss 3.166024 (+0.35z)| norm 0.1914 (-0.68z)| lr 4.58e-03 | 2026.90 ms | 67.7% bf16 MFU | 260050 tok/s +step 7093/18794 | loss 3.158742 (+0.15z)| norm 0.3007 (+0.92z)| lr 4.58e-03 | 2002.77 ms | 68.5% bf16 MFU | 260137 tok/s +step 7094/18794 | loss 3.183781 (+0.88z)| norm 0.2330 (-0.09z)| lr 4.58e-03 | 2018.35 ms | 68.0% bf16 MFU | 260118 tok/s +step 7095/18794 | loss 3.156732 (+0.09z)| norm 0.1714 (-0.98z)| lr 4.58e-03 | 2020.12 ms | 67.9% bf16 MFU | 260089 tok/s +step 7096/18794 | loss 3.105967 (-1.36z)| norm 0.1722 (-0.97z)| lr 4.58e-03 | 2018.65 ms | 68.0% bf16 MFU | 260070 tok/s +step 7097/18794 | loss 3.122113 (-0.87z)| norm 0.1614 (-1.12z)| lr 4.58e-03 | 2019.33 ms | 68.0% bf16 MFU | 260049 tok/s +step 7098/18794 | loss 3.166721 (+0.41z)| norm 0.1888 (-0.72z)| lr 4.58e-03 | 2025.13 ms | 67.8% bf16 MFU | 259991 tok/s +step 7099/18794 | loss 3.185819 (+0.99z)| norm 0.2474 (+0.13z)| lr 4.58e-03 | 2008.16 ms | 68.3% bf16 MFU | 260045 tok/s +step 7100/18794 | loss 3.141430 (-0.32z)| norm 0.2660 (+0.39z)| lr 4.58e-03 | 2011.00 ms | 68.2% bf16 MFU | 260078 tok/s +step 7101/18794 | loss 3.164199 (+0.37z)| norm 0.2409 (+0.01z)| lr 4.58e-03 | 2008.01 ms | 68.3% bf16 MFU | 260129 tok/s +step 7102/18794 | loss 3.102543 (-1.45z)| norm 0.1626 (-1.14z)| lr 4.58e-03 | 2010.25 ms | 68.3% bf16 MFU | 260163 tok/s +step 7103/18794 | loss 3.147153 (-0.12z)| norm 0.2688 (+0.41z)| lr 4.58e-03 | 2004.42 ms | 68.5% bf16 MFU | 260233 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.421385 +step 7104/18794 | loss 3.120323 (-0.90z)| norm 0.4890 (+3.42z)| lr 4.58e-03 | 2027.31 ms | 67.7% bf16 MFU | 260152 tok/s +step 7105/18794 | loss 3.087939 (-1.80z)| norm 0.2282 (-0.22z)| lr 4.58e-03 | 2013.68 ms | 68.2% bf16 MFU | 260163 tok/s +step 7106/18794 | loss 3.135914 (-0.40z)| norm 0.3743 (+1.77z)| lr 4.58e-03 | 2010.16 ms | 68.3% bf16 MFU | 260196 tok/s +step 7107/18794 | loss 3.096073 (-1.53z)| norm 0.3356 (+1.21z)| lr 4.58e-03 | 2007.54 ms | 68.4% bf16 MFU | 260244 tok/s +step 7108/18794 | loss 3.157976 (+0.27z)| norm 0.1788 (-0.95z)| lr 4.58e-03 | 2011.45 ms | 68.2% bf16 MFU | 260264 tok/s +step 7109/18794 | loss 3.174689 (+0.76z)| norm 0.2201 (-0.39z)| lr 4.58e-03 | 2006.69 ms | 68.4% bf16 MFU | 260315 tok/s +step 7110/18794 | loss 3.157312 (+0.25z)| norm 0.2365 (-0.16z)| lr 4.58e-03 | 2009.43 ms | 68.3% bf16 MFU | 260345 tok/s +step 7111/18794 | loss 3.148691 (-0.00z)| norm 0.2373 (-0.16z)| lr 4.58e-03 | 1994.80 ms | 68.8% bf16 MFU | 260469 tok/s +step 7112/18794 | loss 3.165471 (+0.50z)| norm 0.2097 (-0.53z)| lr 4.58e-03 | 2014.66 ms | 68.1% bf16 MFU | 260457 tok/s +step 7113/18794 | loss 3.136037 (-0.36z)| norm 0.2027 (-0.62z)| lr 4.57e-03 | 2013.89 ms | 68.1% bf16 MFU | 260451 tok/s +step 7114/18794 | loss 3.208164 (+1.87z)| norm 0.2201 (-0.37z)| lr 4.57e-03 | 2019.36 ms | 68.0% bf16 MFU | 260410 tok/s +step 7115/18794 | loss 3.171695 (+0.74z)| norm 0.1891 (-0.80z)| lr 4.57e-03 | 2018.90 ms | 68.0% bf16 MFU | 260374 tok/s +step 7116/18794 | loss 3.095975 (-1.58z)| norm 0.2029 (-0.61z)| lr 4.57e-03 | 2012.27 ms | 68.2% bf16 MFU | 260382 tok/s +step 7117/18794 | loss 3.159547 (+0.35z)| norm 0.1943 (-0.73z)| lr 4.57e-03 | 2018.08 ms | 68.0% bf16 MFU | 260353 tok/s +step 7118/18794 | loss 3.168213 (+0.63z)| norm 0.1951 (-0.72z)| lr 4.57e-03 | 2010.96 ms | 68.2% bf16 MFU | 260371 tok/s +step 7119/18794 | loss 3.130214 (-0.57z)| norm 0.1629 (-1.17z)| lr 4.57e-03 | 2010.71 ms | 68.3% bf16 MFU | 260390 tok/s +step 7120/18794 | loss 3.197301 (+1.51z)| norm 0.1760 (-0.98z)| lr 4.57e-03 | 2004.60 ms | 68.5% bf16 MFU | 260448 tok/s +step 7121/18794 | loss 3.201600 (+1.60z)| norm 0.2058 (-0.56z)| lr 4.57e-03 | 2033.95 ms | 67.5% bf16 MFU | 260314 tok/s +step 7122/18794 | loss 3.169663 (+0.60z)| norm 0.2775 (+0.44z)| lr 4.57e-03 | 2010.74 ms | 68.2% bf16 MFU | 260335 tok/s +step 7123/18794 | loss 3.122863 (-0.86z)| norm 0.2532 (+0.09z)| lr 4.57e-03 | 2018.29 ms | 68.0% bf16 MFU | 260307 tok/s +step 7124/18794 | loss 3.156605 (+0.18z)| norm 0.1902 (-0.79z)| lr 4.57e-03 | 2009.63 ms | 68.3% bf16 MFU | 260336 tok/s +step 7125/18794 | loss 3.158131 (+0.23z)| norm 0.2934 (+0.65z)| lr 4.57e-03 | 2005.84 ms | 68.4% bf16 MFU | 260388 tok/s +step 7126/18794 | loss 3.184345 (+1.05z)| norm 0.3305 (+1.15z)| lr 4.57e-03 | 2016.87 ms | 68.0% bf16 MFU | 260366 tok/s +step 7127/18794 | loss 3.116601 (-1.05z)| norm 0.2382 (-0.14z)| lr 4.57e-03 | 2005.13 ms | 68.4% bf16 MFU | 260422 tok/s +step 7128/18794 | loss 3.133318 (-0.53z)| norm 0.2318 (-0.23z)| lr 4.57e-03 | 2002.04 ms | 68.5% bf16 MFU | 260494 tok/s +step 7129/18794 | loss 3.159687 (+0.28z)| norm 0.3701 (+1.66z)| lr 4.57e-03 | 2009.44 ms | 68.3% bf16 MFU | 260515 tok/s +step 7130/18794 | loss 3.142156 (-0.25z)| norm 0.3423 (+1.25z)| lr 4.57e-03 | 2011.28 ms | 68.2% bf16 MFU | 260523 tok/s +step 7131/18794 | loss 3.150538 (+0.02z)| norm 0.2355 (-0.23z)| lr 4.57e-03 | 2009.43 ms | 68.3% bf16 MFU | 260543 tok/s +step 7132/18794 | loss 3.159443 (+0.30z)| norm 0.2790 (+0.36z)| lr 4.57e-03 | 2004.18 ms | 68.5% bf16 MFU | 260596 tok/s +step 7133/18794 | loss 3.132704 (-0.53z)| norm 0.2328 (-0.28z)| lr 4.57e-03 | 2001.89 ms | 68.6% bf16 MFU | 260661 tok/s +step 7134/18794 | loss 3.117073 (-1.01z)| norm 0.1969 (-0.77z)| lr 4.57e-03 | 2011.49 ms | 68.2% bf16 MFU | 260660 tok/s +step 7135/18794 | loss 3.141437 (-0.23z)| norm 0.3655 (+1.52z)| lr 4.57e-03 | 2012.46 ms | 68.2% bf16 MFU | 260653 tok/s +step 7136/18794 | loss 3.148750 (-0.01z)| norm 0.3392 (+1.19z)| lr 4.57e-03 | 2024.59 ms | 67.8% bf16 MFU | 260568 tok/s +step 7137/18794 | loss 3.142729 (-0.20z)| norm 0.1827 (-0.98z)| lr 4.56e-03 | 1993.80 ms | 68.8% bf16 MFU | 260688 tok/s +step 7138/18794 | loss 3.131962 (-0.57z)| norm 0.3128 (+0.86z)| lr 4.56e-03 | 2001.54 ms | 68.6% bf16 MFU | 260751 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.240571 +step 7139/18794 | loss 3.185980 (+1.19z)| norm 0.4156 (+2.24z)| lr 4.56e-03 | 2018.29 ms | 68.0% bf16 MFU | 260701 tok/s +step 7140/18794 | loss 3.161514 (+0.38z)| norm 0.2185 (-0.48z)| lr 4.56e-03 | 2019.29 ms | 68.0% bf16 MFU | 260648 tok/s +step 7141/18794 | loss 3.153775 (+0.14z)| norm 0.2450 (-0.12z)| lr 4.56e-03 | 2018.72 ms | 68.0% bf16 MFU | 260602 tok/s +step 7142/18794 | loss 3.149138 (-0.01z)| norm 0.2620 (+0.11z)| lr 4.56e-03 | 2009.99 ms | 68.3% bf16 MFU | 260614 tok/s +step 7143/18794 | loss 3.193545 (+1.42z)| norm 0.2355 (-0.27z)| lr 4.56e-03 | 2001.77 ms | 68.6% bf16 MFU | 260678 tok/s +step 7144/18794 | loss 3.198141 (+1.53z)| norm 0.2328 (-0.31z)| lr 4.56e-03 | 2001.75 ms | 68.6% bf16 MFU | 260740 tok/s +step 7145/18794 | loss 3.195777 (+1.44z)| norm 0.2363 (-0.26z)| lr 4.56e-03 | 1994.52 ms | 68.8% bf16 MFU | 260846 tok/s +step 7146/18794 | loss 3.186734 (+1.13z)| norm 0.1764 (-1.10z)| lr 4.56e-03 | 1987.36 ms | 69.1% bf16 MFU | 260995 tok/s +step 7147/18794 | loss 3.142079 (-0.31z)| norm 0.2572 (+0.04z)| lr 4.56e-03 | 2016.69 ms | 68.0% bf16 MFU | 260944 tok/s +step 7148/18794 | loss 3.151317 (-0.03z)| norm 0.3344 (+1.12z)| lr 4.56e-03 | 2001.61 ms | 68.6% bf16 MFU | 260993 tok/s +step 7149/18794 | loss 3.124995 (-0.88z)| norm 0.2671 (+0.17z)| lr 4.56e-03 | 1994.26 ms | 68.8% bf16 MFU | 261088 tok/s +step 7150/18794 | loss 3.158655 (+0.19z)| norm 0.2536 (+0.01z)| lr 4.56e-03 | 2011.29 ms | 68.2% bf16 MFU | 261068 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.707683 +step 7151/18794 | loss 3.099250 (-1.74z)| norm 0.4464 (+2.71z)| lr 4.56e-03 | 2001.59 ms | 68.6% bf16 MFU | 261111 tok/s +step 7152/18794 | loss 3.112594 (-1.27z)| norm 0.2314 (-0.32z)| lr 4.56e-03 | 2002.22 ms | 68.5% bf16 MFU | 261148 tok/s +step 7153/18794 | loss 3.166465 (+0.50z)| norm 0.3231 (+0.95z)| lr 4.56e-03 | 2016.61 ms | 68.1% bf16 MFU | 261090 tok/s +step 7154/18794 | loss 3.128276 (-0.76z)| norm 0.3651 (+1.51z)| lr 4.56e-03 | 2019.30 ms | 68.0% bf16 MFU | 261017 tok/s +step 7155/18794 | loss 3.133307 (-0.60z)| norm 0.1864 (-0.99z)| lr 4.56e-03 | 2010.21 ms | 68.3% bf16 MFU | 261007 tok/s +step 7156/18794 | loss 3.189219 (+1.23z)| norm 0.2788 (+0.29z)| lr 4.56e-03 | 2001.60 ms | 68.6% bf16 MFU | 261053 tok/s +step 7157/18794 | loss 3.138919 (-0.42z)| norm 0.2102 (-0.67z)| lr 4.56e-03 | 2001.55 ms | 68.6% bf16 MFU | 261098 tok/s +step 7158/18794 | loss 3.165591 (+0.46z)| norm 0.1763 (-1.13z)| lr 4.56e-03 | 2009.28 ms | 68.3% bf16 MFU | 261090 tok/s +step 7159/18794 | loss 3.176876 (+0.82z)| norm 0.1790 (-1.10z)| lr 4.56e-03 | 2018.91 ms | 68.0% bf16 MFU | 261020 tok/s +step 7160/18794 | loss 3.173057 (+0.68z)| norm 0.2854 (+0.48z)| lr 4.55e-03 | 2026.96 ms | 67.7% bf16 MFU | 260901 tok/s +step 7161/18794 | loss 3.219361 (+2.15z)| norm 0.3427 (+1.33z)| lr 4.55e-03 | 2019.34 ms | 68.0% bf16 MFU | 260838 tok/s +step 7162/18794 | loss 3.108974 (-1.43z)| norm 0.2677 (+0.20z)| lr 4.55e-03 | 2011.90 ms | 68.2% bf16 MFU | 260826 tok/s +step 7163/18794 | loss 3.156251 (+0.09z)| norm 0.2146 (-0.58z)| lr 4.55e-03 | 2033.89 ms | 67.5% bf16 MFU | 260673 tok/s +step 7164/18794 | loss 3.196396 (+1.39z)| norm 0.1723 (-1.20z)| lr 4.55e-03 | 2017.78 ms | 68.0% bf16 MFU | 260631 tok/s +step 7165/18794 | loss 3.171904 (+0.59z)| norm 0.2421 (-0.17z)| lr 4.55e-03 | 2018.71 ms | 68.0% bf16 MFU | 260585 tok/s +step 7166/18794 | loss 3.118266 (-1.20z)| norm 0.2559 (+0.03z)| lr 4.55e-03 | 2012.50 ms | 68.2% bf16 MFU | 260582 tok/s +step 7167/18794 | loss 3.140742 (-0.44z)| norm 0.1994 (-0.82z)| lr 4.55e-03 | 1994.37 ms | 68.8% bf16 MFU | 260697 tok/s +step 7168/18794 | loss 3.145650 (-0.27z)| norm 0.1774 (-1.14z)| lr 4.55e-03 | 2003.04 ms | 68.5% bf16 MFU | 260750 tok/s +step 7169/18794 | loss 3.141702 (-0.41z)| norm 0.2118 (-0.62z)| lr 4.55e-03 | 2002.18 ms | 68.5% bf16 MFU | 260805 tok/s +step 7170/18794 | loss 3.144410 (-0.32z)| norm 0.2590 (+0.09z)| lr 4.55e-03 | 2011.86 ms | 68.2% bf16 MFU | 260795 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.279883 +step 7171/18794 | loss 3.216590 (+2.05z)| norm 0.4107 (+2.28z)| lr 4.55e-03 | 1994.52 ms | 68.8% bf16 MFU | 260898 tok/s +step 7172/18794 | loss 3.228356 (+2.35z)| norm 0.3240 (+0.99z)| lr 4.55e-03 | 1986.33 ms | 69.1% bf16 MFU | 261051 tok/s +step 7173/18794 | loss 3.176598 (+0.67z)| norm 0.2365 (-0.28z)| lr 4.55e-03 | 1995.75 ms | 68.8% bf16 MFU | 261133 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.534954 +step 7174/18794 | loss 3.175732 (+0.63z)| norm 0.4377 (+2.53z)| lr 4.55e-03 | 1995.18 ms | 68.8% bf16 MFU | 261215 tok/s +step 7175/18794 | loss 3.207710 (+1.67z)| norm 0.2858 (+0.40z)| lr 4.55e-03 | 2003.88 ms | 68.5% bf16 MFU | 261236 tok/s +step 7176/18794 | loss 3.205411 (+1.56z)| norm 0.2026 (-0.78z)| lr 4.55e-03 | 2003.36 ms | 68.5% bf16 MFU | 261260 tok/s +step 7177/18794 | loss 3.131963 (-0.79z)| norm 0.2110 (-0.65z)| lr 4.55e-03 | 1996.39 ms | 68.7% bf16 MFU | 261328 tok/s +step 7178/18794 | loss 3.122459 (-1.12z)| norm 0.1820 (-1.04z)| lr 4.55e-03 | 2009.61 ms | 68.3% bf16 MFU | 261306 tok/s +step 7179/18794 | loss 3.153257 (-0.10z)| norm 0.2142 (-0.58z)| lr 4.55e-03 | 1995.92 ms | 68.8% bf16 MFU | 261375 tok/s +step 7180/18794 | loss 3.143258 (-0.43z)| norm 0.2046 (-0.71z)| lr 4.55e-03 | 2001.45 ms | 68.6% bf16 MFU | 261403 tok/s +step 7181/18794 | loss 3.169127 (+0.43z)| norm 0.2021 (-0.73z)| lr 4.55e-03 | 2011.71 ms | 68.2% bf16 MFU | 261364 tok/s +step 7182/18794 | loss 3.143524 (-0.42z)| norm 0.2072 (-0.66z)| lr 4.55e-03 | 2027.65 ms | 67.7% bf16 MFU | 261224 tok/s +step 7183/18794 | loss 3.168676 (+0.41z)| norm 0.2509 (-0.04z)| lr 4.55e-03 | 2003.08 ms | 68.5% bf16 MFU | 261250 tok/s +step 7184/18794 | loss 3.166419 (+0.36z)| norm 0.1973 (-0.78z)| lr 4.54e-03 | 1994.78 ms | 68.8% bf16 MFU | 261329 tok/s +step 7185/18794 | loss 3.189939 (+1.13z)| norm 0.2968 (+0.65z)| lr 4.54e-03 | 1996.28 ms | 68.7% bf16 MFU | 261394 tok/s +step 7186/18794 | loss 3.110375 (-1.50z)| norm 0.3124 (+0.87z)| lr 4.54e-03 | 1985.72 ms | 69.1% bf16 MFU | 261526 tok/s +step 7187/18794 | loss 3.128678 (-0.87z)| norm 0.1848 (-0.96z)| lr 4.54e-03 | 2003.85 ms | 68.5% bf16 MFU | 261532 tok/s +step 7188/18794 | loss 3.134438 (-0.68z)| norm 0.2563 (+0.08z)| lr 4.54e-03 | 2010.55 ms | 68.3% bf16 MFU | 261494 tok/s +step 7189/18794 | loss 3.174930 (+0.69z)| norm 0.1890 (-0.89z)| lr 4.54e-03 | 2011.79 ms | 68.2% bf16 MFU | 261449 tok/s +step 7190/18794 | loss 3.156052 (+0.05z)| norm 0.2449 (-0.07z)| lr 4.54e-03 | 1995.48 ms | 68.8% bf16 MFU | 261514 tok/s +step 7191/18794 | loss 3.148977 (-0.19z)| norm 0.3347 (+1.21z)| lr 4.54e-03 | 2003.13 ms | 68.5% bf16 MFU | 261525 tok/s +step 7192/18794 | loss 3.166770 (+0.41z)| norm 0.1906 (-0.87z)| lr 4.54e-03 | 1994.94 ms | 68.8% bf16 MFU | 261589 tok/s +step 7193/18794 | loss 3.127890 (-0.89z)| norm 0.2254 (-0.36z)| lr 4.54e-03 | 1994.61 ms | 68.8% bf16 MFU | 261652 tok/s +step 7194/18794 | loss 3.140428 (-0.46z)| norm 0.1572 (-1.32z)| lr 4.54e-03 | 1994.80 ms | 68.8% bf16 MFU | 261711 tok/s +step 7195/18794 | loss 3.199090 (+1.50z)| norm 0.2503 (+0.00z)| lr 4.54e-03 | 1993.07 ms | 68.9% bf16 MFU | 261778 tok/s +step 7196/18794 | loss 3.147073 (-0.26z)| norm 0.2959 (+0.65z)| lr 4.54e-03 | 1994.88 ms | 68.8% bf16 MFU | 261830 tok/s +step 7197/18794 | loss 3.125660 (-0.99z)| norm 0.2719 (+0.28z)| lr 4.54e-03 | 1990.24 ms | 69.0% bf16 MFU | 261910 tok/s +step 7198/18794 | loss 3.128767 (-0.87z)| norm 0.1961 (-0.83z)| lr 4.54e-03 | 1987.41 ms | 69.1% bf16 MFU | 262005 tok/s +step 7199/18794 | loss 3.152639 (-0.05z)| norm 0.3463 (+1.35z)| lr 4.54e-03 | 1994.24 ms | 68.8% bf16 MFU | 262050 tok/s +step 7200/18794 | loss 3.255616 (+3.24z)| norm 0.2446 (-0.12z)| lr 4.54e-03 | 2007.45 ms | 68.4% bf16 MFU | 262006 tok/s +step 7201/18794 | loss 3.205415 (+1.59z)| norm 0.1976 (-0.80z)| lr 4.54e-03 | 1980.29 ms | 69.3% bf16 MFU | 262143 tok/s +step 7202/18794 | loss 3.160366 (+0.13z)| norm 0.1765 (-1.11z)| lr 4.54e-03 | 1986.86 ms | 69.1% bf16 MFU | 262230 tok/s +step 7203/18794 | loss 3.171798 (+0.50z)| norm 0.3707 (+1.67z)| lr 4.54e-03 | 1994.85 ms | 68.8% bf16 MFU | 262259 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.988740 +step 7204/18794 | loss 3.149889 (-0.22z)| norm 0.4598 (+2.99z)| lr 4.54e-03 | 2002.11 ms | 68.5% bf16 MFU | 262240 tok/s +step 7205/18794 | loss 3.168324 (+0.36z)| norm 0.3083 (+0.78z)| lr 4.54e-03 | 1993.59 ms | 68.8% bf16 MFU | 262277 tok/s +step 7206/18794 | loss 3.193244 (+1.17z)| norm 0.2007 (-0.76z)| lr 4.54e-03 | 1995.42 ms | 68.8% bf16 MFU | 262301 tok/s +step 7207/18794 | loss 3.167746 (+0.30z)| norm 0.3859 (+1.92z)| lr 4.54e-03 | 2003.24 ms | 68.5% bf16 MFU | 262272 tok/s +step 7208/18794 | loss 3.168768 (+0.33z)| norm 0.2688 (+0.21z)| lr 4.53e-03 | 2002.84 ms | 68.5% bf16 MFU | 262247 tok/s +step 7209/18794 | loss 3.155147 (-0.12z)| norm 0.2092 (-0.65z)| lr 4.53e-03 | 2003.40 ms | 68.5% bf16 MFU | 262219 tok/s +step 7210/18794 | loss 3.134810 (-0.80z)| norm 0.2598 (+0.08z)| lr 4.53e-03 | 1995.43 ms | 68.8% bf16 MFU | 262246 tok/s +step 7211/18794 | loss 3.136886 (-0.73z)| norm 0.2103 (-0.63z)| lr 4.53e-03 | 1987.46 ms | 69.0% bf16 MFU | 262323 tok/s +step 7212/18794 | loss 3.212975 (+1.80z)| norm 0.2213 (-0.48z)| lr 4.53e-03 | 1979.57 ms | 69.3% bf16 MFU | 262449 tok/s +step 7213/18794 | loss 3.187532 (+0.93z)| norm 0.3070 (+0.75z)| lr 4.53e-03 | 1988.38 ms | 69.0% bf16 MFU | 262511 tok/s +step 7214/18794 | loss 3.153186 (-0.19z)| norm 0.2089 (-0.67z)| lr 4.53e-03 | 1994.50 ms | 68.8% bf16 MFU | 262529 tok/s +step 7215/18794 | loss 3.183805 (+0.83z)| norm 0.2090 (-0.67z)| lr 4.53e-03 | 1986.63 ms | 69.1% bf16 MFU | 262598 tok/s +step 7216/18794 | loss 3.113671 (-1.54z)| norm 0.3669 (+1.59z)| lr 4.53e-03 | 1987.44 ms | 69.0% bf16 MFU | 262658 tok/s +step 7217/18794 | loss 3.185478 (+0.88z)| norm 0.3120 (+0.78z)| lr 4.53e-03 | 1988.17 ms | 69.0% bf16 MFU | 262710 tok/s +step 7218/18794 | loss 3.186302 (+0.90z)| norm 0.2234 (-0.51z)| lr 4.53e-03 | 1987.38 ms | 69.1% bf16 MFU | 262765 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.418641 +step 7219/18794 | loss 3.127057 (-1.09z)| norm 0.4317 (+2.42z)| lr 4.53e-03 | 1988.66 ms | 69.0% bf16 MFU | 262809 tok/s +step 7220/18794 | loss 3.182105 (+0.77z)| norm 0.2553 (-0.09z)| lr 4.53e-03 | 1980.17 ms | 69.3% bf16 MFU | 262907 tok/s +step 7221/18794 | loss 3.135337 (-0.80z)| norm 0.3730 (+1.55z)| lr 4.53e-03 | 1987.95 ms | 69.0% bf16 MFU | 262948 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.604943 +step 7222/18794 | loss 3.157795 (-0.03z)| norm 0.4560 (+2.60z)| lr 4.53e-03 | 1988.75 ms | 69.0% bf16 MFU | 262982 tok/s +step 7223/18794 | loss 3.117993 (-1.38z)| norm 0.2181 (-0.64z)| lr 4.53e-03 | 1981.57 ms | 69.3% bf16 MFU | 263062 tok/s +step 7224/18794 | loss 3.140478 (-0.61z)| norm 0.4135 (+1.97z)| lr 4.53e-03 | 1997.55 ms | 68.7% bf16 MFU | 263032 tok/s +step 7225/18794 | loss 3.158522 (+0.00z)| norm 0.2703 (+0.04z)| lr 4.53e-03 | 1981.33 ms | 69.3% bf16 MFU | 263111 tok/s +step 7226/18794 | loss 3.167348 (+0.31z)| norm 0.2146 (-0.69z)| lr 4.53e-03 | 1980.07 ms | 69.3% bf16 MFU | 263195 tok/s +step 7227/18794 | loss 3.156260 (-0.08z)| norm 0.2200 (-0.61z)| lr 4.53e-03 | 1980.76 ms | 69.3% bf16 MFU | 263269 tok/s +step 7228/18794 | loss 3.125699 (-1.13z)| norm 0.1940 (-0.95z)| lr 4.53e-03 | 1986.01 ms | 69.1% bf16 MFU | 263306 tok/s +step 7229/18794 | loss 3.191433 (+1.11z)| norm 0.2552 (-0.12z)| lr 4.53e-03 | 1988.26 ms | 69.0% bf16 MFU | 263325 tok/s +step 7230/18794 | loss 3.207010 (+1.60z)| norm 0.3720 (+1.45z)| lr 4.53e-03 | 1982.80 ms | 69.2% bf16 MFU | 263379 tok/s +step 7231/18794 | loss 3.139280 (-0.68z)| norm 0.1974 (-0.89z)| lr 4.52e-03 | 1989.06 ms | 69.0% bf16 MFU | 263390 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.319317 +step 7232/18794 | loss 3.183601 (+0.80z)| norm 0.5288 (+3.32z)| lr 4.52e-03 | 1988.59 ms | 69.0% bf16 MFU | 263403 tok/s +step 7233/18794 | loss 3.111145 (-1.60z)| norm 0.3322 (+0.82z)| lr 4.52e-03 | 1985.81 ms | 69.1% bf16 MFU | 263433 tok/s +step 7234/18794 | loss 3.184760 (+0.82z)| norm 0.3184 (+0.63z)| lr 4.52e-03 | 1980.94 ms | 69.3% bf16 MFU | 263495 tok/s +step 7235/18794 | loss 3.114638 (-1.50z)| norm 0.3708 (+1.29z)| lr 4.52e-03 | 1979.47 ms | 69.3% bf16 MFU | 263563 tok/s +step 7236/18794 | loss 3.109708 (-1.63z)| norm 0.2026 (-0.82z)| lr 4.52e-03 | 1980.36 ms | 69.3% bf16 MFU | 263622 tok/s +step 7237/18794 | loss 3.179758 (+0.65z)| norm 0.3539 (+1.07z)| lr 4.52e-03 | 1988.51 ms | 69.0% bf16 MFU | 263624 tok/s +step 7238/18794 | loss 3.170331 (+0.33z)| norm 0.2034 (-0.82z)| lr 4.52e-03 | 1985.48 ms | 69.1% bf16 MFU | 263646 tok/s +step 7239/18794 | loss 3.223137 (+2.02z)| norm 0.2062 (-0.76z)| lr 4.52e-03 | 1986.05 ms | 69.1% bf16 MFU | 263663 tok/s +step 7240/18794 | loss 3.166491 (+0.19z)| norm 0.2564 (-0.13z)| lr 4.52e-03 | 1985.96 ms | 69.1% bf16 MFU | 263680 tok/s +step 7241/18794 | loss 3.176077 (+0.49z)| norm 0.1758 (-1.15z)| lr 4.52e-03 | 1981.02 ms | 69.3% bf16 MFU | 263729 tok/s +step 7242/18794 | loss 3.177437 (+0.52z)| norm 0.2405 (-0.32z)| lr 4.52e-03 | 1981.44 ms | 69.3% bf16 MFU | 263772 tok/s +step 7243/18794 | loss 3.139817 (-0.67z)| norm 0.2213 (-0.56z)| lr 4.52e-03 | 1980.54 ms | 69.3% bf16 MFU | 263820 tok/s +step 7244/18794 | loss 3.118479 (-1.33z)| norm 0.2827 (+0.22z)| lr 4.52e-03 | 1981.50 ms | 69.3% bf16 MFU | 263858 tok/s +step 7245/18794 | loss 3.175750 (+0.52z)| norm 0.2186 (-0.60z)| lr 4.52e-03 | 1979.11 ms | 69.3% bf16 MFU | 263911 tok/s +step 7246/18794 | loss 3.131278 (-0.90z)| norm 0.2298 (-0.46z)| lr 4.52e-03 | 1980.86 ms | 69.3% bf16 MFU | 263949 tok/s +step 7247/18794 | loss 3.125944 (-1.06z)| norm 0.2284 (-0.48z)| lr 4.52e-03 | 1981.95 ms | 69.2% bf16 MFU | 263978 tok/s +step 7248/18794 | loss 3.119026 (-1.27z)| norm 0.1882 (-0.97z)| lr 4.52e-03 | 1979.45 ms | 69.3% bf16 MFU | 264023 tok/s +step 7249/18794 | loss 3.151068 (-0.25z)| norm 0.2145 (-0.63z)| lr 4.52e-03 | 1979.14 ms | 69.3% bf16 MFU | 264067 tok/s +step 7250/18794 | loss 3.131902 (-0.85z)| norm 0.1837 (-1.01z)| lr 4.52e-03 | 1979.90 ms | 69.3% bf16 MFU | 264104 tok/s +val loss 3.166789 +HellaSwag: 2905/10042 = 0.289285: 0/1256 +step 7251/18794 | loss 3.147482 (-0.38z)| norm 0.1830 (-1.01z)| lr 4.52e-03 | 1992.11 ms | 68.9% bf16 MFU | 264058 tok/s +step 7252/18794 | loss 3.123088 (-1.18z)| norm 0.1733 (-1.12z)| lr 4.52e-03 | 1981.76 ms | 69.2% bf16 MFU | 264083 tok/s +step 7253/18794 | loss 3.168770 (+0.32z)| norm 0.3110 (+0.66z)| lr 4.52e-03 | 1982.35 ms | 69.2% bf16 MFU | 264102 tok/s +step 7254/18794 | loss 3.087027 (-2.30z)| norm 0.2891 (+0.39z)| lr 4.52e-03 | 1980.82 ms | 69.3% bf16 MFU | 264131 tok/s +step 7255/18794 | loss 3.159209 (+0.01z)| norm 0.2630 (+0.04z)| lr 4.51e-03 | 1979.18 ms | 69.3% bf16 MFU | 264170 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.198318 +step 7256/18794 | loss 3.189357 (+0.98z)| norm 0.4334 (+2.20z)| lr 4.51e-03 | 1984.07 ms | 69.2% bf16 MFU | 264174 tok/s +step 7257/18794 | loss 3.251923 (+2.84z)| norm 0.3040 (+0.53z)| lr 4.51e-03 | 1981.53 ms | 69.3% bf16 MFU | 264194 tok/s +step 7258/18794 | loss 3.155138 (-0.15z)| norm 0.2180 (-0.57z)| lr 4.51e-03 | 1979.22 ms | 69.3% bf16 MFU | 264230 tok/s +step 7259/18794 | loss 3.097941 (-1.86z)| norm 0.1953 (-0.87z)| lr 4.51e-03 | 1986.04 ms | 69.1% bf16 MFU | 264217 tok/s +step 7260/18794 | loss 3.153977 (-0.15z)| norm 0.1870 (-0.96z)| lr 4.51e-03 | 1988.19 ms | 69.0% bf16 MFU | 264192 tok/s +step 7261/18794 | loss 3.150477 (-0.24z)| norm 0.1970 (-0.81z)| lr 4.51e-03 | 1986.12 ms | 69.1% bf16 MFU | 264181 tok/s +step 7262/18794 | loss 3.159786 (+0.03z)| norm 0.3657 (+1.33z)| lr 4.51e-03 | 1983.14 ms | 69.2% bf16 MFU | 264190 tok/s +step 7263/18794 | loss 3.139204 (-0.61z)| norm 0.4228 (+1.99z)| lr 4.51e-03 | 1981.66 ms | 69.3% bf16 MFU | 264209 tok/s +step 7264/18794 | loss 3.112155 (-1.42z)| norm 0.2559 (-0.11z)| lr 4.51e-03 | 1980.75 ms | 69.3% bf16 MFU | 264233 tok/s +step 7265/18794 | loss 3.134907 (-0.70z)| norm 0.1842 (-0.99z)| lr 4.51e-03 | 1986.04 ms | 69.1% bf16 MFU | 264221 tok/s +step 7266/18794 | loss 3.106983 (-1.56z)| norm 0.1953 (-0.85z)| lr 4.51e-03 | 1978.86 ms | 69.3% bf16 MFU | 264257 tok/s +step 7267/18794 | loss 3.107539 (-1.52z)| norm 0.2112 (-0.65z)| lr 4.51e-03 | 1979.02 ms | 69.3% bf16 MFU | 264291 tok/s +step 7268/18794 | loss 3.194430 (+1.12z)| norm 0.2392 (-0.31z)| lr 4.51e-03 | 1984.21 ms | 69.2% bf16 MFU | 264288 tok/s +step 7269/18794 | loss 3.133397 (-0.73z)| norm 0.1746 (-1.11z)| lr 4.51e-03 | 1978.45 ms | 69.4% bf16 MFU | 264323 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.381261 +step 7270/18794 | loss 3.105368 (-1.55z)| norm 0.4618 (+2.38z)| lr 4.51e-03 | 1978.66 ms | 69.4% bf16 MFU | 264356 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.313387 +step 7271/18794 | loss 3.140112 (-0.49z)| norm 0.4589 (+2.31z)| lr 4.51e-03 | 1983.52 ms | 69.2% bf16 MFU | 264354 tok/s +step 7272/18794 | loss 3.093135 (-1.90z)| norm 0.2507 (-0.18z)| lr 4.51e-03 | 1982.96 ms | 69.2% bf16 MFU | 264356 tok/s +step 7273/18794 | loss 3.127033 (-0.84z)| norm 0.3877 (+1.44z)| lr 4.51e-03 | 1990.44 ms | 68.9% bf16 MFU | 264308 tok/s +step 7274/18794 | loss 3.113056 (-1.24z)| norm 0.2089 (-0.67z)| lr 4.51e-03 | 1989.38 ms | 69.0% bf16 MFU | 264270 tok/s +step 7275/18794 | loss 3.163326 (+0.31z)| norm 0.2296 (-0.42z)| lr 4.51e-03 | 1980.39 ms | 69.3% bf16 MFU | 264294 tok/s +step 7276/18794 | loss 3.190987 (+1.17z)| norm 0.2003 (-0.77z)| lr 4.51e-03 | 1980.23 ms | 69.3% bf16 MFU | 264317 tok/s +step 7277/18794 | loss 3.101186 (-1.59z)| norm 0.2463 (-0.22z)| lr 4.51e-03 | 1981.99 ms | 69.2% bf16 MFU | 264327 tok/s +step 7278/18794 | loss 3.144321 (-0.27z)| norm 0.2105 (-0.66z)| lr 4.50e-03 | 1979.26 ms | 69.3% bf16 MFU | 264356 tok/s +step 7279/18794 | loss 3.203895 (+1.53z)| norm 0.2036 (-0.74z)| lr 4.50e-03 | 1979.45 ms | 69.3% bf16 MFU | 264381 tok/s +step 7280/18794 | loss 3.175331 (+0.65z)| norm 0.2558 (-0.11z)| lr 4.50e-03 | 1996.41 ms | 68.7% bf16 MFU | 264293 tok/s +step 7281/18794 | loss 3.111488 (-1.27z)| norm 0.1894 (-0.92z)| lr 4.50e-03 | 1984.21 ms | 69.2% bf16 MFU | 264290 tok/s +step 7282/18794 | loss 3.152107 (-0.04z)| norm 0.2050 (-0.73z)| lr 4.50e-03 | 1982.82 ms | 69.2% bf16 MFU | 264296 tok/s +step 7283/18794 | loss 3.132427 (-0.63z)| norm 0.2128 (-0.63z)| lr 4.50e-03 | 1982.47 ms | 69.2% bf16 MFU | 264304 tok/s +step 7284/18794 | loss 3.179432 (+0.79z)| norm 0.1706 (-1.13z)| lr 4.50e-03 | 1980.02 ms | 69.3% bf16 MFU | 264328 tok/s +step 7285/18794 | loss 3.135258 (-0.53z)| norm 0.1752 (-1.06z)| lr 4.50e-03 | 1984.51 ms | 69.2% bf16 MFU | 264322 tok/s +step 7286/18794 | loss 3.186811 (+1.01z)| norm 0.2398 (-0.27z)| lr 4.50e-03 | 1980.65 ms | 69.3% bf16 MFU | 264341 tok/s +step 7287/18794 | loss 3.161752 (+0.24z)| norm 0.2474 (-0.19z)| lr 4.50e-03 | 1979.86 ms | 69.3% bf16 MFU | 264364 tok/s +step 7288/18794 | loss 3.172503 (+0.56z)| norm 0.2068 (-0.67z)| lr 4.50e-03 | 1979.73 ms | 69.3% bf16 MFU | 264387 tok/s +step 7289/18794 | loss 3.134309 (-0.59z)| norm 0.2511 (-0.14z)| lr 4.50e-03 | 1981.04 ms | 69.3% bf16 MFU | 264401 tok/s +step 7290/18794 | loss 3.134831 (-0.57z)| norm 0.3313 (+0.82z)| lr 4.50e-03 | 1979.84 ms | 69.3% bf16 MFU | 264421 tok/s +step 7291/18794 | loss 3.195018 (+1.24z)| norm 0.2590 (-0.05z)| lr 4.50e-03 | 1979.14 ms | 69.3% bf16 MFU | 264446 tok/s +step 7292/18794 | loss 3.174977 (+0.63z)| norm 0.2803 (+0.20z)| lr 4.50e-03 | 1982.82 ms | 69.2% bf16 MFU | 264444 tok/s +step 7293/18794 | loss 3.119875 (-1.03z)| norm 0.3615 (+1.17z)| lr 4.50e-03 | 1980.51 ms | 69.3% bf16 MFU | 264458 tok/s +step 7294/18794 | loss 3.135767 (-0.55z)| norm 0.2032 (-0.76z)| lr 4.50e-03 | 1982.28 ms | 69.2% bf16 MFU | 264459 tok/s +step 7295/18794 | loss 3.115746 (-1.13z)| norm 0.3664 (+1.20z)| lr 4.50e-03 | 1987.76 ms | 69.0% bf16 MFU | 264424 tok/s +step 7296/18794 | loss 3.147500 (-0.17z)| norm 0.2202 (-0.56z)| lr 4.50e-03 | 1982.85 ms | 69.2% bf16 MFU | 264424 tok/s +step 7297/18794 | loss 3.171899 (+0.55z)| norm 0.2701 (+0.05z)| lr 4.50e-03 | 1984.04 ms | 69.2% bf16 MFU | 264415 tok/s +step 7298/18794 | loss 3.162278 (+0.25z)| norm 0.2491 (-0.21z)| lr 4.50e-03 | 1978.09 ms | 69.4% bf16 MFU | 264447 tok/s +step 7299/18794 | loss 3.126395 (-0.83z)| norm 0.2353 (-0.37z)| lr 4.50e-03 | 1984.47 ms | 69.2% bf16 MFU | 264434 tok/s +step 7300/18794 | loss 3.146397 (-0.20z)| norm 0.2959 (+0.36z)| lr 4.50e-03 | 1981.56 ms | 69.3% bf16 MFU | 264442 tok/s +step 7301/18794 | loss 3.155280 (+0.10z)| norm 0.1713 (-1.15z)| lr 4.50e-03 | 1980.44 ms | 69.3% bf16 MFU | 264456 tok/s +step 7302/18794 | loss 3.064273 (-2.70z)| norm 0.3094 (+0.52z)| lr 4.49e-03 | 1979.40 ms | 69.3% bf16 MFU | 264477 tok/s +step 7303/18794 | loss 3.172979 (+0.68z)| norm 0.1905 (-0.92z)| lr 4.49e-03 | 1979.13 ms | 69.3% bf16 MFU | 264498 tok/s +step 7304/18794 | loss 3.145730 (-0.17z)| norm 0.2189 (-0.56z)| lr 4.49e-03 | 1979.41 ms | 69.3% bf16 MFU | 264517 tok/s +step 7305/18794 | loss 3.140586 (-0.32z)| norm 0.1732 (-1.11z)| lr 4.49e-03 | 1979.86 ms | 69.3% bf16 MFU | 264532 tok/s +step 7306/18794 | loss 3.142224 (-0.25z)| norm 0.2745 (+0.15z)| lr 4.49e-03 | 1979.12 ms | 69.3% bf16 MFU | 264551 tok/s +step 7307/18794 | loss 3.149217 (-0.03z)| norm 0.2709 (+0.12z)| lr 4.49e-03 | 1980.18 ms | 69.3% bf16 MFU | 264562 tok/s +step 7308/18794 | loss 3.135396 (-0.45z)| norm 0.2996 (+0.48z)| lr 4.49e-03 | 1979.33 ms | 69.3% bf16 MFU | 264578 tok/s +step 7309/18794 | loss 3.192723 (+1.32z)| norm 0.3113 (+0.62z)| lr 4.49e-03 | 1980.07 ms | 69.3% bf16 MFU | 264588 tok/s +step 7310/18794 | loss 3.137872 (-0.38z)| norm 0.1705 (-1.15z)| lr 4.49e-03 | 1979.92 ms | 69.3% bf16 MFU | 264599 tok/s +step 7311/18794 | loss 3.090897 (-1.80z)| norm 0.2490 (-0.17z)| lr 4.49e-03 | 1977.93 ms | 69.4% bf16 MFU | 264622 tok/s +step 7312/18794 | loss 3.157005 (+0.24z)| norm 0.1964 (-0.83z)| lr 4.49e-03 | 1979.95 ms | 69.3% bf16 MFU | 264631 tok/s +step 7313/18794 | loss 3.145576 (-0.10z)| norm 0.1961 (-0.81z)| lr 4.49e-03 | 1980.05 ms | 69.3% bf16 MFU | 264639 tok/s +step 7314/18794 | loss 3.110237 (-1.19z)| norm 0.3778 (+1.44z)| lr 4.49e-03 | 1979.43 ms | 69.3% bf16 MFU | 264650 tok/s +step 7315/18794 | loss 3.100974 (-1.45z)| norm 0.2777 (+0.18z)| lr 4.49e-03 | 1983.02 ms | 69.2% bf16 MFU | 264637 tok/s +step 7316/18794 | loss 3.107876 (-1.23z)| norm 0.2278 (-0.43z)| lr 4.49e-03 | 1979.69 ms | 69.3% bf16 MFU | 264647 tok/s +step 7317/18794 | loss 3.185900 (+1.19z)| norm 0.3656 (+1.30z)| lr 4.49e-03 | 1979.90 ms | 69.3% bf16 MFU | 264655 tok/s +step 7318/18794 | loss 3.169240 (+0.69z)| norm 0.2629 (+0.00z)| lr 4.49e-03 | 1981.30 ms | 69.3% bf16 MFU | 264653 tok/s +step 7319/18794 | loss 3.117845 (-0.91z)| norm 0.3206 (+0.76z)| lr 4.49e-03 | 1979.50 ms | 69.3% bf16 MFU | 264663 tok/s +step 7320/18794 | loss 3.149905 (+0.10z)| norm 0.3037 (+0.53z)| lr 4.49e-03 | 1978.41 ms | 69.4% bf16 MFU | 264680 tok/s +step 7321/18794 | loss 3.167761 (+0.64z)| norm 0.2010 (-0.76z)| lr 4.49e-03 | 1980.60 ms | 69.3% bf16 MFU | 264682 tok/s +step 7322/18794 | loss 3.177449 (+0.94z)| norm 0.2270 (-0.41z)| lr 4.49e-03 | 1979.98 ms | 69.3% bf16 MFU | 264688 tok/s +step 7323/18794 | loss 3.227881 (+2.41z)| norm 0.3115 (+0.70z)| lr 4.49e-03 | 1990.43 ms | 68.9% bf16 MFU | 264623 tok/s +step 7324/18794 | loss 3.126312 (-0.67z)| norm 0.2901 (+0.44z)| lr 4.49e-03 | 1984.81 ms | 69.1% bf16 MFU | 264600 tok/s +step 7325/18794 | loss 3.110518 (-1.12z)| norm 0.3019 (+0.60z)| lr 4.48e-03 | 1985.10 ms | 69.1% bf16 MFU | 264575 tok/s +step 7326/18794 | loss 3.163814 (+0.48z)| norm 0.3419 (+1.12z)| lr 4.48e-03 | 1982.46 ms | 69.2% bf16 MFU | 264570 tok/s +step 7327/18794 | loss 3.087674 (-1.76z)| norm 0.2074 (-0.70z)| lr 4.48e-03 | 1980.70 ms | 69.3% bf16 MFU | 264576 tok/s +step 7328/18794 | loss 3.150592 (+0.10z)| norm 0.4108 (+1.98z)| lr 4.48e-03 | 1980.50 ms | 69.3% bf16 MFU | 264584 tok/s +step 7329/18794 | loss 3.087090 (-1.75z)| norm 0.4016 (+1.81z)| lr 4.48e-03 | 1980.26 ms | 69.3% bf16 MFU | 264592 tok/s +step 7330/18794 | loss 3.085529 (-1.76z)| norm 0.1859 (-0.98z)| lr 4.48e-03 | 1980.97 ms | 69.3% bf16 MFU | 264596 tok/s +step 7331/18794 | loss 3.126226 (-0.55z)| norm 0.2912 (+0.39z)| lr 4.48e-03 | 1980.10 ms | 69.3% bf16 MFU | 264605 tok/s +step 7332/18794 | loss 3.112523 (-0.94z)| norm 0.1690 (-1.24z)| lr 4.48e-03 | 1979.42 ms | 69.3% bf16 MFU | 264618 tok/s +step 7333/18794 | loss 3.193075 (+1.41z)| norm 0.2398 (-0.25z)| lr 4.48e-03 | 1980.33 ms | 69.3% bf16 MFU | 264625 tok/s +step 7334/18794 | loss 3.174956 (+0.88z)| norm 0.2195 (-0.52z)| lr 4.48e-03 | 1980.35 ms | 69.3% bf16 MFU | 264631 tok/s +step 7335/18794 | loss 3.154046 (+0.26z)| norm 0.2948 (+0.56z)| lr 4.48e-03 | 1979.63 ms | 69.3% bf16 MFU | 264641 tok/s +step 7336/18794 | loss 3.141775 (-0.12z)| norm 0.1731 (-1.16z)| lr 4.48e-03 | 1981.57 ms | 69.3% bf16 MFU | 264638 tok/s +step 7337/18794 | loss 3.128551 (-0.50z)| norm 0.2581 (+0.05z)| lr 4.48e-03 | 1981.09 ms | 69.3% bf16 MFU | 264639 tok/s +step 7338/18794 | loss 3.112006 (-0.97z)| norm 0.2042 (-0.72z)| lr 4.48e-03 | 1978.58 ms | 69.4% bf16 MFU | 264656 tok/s +step 7339/18794 | loss 3.082330 (-1.84z)| norm 0.2061 (-0.69z)| lr 4.48e-03 | 1983.56 ms | 69.2% bf16 MFU | 264639 tok/s +step 7340/18794 | loss 3.154910 (+0.36z)| norm 0.2137 (-0.57z)| lr 4.48e-03 | 1982.52 ms | 69.2% bf16 MFU | 264630 tok/s +step 7341/18794 | loss 3.174242 (+0.94z)| norm 0.2076 (-0.67z)| lr 4.48e-03 | 1987.63 ms | 69.0% bf16 MFU | 264587 tok/s +step 7342/18794 | loss 3.125289 (-0.52z)| norm 0.2200 (-0.49z)| lr 4.48e-03 | 1982.69 ms | 69.2% bf16 MFU | 264579 tok/s +step 7343/18794 | loss 3.188171 (+1.36z)| norm 0.1876 (-0.94z)| lr 4.48e-03 | 1986.93 ms | 69.1% bf16 MFU | 264544 tok/s +step 7344/18794 | loss 3.149216 (+0.18z)| norm 0.1686 (-1.19z)| lr 4.48e-03 | 1981.81 ms | 69.2% bf16 MFU | 264544 tok/s +step 7345/18794 | loss 3.085350 (-1.70z)| norm 0.2013 (-0.72z)| lr 4.48e-03 | 1980.66 ms | 69.3% bf16 MFU | 264552 tok/s +step 7346/18794 | loss 3.083771 (-1.71z)| norm 0.1761 (-1.07z)| lr 4.48e-03 | 1980.01 ms | 69.3% bf16 MFU | 264564 tok/s +step 7347/18794 | loss 3.132975 (-0.27z)| norm 0.2416 (-0.15z)| lr 4.48e-03 | 1980.67 ms | 69.3% bf16 MFU | 264571 tok/s +step 7348/18794 | loss 3.127347 (-0.44z)| norm 0.3568 (+1.45z)| lr 4.47e-03 | 1979.79 ms | 69.3% bf16 MFU | 264583 tok/s +step 7349/18794 | loss 3.147453 (+0.16z)| norm 0.3558 (+1.40z)| lr 4.47e-03 | 1982.34 ms | 69.2% bf16 MFU | 264578 tok/s +step 7350/18794 | loss 3.175883 (+0.98z)| norm 0.2184 (-0.52z)| lr 4.47e-03 | 1979.90 ms | 69.3% bf16 MFU | 264589 tok/s +step 7351/18794 | loss 3.154977 (+0.36z)| norm 0.2545 (-0.02z)| lr 4.47e-03 | 1982.33 ms | 69.2% bf16 MFU | 264584 tok/s +step 7352/18794 | loss 3.141050 (-0.05z)| norm 0.3242 (+0.94z)| lr 4.47e-03 | 1980.25 ms | 69.3% bf16 MFU | 264593 tok/s +step 7353/18794 | loss 3.169219 (+0.78z)| norm 0.2588 (+0.02z)| lr 4.47e-03 | 1979.43 ms | 69.3% bf16 MFU | 264606 tok/s +step 7354/18794 | loss 3.120925 (-0.66z)| norm 0.1925 (-0.89z)| lr 4.47e-03 | 1988.29 ms | 69.0% bf16 MFU | 264561 tok/s +step 7355/18794 | loss 3.159736 (+0.49z)| norm 0.2219 (-0.47z)| lr 4.47e-03 | 1988.55 ms | 69.0% bf16 MFU | 264515 tok/s +step 7356/18794 | loss 3.101914 (-1.20z)| norm 0.2206 (-0.48z)| lr 4.47e-03 | 1985.93 ms | 69.1% bf16 MFU | 264490 tok/s +step 7357/18794 | loss 3.108416 (-1.02z)| norm 0.2774 (+0.35z)| lr 4.47e-03 | 1982.46 ms | 69.2% bf16 MFU | 264488 tok/s +step 7358/18794 | loss 3.114206 (-0.82z)| norm 0.1717 (-1.17z)| lr 4.47e-03 | 1981.77 ms | 69.2% bf16 MFU | 264492 tok/s +step 7359/18794 | loss 3.107090 (-1.05z)| norm 0.2567 (+0.05z)| lr 4.47e-03 | 1979.17 ms | 69.3% bf16 MFU | 264512 tok/s +step 7360/18794 | loss 3.120286 (-0.62z)| norm 0.2159 (-0.55z)| lr 4.47e-03 | 1978.99 ms | 69.3% bf16 MFU | 264533 tok/s +step 7361/18794 | loss 3.078379 (-1.88z)| norm 0.2015 (-0.76z)| lr 4.47e-03 | 1981.29 ms | 69.3% bf16 MFU | 264537 tok/s +step 7362/18794 | loss 3.081263 (-1.75z)| norm 0.2168 (-0.52z)| lr 4.47e-03 | 1979.09 ms | 69.3% bf16 MFU | 264556 tok/s +step 7363/18794 | loss 3.158591 (+0.60z)| norm 0.2778 (+0.41z)| lr 4.47e-03 | 1979.77 ms | 69.3% bf16 MFU | 264569 tok/s +step 7364/18794 | loss 3.111689 (-0.82z)| norm 0.2807 (+0.45z)| lr 4.47e-03 | 1981.28 ms | 69.3% bf16 MFU | 264572 tok/s +step 7365/18794 | loss 3.147416 (+0.26z)| norm 0.2813 (+0.44z)| lr 4.47e-03 | 1979.58 ms | 69.3% bf16 MFU | 264586 tok/s +step 7366/18794 | loss 3.126016 (-0.40z)| norm 0.2852 (+0.49z)| lr 4.47e-03 | 1981.00 ms | 69.3% bf16 MFU | 264589 tok/s +step 7367/18794 | loss 3.090331 (-1.47z)| norm 0.1939 (-0.90z)| lr 4.47e-03 | 1980.66 ms | 69.3% bf16 MFU | 264595 tok/s +step 7368/18794 | loss 3.087785 (-1.52z)| norm 0.3050 (+0.78z)| lr 4.47e-03 | 1981.72 ms | 69.2% bf16 MFU | 264593 tok/s +step 7369/18794 | loss 3.127383 (-0.32z)| norm 0.3529 (+1.48z)| lr 4.47e-03 | 1979.60 ms | 69.3% bf16 MFU | 264606 tok/s +step 7370/18794 | loss 3.098737 (-1.18z)| norm 0.2711 (+0.28z)| lr 4.47e-03 | 1985.88 ms | 69.1% bf16 MFU | 264576 tok/s +step 7371/18794 | loss 3.090766 (-1.40z)| norm 0.1921 (-0.98z)| lr 4.47e-03 | 1981.48 ms | 69.3% bf16 MFU | 264577 tok/s +step 7372/18794 | loss 3.170367 (+0.97z)| norm 0.1952 (-0.92z)| lr 4.46e-03 | 1988.04 ms | 69.0% bf16 MFU | 264534 tok/s +step 7373/18794 | loss 3.150923 (+0.38z)| norm 0.2200 (-0.49z)| lr 4.46e-03 | 1982.21 ms | 69.2% bf16 MFU | 264532 tok/s +step 7374/18794 | loss 3.119568 (-0.57z)| norm 0.2087 (-0.68z)| lr 4.46e-03 | 1984.99 ms | 69.1% bf16 MFU | 264512 tok/s +step 7375/18794 | loss 3.166893 (+0.86z)| norm 0.2528 (+0.07z)| lr 4.46e-03 | 1981.75 ms | 69.2% bf16 MFU | 264514 tok/s +step 7376/18794 | loss 3.149011 (+0.34z)| norm 0.3595 (+1.85z)| lr 4.46e-03 | 1981.00 ms | 69.3% bf16 MFU | 264521 tok/s +step 7377/18794 | loss 3.136597 (-0.05z)| norm 0.2782 (+0.47z)| lr 4.46e-03 | 1980.81 ms | 69.3% bf16 MFU | 264530 tok/s +step 7378/18794 | loss 3.139651 (+0.04z)| norm 0.1758 (-1.25z)| lr 4.46e-03 | 1980.23 ms | 69.3% bf16 MFU | 264541 tok/s +step 7379/18794 | loss 3.119397 (-0.57z)| norm 0.1902 (-1.01z)| lr 4.46e-03 | 1979.87 ms | 69.3% bf16 MFU | 264555 tok/s +step 7380/18794 | loss 3.071455 (-2.00z)| norm 0.2142 (-0.60z)| lr 4.46e-03 | 1982.09 ms | 69.2% bf16 MFU | 264552 tok/s +step 7381/18794 | loss 3.110309 (-0.81z)| norm 0.2646 (+0.24z)| lr 4.46e-03 | 1979.99 ms | 69.3% bf16 MFU | 264564 tok/s +step 7382/18794 | loss 3.133475 (-0.09z)| norm 0.2556 (+0.08z)| lr 4.46e-03 | 1985.92 ms | 69.1% bf16 MFU | 264536 tok/s +step 7383/18794 | loss 3.135632 (-0.02z)| norm 0.2081 (-0.72z)| lr 4.46e-03 | 1980.40 ms | 69.3% bf16 MFU | 264546 tok/s +step 7384/18794 | loss 3.198739 (+1.90z)| norm 0.3600 (+1.80z)| lr 4.46e-03 | 1979.07 ms | 69.3% bf16 MFU | 264565 tok/s +step 7385/18794 | loss 3.136513 (+0.00z)| norm 0.2319 (-0.36z)| lr 4.46e-03 | 1988.67 ms | 69.0% bf16 MFU | 264519 tok/s +step 7386/18794 | loss 3.119837 (-0.49z)| norm 0.2820 (+0.48z)| lr 4.46e-03 | 1985.44 ms | 69.1% bf16 MFU | 264496 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.284894 +step 7387/18794 | loss 3.163469 (+0.85z)| norm 0.3943 (+2.28z)| lr 4.46e-03 | 1984.58 ms | 69.1% bf16 MFU | 264480 tok/s +step 7388/18794 | loss 3.081549 (-1.64z)| norm 0.3407 (+1.37z)| lr 4.46e-03 | 1982.63 ms | 69.2% bf16 MFU | 264478 tok/s +step 7389/18794 | loss 3.110267 (-0.75z)| norm 0.2789 (+0.36z)| lr 4.46e-03 | 1981.63 ms | 69.3% bf16 MFU | 264483 tok/s +step 7390/18794 | loss 3.127253 (-0.22z)| norm 0.1972 (-0.96z)| lr 4.46e-03 | 1979.50 ms | 69.3% bf16 MFU | 264502 tok/s +step 7391/18794 | loss 3.090588 (-1.33z)| norm 0.2385 (-0.28z)| lr 4.46e-03 | 1979.75 ms | 69.3% bf16 MFU | 264518 tok/s +step 7392/18794 | loss 3.089733 (-1.32z)| norm 0.2330 (-0.36z)| lr 4.46e-03 | 1979.84 ms | 69.3% bf16 MFU | 264533 tok/s +step 7393/18794 | loss 3.179558 (+1.41z)| norm 0.2343 (-0.32z)| lr 4.46e-03 | 1979.64 ms | 69.3% bf16 MFU | 264548 tok/s +step 7394/18794 | loss 3.114120 (-0.58z)| norm 0.3480 (+1.53z)| lr 4.46e-03 | 1979.84 ms | 69.3% bf16 MFU | 264561 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.970316 +step 7395/18794 | loss 3.136847 (+0.11z)| norm 0.4414 (+2.97z)| lr 4.45e-03 | 1982.06 ms | 69.2% bf16 MFU | 264559 tok/s +step 7396/18794 | loss 3.129650 (-0.11z)| norm 0.3071 (+0.81z)| lr 4.45e-03 | 1979.70 ms | 69.3% bf16 MFU | 264573 tok/s +step 7397/18794 | loss 3.136395 (+0.11z)| norm 0.2006 (-0.88z)| lr 4.45e-03 | 1979.54 ms | 69.3% bf16 MFU | 264587 tok/s +step 7398/18794 | loss 3.120922 (-0.35z)| norm 0.3673 (+1.73z)| lr 4.45e-03 | 1980.12 ms | 69.3% bf16 MFU | 264596 tok/s +step 7399/18794 | loss 3.154349 (+0.67z)| norm 0.2360 (-0.33z)| lr 4.45e-03 | 1981.74 ms | 69.2% bf16 MFU | 264594 tok/s +step 7400/18794 | loss 3.150472 (+0.55z)| norm 0.2086 (-0.75z)| lr 4.45e-03 | 1980.22 ms | 69.3% bf16 MFU | 264603 tok/s +step 7401/18794 | loss 3.125068 (-0.23z)| norm 0.1661 (-1.41z)| lr 4.45e-03 | 1983.15 ms | 69.2% bf16 MFU | 264591 tok/s +step 7402/18794 | loss 3.176334 (+1.34z)| norm 0.2410 (-0.23z)| lr 4.45e-03 | 1994.75 ms | 68.8% bf16 MFU | 264503 tok/s +step 7403/18794 | loss 3.175499 (+1.31z)| norm 0.2303 (-0.40z)| lr 4.45e-03 | 1985.74 ms | 69.1% bf16 MFU | 264480 tok/s +step 7404/18794 | loss 3.126660 (-0.21z)| norm 0.1829 (-1.14z)| lr 4.45e-03 | 1983.12 ms | 69.2% bf16 MFU | 264474 tok/s +step 7405/18794 | loss 3.086170 (-1.44z)| norm 0.1973 (-0.92z)| lr 4.45e-03 | 1980.22 ms | 69.3% bf16 MFU | 264489 tok/s +step 7406/18794 | loss 3.144763 (+0.37z)| norm 0.2396 (-0.25z)| lr 4.45e-03 | 1980.61 ms | 69.3% bf16 MFU | 264500 tok/s +step 7407/18794 | loss 3.146859 (+0.44z)| norm 0.3284 (+1.13z)| lr 4.45e-03 | 1980.40 ms | 69.3% bf16 MFU | 264512 tok/s +step 7408/18794 | loss 3.119962 (-0.39z)| norm 0.2871 (+0.49z)| lr 4.45e-03 | 1979.10 ms | 69.3% bf16 MFU | 264532 tok/s +step 7409/18794 | loss 3.192395 (+1.85z)| norm 0.1829 (-1.12z)| lr 4.45e-03 | 1980.43 ms | 69.3% bf16 MFU | 264542 tok/s +step 7410/18794 | loss 3.151691 (+0.59z)| norm 0.2467 (-0.14z)| lr 4.45e-03 | 1979.35 ms | 69.3% bf16 MFU | 264559 tok/s +step 7411/18794 | loss 3.084732 (-1.48z)| norm 0.3580 (+1.58z)| lr 4.45e-03 | 1983.22 ms | 69.2% bf16 MFU | 264549 tok/s +step 7412/18794 | loss 3.093202 (-1.19z)| norm 0.2761 (+0.29z)| lr 4.45e-03 | 1979.88 ms | 69.3% bf16 MFU | 264562 tok/s +step 7413/18794 | loss 3.125199 (-0.20z)| norm 0.3164 (+0.91z)| lr 4.45e-03 | 1978.55 ms | 69.4% bf16 MFU | 264583 tok/s +step 7414/18794 | loss 3.084782 (-1.43z)| norm 0.3693 (+1.74z)| lr 4.45e-03 | 1980.92 ms | 69.3% bf16 MFU | 264587 tok/s +step 7415/18794 | loss 3.111182 (-0.63z)| norm 0.1943 (-0.99z)| lr 4.45e-03 | 1979.68 ms | 69.3% bf16 MFU | 264600 tok/s +step 7416/18794 | loss 3.149627 (+0.54z)| norm 0.2228 (-0.54z)| lr 4.45e-03 | 1987.85 ms | 69.0% bf16 MFU | 264557 tok/s +step 7417/18794 | loss 3.134142 (+0.08z)| norm 0.2190 (-0.59z)| lr 4.45e-03 | 1989.94 ms | 69.0% bf16 MFU | 264503 tok/s +step 7418/18794 | loss 3.134182 (+0.09z)| norm 0.2043 (-0.81z)| lr 4.44e-03 | 1985.12 ms | 69.1% bf16 MFU | 264483 tok/s +step 7419/18794 | loss 3.135599 (+0.13z)| norm 0.2317 (-0.36z)| lr 4.44e-03 | 1979.24 ms | 69.3% bf16 MFU | 264504 tok/s +step 7420/18794 | loss 3.119105 (-0.38z)| norm 0.2908 (+0.58z)| lr 4.44e-03 | 1979.81 ms | 69.3% bf16 MFU | 264519 tok/s +step 7421/18794 | loss 3.125358 (-0.17z)| norm 0.3360 (+1.27z)| lr 4.44e-03 | 1979.46 ms | 69.3% bf16 MFU | 264536 tok/s +step 7422/18794 | loss 3.106584 (-0.74z)| norm 0.2122 (-0.68z)| lr 4.44e-03 | 1979.26 ms | 69.3% bf16 MFU | 264554 tok/s +step 7423/18794 | loss 3.025351 (-3.24z)| norm 0.2864 (+0.49z)| lr 4.44e-03 | 1980.14 ms | 69.3% bf16 MFU | 264565 tok/s +step 7424/18794 | loss 3.104953 (-0.72z)| norm 0.2264 (-0.45z)| lr 4.44e-03 | 1978.04 ms | 69.4% bf16 MFU | 264590 tok/s +step 7425/18794 | loss 3.149757 (+0.68z)| norm 0.1850 (-1.08z)| lr 4.44e-03 | 1979.76 ms | 69.3% bf16 MFU | 264601 tok/s +step 7426/18794 | loss 3.121934 (-0.18z)| norm 0.2121 (-0.64z)| lr 4.44e-03 | 1980.08 ms | 69.3% bf16 MFU | 264610 tok/s +step 7427/18794 | loss 3.122921 (-0.16z)| norm 0.2650 (+0.19z)| lr 4.44e-03 | 1980.39 ms | 69.3% bf16 MFU | 264617 tok/s +step 7428/18794 | loss 3.140224 (+0.39z)| norm 0.3364 (+1.37z)| lr 4.44e-03 | 1982.79 ms | 69.2% bf16 MFU | 264607 tok/s +step 7429/18794 | loss 3.141030 (+0.40z)| norm 0.3250 (+1.23z)| lr 4.44e-03 | 1981.22 ms | 69.3% bf16 MFU | 264608 tok/s +step 7430/18794 | loss 3.095643 (-1.07z)| norm 0.2623 (+0.17z)| lr 4.44e-03 | 1982.85 ms | 69.2% bf16 MFU | 264598 tok/s +step 7431/18794 | loss 3.159223 (+0.98z)| norm 0.2612 (+0.16z)| lr 4.44e-03 | 1980.49 ms | 69.3% bf16 MFU | 264604 tok/s +step 7432/18794 | loss 3.132255 (+0.10z)| norm 0.2453 (-0.12z)| lr 4.44e-03 | 1981.63 ms | 69.3% bf16 MFU | 264603 tok/s +step 7433/18794 | loss 3.106565 (-0.71z)| norm 0.1834 (-1.16z)| lr 4.44e-03 | 1980.82 ms | 69.3% bf16 MFU | 264607 tok/s +step 7434/18794 | loss 3.123322 (-0.15z)| norm 0.2158 (-0.61z)| lr 4.44e-03 | 1982.56 ms | 69.2% bf16 MFU | 264599 tok/s +step 7435/18794 | loss 3.151408 (+0.79z)| norm 0.2897 (+0.64z)| lr 4.44e-03 | 1988.08 ms | 69.0% bf16 MFU | 264555 tok/s +step 7436/18794 | loss 3.116528 (-0.36z)| norm 0.2079 (-0.75z)| lr 4.44e-03 | 1985.58 ms | 69.1% bf16 MFU | 264529 tok/s +step 7437/18794 | loss 3.142894 (+0.51z)| norm 0.2472 (-0.08z)| lr 4.44e-03 | 1983.48 ms | 69.2% bf16 MFU | 264519 tok/s +step 7438/18794 | loss 3.154955 (+0.89z)| norm 0.2360 (-0.28z)| lr 4.44e-03 | 1978.41 ms | 69.4% bf16 MFU | 264544 tok/s +step 7439/18794 | loss 3.094691 (-1.12z)| norm 0.2155 (-0.63z)| lr 4.44e-03 | 1985.11 ms | 69.1% bf16 MFU | 264522 tok/s +step 7440/18794 | loss 3.142766 (+0.49z)| norm 0.2086 (-0.75z)| lr 4.44e-03 | 1981.95 ms | 69.2% bf16 MFU | 264522 tok/s +step 7441/18794 | loss 3.139082 (+0.39z)| norm 0.2619 (+0.15z)| lr 4.43e-03 | 1980.75 ms | 69.3% bf16 MFU | 264531 tok/s +step 7442/18794 | loss 3.094840 (-1.09z)| norm 0.3248 (+1.20z)| lr 4.43e-03 | 1979.80 ms | 69.3% bf16 MFU | 264545 tok/s +step 7443/18794 | loss 3.107660 (-0.65z)| norm 0.2939 (+0.66z)| lr 4.43e-03 | 1981.48 ms | 69.3% bf16 MFU | 264548 tok/s +step 7444/18794 | loss 3.111103 (-0.52z)| norm 0.2763 (+0.35z)| lr 4.43e-03 | 1980.71 ms | 69.3% bf16 MFU | 264555 tok/s +step 7445/18794 | loss 3.151983 (+0.87z)| norm 0.2611 (+0.07z)| lr 4.43e-03 | 1981.33 ms | 69.3% bf16 MFU | 264558 tok/s +step 7446/18794 | loss 3.120535 (-0.23z)| norm 0.2208 (-0.64z)| lr 4.43e-03 | 1980.63 ms | 69.3% bf16 MFU | 264566 tok/s +step 7447/18794 | loss 3.126618 (-0.02z)| norm 0.2243 (-0.57z)| lr 4.43e-03 | 1983.56 ms | 69.2% bf16 MFU | 264553 tok/s +step 7448/18794 | loss 3.193051 (+2.22z)| norm 0.1915 (-1.12z)| lr 4.43e-03 | 1981.71 ms | 69.2% bf16 MFU | 264554 tok/s +step 7449/18794 | loss 3.126825 (-0.03z)| norm 0.2387 (-0.28z)| lr 4.43e-03 | 1979.42 ms | 69.3% bf16 MFU | 264569 tok/s +step 7450/18794 | loss 3.112154 (-0.51z)| norm 0.2458 (-0.16z)| lr 4.43e-03 | 1988.72 ms | 69.0% bf16 MFU | 264523 tok/s +step 7451/18794 | loss 3.169277 (+1.45z)| norm 0.3008 (+0.82z)| lr 4.43e-03 | 1985.04 ms | 69.1% bf16 MFU | 264502 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.173590 +step 7452/18794 | loss 3.156593 (+1.00z)| norm 0.3795 (+2.17z)| lr 4.43e-03 | 1983.59 ms | 69.2% bf16 MFU | 264493 tok/s +step 7453/18794 | loss 3.120400 (-0.22z)| norm 0.1787 (-1.32z)| lr 4.43e-03 | 1982.24 ms | 69.2% bf16 MFU | 264493 tok/s +step 7454/18794 | loss 3.165159 (+1.30z)| norm 0.2855 (+0.52z)| lr 4.43e-03 | 1981.58 ms | 69.3% bf16 MFU | 264497 tok/s +step 7455/18794 | loss 3.095699 (-1.06z)| norm 0.3211 (+1.12z)| lr 4.43e-03 | 1981.49 ms | 69.3% bf16 MFU | 264502 tok/s +step 7456/18794 | loss 3.139871 (+0.44z)| norm 0.2495 (-0.13z)| lr 4.43e-03 | 1980.60 ms | 69.3% bf16 MFU | 264513 tok/s +step 7457/18794 | loss 3.097865 (-1.00z)| norm 0.2058 (-0.88z)| lr 4.43e-03 | 1981.39 ms | 69.3% bf16 MFU | 264517 tok/s +step 7458/18794 | loss 3.161466 (+1.16z)| norm 0.2311 (-0.45z)| lr 4.43e-03 | 1980.12 ms | 69.3% bf16 MFU | 264530 tok/s +step 7459/18794 | loss 3.095603 (-1.08z)| norm 0.2059 (-0.88z)| lr 4.43e-03 | 1980.60 ms | 69.3% bf16 MFU | 264539 tok/s +step 7460/18794 | loss 3.141371 (+0.47z)| norm 0.2031 (-0.93z)| lr 4.43e-03 | 1980.45 ms | 69.3% bf16 MFU | 264549 tok/s +step 7461/18794 | loss 3.157493 (+1.00z)| norm 0.1721 (-1.45z)| lr 4.43e-03 | 1981.44 ms | 69.3% bf16 MFU | 264551 tok/s +step 7462/18794 | loss 3.143020 (+0.49z)| norm 0.2207 (-0.61z)| lr 4.43e-03 | 1980.24 ms | 69.3% bf16 MFU | 264562 tok/s +step 7463/18794 | loss 3.176763 (+1.64z)| norm 0.2531 (-0.05z)| lr 4.43e-03 | 1985.48 ms | 69.1% bf16 MFU | 264537 tok/s +step 7464/18794 | loss 3.136967 (+0.26z)| norm 0.2188 (-0.63z)| lr 4.42e-03 | 1984.92 ms | 69.1% bf16 MFU | 264517 tok/s +step 7465/18794 | loss 3.135084 (+0.20z)| norm 0.1727 (-1.40z)| lr 4.42e-03 | 1986.40 ms | 69.1% bf16 MFU | 264488 tok/s +step 7466/18794 | loss 3.163591 (+1.17z)| norm 0.1907 (-1.07z)| lr 4.42e-03 | 1979.83 ms | 69.3% bf16 MFU | 264504 tok/s +step 7467/18794 | loss 3.157737 (+0.95z)| norm 0.2108 (-0.73z)| lr 4.42e-03 | 1985.54 ms | 69.1% bf16 MFU | 264482 tok/s +step 7468/18794 | loss 3.103690 (-0.92z)| norm 0.2315 (-0.36z)| lr 4.42e-03 | 1982.21 ms | 69.2% bf16 MFU | 264482 tok/s +step 7469/18794 | loss 3.125343 (-0.17z)| norm 0.2392 (-0.21z)| lr 4.42e-03 | 1980.93 ms | 69.3% bf16 MFU | 264492 tok/s +step 7470/18794 | loss 3.121632 (-0.31z)| norm 0.2128 (-0.66z)| lr 4.42e-03 | 1979.97 ms | 69.3% bf16 MFU | 264507 tok/s +step 7471/18794 | loss 3.145800 (+0.52z)| norm 0.2004 (-0.88z)| lr 4.42e-03 | 1980.32 ms | 69.3% bf16 MFU | 264519 tok/s +step 7472/18794 | loss 3.154738 (+0.84z)| norm 0.1973 (-0.94z)| lr 4.42e-03 | 1980.53 ms | 69.3% bf16 MFU | 264529 tok/s +step 7473/18794 | loss 3.181509 (+1.75z)| norm 0.1963 (-0.95z)| lr 4.42e-03 | 1978.71 ms | 69.4% bf16 MFU | 264551 tok/s +step 7474/18794 | loss 3.141566 (+0.35z)| norm 0.1828 (-1.17z)| lr 4.42e-03 | 1980.06 ms | 69.3% bf16 MFU | 264563 tok/s +step 7475/18794 | loss 3.174882 (+1.51z)| norm 0.2287 (-0.37z)| lr 4.42e-03 | 1981.59 ms | 69.3% bf16 MFU | 264563 tok/s +step 7476/18794 | loss 3.165647 (+1.17z)| norm 0.1950 (-0.94z)| lr 4.42e-03 | 1981.96 ms | 69.2% bf16 MFU | 264562 tok/s +step 7477/18794 | loss 3.191531 (+2.01z)| norm 0.2578 (+0.17z)| lr 4.42e-03 | 1978.97 ms | 69.3% bf16 MFU | 264580 tok/s +step 7478/18794 | loss 3.221874 (+2.88z)| norm 0.2512 (+0.04z)| lr 4.42e-03 | 1983.14 ms | 69.2% bf16 MFU | 264570 tok/s +step 7479/18794 | loss 3.117875 (-0.49z)| norm 0.2185 (-0.55z)| lr 4.42e-03 | 1981.76 ms | 69.2% bf16 MFU | 264569 tok/s +step 7480/18794 | loss 3.110573 (-0.76z)| norm 0.2303 (-0.34z)| lr 4.42e-03 | 1978.57 ms | 69.4% bf16 MFU | 264590 tok/s +step 7481/18794 | loss 3.172378 (+1.26z)| norm 0.1952 (-0.95z)| lr 4.42e-03 | 1990.59 ms | 68.9% bf16 MFU | 264530 tok/s +step 7482/18794 | loss 3.216319 (+2.58z)| norm 0.2247 (-0.42z)| lr 4.42e-03 | 1985.12 ms | 69.1% bf16 MFU | 264508 tok/s +step 7483/18794 | loss 3.176185 (+1.28z)| norm 0.3079 (+1.03z)| lr 4.42e-03 | 1981.64 ms | 69.3% bf16 MFU | 264512 tok/s +step 7484/18794 | loss 3.154680 (+0.63z)| norm 0.2320 (-0.29z)| lr 4.42e-03 | 1978.06 ms | 69.4% bf16 MFU | 264539 tok/s +step 7485/18794 | loss 3.168796 (+1.07z)| norm 0.2006 (-0.85z)| lr 4.42e-03 | 1981.55 ms | 69.3% bf16 MFU | 264541 tok/s +step 7486/18794 | loss 3.097785 (-1.18z)| norm 0.1753 (-1.28z)| lr 4.42e-03 | 1977.63 ms | 69.4% bf16 MFU | 264569 tok/s +step 7487/18794 | loss 3.098024 (-1.15z)| norm 0.2046 (-0.75z)| lr 4.41e-03 | 1979.14 ms | 69.3% bf16 MFU | 264586 tok/s +step 7488/18794 | loss 3.168755 (+1.07z)| norm 0.1743 (-1.28z)| lr 4.41e-03 | 1977.99 ms | 69.4% bf16 MFU | 264610 tok/s +step 7489/18794 | loss 3.095716 (-1.25z)| norm 0.2495 (+0.12z)| lr 4.41e-03 | 1985.26 ms | 69.1% bf16 MFU | 264584 tok/s +step 7490/18794 | loss 3.183278 (+1.49z)| norm 0.3344 (+1.66z)| lr 4.41e-03 | 1981.39 ms | 69.3% bf16 MFU | 264585 tok/s +step 7491/18794 | loss 3.175003 (+1.21z)| norm 0.3364 (+1.66z)| lr 4.41e-03 | 1981.51 ms | 69.3% bf16 MFU | 264585 tok/s +step 7492/18794 | loss 3.158468 (+0.68z)| norm 0.2818 (+0.65z)| lr 4.41e-03 | 1979.08 ms | 69.3% bf16 MFU | 264602 tok/s +step 7493/18794 | loss 3.205774 (+2.14z)| norm 0.1952 (-0.91z)| lr 4.41e-03 | 1979.86 ms | 69.3% bf16 MFU | 264612 tok/s +step 7494/18794 | loss 3.149011 (+0.35z)| norm 0.2222 (-0.40z)| lr 4.41e-03 | 1982.94 ms | 69.2% bf16 MFU | 264602 tok/s +step 7495/18794 | loss 3.107796 (-0.93z)| norm 0.2720 (+0.58z)| lr 4.41e-03 | 1981.85 ms | 69.2% bf16 MFU | 264599 tok/s +step 7496/18794 | loss 3.137484 (-0.00z)| norm 0.2243 (-0.35z)| lr 4.41e-03 | 1980.55 ms | 69.3% bf16 MFU | 264605 tok/s +step 7497/18794 | loss 3.173352 (+1.10z)| norm 0.1949 (-0.93z)| lr 4.41e-03 | 1979.69 ms | 69.3% bf16 MFU | 264616 tok/s +step 7498/18794 | loss 3.147236 (+0.28z)| norm 0.2019 (-0.78z)| lr 4.41e-03 | 1981.25 ms | 69.3% bf16 MFU | 264617 tok/s +step 7499/18794 | loss 3.153802 (+0.48z)| norm 0.2128 (-0.55z)| lr 4.41e-03 | 1984.12 ms | 69.2% bf16 MFU | 264598 tok/s +step 7500/18794 | loss 3.120958 (-0.53z)| norm 0.1793 (-1.22z)| lr 4.41e-03 | 1987.00 ms | 69.1% bf16 MFU | 264561 tok/s +val loss 3.161419 +HellaSwag: 2923/10042 = 0.291077Swag: 990/1256: 0/1256 +Writing checkpoint at step 7500 +Writing model to log_gpt3_125M_edu_v4/model_00007500.bin +Writing state to log_gpt3_125M_edu_v4/state_00007500_00001.bin +Writing state to log_gpt3_125M_edu_v4/state_00007500_00000.bin +Deleting checkpoint at step 5000 +step 7501/18794 | loss 3.115556 (-0.69z)| norm 0.1766 (-1.28z)| lr 4.41e-03 | 1984.82 ms | 69.1% bf16 MFU | 264540 tok/s +step 7502/18794 | loss 3.108328 (-0.90z)| norm 0.2062 (-0.67z)| lr 4.41e-03 | 1986.67 ms | 69.1% bf16 MFU | 264508 tok/s +step 7503/18794 | loss 3.113056 (-0.73z)| norm 0.2315 (-0.16z)| lr 4.41e-03 | 1979.51 ms | 69.3% bf16 MFU | 264526 tok/s +step 7504/18794 | loss 3.071263 (-1.98z)| norm 0.2242 (-0.32z)| lr 4.41e-03 | 1979.68 ms | 69.3% bf16 MFU | 264541 tok/s +step 7505/18794 | loss 3.117252 (-0.59z)| norm 0.1941 (-0.93z)| lr 4.41e-03 | 1983.15 ms | 69.2% bf16 MFU | 264533 tok/s +step 7506/18794 | loss 3.149921 (+0.42z)| norm 0.1894 (-1.01z)| lr 4.41e-03 | 1981.90 ms | 69.2% bf16 MFU | 264533 tok/s +step 7507/18794 | loss 3.077615 (-1.77z)| norm 0.1833 (-1.12z)| lr 4.41e-03 | 1979.87 ms | 69.3% bf16 MFU | 264547 tok/s +step 7508/18794 | loss 3.168190 (+0.97z)| norm 0.1891 (-0.98z)| lr 4.41e-03 | 1979.23 ms | 69.3% bf16 MFU | 264564 tok/s +step 7509/18794 | loss 3.178468 (+1.30z)| norm 0.3143 (+1.56z)| lr 4.41e-03 | 1979.80 ms | 69.3% bf16 MFU | 264577 tok/s +step 7510/18794 | loss 3.135949 (+0.00z)| norm 0.3359 (+1.95z)| lr 4.40e-03 | 1980.62 ms | 69.3% bf16 MFU | 264584 tok/s +step 7511/18794 | loss 3.138546 (+0.07z)| norm 0.2568 (+0.39z)| lr 4.40e-03 | 1979.73 ms | 69.3% bf16 MFU | 264596 tok/s +step 7512/18794 | loss 3.149200 (+0.38z)| norm 0.2488 (+0.23z)| lr 4.40e-03 | 1980.43 ms | 69.3% bf16 MFU | 264603 tok/s +step 7513/18794 | loss 3.137521 (+0.02z)| norm 0.2665 (+0.62z)| lr 4.40e-03 | 1981.59 ms | 69.3% bf16 MFU | 264602 tok/s +step 7514/18794 | loss 3.183529 (+1.43z)| norm 0.1704 (-1.40z)| lr 4.40e-03 | 1979.82 ms | 69.3% bf16 MFU | 264612 tok/s +step 7515/18794 | loss 3.118315 (-0.62z)| norm 0.2388 (+0.07z)| lr 4.40e-03 | 1980.91 ms | 69.3% bf16 MFU | 264615 tok/s +step 7516/18794 | loss 3.151214 (+0.41z)| norm 0.2719 (+0.77z)| lr 4.40e-03 | 1982.99 ms | 69.2% bf16 MFU | 264604 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.591448 +step 7517/18794 | loss 3.130702 (-0.23z)| norm 0.3615 (+2.59z)| lr 4.40e-03 | 1984.65 ms | 69.1% bf16 MFU | 264582 tok/s +step 7518/18794 | loss 3.180984 (+1.32z)| norm 0.2992 (+1.26z)| lr 4.40e-03 | 1988.31 ms | 69.0% bf16 MFU | 264538 tok/s +step 7519/18794 | loss 3.128223 (-0.32z)| norm 0.1919 (-0.95z)| lr 4.40e-03 | 1984.31 ms | 69.2% bf16 MFU | 264522 tok/s +step 7520/18794 | loss 3.109332 (-0.90z)| norm 0.3072 (+1.42z)| lr 4.40e-03 | 1980.58 ms | 69.3% bf16 MFU | 264531 tok/s +step 7521/18794 | loss 3.135101 (-0.10z)| norm 0.3078 (+1.45z)| lr 4.40e-03 | 1979.91 ms | 69.3% bf16 MFU | 264545 tok/s +step 7522/18794 | loss 3.153354 (+0.45z)| norm 0.1630 (-1.53z)| lr 4.40e-03 | 1981.21 ms | 69.3% bf16 MFU | 264549 tok/s +step 7523/18794 | loss 3.143036 (+0.10z)| norm 0.1913 (-0.93z)| lr 4.40e-03 | 1983.00 ms | 69.2% bf16 MFU | 264541 tok/s +step 7524/18794 | loss 3.195478 (+1.80z)| norm 0.1991 (-0.76z)| lr 4.40e-03 | 1980.65 ms | 69.3% bf16 MFU | 264549 tok/s +step 7525/18794 | loss 3.096016 (-1.45z)| norm 0.2101 (-0.54z)| lr 4.40e-03 | 1980.89 ms | 69.3% bf16 MFU | 264556 tok/s +step 7526/18794 | loss 3.126327 (-0.46z)| norm 0.2468 (+0.21z)| lr 4.40e-03 | 1980.54 ms | 69.3% bf16 MFU | 264564 tok/s +step 7527/18794 | loss 3.161469 (+0.67z)| norm 0.2710 (+0.71z)| lr 4.40e-03 | 1977.85 ms | 69.4% bf16 MFU | 264590 tok/s +step 7528/18794 | loss 3.090596 (-1.61z)| norm 0.2431 (+0.15z)| lr 4.40e-03 | 1980.65 ms | 69.3% bf16 MFU | 264595 tok/s +step 7529/18794 | loss 3.175158 (+1.10z)| norm 0.2045 (-0.65z)| lr 4.40e-03 | 1980.22 ms | 69.3% bf16 MFU | 264604 tok/s +step 7530/18794 | loss 3.132584 (-0.28z)| norm 0.1944 (-0.85z)| lr 4.40e-03 | 1981.01 ms | 69.3% bf16 MFU | 264606 tok/s +step 7531/18794 | loss 3.074424 (-2.09z)| norm 0.1766 (-1.20z)| lr 4.40e-03 | 1980.78 ms | 69.3% bf16 MFU | 264610 tok/s +step 7532/18794 | loss 3.113279 (-0.85z)| norm 0.2133 (-0.42z)| lr 4.40e-03 | 1985.37 ms | 69.1% bf16 MFU | 264584 tok/s +step 7533/18794 | loss 3.207292 (+2.06z)| norm 0.2221 (-0.24z)| lr 4.39e-03 | 1988.38 ms | 69.0% bf16 MFU | 264538 tok/s +step 7534/18794 | loss 3.150809 (+0.29z)| norm 0.2558 (+0.47z)| lr 4.39e-03 | 1985.16 ms | 69.1% bf16 MFU | 264516 tok/s +step 7535/18794 | loss 3.111639 (-0.91z)| norm 0.2837 (+1.07z)| lr 4.39e-03 | 1981.16 ms | 69.3% bf16 MFU | 264522 tok/s +step 7536/18794 | loss 3.155865 (+0.45z)| norm 0.2835 (+1.05z)| lr 4.39e-03 | 1981.91 ms | 69.2% bf16 MFU | 264523 tok/s +step 7537/18794 | loss 3.111218 (-0.92z)| norm 0.1811 (-1.11z)| lr 4.39e-03 | 1980.04 ms | 69.3% bf16 MFU | 264536 tok/s +step 7538/18794 | loss 3.129616 (-0.35z)| norm 0.2175 (-0.34z)| lr 4.39e-03 | 2029.74 ms | 67.6% bf16 MFU | 264225 tok/s +step 7539/18794 | loss 3.117411 (-0.74z)| norm 0.2283 (-0.11z)| lr 4.39e-03 | 2049.26 ms | 67.0% bf16 MFU | 263806 tok/s +step 7540/18794 | loss 3.221291 (+2.41z)| norm 0.3207 (+1.79z)| lr 4.39e-03 | 2046.43 ms | 67.1% bf16 MFU | 263425 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.044787 +step 7541/18794 | loss 3.161401 (+0.59z)| norm 0.3355 (+2.04z)| lr 4.39e-03 | 2049.33 ms | 67.0% bf16 MFU | 263046 tok/s +step 7542/18794 | loss 3.117873 (-0.74z)| norm 0.2096 (-0.51z)| lr 4.39e-03 | 2042.86 ms | 67.2% bf16 MFU | 262725 tok/s +reducing beta2 to 0.9 and lr/wd by 0.917 due to grad z-score of 3.817527 +step 7543/18794 | loss 3.145167 (+0.08z)| norm 0.4332 (+3.82z)| lr 4.03e-03 | 2041.81 ms | 67.2% bf16 MFU | 262428 tok/s +step 7544/18794 | loss 3.085911 (-1.71z)| norm 0.2778 (+0.81z)| lr 4.39e-03 | 2026.89 ms | 67.7% bf16 MFU | 262240 tok/s +step 7545/18794 | loss 3.086715 (-1.64z)| norm 0.2594 (+0.46z)| lr 4.39e-03 | 2038.54 ms | 67.3% bf16 MFU | 261987 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.320112 +step 7546/18794 | loss 3.211106 (+2.01z)| norm 0.4198 (+3.32z)| lr 4.39e-03 | 2039.34 ms | 67.3% bf16 MFU | 261742 tok/s +step 7547/18794 | loss 3.126685 (-0.47z)| norm 0.2553 (+0.32z)| lr 4.39e-03 | 2036.89 ms | 67.4% bf16 MFU | 261525 tok/s +step 7548/18794 | loss 3.156414 (+0.42z)| norm 0.2702 (+0.57z)| lr 4.39e-03 | 2032.01 ms | 67.5% bf16 MFU | 261349 tok/s +step 7549/18794 | loss 3.124852 (-0.52z)| norm 0.2196 (-0.35z)| lr 4.39e-03 | 2040.03 ms | 67.3% bf16 MFU | 261132 tok/s +step 7550/18794 | loss 3.149124 (+0.19z)| norm 0.1886 (-0.90z)| lr 4.39e-03 | 2037.22 ms | 67.4% bf16 MFU | 260943 tok/s +step 7551/18794 | loss 3.154754 (+0.37z)| norm 0.1778 (-1.07z)| lr 4.39e-03 | 2025.31 ms | 67.8% bf16 MFU | 260839 tok/s +step 7552/18794 | loss 3.146987 (+0.14z)| norm 0.2617 (+0.49z)| lr 4.39e-03 | 2042.51 ms | 67.2% bf16 MFU | 260632 tok/s +step 7553/18794 | loss 3.105218 (-1.10z)| norm 0.3232 (+1.61z)| lr 4.39e-03 | 2042.78 ms | 67.2% bf16 MFU | 260433 tok/s +step 7554/18794 | loss 3.197879 (+1.63z)| norm 0.3094 (+1.34z)| lr 4.39e-03 | 2031.20 ms | 67.6% bf16 MFU | 260317 tok/s +step 7555/18794 | loss 3.176189 (+0.97z)| norm 0.2138 (-0.42z)| lr 4.39e-03 | 2015.20 ms | 68.1% bf16 MFU | 260310 tok/s +step 7556/18794 | loss 3.167166 (+0.69z)| norm 0.2399 (+0.07z)| lr 4.38e-03 | 2020.12 ms | 67.9% bf16 MFU | 260271 tok/s +step 7557/18794 | loss 3.117312 (-0.79z)| norm 0.2608 (+0.45z)| lr 4.38e-03 | 2043.76 ms | 67.1% bf16 MFU | 260084 tok/s +step 7558/18794 | loss 3.117501 (-0.77z)| norm 0.2428 (+0.11z)| lr 4.38e-03 | 2039.01 ms | 67.3% bf16 MFU | 259936 tok/s +step 7559/18794 | loss 3.088198 (-1.63z)| norm 0.1900 (-0.88z)| lr 4.38e-03 | 2037.26 ms | 67.4% bf16 MFU | 259807 tok/s +step 7560/18794 | loss 3.146332 (+0.09z)| norm 0.3107 (+1.36z)| lr 4.38e-03 | 2042.65 ms | 67.2% bf16 MFU | 259650 tok/s +step 7561/18794 | loss 3.125814 (-0.51z)| norm 0.2149 (-0.44z)| lr 4.38e-03 | 2040.58 ms | 67.3% bf16 MFU | 259514 tok/s +step 7562/18794 | loss 3.118108 (-0.73z)| norm 0.2932 (+1.02z)| lr 4.38e-03 | 2032.90 ms | 67.5% bf16 MFU | 259433 tok/s +reducing beta2 to 0.9 and lr/wd by 0.877 due to grad z-score of 3.991868 +step 7563/18794 | loss 3.124877 (-0.51z)| norm 0.4744 (+3.99z)| lr 3.84e-03 | 2017.59 ms | 68.0% bf16 MFU | 259455 tok/s +step 7564/18794 | loss 3.093965 (-1.40z)| norm 0.3253 (+1.41z)| lr 4.38e-03 | 2041.00 ms | 67.2% bf16 MFU | 259326 tok/s +step 7565/18794 | loss 3.157797 (+0.46z)| norm 0.1883 (-0.92z)| lr 4.38e-03 | 2035.14 ms | 67.4% bf16 MFU | 259240 tok/s +step 7566/18794 | loss 3.136207 (-0.16z)| norm 0.2635 (+0.35z)| lr 4.38e-03 | 2030.48 ms | 67.6% bf16 MFU | 259189 tok/s +step 7567/18794 | loss 3.088218 (-1.53z)| norm 0.3280 (+1.42z)| lr 4.38e-03 | 2039.48 ms | 67.3% bf16 MFU | 259083 tok/s +step 7568/18794 | loss 3.133386 (-0.23z)| norm 0.3629 (+1.95z)| lr 4.38e-03 | 2042.80 ms | 67.2% bf16 MFU | 258961 tok/s +step 7569/18794 | loss 3.115857 (-0.74z)| norm 0.2387 (-0.11z)| lr 4.38e-03 | 2023.42 ms | 67.8% bf16 MFU | 258969 tok/s +step 7570/18794 | loss 3.179115 (+1.08z)| norm 0.3061 (+0.99z)| lr 4.38e-03 | 2037.71 ms | 67.3% bf16 MFU | 258885 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.879541 +step 7571/18794 | loss 3.135736 (-0.17z)| norm 0.4302 (+2.88z)| lr 4.38e-03 | 2014.65 ms | 68.1% bf16 MFU | 258953 tok/s +step 7572/18794 | loss 3.150126 (+0.24z)| norm 0.1844 (-1.02z)| lr 4.38e-03 | 2031.94 ms | 67.5% bf16 MFU | 258906 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.267827 +step 7573/18794 | loss 3.187727 (+1.33z)| norm 0.3972 (+2.27z)| lr 4.38e-03 | 2019.16 ms | 68.0% bf16 MFU | 258944 tok/s +step 7574/18794 | loss 3.197247 (+1.57z)| norm 0.2427 (-0.13z)| lr 4.38e-03 | 2019.45 ms | 68.0% bf16 MFU | 258977 tok/s +step 7575/18794 | loss 3.116324 (-0.73z)| norm 0.2183 (-0.51z)| lr 4.38e-03 | 2033.66 ms | 67.5% bf16 MFU | 258919 tok/s +step 7576/18794 | loss 3.161501 (+0.57z)| norm 0.2270 (-0.38z)| lr 4.38e-03 | 2030.57 ms | 67.6% bf16 MFU | 258883 tok/s +step 7577/18794 | loss 3.129040 (-0.35z)| norm 0.2035 (-0.74z)| lr 4.38e-03 | 2026.72 ms | 67.7% bf16 MFU | 258873 tok/s +step 7578/18794 | loss 3.107280 (-0.97z)| norm 0.1922 (-0.90z)| lr 4.38e-03 | 2035.80 ms | 67.4% bf16 MFU | 258806 tok/s +step 7579/18794 | loss 3.158044 (+0.52z)| norm 0.2138 (-0.56z)| lr 4.37e-03 | 2030.64 ms | 67.6% bf16 MFU | 258775 tok/s +step 7580/18794 | loss 3.094930 (-1.33z)| norm 0.2311 (-0.30z)| lr 4.37e-03 | 2032.46 ms | 67.5% bf16 MFU | 258734 tok/s +step 7581/18794 | loss 3.086757 (-1.54z)| norm 0.3214 (+1.08z)| lr 4.37e-03 | 2027.22 ms | 67.7% bf16 MFU | 258729 tok/s +step 7582/18794 | loss 3.164068 (+0.75z)| norm 0.2328 (-0.29z)| lr 4.37e-03 | 2020.15 ms | 67.9% bf16 MFU | 258769 tok/s +step 7583/18794 | loss 3.127865 (-0.31z)| norm 0.2285 (-0.35z)| lr 4.37e-03 | 2030.57 ms | 67.6% bf16 MFU | 258740 tok/s +step 7584/18794 | loss 3.097168 (-1.21z)| norm 0.3063 (+0.85z)| lr 4.37e-03 | 2030.07 ms | 67.6% bf16 MFU | 258716 tok/s +step 7585/18794 | loss 3.139718 (+0.07z)| norm 0.2484 (-0.06z)| lr 4.37e-03 | 2012.27 ms | 68.2% bf16 MFU | 258808 tok/s +step 7586/18794 | loss 3.137745 (-0.00z)| norm 0.1889 (-0.98z)| lr 4.37e-03 | 2029.54 ms | 67.6% bf16 MFU | 258784 tok/s +step 7587/18794 | loss 3.109583 (-0.86z)| norm 0.1823 (-1.08z)| lr 4.37e-03 | 1999.20 ms | 68.6% bf16 MFU | 258957 tok/s +step 7588/18794 | loss 3.142162 (+0.14z)| norm 0.2158 (-0.57z)| lr 4.37e-03 | 2037.58 ms | 67.4% bf16 MFU | 258875 tok/s +step 7589/18794 | loss 3.149501 (+0.35z)| norm 0.2497 (-0.04z)| lr 4.37e-03 | 2016.88 ms | 68.0% bf16 MFU | 258928 tok/s +step 7590/18794 | loss 3.152516 (+0.45z)| norm 0.1795 (-1.11z)| lr 4.37e-03 | 2019.06 ms | 68.0% bf16 MFU | 258966 tok/s +step 7591/18794 | loss 3.168121 (+0.94z)| norm 0.1992 (-0.79z)| lr 4.37e-03 | 2021.89 ms | 67.9% bf16 MFU | 258982 tok/s +step 7592/18794 | loss 3.121634 (-0.49z)| norm 0.1991 (-0.78z)| lr 4.37e-03 | 2014.60 ms | 68.1% bf16 MFU | 259046 tok/s +step 7593/18794 | loss 3.117713 (-0.60z)| norm 0.2343 (-0.23z)| lr 4.37e-03 | 2001.45 ms | 68.6% bf16 MFU | 259191 tok/s +step 7594/18794 | loss 3.082880 (-1.66z)| norm 0.2181 (-0.48z)| lr 4.37e-03 | 2007.77 ms | 68.4% bf16 MFU | 259288 tok/s +step 7595/18794 | loss 3.119489 (-0.52z)| norm 0.1795 (-1.07z)| lr 4.37e-03 | 2023.80 ms | 67.8% bf16 MFU | 259277 tok/s +step 7596/18794 | loss 3.154660 (+0.58z)| norm 0.2045 (-0.68z)| lr 4.37e-03 | 2019.85 ms | 67.9% bf16 MFU | 259291 tok/s +step 7597/18794 | loss 3.122573 (-0.42z)| norm 0.2621 (+0.21z)| lr 4.37e-03 | 2018.40 ms | 68.0% bf16 MFU | 259314 tok/s +step 7598/18794 | loss 3.166745 (+0.97z)| norm 0.1643 (-1.31z)| lr 4.37e-03 | 2026.57 ms | 67.7% bf16 MFU | 259284 tok/s +step 7599/18794 | loss 3.137956 (+0.07z)| norm 0.2037 (-0.69z)| lr 4.37e-03 | 2025.29 ms | 67.8% bf16 MFU | 259263 tok/s +step 7600/18794 | loss 3.140010 (+0.13z)| norm 0.2287 (-0.31z)| lr 4.37e-03 | 2028.99 ms | 67.6% bf16 MFU | 259220 tok/s +step 7601/18794 | loss 3.141322 (+0.16z)| norm 0.2692 (+0.31z)| lr 4.37e-03 | 2001.34 ms | 68.6% bf16 MFU | 259357 tok/s +step 7602/18794 | loss 3.122299 (-0.45z)| norm 0.2538 (+0.06z)| lr 4.36e-03 | 2028.33 ms | 67.7% bf16 MFU | 259314 tok/s +step 7603/18794 | loss 3.122102 (-0.46z)| norm 0.2312 (-0.30z)| lr 4.36e-03 | 2023.41 ms | 67.8% bf16 MFU | 259304 tok/s +step 7604/18794 | loss 3.143740 (+0.21z)| norm 0.3369 (+1.34z)| lr 4.36e-03 | 2035.90 ms | 67.4% bf16 MFU | 259214 tok/s +step 7605/18794 | loss 3.163314 (+0.83z)| norm 0.3754 (+1.89z)| lr 4.36e-03 | 2017.17 ms | 68.0% bf16 MFU | 259249 tok/s +step 7606/18794 | loss 3.157654 (+0.64z)| norm 0.2751 (+0.33z)| lr 4.36e-03 | 2037.95 ms | 67.3% bf16 MFU | 259150 tok/s +step 7607/18794 | loss 3.156567 (+0.59z)| norm 0.2297 (-0.38z)| lr 4.36e-03 | 2018.91 ms | 68.0% bf16 MFU | 259177 tok/s +step 7608/18794 | loss 3.174232 (+1.17z)| norm 0.2952 (+0.62z)| lr 4.36e-03 | 1998.98 ms | 68.7% bf16 MFU | 259332 tok/s +step 7609/18794 | loss 3.172777 (+1.13z)| norm 0.3766 (+1.86z)| lr 4.36e-03 | 2035.07 ms | 67.4% bf16 MFU | 259247 tok/s +step 7610/18794 | loss 3.137371 (-0.04z)| norm 0.3050 (+0.77z)| lr 4.36e-03 | 2014.02 ms | 68.1% bf16 MFU | 259300 tok/s +step 7611/18794 | loss 3.074761 (-2.03z)| norm 0.2043 (-0.78z)| lr 4.36e-03 | 2015.46 ms | 68.1% bf16 MFU | 259342 tok/s +step 7612/18794 | loss 3.110486 (-0.87z)| norm 0.2761 (+0.32z)| lr 4.36e-03 | 2009.09 ms | 68.3% bf16 MFU | 259423 tok/s +step 7613/18794 | loss 3.117794 (-0.63z)| norm 0.3177 (+0.95z)| lr 4.36e-03 | 2022.16 ms | 67.9% bf16 MFU | 259415 tok/s +step 7614/18794 | loss 3.147484 (+0.34z)| norm 0.1990 (-0.88z)| lr 4.36e-03 | 2005.94 ms | 68.4% bf16 MFU | 259513 tok/s +step 7615/18794 | loss 3.143620 (+0.21z)| norm 0.1752 (-1.23z)| lr 4.36e-03 | 2018.12 ms | 68.0% bf16 MFU | 259527 tok/s +step 7616/18794 | loss 3.131073 (-0.19z)| norm 0.1992 (-0.85z)| lr 4.36e-03 | 2022.51 ms | 67.9% bf16 MFU | 259512 tok/s +step 7617/18794 | loss 3.122829 (-0.46z)| norm 0.2085 (-0.69z)| lr 4.36e-03 | 2025.72 ms | 67.7% bf16 MFU | 259477 tok/s +step 7618/18794 | loss 3.166121 (+0.96z)| norm 0.2308 (-0.34z)| lr 4.36e-03 | 2017.69 ms | 68.0% bf16 MFU | 259495 tok/s +step 7619/18794 | loss 3.166460 (+0.95z)| norm 0.1971 (-0.86z)| lr 4.36e-03 | 2025.18 ms | 67.8% bf16 MFU | 259465 tok/s +step 7620/18794 | loss 3.125236 (-0.39z)| norm 0.1875 (-0.99z)| lr 4.36e-03 | 2017.04 ms | 68.0% bf16 MFU | 259488 tok/s +step 7621/18794 | loss 3.223228 (+2.67z)| norm 0.2220 (-0.44z)| lr 4.36e-03 | 2016.24 ms | 68.1% bf16 MFU | 259515 tok/s +step 7622/18794 | loss 3.115980 (-0.69z)| norm 0.1914 (-0.93z)| lr 4.36e-03 | 2004.91 ms | 68.4% bf16 MFU | 259615 tok/s +step 7623/18794 | loss 3.152522 (+0.46z)| norm 0.1852 (-1.02z)| lr 4.36e-03 | 2005.88 ms | 68.4% bf16 MFU | 259703 tok/s +step 7624/18794 | loss 3.115257 (-0.70z)| norm 0.2327 (-0.29z)| lr 4.36e-03 | 2003.60 ms | 68.5% bf16 MFU | 259801 tok/s +step 7625/18794 | loss 3.073822 (-1.98z)| norm 0.2718 (+0.31z)| lr 4.35e-03 | 1993.04 ms | 68.9% bf16 MFU | 259964 tok/s +step 7626/18794 | loss 3.107417 (-0.92z)| norm 0.2033 (-0.75z)| lr 4.35e-03 | 2021.18 ms | 67.9% bf16 MFU | 259936 tok/s +step 7627/18794 | loss 3.103026 (-1.03z)| norm 0.2962 (+0.70z)| lr 4.35e-03 | 2014.03 ms | 68.1% bf16 MFU | 259955 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.049912 +step 7628/18794 | loss 3.177590 (+1.27z)| norm 0.3875 (+2.05z)| lr 4.35e-03 | 1997.43 ms | 68.7% bf16 MFU | 260081 tok/s +step 7629/18794 | loss 3.152477 (+0.49z)| norm 0.3468 (+1.40z)| lr 4.35e-03 | 2015.63 ms | 68.1% bf16 MFU | 260083 tok/s +step 7630/18794 | loss 3.155632 (+0.59z)| norm 0.2040 (-0.77z)| lr 4.35e-03 | 2020.86 ms | 67.9% bf16 MFU | 260050 tok/s +step 7631/18794 | loss 3.153616 (+0.51z)| norm 0.2458 (-0.14z)| lr 4.35e-03 | 2028.80 ms | 67.6% bf16 MFU | 259969 tok/s +step 7632/18794 | loss 3.155818 (+0.56z)| norm 0.2015 (-0.82z)| lr 4.35e-03 | 2017.92 ms | 68.0% bf16 MFU | 259961 tok/s +step 7633/18794 | loss 3.126904 (-0.35z)| norm 0.2149 (-0.61z)| lr 4.35e-03 | 2005.90 ms | 68.4% bf16 MFU | 260032 tok/s +step 7634/18794 | loss 3.171602 (+1.11z)| norm 0.2214 (-0.51z)| lr 4.35e-03 | 2021.89 ms | 67.9% bf16 MFU | 259996 tok/s +step 7635/18794 | loss 3.154774 (+0.55z)| norm 0.1842 (-1.05z)| lr 4.35e-03 | 2027.30 ms | 67.7% bf16 MFU | 259926 tok/s +step 7636/18794 | loss 3.126194 (-0.38z)| norm 0.1910 (-0.93z)| lr 4.35e-03 | 2009.95 ms | 68.3% bf16 MFU | 259972 tok/s +step 7637/18794 | loss 3.138597 (+0.02z)| norm 0.2772 (+0.36z)| lr 4.35e-03 | 2012.03 ms | 68.2% bf16 MFU | 260003 tok/s +step 7638/18794 | loss 3.132323 (-0.19z)| norm 0.2302 (-0.36z)| lr 4.35e-03 | 2003.47 ms | 68.5% bf16 MFU | 260087 tok/s +step 7639/18794 | loss 3.100791 (-1.22z)| norm 0.1913 (-0.94z)| lr 4.35e-03 | 2006.84 ms | 68.4% bf16 MFU | 260145 tok/s +step 7640/18794 | loss 3.149207 (+0.41z)| norm 0.2861 (+0.50z)| lr 4.35e-03 | 1995.04 ms | 68.8% bf16 MFU | 260278 tok/s +step 7641/18794 | loss 3.113866 (-0.78z)| norm 0.2548 (+0.04z)| lr 4.35e-03 | 2001.01 ms | 68.6% bf16 MFU | 260364 tok/s +step 7642/18794 | loss 3.101008 (-1.20z)| norm 0.2248 (-0.42z)| lr 4.35e-03 | 2021.94 ms | 67.9% bf16 MFU | 260311 tok/s +step 7643/18794 | loss 3.137387 (+0.03z)| norm 0.2148 (-0.56z)| lr 4.35e-03 | 2005.61 ms | 68.4% bf16 MFU | 260366 tok/s +step 7644/18794 | loss 3.174892 (+1.28z)| norm 0.1745 (-1.18z)| lr 4.35e-03 | 2013.45 ms | 68.2% bf16 MFU | 260367 tok/s +step 7645/18794 | loss 3.135393 (-0.08z)| norm 0.2241 (-0.39z)| lr 4.35e-03 | 2011.05 ms | 68.2% bf16 MFU | 260384 tok/s +step 7646/18794 | loss 3.181519 (+1.56z)| norm 0.3126 (+1.06z)| lr 4.35e-03 | 1998.80 ms | 68.7% bf16 MFU | 260480 tok/s +step 7647/18794 | loss 3.152059 (+0.51z)| norm 0.2838 (+0.58z)| lr 4.35e-03 | 2002.70 ms | 68.5% bf16 MFU | 260546 tok/s +step 7648/18794 | loss 3.150088 (+0.44z)| norm 0.1956 (-0.84z)| lr 4.34e-03 | 2016.10 ms | 68.1% bf16 MFU | 260521 tok/s +step 7649/18794 | loss 3.114863 (-0.80z)| norm 0.3213 (+1.18z)| lr 4.34e-03 | 1998.88 ms | 68.7% bf16 MFU | 260609 tok/s +step 7650/18794 | loss 3.183112 (+1.58z)| norm 0.3344 (+1.36z)| lr 4.34e-03 | 2010.41 ms | 68.3% bf16 MFU | 260618 tok/s +step 7651/18794 | loss 3.127148 (-0.37z)| norm 0.2247 (-0.41z)| lr 4.34e-03 | 2013.22 ms | 68.2% bf16 MFU | 260608 tok/s +step 7652/18794 | loss 3.086586 (-1.74z)| norm 0.1926 (-0.92z)| lr 4.34e-03 | 2010.01 ms | 68.3% bf16 MFU | 260620 tok/s +step 7653/18794 | loss 3.218926 (+2.69z)| norm 0.2162 (-0.52z)| lr 4.34e-03 | 1983.82 ms | 69.2% bf16 MFU | 260803 tok/s +step 7654/18794 | loss 3.152238 (+0.49z)| norm 0.2641 (+0.26z)| lr 4.34e-03 | 2017.08 ms | 68.0% bf16 MFU | 260759 tok/s +step 7655/18794 | loss 3.147472 (+0.34z)| norm 0.2930 (+0.72z)| lr 4.34e-03 | 1994.31 ms | 68.8% bf16 MFU | 260866 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.876698 +step 7656/18794 | loss 3.144707 (+0.26z)| norm 0.4354 (+2.88z)| lr 4.34e-03 | 2014.88 ms | 68.1% bf16 MFU | 260833 tok/s +step 7657/18794 | loss 3.172518 (+1.19z)| norm 0.3415 (+1.39z)| lr 4.34e-03 | 2007.36 ms | 68.4% bf16 MFU | 260850 tok/s +step 7658/18794 | loss 3.145680 (+0.26z)| norm 0.2023 (-0.75z)| lr 4.34e-03 | 2021.10 ms | 67.9% bf16 MFU | 260778 tok/s +step 7659/18794 | loss 3.146184 (+0.26z)| norm 0.3066 (+0.84z)| lr 4.34e-03 | 1993.38 ms | 68.8% bf16 MFU | 260890 tok/s +step 7660/18794 | loss 3.187021 (+1.65z)| norm 0.3356 (+1.27z)| lr 4.34e-03 | 1986.40 ms | 69.1% bf16 MFU | 261042 tok/s +step 7661/18794 | loss 3.162360 (+0.79z)| norm 0.2189 (-0.52z)| lr 4.34e-03 | 1994.38 ms | 68.8% bf16 MFU | 261135 tok/s +step 7662/18794 | loss 3.116436 (-0.79z)| norm 0.3106 (+0.88z)| lr 4.34e-03 | 1999.66 ms | 68.6% bf16 MFU | 261187 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.146786 +step 7663/18794 | loss 3.170234 (+1.04z)| norm 0.4563 (+3.15z)| lr 4.34e-03 | 2000.54 ms | 68.6% bf16 MFU | 261232 tok/s +step 7664/18794 | loss 3.140760 (+0.01z)| norm 0.2621 (+0.16z)| lr 4.34e-03 | 2000.87 ms | 68.6% bf16 MFU | 261271 tok/s +step 7665/18794 | loss 3.162128 (+0.75z)| norm 0.2520 (-0.01z)| lr 4.34e-03 | 2008.21 ms | 68.3% bf16 MFU | 261261 tok/s +step 7666/18794 | loss 3.122494 (-0.61z)| norm 0.2134 (-0.60z)| lr 4.34e-03 | 2013.14 ms | 68.2% bf16 MFU | 261220 tok/s +step 7667/18794 | loss 3.134383 (-0.22z)| norm 0.1740 (-1.19z)| lr 4.34e-03 | 1991.17 ms | 68.9% bf16 MFU | 261324 tok/s +step 7668/18794 | loss 3.132961 (-0.27z)| norm 0.1817 (-1.06z)| lr 4.34e-03 | 1997.59 ms | 68.7% bf16 MFU | 261381 tok/s +step 7669/18794 | loss 3.163578 (+0.79z)| norm 0.2191 (-0.47z)| lr 4.34e-03 | 2002.13 ms | 68.5% bf16 MFU | 261405 tok/s +step 7670/18794 | loss 3.164364 (+0.82z)| norm 0.1951 (-0.83z)| lr 4.33e-03 | 1988.03 ms | 69.0% bf16 MFU | 261521 tok/s +step 7671/18794 | loss 3.161829 (+0.72z)| norm 0.1908 (-0.89z)| lr 4.33e-03 | 2015.32 ms | 68.1% bf16 MFU | 261453 tok/s +step 7672/18794 | loss 3.172311 (+1.08z)| norm 0.2107 (-0.57z)| lr 4.33e-03 | 2018.96 ms | 68.0% bf16 MFU | 261364 tok/s +step 7673/18794 | loss 3.149990 (+0.31z)| norm 0.2133 (-0.51z)| lr 4.33e-03 | 1994.08 ms | 68.8% bf16 MFU | 261442 tok/s +step 7674/18794 | loss 3.109594 (-1.10z)| norm 0.2582 (+0.25z)| lr 4.33e-03 | 2000.13 ms | 68.6% bf16 MFU | 261476 tok/s +step 7675/18794 | loss 3.166982 (+0.94z)| norm 0.2356 (-0.14z)| lr 4.33e-03 | 2003.54 ms | 68.5% bf16 MFU | 261487 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.164940 +step 7676/18794 | loss 3.154995 (+0.51z)| norm 0.3760 (+2.16z)| lr 4.33e-03 | 1988.84 ms | 69.0% bf16 MFU | 261593 tok/s +step 7677/18794 | loss 3.185467 (+1.57z)| norm 0.3401 (+1.53z)| lr 4.33e-03 | 2008.90 ms | 68.3% bf16 MFU | 261562 tok/s +step 7678/18794 | loss 3.127501 (-0.50z)| norm 0.2155 (-0.52z)| lr 4.33e-03 | 1987.71 ms | 69.0% bf16 MFU | 261673 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.344455 +step 7679/18794 | loss 3.187737 (+1.63z)| norm 0.3951 (+2.34z)| lr 4.33e-03 | 1991.97 ms | 68.9% bf16 MFU | 261749 tok/s +step 7680/18794 | loss 3.112623 (-1.05z)| norm 0.2541 (+0.08z)| lr 4.33e-03 | 1992.93 ms | 68.9% bf16 MFU | 261815 tok/s +step 7681/18794 | loss 3.199928 (+2.03z)| norm 0.2049 (-0.69z)| lr 4.33e-03 | 2007.85 ms | 68.3% bf16 MFU | 261780 tok/s +step 7682/18794 | loss 3.144214 (+0.05z)| norm 0.1826 (-1.04z)| lr 4.33e-03 | 2000.58 ms | 68.6% bf16 MFU | 261795 tok/s +step 7683/18794 | loss 3.054366 (-3.00z)| norm 0.2279 (-0.31z)| lr 4.33e-03 | 2004.94 ms | 68.4% bf16 MFU | 261780 tok/s +step 7684/18794 | loss 3.132004 (-0.36z)| norm 0.3395 (+1.46z)| lr 4.33e-03 | 2000.22 ms | 68.6% bf16 MFU | 261797 tok/s +step 7685/18794 | loss 3.172998 (+1.04z)| norm 0.3064 (+0.92z)| lr 4.33e-03 | 1980.71 ms | 69.3% bf16 MFU | 261942 tok/s +step 7686/18794 | loss 3.149513 (+0.23z)| norm 0.2538 (+0.08z)| lr 4.33e-03 | 1996.45 ms | 68.7% bf16 MFU | 261975 tok/s +step 7687/18794 | loss 3.146979 (+0.13z)| norm 0.2277 (-0.35z)| lr 4.33e-03 | 1999.08 ms | 68.6% bf16 MFU | 261990 tok/s +step 7688/18794 | loss 3.183508 (+1.36z)| norm 0.2276 (-0.35z)| lr 4.33e-03 | 1988.51 ms | 69.0% bf16 MFU | 262073 tok/s +step 7689/18794 | loss 3.122249 (-0.72z)| norm 0.2412 (-0.13z)| lr 4.33e-03 | 1993.04 ms | 68.9% bf16 MFU | 262122 tok/s +step 7690/18794 | loss 3.120144 (-0.78z)| norm 0.2284 (-0.35z)| lr 4.33e-03 | 1986.22 ms | 69.1% bf16 MFU | 262214 tok/s +step 7691/18794 | loss 3.138927 (-0.13z)| norm 0.3238 (+1.17z)| lr 4.33e-03 | 1983.65 ms | 69.2% bf16 MFU | 262319 tok/s +step 7692/18794 | loss 3.141875 (-0.04z)| norm 0.3162 (+1.02z)| lr 4.33e-03 | 1985.56 ms | 69.1% bf16 MFU | 262405 tok/s +step 7693/18794 | loss 3.143484 (+0.01z)| norm 0.2664 (+0.22z)| lr 4.32e-03 | 2009.22 ms | 68.3% bf16 MFU | 262332 tok/s +reducing beta2 to 0.9 and lr/wd by 0.870 due to grad z-score of 4.023734 +step 7694/18794 | loss 3.142271 (-0.06z)| norm 0.5299 (+4.02z)| lr 3.76e-03 | 1980.01 ms | 69.3% bf16 MFU | 262455 tok/s +step 7695/18794 | loss 3.136607 (-0.26z)| norm 0.1631 (-1.35z)| lr 4.32e-03 | 1979.93 ms | 69.3% bf16 MFU | 262572 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.963412 +step 7696/18794 | loss 3.068964 (-2.54z)| norm 0.4698 (+2.96z)| lr 4.32e-03 | 1991.97 ms | 68.9% bf16 MFU | 262604 tok/s +step 7697/18794 | loss 3.183156 (+1.34z)| norm 0.3068 (+0.67z)| lr 4.32e-03 | 1996.46 ms | 68.7% bf16 MFU | 262604 tok/s +step 7698/18794 | loss 3.158047 (+0.49z)| norm 0.2298 (-0.42z)| lr 4.32e-03 | 1977.81 ms | 69.4% bf16 MFU | 262728 tok/s +step 7699/18794 | loss 3.177025 (+1.11z)| norm 0.3794 (+1.65z)| lr 4.32e-03 | 1978.68 ms | 69.4% bf16 MFU | 262840 tok/s +step 7700/18794 | loss 3.151002 (+0.23z)| norm 0.1870 (-1.02z)| lr 4.32e-03 | 1985.66 ms | 69.1% bf16 MFU | 262900 tok/s +step 7701/18794 | loss 3.102435 (-1.39z)| norm 0.4032 (+1.92z)| lr 4.32e-03 | 1991.49 ms | 68.9% bf16 MFU | 262918 tok/s +step 7702/18794 | loss 3.126640 (-0.58z)| norm 0.1956 (-0.89z)| lr 4.32e-03 | 2004.62 ms | 68.5% bf16 MFU | 262849 tok/s +step 7703/18794 | loss 3.208470 (+2.09z)| norm 0.3044 (+0.57z)| lr 4.32e-03 | 1987.16 ms | 69.1% bf16 MFU | 262899 tok/s +step 7704/18794 | loss 3.170017 (+0.82z)| norm 0.1928 (-0.92z)| lr 4.32e-03 | 1984.96 ms | 69.1% bf16 MFU | 262960 tok/s +step 7705/18794 | loss 3.132174 (-0.41z)| norm 0.2394 (-0.27z)| lr 4.32e-03 | 2014.35 ms | 68.1% bf16 MFU | 262826 tok/s +step 7706/18794 | loss 3.159420 (+0.48z)| norm 0.2158 (-0.59z)| lr 4.32e-03 | 1988.29 ms | 69.0% bf16 MFU | 262869 tok/s +step 7707/18794 | loss 3.110570 (-1.10z)| norm 0.1900 (-0.93z)| lr 4.32e-03 | 1994.19 ms | 68.8% bf16 MFU | 262871 tok/s +step 7708/18794 | loss 3.144281 (+0.01z)| norm 0.1926 (-0.88z)| lr 4.32e-03 | 1983.31 ms | 69.2% bf16 MFU | 262945 tok/s +step 7709/18794 | loss 3.192794 (+1.58z)| norm 0.3176 (+0.84z)| lr 4.32e-03 | 2000.87 ms | 68.6% bf16 MFU | 262899 tok/s +step 7710/18794 | loss 3.159109 (+0.48z)| norm 0.2806 (+0.33z)| lr 4.32e-03 | 1978.83 ms | 69.4% bf16 MFU | 263002 tok/s +step 7711/18794 | loss 3.163683 (+0.61z)| norm 0.2135 (-0.59z)| lr 4.32e-03 | 1980.61 ms | 69.3% bf16 MFU | 263087 tok/s +step 7712/18794 | loss 3.133189 (-0.41z)| norm 0.3147 (+0.79z)| lr 4.32e-03 | 1979.40 ms | 69.3% bf16 MFU | 263177 tok/s +step 7713/18794 | loss 3.113596 (-1.06z)| norm 0.2295 (-0.37z)| lr 4.32e-03 | 1985.49 ms | 69.1% bf16 MFU | 263221 tok/s +step 7714/18794 | loss 3.084660 (-1.97z)| norm 0.3105 (+0.74z)| lr 4.32e-03 | 2006.42 ms | 68.4% bf16 MFU | 263125 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.250596 +step 7715/18794 | loss 3.163960 (+0.62z)| norm 0.4261 (+2.25z)| lr 4.32e-03 | 1992.56 ms | 68.9% bf16 MFU | 263125 tok/s +step 7716/18794 | loss 3.145150 (-0.00z)| norm 0.1858 (-1.00z)| lr 4.31e-03 | 1993.97 ms | 68.8% bf16 MFU | 263115 tok/s +step 7717/18794 | loss 3.189481 (+1.42z)| norm 0.3960 (+1.79z)| lr 4.31e-03 | 1984.81 ms | 69.1% bf16 MFU | 263167 tok/s +step 7718/18794 | loss 3.092194 (-1.70z)| norm 0.2496 (-0.16z)| lr 4.31e-03 | 1994.17 ms | 68.8% bf16 MFU | 263154 tok/s +step 7719/18794 | loss 3.155126 (+0.33z)| norm 0.2008 (-0.81z)| lr 4.31e-03 | 1989.01 ms | 69.0% bf16 MFU | 263176 tok/s +step 7720/18794 | loss 3.087158 (-1.82z)| norm 0.1905 (-0.95z)| lr 4.31e-03 | 1987.03 ms | 69.1% bf16 MFU | 263210 tok/s +step 7721/18794 | loss 3.196795 (+1.69z)| norm 0.3700 (+1.41z)| lr 4.31e-03 | 1984.82 ms | 69.1% bf16 MFU | 263257 tok/s +step 7722/18794 | loss 3.151054 (+0.21z)| norm 0.2444 (-0.26z)| lr 4.31e-03 | 1981.45 ms | 69.3% bf16 MFU | 263324 tok/s +step 7723/18794 | loss 3.221311 (+2.38z)| norm 0.2676 (+0.04z)| lr 4.31e-03 | 1980.84 ms | 69.3% bf16 MFU | 263392 tok/s +step 7724/18794 | loss 3.166316 (+0.64z)| norm 0.2163 (-0.64z)| lr 4.31e-03 | 1981.57 ms | 69.3% bf16 MFU | 263451 tok/s +step 7725/18794 | loss 3.131186 (-0.49z)| norm 0.1733 (-1.19z)| lr 4.31e-03 | 1991.43 ms | 68.9% bf16 MFU | 263442 tok/s +step 7726/18794 | loss 3.143645 (-0.10z)| norm 0.2958 (+0.42z)| lr 4.31e-03 | 1988.34 ms | 69.0% bf16 MFU | 263454 tok/s +step 7727/18794 | loss 3.139449 (-0.25z)| norm 0.4011 (+1.77z)| lr 4.31e-03 | 1986.36 ms | 69.1% bf16 MFU | 263479 tok/s +step 7728/18794 | loss 3.183497 (+1.19z)| norm 0.3415 (+1.01z)| lr 4.31e-03 | 1987.85 ms | 69.0% bf16 MFU | 263492 tok/s +step 7729/18794 | loss 3.081050 (-2.10z)| norm 0.3962 (+1.71z)| lr 4.31e-03 | 2038.10 ms | 67.3% bf16 MFU | 263180 tok/s +step 7730/18794 | loss 3.139275 (-0.23z)| norm 0.2806 (+0.19z)| lr 4.31e-03 | 2038.55 ms | 67.3% bf16 MFU | 262880 tok/s +step 7731/18794 | loss 3.165945 (+0.62z)| norm 0.2597 (-0.09z)| lr 4.31e-03 | 2034.27 ms | 67.5% bf16 MFU | 262622 tok/s +step 7732/18794 | loss 3.122660 (-0.75z)| norm 0.4107 (+1.84z)| lr 4.31e-03 | 2040.34 ms | 67.3% bf16 MFU | 262339 tok/s +step 7733/18794 | loss 3.051957 (-2.86z)| norm 0.2027 (-0.85z)| lr 4.31e-03 | 2037.63 ms | 67.3% bf16 MFU | 262088 tok/s +step 7734/18794 | loss 3.175261 (+0.91z)| norm 0.3748 (+1.34z)| lr 4.31e-03 | 2030.34 ms | 67.6% bf16 MFU | 261894 tok/s +step 7735/18794 | loss 3.136554 (-0.27z)| norm 0.2005 (-0.89z)| lr 4.31e-03 | 2027.16 ms | 67.7% bf16 MFU | 261731 tok/s +step 7736/18794 | loss 3.127916 (-0.53z)| norm 0.4187 (+1.86z)| lr 4.31e-03 | 2045.41 ms | 67.1% bf16 MFU | 261461 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.473853 +step 7737/18794 | loss 3.085602 (-1.78z)| norm 0.5676 (+3.47z)| lr 4.31e-03 | 2045.12 ms | 67.1% bf16 MFU | 261206 tok/s +step 7738/18794 | loss 3.148133 (+0.10z)| norm 0.1795 (-1.12z)| lr 4.30e-03 | 2037.29 ms | 67.4% bf16 MFU | 261013 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.321450 +step 7739/18794 | loss 3.155890 (+0.32z)| norm 0.4785 (+2.32z)| lr 4.30e-03 | 2037.69 ms | 67.3% bf16 MFU | 260827 tok/s +step 7740/18794 | loss 3.111456 (-1.02z)| norm 0.1893 (-1.00z)| lr 4.30e-03 | 2038.37 ms | 67.3% bf16 MFU | 260646 tok/s +step 7741/18794 | loss 3.174117 (+0.86z)| norm 0.2735 (-0.04z)| lr 4.30e-03 | 2035.08 ms | 67.4% bf16 MFU | 260495 tok/s +step 7742/18794 | loss 3.122315 (-0.72z)| norm 0.2891 (+0.14z)| lr 4.30e-03 | 2031.02 ms | 67.6% bf16 MFU | 260377 tok/s +step 7743/18794 | loss 3.095840 (-1.50z)| norm 0.2842 (+0.07z)| lr 4.30e-03 | 2042.94 ms | 67.2% bf16 MFU | 260190 tok/s +step 7744/18794 | loss 3.122075 (-0.69z)| norm 0.2059 (-0.84z)| lr 4.30e-03 | 2019.25 ms | 68.0% bf16 MFU | 260163 tok/s +step 7745/18794 | loss 3.144382 (-0.02z)| norm 0.2123 (-0.76z)| lr 4.30e-03 | 2035.65 ms | 67.4% bf16 MFU | 260032 tok/s +step 7746/18794 | loss 3.153820 (+0.28z)| norm 0.1882 (-1.02z)| lr 4.30e-03 | 2042.28 ms | 67.2% bf16 MFU | 259867 tok/s +step 7747/18794 | loss 3.145848 (+0.03z)| norm 0.2630 (-0.16z)| lr 4.30e-03 | 2034.61 ms | 67.4% bf16 MFU | 259758 tok/s +step 7748/18794 | loss 3.161299 (+0.50z)| norm 0.2122 (-0.75z)| lr 4.30e-03 | 2025.16 ms | 67.8% bf16 MFU | 259714 tok/s +step 7749/18794 | loss 3.152235 (+0.21z)| norm 0.2216 (-0.63z)| lr 4.30e-03 | 2039.81 ms | 67.3% bf16 MFU | 259580 tok/s +step 7750/18794 | loss 3.164177 (+0.59z)| norm 0.2360 (-0.45z)| lr 4.30e-03 | 2028.40 ms | 67.7% bf16 MFU | 259524 tok/s +val loss 3.158122 +HellaSwag: 2931/10042 = 0.291874: 0/1256 +step 7751/18794 | loss 3.140244 (-0.15z)| norm 0.2110 (-0.74z)| lr 4.30e-03 | 2032.06 ms | 67.5% bf16 MFU | 259449 tok/s +step 7752/18794 | loss 3.143412 (-0.07z)| norm 0.2134 (-0.71z)| lr 4.30e-03 | 2027.22 ms | 67.7% bf16 MFU | 259407 tok/s +step 7753/18794 | loss 3.103721 (-1.30z)| norm 0.1839 (-1.05z)| lr 4.30e-03 | 2048.64 ms | 67.0% bf16 MFU | 259233 tok/s +step 7754/18794 | loss 3.131255 (-0.42z)| norm 0.2345 (-0.46z)| lr 4.30e-03 | 2040.11 ms | 67.3% bf16 MFU | 259121 tok/s +step 7755/18794 | loss 3.153906 (+0.30z)| norm 0.2331 (-0.47z)| lr 4.30e-03 | 2029.86 ms | 67.6% bf16 MFU | 259079 tok/s +step 7756/18794 | loss 3.180301 (+1.12z)| norm 0.1991 (-0.85z)| lr 4.30e-03 | 2027.00 ms | 67.7% bf16 MFU | 259058 tok/s +step 7757/18794 | loss 3.129349 (-0.47z)| norm 0.2650 (-0.07z)| lr 4.30e-03 | 2031.94 ms | 67.5% bf16 MFU | 259006 tok/s +step 7758/18794 | loss 3.128055 (-0.51z)| norm 0.3791 (+1.24z)| lr 4.30e-03 | 2035.24 ms | 67.4% bf16 MFU | 258936 tok/s +step 7759/18794 | loss 3.151476 (+0.23z)| norm 0.3504 (+0.90z)| lr 4.30e-03 | 2033.12 ms | 67.5% bf16 MFU | 258883 tok/s +step 7760/18794 | loss 3.191440 (+1.49z)| norm 0.2107 (-0.71z)| lr 4.30e-03 | 2017.89 ms | 68.0% bf16 MFU | 258930 tok/s +step 7761/18794 | loss 3.125546 (-0.58z)| norm 0.2957 (+0.27z)| lr 4.29e-03 | 2036.63 ms | 67.4% bf16 MFU | 258855 tok/s +step 7762/18794 | loss 3.094939 (-1.52z)| norm 0.2190 (-0.61z)| lr 4.29e-03 | 2043.87 ms | 67.1% bf16 MFU | 258738 tok/s +step 7763/18794 | loss 3.148474 (+0.16z)| norm 0.2196 (-0.59z)| lr 4.29e-03 | 2022.69 ms | 67.8% bf16 MFU | 258761 tok/s +step 7764/18794 | loss 3.147636 (+0.13z)| norm 0.2252 (-0.52z)| lr 4.29e-03 | 2035.39 ms | 67.4% bf16 MFU | 258702 tok/s +step 7765/18794 | loss 3.168950 (+0.79z)| norm 0.2027 (-0.78z)| lr 4.29e-03 | 2021.70 ms | 67.9% bf16 MFU | 258734 tok/s +step 7766/18794 | loss 3.160507 (+0.52z)| norm 0.1740 (-1.11z)| lr 4.29e-03 | 2009.03 ms | 68.3% bf16 MFU | 258845 tok/s +step 7767/18794 | loss 3.117568 (-0.82z)| norm 0.2089 (-0.70z)| lr 4.29e-03 | 1998.67 ms | 68.7% bf16 MFU | 259019 tok/s +step 7768/18794 | loss 3.136118 (-0.24z)| norm 0.1809 (-1.03z)| lr 4.29e-03 | 2033.69 ms | 67.5% bf16 MFU | 258958 tok/s +step 7769/18794 | loss 3.133663 (-0.31z)| norm 0.2529 (-0.19z)| lr 4.29e-03 | 2011.48 ms | 68.2% bf16 MFU | 259043 tok/s +step 7770/18794 | loss 3.202673 (+1.81z)| norm 0.2747 (+0.06z)| lr 4.29e-03 | 2019.47 ms | 68.0% bf16 MFU | 259071 tok/s +step 7771/18794 | loss 3.123776 (-0.61z)| norm 0.3071 (+0.44z)| lr 4.29e-03 | 2038.82 ms | 67.3% bf16 MFU | 258975 tok/s +step 7772/18794 | loss 3.221109 (+2.32z)| norm 0.2710 (-0.00z)| lr 4.29e-03 | 2019.27 ms | 68.0% bf16 MFU | 259009 tok/s +step 7773/18794 | loss 3.084920 (-1.73z)| norm 0.2195 (-0.62z)| lr 4.29e-03 | 2024.67 ms | 67.8% bf16 MFU | 259006 tok/s +step 7774/18794 | loss 3.164127 (+0.60z)| norm 0.2680 (-0.04z)| lr 4.29e-03 | 2024.38 ms | 67.8% bf16 MFU | 259005 tok/s +step 7775/18794 | loss 3.157587 (+0.41z)| norm 0.2005 (-0.84z)| lr 4.29e-03 | 2014.88 ms | 68.1% bf16 MFU | 259065 tok/s +step 7776/18794 | loss 3.196141 (+1.53z)| norm 0.2087 (-0.73z)| lr 4.29e-03 | 2019.24 ms | 68.0% bf16 MFU | 259094 tok/s +step 7777/18794 | loss 3.155547 (+0.34z)| norm 0.2115 (-0.68z)| lr 4.29e-03 | 2021.66 ms | 67.9% bf16 MFU | 259106 tok/s +step 7778/18794 | loss 3.149260 (+0.15z)| norm 0.2098 (-0.70z)| lr 4.29e-03 | 2027.87 ms | 67.7% bf16 MFU | 259078 tok/s +step 7779/18794 | loss 3.148251 (+0.13z)| norm 0.1601 (-1.27z)| lr 4.29e-03 | 2023.38 ms | 67.8% bf16 MFU | 259080 tok/s +step 7780/18794 | loss 3.127236 (-0.50z)| norm 0.2409 (-0.30z)| lr 4.29e-03 | 2021.42 ms | 67.9% bf16 MFU | 259094 tok/s +step 7781/18794 | loss 3.147275 (+0.12z)| norm 0.3143 (+0.58z)| lr 4.29e-03 | 2028.95 ms | 67.6% bf16 MFU | 259060 tok/s +step 7782/18794 | loss 3.150256 (+0.21z)| norm 0.2900 (+0.27z)| lr 4.29e-03 | 2024.34 ms | 67.8% bf16 MFU | 259056 tok/s +step 7783/18794 | loss 3.187160 (+1.33z)| norm 0.2034 (-0.77z)| lr 4.28e-03 | 2022.25 ms | 67.9% bf16 MFU | 259066 tok/s +step 7784/18794 | loss 3.160569 (+0.49z)| norm 0.2575 (-0.11z)| lr 4.28e-03 | 2015.78 ms | 68.1% bf16 MFU | 259118 tok/s +step 7785/18794 | loss 3.174245 (+0.92z)| norm 0.2219 (-0.53z)| lr 4.28e-03 | 2024.49 ms | 67.8% bf16 MFU | 259110 tok/s +step 7786/18794 | loss 3.168319 (+0.72z)| norm 0.2381 (-0.33z)| lr 4.28e-03 | 2031.88 ms | 67.5% bf16 MFU | 259056 tok/s +step 7787/18794 | loss 3.116555 (-0.88z)| norm 0.2255 (-0.49z)| lr 4.28e-03 | 2014.20 ms | 68.1% bf16 MFU | 259118 tok/s +step 7788/18794 | loss 3.107300 (-1.15z)| norm 0.2220 (-0.53z)| lr 4.28e-03 | 2014.82 ms | 68.1% bf16 MFU | 259173 tok/s +step 7789/18794 | loss 3.176778 (+1.00z)| norm 0.2546 (-0.13z)| lr 4.28e-03 | 2032.46 ms | 67.5% bf16 MFU | 259113 tok/s +step 7790/18794 | loss 3.112313 (-1.00z)| norm 0.2393 (-0.32z)| lr 4.28e-03 | 2022.08 ms | 67.9% bf16 MFU | 259121 tok/s +step 7791/18794 | loss 3.134558 (-0.31z)| norm 0.1937 (-0.86z)| lr 4.28e-03 | 2013.29 ms | 68.2% bf16 MFU | 259186 tok/s +step 7792/18794 | loss 3.162781 (+0.56z)| norm 0.3086 (+0.54z)| lr 4.28e-03 | 2033.30 ms | 67.5% bf16 MFU | 259119 tok/s +step 7793/18794 | loss 3.153242 (+0.26z)| norm 0.3480 (+1.00z)| lr 4.28e-03 | 1995.24 ms | 68.8% bf16 MFU | 259301 tok/s +step 7794/18794 | loss 3.145325 (+0.01z)| norm 0.2249 (-0.47z)| lr 4.28e-03 | 2017.22 ms | 68.0% bf16 MFU | 259332 tok/s +step 7795/18794 | loss 3.207036 (+1.87z)| norm 0.1865 (-0.97z)| lr 4.28e-03 | 2019.16 ms | 68.0% bf16 MFU | 259348 tok/s +step 7796/18794 | loss 3.141125 (-0.16z)| norm 0.1940 (-0.87z)| lr 4.28e-03 | 2005.37 ms | 68.4% bf16 MFU | 259453 tok/s +step 7797/18794 | loss 3.099560 (-1.43z)| norm 0.3252 (+0.86z)| lr 4.28e-03 | 2004.35 ms | 68.5% bf16 MFU | 259559 tok/s +step 7798/18794 | loss 3.099564 (-1.40z)| norm 0.2911 (+0.40z)| lr 4.28e-03 | 2025.57 ms | 67.8% bf16 MFU | 259523 tok/s +step 7799/18794 | loss 3.085032 (-1.80z)| norm 0.2095 (-0.66z)| lr 4.28e-03 | 2005.71 ms | 68.4% bf16 MFU | 259616 tok/s +step 7800/18794 | loss 3.105006 (-1.17z)| norm 0.1930 (-0.88z)| lr 4.28e-03 | 2028.51 ms | 67.7% bf16 MFU | 259558 tok/s +step 7801/18794 | loss 3.185828 (+1.26z)| norm 0.2188 (-0.52z)| lr 4.28e-03 | 2012.28 ms | 68.2% bf16 MFU | 259608 tok/s +step 7802/18794 | loss 3.169431 (+0.75z)| norm 0.3544 (+1.29z)| lr 4.28e-03 | 2021.67 ms | 67.9% bf16 MFU | 259594 tok/s +step 7803/18794 | loss 3.104971 (-1.19z)| norm 0.3513 (+1.24z)| lr 4.28e-03 | 2013.37 ms | 68.2% bf16 MFU | 259635 tok/s +step 7804/18794 | loss 3.207244 (+1.90z)| norm 0.2353 (-0.32z)| lr 4.28e-03 | 2028.83 ms | 67.6% bf16 MFU | 259574 tok/s +step 7805/18794 | loss 3.123555 (-0.62z)| norm 0.3352 (+1.00z)| lr 4.28e-03 | 1988.36 ms | 69.0% bf16 MFU | 259779 tok/s +step 7806/18794 | loss 3.152811 (+0.27z)| norm 0.3300 (+0.92z)| lr 4.27e-03 | 2024.21 ms | 67.8% bf16 MFU | 259740 tok/s +step 7807/18794 | loss 3.162481 (+0.54z)| norm 0.2357 (-0.35z)| lr 4.27e-03 | 2024.20 ms | 67.8% bf16 MFU | 259704 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.348664 +step 7808/18794 | loss 3.174515 (+0.89z)| norm 0.4438 (+2.35z)| lr 4.27e-03 | 2022.90 ms | 67.8% bf16 MFU | 259678 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.589485 +step 7809/18794 | loss 3.133978 (-0.31z)| norm 0.4702 (+2.59z)| lr 4.27e-03 | 2006.60 ms | 68.4% bf16 MFU | 259758 tok/s +step 7810/18794 | loss 3.123956 (-0.61z)| norm 0.2193 (-0.58z)| lr 4.27e-03 | 2007.57 ms | 68.4% bf16 MFU | 259828 tok/s +step 7811/18794 | loss 3.122064 (-0.65z)| norm 0.3627 (+1.21z)| lr 4.27e-03 | 2014.68 ms | 68.1% bf16 MFU | 259848 tok/s +step 7812/18794 | loss 3.136116 (-0.22z)| norm 0.1918 (-0.93z)| lr 4.27e-03 | 2007.46 ms | 68.4% bf16 MFU | 259914 tok/s +step 7813/18794 | loss 3.110435 (-1.00z)| norm 0.2898 (+0.30z)| lr 4.27e-03 | 1999.69 ms | 68.6% bf16 MFU | 260028 tok/s +step 7814/18794 | loss 3.195251 (+1.55z)| norm 0.2564 (-0.12z)| lr 4.27e-03 | 1992.92 ms | 68.9% bf16 MFU | 260180 tok/s +step 7815/18794 | loss 3.130379 (-0.42z)| norm 0.2234 (-0.52z)| lr 4.27e-03 | 2029.84 ms | 67.6% bf16 MFU | 260085 tok/s +step 7816/18794 | loss 3.160347 (+0.49z)| norm 0.2215 (-0.55z)| lr 4.27e-03 | 2009.02 ms | 68.3% bf16 MFU | 260130 tok/s +step 7817/18794 | loss 3.170451 (+0.81z)| norm 0.2297 (-0.43z)| lr 4.27e-03 | 1998.23 ms | 68.7% bf16 MFU | 260242 tok/s +step 7818/18794 | loss 3.114110 (-0.94z)| norm 0.2935 (+0.40z)| lr 4.27e-03 | 2017.06 ms | 68.0% bf16 MFU | 260226 tok/s +step 7819/18794 | loss 3.111317 (-1.01z)| norm 0.1946 (-0.89z)| lr 4.27e-03 | 2003.67 ms | 68.5% bf16 MFU | 260298 tok/s +step 7820/18794 | loss 3.088678 (-1.71z)| norm 0.4188 (+1.98z)| lr 4.27e-03 | 2001.67 ms | 68.6% bf16 MFU | 260379 tok/s +step 7821/18794 | loss 3.092599 (-1.56z)| norm 0.4201 (+1.96z)| lr 4.27e-03 | 1996.78 ms | 68.7% bf16 MFU | 260489 tok/s +step 7822/18794 | loss 3.002982 (-3.93z)| norm 0.2122 (-0.67z)| lr 4.27e-03 | 2019.74 ms | 67.9% bf16 MFU | 260443 tok/s +step 7823/18794 | loss 3.090368 (-1.44z)| norm 0.2005 (-0.81z)| lr 4.27e-03 | 1995.01 ms | 68.8% bf16 MFU | 260561 tok/s +step 7824/18794 | loss 3.126414 (-0.39z)| norm 0.2701 (+0.06z)| lr 4.27e-03 | 2005.39 ms | 68.4% bf16 MFU | 260605 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.003670 +step 7825/18794 | loss 3.116484 (-0.67z)| norm 0.4280 (+2.00z)| lr 4.27e-03 | 2009.60 ms | 68.3% bf16 MFU | 260619 tok/s +step 7826/18794 | loss 3.070496 (-1.94z)| norm 0.3284 (+0.75z)| lr 4.27e-03 | 2003.96 ms | 68.5% bf16 MFU | 260670 tok/s +step 7827/18794 | loss 3.118613 (-0.57z)| norm 0.1603 (-1.32z)| lr 4.27e-03 | 2010.23 ms | 68.3% bf16 MFU | 260677 tok/s +step 7828/18794 | loss 3.107388 (-0.87z)| norm 0.1851 (-0.99z)| lr 4.26e-03 | 2012.34 ms | 68.2% bf16 MFU | 260670 tok/s +step 7829/18794 | loss 3.144607 (+0.17z)| norm 0.2076 (-0.69z)| lr 4.26e-03 | 1998.58 ms | 68.7% bf16 MFU | 260753 tok/s +step 7830/18794 | loss 3.152695 (+0.40z)| norm 0.2943 (+0.41z)| lr 4.26e-03 | 2005.08 ms | 68.4% bf16 MFU | 260789 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.111197 +step 7831/18794 | loss 3.101207 (-1.06z)| norm 0.4345 (+2.11z)| lr 4.26e-03 | 2005.82 ms | 68.4% bf16 MFU | 260819 tok/s +step 7832/18794 | loss 3.114503 (-0.67z)| norm 0.2116 (-0.63z)| lr 4.26e-03 | 1994.47 ms | 68.8% bf16 MFU | 260921 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.386962 +step 7833/18794 | loss 3.139781 (+0.03z)| norm 0.4597 (+2.39z)| lr 4.26e-03 | 1989.06 ms | 69.0% bf16 MFU | 261055 tok/s +step 7834/18794 | loss 3.114204 (-0.71z)| norm 0.3612 (+1.19z)| lr 4.26e-03 | 1994.84 ms | 68.8% bf16 MFU | 261143 tok/s +step 7835/18794 | loss 3.146037 (+0.23z)| norm 0.2589 (-0.08z)| lr 4.26e-03 | 2003.69 ms | 68.5% bf16 MFU | 261169 tok/s +step 7836/18794 | loss 3.099378 (-1.14z)| norm 0.3755 (+1.38z)| lr 4.26e-03 | 1983.27 ms | 69.2% bf16 MFU | 261328 tok/s +step 7837/18794 | loss 3.107981 (-0.90z)| norm 0.2220 (-0.52z)| lr 4.26e-03 | 1997.95 ms | 68.7% bf16 MFU | 261382 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.114598 +step 7838/18794 | loss 3.093149 (-1.31z)| norm 0.4244 (+2.11z)| lr 4.26e-03 | 1989.25 ms | 69.0% bf16 MFU | 261491 tok/s +step 7839/18794 | loss 3.086241 (-1.48z)| norm 0.2008 (-0.82z)| lr 4.26e-03 | 1996.11 ms | 68.7% bf16 MFU | 261550 tok/s +step 7840/18794 | loss 3.128286 (-0.26z)| norm 0.3851 (+1.66z)| lr 4.26e-03 | 1997.93 ms | 68.7% bf16 MFU | 261593 tok/s +step 7841/18794 | loss 3.113252 (-0.68z)| norm 0.2344 (-0.38z)| lr 4.26e-03 | 2003.33 ms | 68.5% bf16 MFU | 261599 tok/s +step 7842/18794 | loss 3.078964 (-1.66z)| norm 0.2227 (-0.53z)| lr 4.26e-03 | 2005.67 ms | 68.4% bf16 MFU | 261589 tok/s +step 7843/18794 | loss 3.073367 (-1.79z)| norm 0.2068 (-0.73z)| lr 4.26e-03 | 2012.73 ms | 68.2% bf16 MFU | 261534 tok/s +step 7844/18794 | loss 3.119951 (-0.46z)| norm 0.1978 (-0.85z)| lr 4.26e-03 | 2009.15 ms | 68.3% bf16 MFU | 261505 tok/s +step 7845/18794 | loss 3.112011 (-0.67z)| norm 0.2336 (-0.37z)| lr 4.26e-03 | 2009.16 ms | 68.3% bf16 MFU | 261477 tok/s +step 7846/18794 | loss 3.147129 (+0.33z)| norm 0.3065 (+0.60z)| lr 4.26e-03 | 2008.84 ms | 68.3% bf16 MFU | 261452 tok/s +step 7847/18794 | loss 3.133597 (-0.05z)| norm 0.1995 (-0.84z)| lr 4.26e-03 | 2008.08 ms | 68.3% bf16 MFU | 261434 tok/s +step 7848/18794 | loss 2.984977 (-3.92z)| norm 0.2244 (-0.50z)| lr 4.26e-03 | 1996.04 ms | 68.8% bf16 MFU | 261496 tok/s +step 7849/18794 | loss 3.106924 (-0.69z)| norm 0.2449 (-0.23z)| lr 4.26e-03 | 2008.49 ms | 68.3% bf16 MFU | 261473 tok/s +step 7850/18794 | loss 3.153668 (+0.54z)| norm 0.2577 (-0.06z)| lr 4.26e-03 | 2003.17 ms | 68.5% bf16 MFU | 261486 tok/s +step 7851/18794 | loss 3.102407 (-0.80z)| norm 0.2332 (-0.40z)| lr 4.25e-03 | 2004.12 ms | 68.5% bf16 MFU | 261492 tok/s +step 7852/18794 | loss 3.081427 (-1.32z)| norm 0.2632 (+0.00z)| lr 4.25e-03 | 1990.71 ms | 68.9% bf16 MFU | 261585 tok/s +step 7853/18794 | loss 3.077908 (-1.40z)| norm 0.2300 (-0.46z)| lr 4.25e-03 | 1994.91 ms | 68.8% bf16 MFU | 261647 tok/s +step 7854/18794 | loss 3.139768 (+0.20z)| norm 0.2197 (-0.59z)| lr 4.25e-03 | 1986.01 ms | 69.1% bf16 MFU | 261764 tok/s +step 7855/18794 | loss 3.148169 (+0.42z)| norm 0.2316 (-0.43z)| lr 4.25e-03 | 1986.99 ms | 69.1% bf16 MFU | 261869 tok/s +step 7856/18794 | loss 3.103794 (-0.71z)| norm 0.1986 (-0.88z)| lr 4.25e-03 | 1996.72 ms | 68.7% bf16 MFU | 261904 tok/s +step 7857/18794 | loss 3.093777 (-0.96z)| norm 0.1829 (-1.08z)| lr 4.25e-03 | 2006.50 ms | 68.4% bf16 MFU | 261874 tok/s +step 7858/18794 | loss 3.135720 (+0.13z)| norm 0.2014 (-0.81z)| lr 4.25e-03 | 1989.07 ms | 69.0% bf16 MFU | 261959 tok/s +step 7859/18794 | loss 3.087135 (-1.11z)| norm 0.1760 (-1.14z)| lr 4.25e-03 | 1996.01 ms | 68.8% bf16 MFU | 261994 tok/s +step 7860/18794 | loss 3.135038 (+0.14z)| norm 0.2338 (-0.35z)| lr 4.25e-03 | 2007.13 ms | 68.4% bf16 MFU | 261955 tok/s +step 7861/18794 | loss 3.147374 (+0.46z)| norm 0.3555 (+1.31z)| lr 4.25e-03 | 1994.79 ms | 68.8% bf16 MFU | 261999 tok/s +step 7862/18794 | loss 3.123247 (-0.18z)| norm 0.2474 (-0.17z)| lr 4.25e-03 | 2005.06 ms | 68.4% bf16 MFU | 261973 tok/s +step 7863/18794 | loss 3.076898 (-1.37z)| norm 0.2537 (-0.09z)| lr 4.25e-03 | 1986.39 ms | 69.1% bf16 MFU | 262072 tok/s +step 7864/18794 | loss 3.070246 (-1.50z)| norm 0.3727 (+1.50z)| lr 4.25e-03 | 1982.32 ms | 69.2% bf16 MFU | 262192 tok/s +step 7865/18794 | loss 3.036869 (-2.28z)| norm 0.1923 (-0.94z)| lr 4.25e-03 | 1993.68 ms | 68.8% bf16 MFU | 262231 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.879185 +step 7866/18794 | loss 3.171337 (+1.11z)| norm 0.4860 (+2.88z)| lr 4.25e-03 | 1988.97 ms | 69.0% bf16 MFU | 262300 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.349181 +step 7867/18794 | loss 3.122148 (-0.13z)| norm 0.4524 (+2.35z)| lr 4.25e-03 | 1983.56 ms | 69.2% bf16 MFU | 262400 tok/s +step 7868/18794 | loss 3.131069 (+0.09z)| norm 0.2048 (-0.80z)| lr 4.25e-03 | 1988.49 ms | 69.0% bf16 MFU | 262463 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.103913 +step 7869/18794 | loss 3.168458 (+1.02z)| norm 0.4385 (+2.10z)| lr 4.25e-03 | 2002.98 ms | 68.5% bf16 MFU | 262428 tok/s +step 7870/18794 | loss 3.128331 (+0.03z)| norm 0.1760 (-1.14z)| lr 4.25e-03 | 2000.52 ms | 68.6% bf16 MFU | 262410 tok/s +step 7871/18794 | loss 3.060978 (-1.64z)| norm 0.3648 (+1.18z)| lr 4.25e-03 | 1989.78 ms | 69.0% bf16 MFU | 262464 tok/s +step 7872/18794 | loss 3.081954 (-1.11z)| norm 0.3341 (+0.79z)| lr 4.25e-03 | 1987.46 ms | 69.0% bf16 MFU | 262531 tok/s +step 7873/18794 | loss 3.077480 (-1.22z)| norm 0.2116 (-0.71z)| lr 4.24e-03 | 2024.07 ms | 67.8% bf16 MFU | 262356 tok/s +step 7874/18794 | loss 3.135087 (+0.27z)| norm 0.2963 (+0.32z)| lr 4.24e-03 | 1995.39 ms | 68.8% bf16 MFU | 262376 tok/s +step 7875/18794 | loss 3.114040 (-0.26z)| norm 0.2030 (-0.82z)| lr 4.24e-03 | 1988.89 ms | 69.0% bf16 MFU | 262437 tok/s +step 7876/18794 | loss 3.071282 (-1.35z)| norm 0.2595 (-0.13z)| lr 4.24e-03 | 2014.62 ms | 68.1% bf16 MFU | 262327 tok/s +step 7877/18794 | loss 3.094066 (-0.74z)| norm 0.2073 (-0.77z)| lr 4.24e-03 | 1996.75 ms | 68.7% bf16 MFU | 262340 tok/s +step 7878/18794 | loss 3.099307 (-0.59z)| norm 0.2148 (-0.68z)| lr 4.24e-03 | 2009.68 ms | 68.3% bf16 MFU | 262267 tok/s +step 7879/18794 | loss 3.060176 (-1.57z)| norm 0.2956 (+0.30z)| lr 4.24e-03 | 1984.83 ms | 69.1% bf16 MFU | 262361 tok/s +step 7880/18794 | loss 3.121259 (+0.01z)| norm 0.3242 (+0.64z)| lr 4.24e-03 | 2017.25 ms | 68.0% bf16 MFU | 262238 tok/s +step 7881/18794 | loss 3.151899 (+0.80z)| norm 0.2065 (-0.80z)| lr 4.24e-03 | 2000.91 ms | 68.6% bf16 MFU | 262227 tok/s +step 7882/18794 | loss 3.068714 (-1.32z)| norm 0.1885 (-1.01z)| lr 4.24e-03 | 2000.92 ms | 68.6% bf16 MFU | 262217 tok/s +step 7883/18794 | loss 3.101199 (-0.47z)| norm 0.2763 (+0.06z)| lr 4.24e-03 | 1996.78 ms | 68.7% bf16 MFU | 262234 tok/s +step 7884/18794 | loss 3.094043 (-0.64z)| norm 0.2728 (+0.02z)| lr 4.24e-03 | 1987.81 ms | 69.0% bf16 MFU | 262310 tok/s +step 7885/18794 | loss 3.069687 (-1.26z)| norm 0.2114 (-0.74z)| lr 4.24e-03 | 1993.16 ms | 68.9% bf16 MFU | 262347 tok/s +step 7886/18794 | loss 3.131294 (+0.37z)| norm 0.2109 (-0.74z)| lr 4.24e-03 | 1988.03 ms | 69.0% bf16 MFU | 262416 tok/s +step 7887/18794 | loss 3.088577 (-0.75z)| norm 0.2178 (-0.65z)| lr 4.24e-03 | 1988.30 ms | 69.0% bf16 MFU | 262479 tok/s +step 7888/18794 | loss 3.095978 (-0.55z)| norm 0.2352 (-0.44z)| lr 4.24e-03 | 1980.56 ms | 69.3% bf16 MFU | 262591 tok/s +step 7889/18794 | loss 3.063796 (-1.37z)| norm 0.1892 (-0.99z)| lr 4.24e-03 | 1980.31 ms | 69.3% bf16 MFU | 262699 tok/s +step 7890/18794 | loss 3.138991 (+0.61z)| norm 0.2518 (-0.23z)| lr 4.24e-03 | 1986.77 ms | 69.1% bf16 MFU | 262759 tok/s +step 7891/18794 | loss 3.129656 (+0.36z)| norm 0.2756 (+0.05z)| lr 4.24e-03 | 1980.40 ms | 69.3% bf16 MFU | 262858 tok/s +step 7892/18794 | loss 3.082726 (-0.86z)| norm 0.2828 (+0.15z)| lr 4.24e-03 | 1980.77 ms | 69.3% bf16 MFU | 262949 tok/s +step 7893/18794 | loss 3.077878 (-0.97z)| norm 0.2420 (-0.34z)| lr 4.24e-03 | 1995.39 ms | 68.8% bf16 MFU | 262939 tok/s +step 7894/18794 | loss 3.147472 (+0.88z)| norm 0.4097 (+1.68z)| lr 4.24e-03 | 1984.77 ms | 69.1% bf16 MFU | 263000 tok/s +step 7895/18794 | loss 3.154380 (+1.10z)| norm 0.3821 (+1.32z)| lr 4.24e-03 | 1996.50 ms | 68.7% bf16 MFU | 262980 tok/s +step 7896/18794 | loss 3.166968 (+1.43z)| norm 0.2193 (-0.67z)| lr 4.23e-03 | 1984.28 ms | 69.2% bf16 MFU | 263042 tok/s +step 7897/18794 | loss 3.079467 (-0.92z)| norm 0.3026 (+0.35z)| lr 4.23e-03 | 1978.77 ms | 69.4% bf16 MFU | 263138 tok/s +step 7898/18794 | loss 3.146055 (+0.85z)| norm 0.3744 (+1.21z)| lr 4.23e-03 | 1987.32 ms | 69.1% bf16 MFU | 263172 tok/s +step 7899/18794 | loss 3.158360 (+1.16z)| norm 0.1933 (-0.98z)| lr 4.23e-03 | 1982.01 ms | 69.2% bf16 MFU | 263239 tok/s +step 7900/18794 | loss 3.097213 (-0.47z)| norm 0.3889 (+1.35z)| lr 4.23e-03 | 1993.37 ms | 68.8% bf16 MFU | 263228 tok/s +step 7901/18794 | loss 3.234825 (+3.09z)| norm 0.3347 (+0.69z)| lr 4.23e-03 | 1994.59 ms | 68.8% bf16 MFU | 263210 tok/s +step 7902/18794 | loss 3.159381 (+1.14z)| norm 0.2244 (-0.63z)| lr 4.23e-03 | 1994.21 ms | 68.8% bf16 MFU | 263194 tok/s +step 7903/18794 | loss 3.036685 (-1.98z)| norm 0.2543 (-0.25z)| lr 4.23e-03 | 1992.85 ms | 68.9% bf16 MFU | 263189 tok/s +step 7904/18794 | loss 3.120878 (+0.19z)| norm 0.2127 (-0.75z)| lr 4.23e-03 | 1986.47 ms | 69.1% bf16 MFU | 263226 tok/s +step 7905/18794 | loss 3.080920 (-0.85z)| norm 0.3335 (+0.71z)| lr 4.23e-03 | 1980.59 ms | 69.3% bf16 MFU | 263300 tok/s +step 7906/18794 | loss 3.136862 (+0.62z)| norm 0.3106 (+0.43z)| lr 4.23e-03 | 1986.06 ms | 69.1% bf16 MFU | 263334 tok/s +step 7907/18794 | loss 3.185980 (+1.88z)| norm 0.2333 (-0.50z)| lr 4.23e-03 | 1978.44 ms | 69.4% bf16 MFU | 263418 tok/s +step 7908/18794 | loss 3.125127 (+0.32z)| norm 0.4271 (+1.85z)| lr 4.23e-03 | 1999.00 ms | 68.7% bf16 MFU | 263361 tok/s +step 7909/18794 | loss 3.107963 (-0.12z)| norm 0.3727 (+1.23z)| lr 4.23e-03 | 1983.55 ms | 69.2% bf16 MFU | 263408 tok/s +step 7910/18794 | loss 3.133275 (+0.54z)| norm 0.2747 (+0.01z)| lr 4.23e-03 | 1988.02 ms | 69.0% bf16 MFU | 263424 tok/s +step 7911/18794 | loss 3.062125 (-1.30z)| norm 0.2354 (-0.47z)| lr 4.23e-03 | 1983.41 ms | 69.2% bf16 MFU | 263470 tok/s +step 7912/18794 | loss 3.172131 (+1.54z)| norm 0.2441 (-0.37z)| lr 4.23e-03 | 1985.65 ms | 69.1% bf16 MFU | 263498 tok/s +step 7913/18794 | loss 3.138178 (+0.65z)| norm 0.2399 (-0.42z)| lr 4.23e-03 | 1979.69 ms | 69.3% bf16 MFU | 263565 tok/s +step 7914/18794 | loss 3.190001 (+1.99z)| norm 0.2915 (+0.23z)| lr 4.23e-03 | 1980.57 ms | 69.3% bf16 MFU | 263623 tok/s +step 7915/18794 | loss 3.118948 (+0.16z)| norm 0.1900 (-1.04z)| lr 4.23e-03 | 1978.85 ms | 69.3% bf16 MFU | 263689 tok/s +step 7916/18794 | loss 3.121777 (+0.25z)| norm 0.2320 (-0.51z)| lr 4.23e-03 | 1979.74 ms | 69.3% bf16 MFU | 263746 tok/s +step 7917/18794 | loss 3.085797 (-0.67z)| norm 0.4021 (+1.58z)| lr 4.23e-03 | 1979.99 ms | 69.3% bf16 MFU | 263798 tok/s +step 7918/18794 | loss 3.124044 (+0.33z)| norm 0.3736 (+1.20z)| lr 4.22e-03 | 1978.75 ms | 69.4% bf16 MFU | 263856 tok/s +step 7919/18794 | loss 3.138705 (+0.70z)| norm 0.1707 (-1.28z)| lr 4.22e-03 | 1980.24 ms | 69.3% bf16 MFU | 263901 tok/s +step 7920/18794 | loss 3.093240 (-0.49z)| norm 0.1966 (-0.95z)| lr 4.22e-03 | 1980.26 ms | 69.3% bf16 MFU | 263944 tok/s +step 7921/18794 | loss 3.144065 (+0.83z)| norm 0.1879 (-1.04z)| lr 4.22e-03 | 1987.44 ms | 69.0% bf16 MFU | 263937 tok/s +step 7922/18794 | loss 3.120776 (+0.20z)| norm 0.2019 (-0.86z)| lr 4.22e-03 | 1979.57 ms | 69.3% bf16 MFU | 263983 tok/s +step 7923/18794 | loss 3.111359 (-0.07z)| norm 0.2858 (+0.18z)| lr 4.22e-03 | 1981.60 ms | 69.3% bf16 MFU | 264012 tok/s +step 7924/18794 | loss 3.110106 (-0.10z)| norm 0.2453 (-0.33z)| lr 4.22e-03 | 1988.14 ms | 69.0% bf16 MFU | 263997 tok/s +step 7925/18794 | loss 3.095214 (-0.50z)| norm 0.1527 (-1.47z)| lr 4.22e-03 | 1992.12 ms | 68.9% bf16 MFU | 263956 tok/s +step 7926/18794 | loss 3.165860 (+1.40z)| norm 0.2654 (-0.03z)| lr 4.22e-03 | 1980.22 ms | 69.3% bf16 MFU | 263997 tok/s +step 7927/18794 | loss 3.038322 (-2.01z)| norm 0.2943 (+0.32z)| lr 4.22e-03 | 1980.59 ms | 69.3% bf16 MFU | 264032 tok/s +step 7928/18794 | loss 3.051744 (-1.61z)| norm 0.2172 (-0.67z)| lr 4.22e-03 | 1980.67 ms | 69.3% bf16 MFU | 264066 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.308614 +step 7929/18794 | loss 3.197557 (+2.17z)| norm 0.5445 (+3.31z)| lr 4.22e-03 | 1979.46 ms | 69.3% bf16 MFU | 264106 tok/s +step 7930/18794 | loss 3.056683 (-1.43z)| norm 0.3703 (+1.17z)| lr 4.22e-03 | 1980.33 ms | 69.3% bf16 MFU | 264138 tok/s +step 7931/18794 | loss 3.148399 (+0.90z)| norm 0.2827 (+0.13z)| lr 4.22e-03 | 1979.80 ms | 69.3% bf16 MFU | 264172 tok/s +step 7932/18794 | loss 3.181067 (+1.70z)| norm 0.4095 (+1.65z)| lr 4.22e-03 | 1978.98 ms | 69.3% bf16 MFU | 264210 tok/s +step 7933/18794 | loss 3.077029 (-0.91z)| norm 0.2487 (-0.29z)| lr 4.22e-03 | 1981.34 ms | 69.3% bf16 MFU | 264230 tok/s +step 7934/18794 | loss 3.147914 (+0.86z)| norm 0.2602 (-0.14z)| lr 4.22e-03 | 1980.20 ms | 69.3% bf16 MFU | 264257 tok/s +step 7935/18794 | loss 3.098717 (-0.36z)| norm 0.2458 (-0.32z)| lr 4.22e-03 | 1980.80 ms | 69.3% bf16 MFU | 264278 tok/s +step 7936/18794 | loss 3.130095 (+0.42z)| norm 0.2277 (-0.53z)| lr 4.22e-03 | 1983.40 ms | 69.2% bf16 MFU | 264281 tok/s +step 7937/18794 | loss 3.127536 (+0.35z)| norm 0.2131 (-0.71z)| lr 4.22e-03 | 1981.23 ms | 69.3% bf16 MFU | 264298 tok/s +step 7938/18794 | loss 3.089477 (-0.60z)| norm 0.2209 (-0.60z)| lr 4.22e-03 | 1981.36 ms | 69.3% bf16 MFU | 264314 tok/s +step 7939/18794 | loss 3.137808 (+0.60z)| norm 0.2146 (-0.68z)| lr 4.22e-03 | 1980.41 ms | 69.3% bf16 MFU | 264335 tok/s +step 7940/18794 | loss 3.134342 (+0.51z)| norm 0.1952 (-0.92z)| lr 4.21e-03 | 1979.87 ms | 69.3% bf16 MFU | 264359 tok/s +step 7941/18794 | loss 3.086808 (-0.67z)| norm 0.2154 (-0.65z)| lr 4.21e-03 | 1982.65 ms | 69.2% bf16 MFU | 264363 tok/s +step 7942/18794 | loss 3.117387 (+0.08z)| norm 0.2805 (+0.19z)| lr 4.21e-03 | 1986.26 ms | 69.1% bf16 MFU | 264342 tok/s +step 7943/18794 | loss 3.254139 (+3.29z)| norm 0.2706 (+0.05z)| lr 4.21e-03 | 1984.09 ms | 69.2% bf16 MFU | 264338 tok/s +step 7944/18794 | loss 3.178018 (+1.45z)| norm 0.1729 (-1.22z)| lr 4.21e-03 | 1990.26 ms | 69.0% bf16 MFU | 264292 tok/s +step 7945/18794 | loss 3.094498 (-0.51z)| norm 0.1807 (-1.10z)| lr 4.21e-03 | 1980.37 ms | 69.3% bf16 MFU | 264315 tok/s +step 7946/18794 | loss 3.134160 (+0.42z)| norm 0.2616 (-0.05z)| lr 4.21e-03 | 1978.81 ms | 69.4% bf16 MFU | 264346 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.194809 +step 7947/18794 | loss 3.107131 (-0.21z)| norm 0.4405 (+2.19z)| lr 4.21e-03 | 1981.56 ms | 69.3% bf16 MFU | 264358 tok/s +step 7948/18794 | loss 3.076785 (-0.99z)| norm 0.2036 (-0.81z)| lr 4.21e-03 | 1981.66 ms | 69.3% bf16 MFU | 264369 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.497754 +step 7949/18794 | loss 3.115745 (-0.03z)| norm 0.4732 (+2.50z)| lr 4.21e-03 | 1982.34 ms | 69.2% bf16 MFU | 264374 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.380488 +step 7950/18794 | loss 3.080986 (-0.87z)| norm 0.5656 (+3.38z)| lr 4.21e-03 | 1981.90 ms | 69.2% bf16 MFU | 264382 tok/s +step 7951/18794 | loss 3.106050 (-0.25z)| norm 0.1975 (-0.87z)| lr 4.21e-03 | 1982.08 ms | 69.2% bf16 MFU | 264389 tok/s +step 7952/18794 | loss 3.073862 (-1.04z)| norm 0.3333 (+0.69z)| lr 4.21e-03 | 1978.53 ms | 69.4% bf16 MFU | 264419 tok/s +step 7953/18794 | loss 3.152256 (+0.87z)| norm 0.2375 (-0.41z)| lr 4.21e-03 | 1981.27 ms | 69.3% bf16 MFU | 264429 tok/s +step 7954/18794 | loss 3.101397 (-0.37z)| norm 0.2099 (-0.73z)| lr 4.21e-03 | 1987.13 ms | 69.1% bf16 MFU | 264400 tok/s +step 7955/18794 | loss 3.175907 (+1.45z)| norm 0.3227 (+0.56z)| lr 4.21e-03 | 1987.31 ms | 69.1% bf16 MFU | 264371 tok/s +step 7956/18794 | loss 3.059724 (-1.38z)| norm 0.2471 (-0.32z)| lr 4.21e-03 | 1992.45 ms | 68.9% bf16 MFU | 264309 tok/s +step 7957/18794 | loss 3.173530 (+1.36z)| norm 0.3701 (+1.08z)| lr 4.21e-03 | 1979.66 ms | 69.3% bf16 MFU | 264336 tok/s +step 7958/18794 | loss 3.144741 (+0.66z)| norm 0.4330 (+1.75z)| lr 4.21e-03 | 1980.37 ms | 69.3% bf16 MFU | 264356 tok/s +step 7959/18794 | loss 3.105961 (-0.28z)| norm 0.2324 (-0.54z)| lr 4.21e-03 | 1981.28 ms | 69.3% bf16 MFU | 264369 tok/s +step 7960/18794 | loss 3.102561 (-0.35z)| norm 0.2846 (+0.05z)| lr 4.21e-03 | 1982.25 ms | 69.2% bf16 MFU | 264375 tok/s +step 7961/18794 | loss 3.089403 (-0.65z)| norm 0.2479 (-0.36z)| lr 4.21e-03 | 1980.44 ms | 69.3% bf16 MFU | 264393 tok/s +step 7962/18794 | loss 3.150594 (+0.81z)| norm 0.2136 (-0.74z)| lr 4.21e-03 | 1980.36 ms | 69.3% bf16 MFU | 264411 tok/s +step 7963/18794 | loss 3.081855 (-0.84z)| norm 0.3599 (+0.92z)| lr 4.20e-03 | 1980.59 ms | 69.3% bf16 MFU | 264426 tok/s +step 7964/18794 | loss 3.052193 (-1.54z)| norm 0.4158 (+1.54z)| lr 4.20e-03 | 1979.07 ms | 69.3% bf16 MFU | 264450 tok/s +step 7965/18794 | loss 3.069180 (-1.16z)| norm 0.1995 (-0.92z)| lr 4.20e-03 | 1979.79 ms | 69.3% bf16 MFU | 264469 tok/s +step 7966/18794 | loss 3.116953 (+0.01z)| norm 0.3286 (+0.59z)| lr 4.20e-03 | 1980.97 ms | 69.3% bf16 MFU | 264478 tok/s +step 7967/18794 | loss 3.093162 (-0.56z)| norm 0.2350 (-0.49z)| lr 4.20e-03 | 1983.81 ms | 69.2% bf16 MFU | 264469 tok/s +step 7968/18794 | loss 3.099695 (-0.39z)| norm 0.2263 (-0.60z)| lr 4.20e-03 | 1987.46 ms | 69.0% bf16 MFU | 264435 tok/s +step 7969/18794 | loss 3.092363 (-0.56z)| norm 0.2572 (-0.21z)| lr 4.20e-03 | 1984.48 ms | 69.2% bf16 MFU | 264423 tok/s +step 7970/18794 | loss 3.086139 (-0.70z)| norm 0.3089 (+0.40z)| lr 4.20e-03 | 1978.47 ms | 69.4% bf16 MFU | 264452 tok/s +step 7971/18794 | loss 3.087144 (-0.69z)| norm 0.4238 (+1.78z)| lr 4.20e-03 | 1993.35 ms | 68.8% bf16 MFU | 264380 tok/s +step 7972/18794 | loss 3.102499 (-0.31z)| norm 0.2215 (-0.65z)| lr 4.20e-03 | 1982.75 ms | 69.2% bf16 MFU | 264382 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.076992 +step 7973/18794 | loss 3.064667 (-1.24z)| norm 0.4527 (+2.08z)| lr 4.20e-03 | 1980.99 ms | 69.3% bf16 MFU | 264396 tok/s +step 7974/18794 | loss 3.155785 (+1.00z)| norm 0.2775 (-0.00z)| lr 4.20e-03 | 1981.82 ms | 69.2% bf16 MFU | 264404 tok/s +step 7975/18794 | loss 3.078992 (-0.88z)| norm 0.2213 (-0.67z)| lr 4.20e-03 | 1981.34 ms | 69.3% bf16 MFU | 264414 tok/s +step 7976/18794 | loss 3.122652 (+0.18z)| norm 0.2085 (-0.82z)| lr 4.20e-03 | 1980.33 ms | 69.3% bf16 MFU | 264431 tok/s +step 7977/18794 | loss 3.131393 (+0.39z)| norm 0.3419 (+0.75z)| lr 4.20e-03 | 1988.17 ms | 69.0% bf16 MFU | 264394 tok/s +step 7978/18794 | loss 3.094569 (-0.52z)| norm 0.2963 (+0.20z)| lr 4.20e-03 | 1986.94 ms | 69.1% bf16 MFU | 264368 tok/s +step 7979/18794 | loss 3.099429 (-0.41z)| norm 0.2361 (-0.51z)| lr 4.20e-03 | 1980.23 ms | 69.3% bf16 MFU | 264388 tok/s +step 7980/18794 | loss 3.117479 (+0.04z)| norm 0.4096 (+1.53z)| lr 4.20e-03 | 1986.78 ms | 69.1% bf16 MFU | 264363 tok/s +step 7981/18794 | loss 3.127067 (+0.28z)| norm 0.3423 (+0.72z)| lr 4.20e-03 | 1984.03 ms | 69.2% bf16 MFU | 264357 tok/s +step 7982/18794 | loss 3.038157 (-1.91z)| norm 0.2005 (-0.95z)| lr 4.20e-03 | 1982.57 ms | 69.2% bf16 MFU | 264362 tok/s +step 7983/18794 | loss 3.067313 (-1.17z)| norm 0.2696 (-0.14z)| lr 4.20e-03 | 1983.23 ms | 69.2% bf16 MFU | 264362 tok/s +step 7984/18794 | loss 3.130826 (+0.38z)| norm 0.2629 (-0.22z)| lr 4.20e-03 | 1980.09 ms | 69.3% bf16 MFU | 264383 tok/s +step 7985/18794 | loss 3.125990 (+0.24z)| norm 0.4536 (+1.97z)| lr 4.19e-03 | 1979.64 ms | 69.3% bf16 MFU | 264406 tok/s +step 7986/18794 | loss 3.091130 (-0.60z)| norm 0.2270 (-0.66z)| lr 4.19e-03 | 1982.02 ms | 69.2% bf16 MFU | 264411 tok/s +step 7987/18794 | loss 3.095340 (-0.50z)| norm 0.3948 (+1.26z)| lr 4.19e-03 | 1979.74 ms | 69.3% bf16 MFU | 264432 tok/s +step 7988/18794 | loss 3.127710 (+0.29z)| norm 0.1724 (-1.29z)| lr 4.19e-03 | 1979.85 ms | 69.3% bf16 MFU | 264451 tok/s +step 7989/18794 | loss 3.065978 (-1.24z)| norm 0.2902 (+0.05z)| lr 4.19e-03 | 1985.99 ms | 69.1% bf16 MFU | 264428 tok/s +step 7990/18794 | loss 3.031792 (-2.02z)| norm 0.2117 (-0.85z)| lr 4.19e-03 | 1983.32 ms | 69.2% bf16 MFU | 264424 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.269789 +step 7991/18794 | loss 3.088355 (-0.63z)| norm 0.4899 (+2.27z)| lr 4.19e-03 | 1981.78 ms | 69.2% bf16 MFU | 264431 tok/s +step 7992/18794 | loss 3.118728 (+0.09z)| norm 0.4595 (+1.87z)| lr 4.19e-03 | 1980.97 ms | 69.3% bf16 MFU | 264442 tok/s +step 7993/18794 | loss 3.025629 (-2.11z)| norm 0.2022 (-0.95z)| lr 4.19e-03 | 1985.88 ms | 69.1% bf16 MFU | 264421 tok/s +step 7994/18794 | loss 3.091006 (-0.54z)| norm 0.3596 (+0.79z)| lr 4.19e-03 | 1978.67 ms | 69.4% bf16 MFU | 264448 tok/s +step 7995/18794 | loss 3.144938 (+0.74z)| norm 0.2988 (+0.12z)| lr 4.19e-03 | 1987.68 ms | 69.0% bf16 MFU | 264414 tok/s +step 7996/18794 | loss 3.100115 (-0.31z)| norm 0.1994 (-0.98z)| lr 4.19e-03 | 1982.85 ms | 69.2% bf16 MFU | 264414 tok/s +step 7997/18794 | loss 3.132622 (+0.46z)| norm 0.2442 (-0.47z)| lr 4.19e-03 | 1980.61 ms | 69.3% bf16 MFU | 264429 tok/s +step 7998/18794 | loss 3.070979 (-1.01z)| norm 0.2527 (-0.37z)| lr 4.19e-03 | 1980.60 ms | 69.3% bf16 MFU | 264443 tok/s +step 7999/18794 | loss 3.057764 (-1.30z)| norm 0.3750 (+0.97z)| lr 4.19e-03 | 1980.50 ms | 69.3% bf16 MFU | 264457 tok/s +step 8000/18794 | loss 3.067080 (-1.06z)| norm 0.3343 (+0.53z)| lr 4.19e-03 | 1980.33 ms | 69.3% bf16 MFU | 264472 tok/s +val loss 3.151690 +HellaSwag: 2949/10042 = 0.293667Swag: 990/1256: 0/1256 +generating: +--- +Writing state to log_gpt3_125M_edu_v4/state_00008000_00001.bin +and that is bringing down the issues of global warming in the decade since +the world climate change rolls out, too. +So what really has happened? Scientists +are seeking to re-write our Planet Earth with virtual will help us protect it. +Elon Musk upstream in +to demonstrate new propulsion technology with Curiosity probes, +again, Penta said. The mobile computers and advanced robotic systems are +supplying bits of information about the rover and +urine that can transmit any position by positioning themselves on a rover's +antenna. Sat down, next to the rover, to rotate on +rapidly to snag 3D view of the +costext spheroid structure of the +--- +Writing checkpoint at step 8000 +Writing model to log_gpt3_125M_edu_v4/model_00008000.bin +Writing state to log_gpt3_125M_edu_v4/state_00008000_00000.bin +Deleting checkpoint at step 5500 +step 8001/18794 | loss 3.086399 (-0.59z)| norm 0.2076 (-0.87z)| lr 4.19e-03 | 1972.83 ms | 69.6% bf16 MFU | 264536 tok/s +step 8002/18794 | loss 3.164385 (+1.35z)| norm 0.2492 (-0.41z)| lr 4.19e-03 | 1982.77 ms | 69.2% bf16 MFU | 264530 tok/s +step 8003/18794 | loss 3.153623 (+1.07z)| norm 0.2136 (-0.80z)| lr 4.19e-03 | 1983.59 ms | 69.2% bf16 MFU | 264519 tok/s +step 8004/18794 | loss 3.139940 (+0.72z)| norm 0.2368 (-0.55z)| lr 4.19e-03 | 1987.18 ms | 69.1% bf16 MFU | 264485 tok/s +step 8005/18794 | loss 3.143971 (+0.80z)| norm 0.2363 (-0.54z)| lr 4.19e-03 | 1985.71 ms | 69.1% bf16 MFU | 264462 tok/s +step 8006/18794 | loss 3.086703 (-0.63z)| norm 0.2303 (-0.60z)| lr 4.19e-03 | 1981.43 ms | 69.3% bf16 MFU | 264469 tok/s +step 8007/18794 | loss 3.147389 (+0.92z)| norm 0.2098 (-0.82z)| lr 4.18e-03 | 1980.96 ms | 69.3% bf16 MFU | 264479 tok/s +step 8008/18794 | loss 3.092327 (-0.47z)| norm 0.3039 (+0.24z)| lr 4.18e-03 | 1981.43 ms | 69.3% bf16 MFU | 264485 tok/s +step 8009/18794 | loss 3.162247 (+1.28z)| norm 0.2434 (-0.43z)| lr 4.18e-03 | 1980.70 ms | 69.3% bf16 MFU | 264496 tok/s +step 8010/18794 | loss 3.108293 (-0.07z)| norm 0.2628 (-0.21z)| lr 4.18e-03 | 1981.28 ms | 69.3% bf16 MFU | 264502 tok/s +step 8011/18794 | loss 3.125596 (+0.35z)| norm 0.3115 (+0.34z)| lr 4.18e-03 | 1979.75 ms | 69.3% bf16 MFU | 264518 tok/s +step 8012/18794 | loss 3.164095 (+1.34z)| norm 0.2691 (-0.15z)| lr 4.18e-03 | 1981.15 ms | 69.3% bf16 MFU | 264524 tok/s +step 8013/18794 | loss 3.168256 (+1.43z)| norm 0.2350 (-0.53z)| lr 4.18e-03 | 1978.35 ms | 69.4% bf16 MFU | 264548 tok/s +step 8014/18794 | loss 3.050859 (-1.53z)| norm 0.2978 (+0.18z)| lr 4.18e-03 | 1978.62 ms | 69.4% bf16 MFU | 264570 tok/s +step 8015/18794 | loss 3.152357 (+1.05z)| norm 0.2923 (+0.10z)| lr 4.18e-03 | 1980.50 ms | 69.3% bf16 MFU | 264578 tok/s +step 8016/18794 | loss 3.104275 (-0.17z)| norm 0.2453 (-0.43z)| lr 4.18e-03 | 2001.97 ms | 68.5% bf16 MFU | 264443 tok/s +step 8017/18794 | loss 3.133722 (+0.57z)| norm 0.2361 (-0.52z)| lr 4.18e-03 | 1986.43 ms | 69.1% bf16 MFU | 264418 tok/s +step 8018/18794 | loss 3.114041 (+0.07z)| norm 0.2507 (-0.34z)| lr 4.18e-03 | 1980.09 ms | 69.3% bf16 MFU | 264436 tok/s +step 8019/18794 | loss 3.120980 (+0.25z)| norm 0.3222 (+0.47z)| lr 4.18e-03 | 1989.44 ms | 69.0% bf16 MFU | 264391 tok/s +step 8020/18794 | loss 3.102348 (-0.22z)| norm 0.2771 (-0.07z)| lr 4.18e-03 | 1982.09 ms | 69.2% bf16 MFU | 264397 tok/s +step 8021/18794 | loss 3.083031 (-0.70z)| norm 0.2310 (-0.61z)| lr 4.18e-03 | 1981.99 ms | 69.2% bf16 MFU | 264403 tok/s +step 8022/18794 | loss 3.162095 (+1.30z)| norm 0.2226 (-0.71z)| lr 4.18e-03 | 1980.94 ms | 69.3% bf16 MFU | 264416 tok/s +step 8023/18794 | loss 3.194301 (+2.05z)| norm 0.2277 (-0.65z)| lr 4.18e-03 | 1980.96 ms | 69.3% bf16 MFU | 264429 tok/s +step 8024/18794 | loss 3.139465 (+0.68z)| norm 0.2255 (-0.67z)| lr 4.18e-03 | 1980.63 ms | 69.3% bf16 MFU | 264443 tok/s +step 8025/18794 | loss 3.098271 (-0.34z)| norm 0.2021 (-0.96z)| lr 4.18e-03 | 1988.59 ms | 69.0% bf16 MFU | 264403 tok/s +step 8026/18794 | loss 3.058680 (-1.30z)| norm 0.2196 (-0.74z)| lr 4.18e-03 | 1984.08 ms | 69.2% bf16 MFU | 264395 tok/s +step 8027/18794 | loss 3.126824 (+0.38z)| norm 0.2905 (+0.09z)| lr 4.18e-03 | 1984.14 ms | 69.2% bf16 MFU | 264387 tok/s +step 8028/18794 | loss 3.156630 (+1.11z)| norm 0.3122 (+0.34z)| lr 4.18e-03 | 1980.06 ms | 69.3% bf16 MFU | 264407 tok/s +step 8029/18794 | loss 3.150986 (+0.99z)| norm 0.2169 (-0.78z)| lr 4.17e-03 | 1989.15 ms | 69.0% bf16 MFU | 264366 tok/s +step 8030/18794 | loss 3.037129 (-1.92z)| norm 0.2398 (-0.49z)| lr 4.17e-03 | 1978.94 ms | 69.3% bf16 MFU | 264394 tok/s +step 8031/18794 | loss 3.119844 (+0.20z)| norm 0.2676 (-0.14z)| lr 4.17e-03 | 1987.45 ms | 69.0% bf16 MFU | 264364 tok/s +step 8032/18794 | loss 3.123901 (+0.33z)| norm 0.2152 (-0.78z)| lr 4.17e-03 | 1982.19 ms | 69.2% bf16 MFU | 264371 tok/s +step 8033/18794 | loss 3.130570 (+0.49z)| norm 0.2101 (-0.83z)| lr 4.17e-03 | 1979.70 ms | 69.3% bf16 MFU | 264394 tok/s +step 8034/18794 | loss 3.122064 (+0.27z)| norm 0.2139 (-0.78z)| lr 4.17e-03 | 1979.56 ms | 69.3% bf16 MFU | 264417 tok/s +step 8035/18794 | loss 3.072990 (-1.01z)| norm 0.1932 (-1.02z)| lr 4.17e-03 | 1980.41 ms | 69.3% bf16 MFU | 264433 tok/s +step 8036/18794 | loss 3.067326 (-1.13z)| norm 0.1977 (-0.96z)| lr 4.17e-03 | 1977.96 ms | 69.4% bf16 MFU | 264464 tok/s +step 8037/18794 | loss 3.084211 (-0.68z)| norm 0.2021 (-0.90z)| lr 4.17e-03 | 1979.01 ms | 69.3% bf16 MFU | 264487 tok/s +step 8038/18794 | loss 3.171082 (+1.54z)| norm 0.2525 (-0.28z)| lr 4.17e-03 | 1979.67 ms | 69.3% bf16 MFU | 264505 tok/s +step 8039/18794 | loss 3.141589 (+0.78z)| norm 0.3075 (+0.39z)| lr 4.17e-03 | 1979.23 ms | 69.3% bf16 MFU | 264524 tok/s +step 8040/18794 | loss 3.161275 (+1.27z)| norm 0.2564 (-0.26z)| lr 4.17e-03 | 1980.03 ms | 69.3% bf16 MFU | 264538 tok/s +step 8041/18794 | loss 3.106839 (-0.12z)| norm 0.2212 (-0.70z)| lr 4.17e-03 | 1978.76 ms | 69.4% bf16 MFU | 264559 tok/s +step 8042/18794 | loss 3.109373 (-0.06z)| norm 0.1935 (-1.03z)| lr 4.17e-03 | 1979.64 ms | 69.3% bf16 MFU | 264573 tok/s +step 8043/18794 | loss 3.158613 (+1.31z)| norm 0.2120 (-0.79z)| lr 4.17e-03 | 1978.97 ms | 69.3% bf16 MFU | 264590 tok/s +step 8044/18794 | loss 3.095169 (-0.41z)| norm 0.2749 (-0.02z)| lr 4.17e-03 | 1979.29 ms | 69.3% bf16 MFU | 264605 tok/s +step 8045/18794 | loss 3.126895 (+0.46z)| norm 0.3165 (+0.49z)| lr 4.17e-03 | 1980.48 ms | 69.3% bf16 MFU | 264611 tok/s +step 8046/18794 | loss 3.127964 (+0.50z)| norm 0.3472 (+0.86z)| lr 4.17e-03 | 1981.57 ms | 69.3% bf16 MFU | 264610 tok/s +step 8047/18794 | loss 3.091137 (-0.52z)| norm 0.2915 (+0.18z)| lr 4.17e-03 | 1984.67 ms | 69.1% bf16 MFU | 264588 tok/s +step 8048/18794 | loss 3.156392 (+1.26z)| norm 0.1884 (-1.13z)| lr 4.17e-03 | 1985.37 ms | 69.1% bf16 MFU | 264562 tok/s +step 8049/18794 | loss 3.170018 (+1.60z)| norm 0.2788 (+0.05z)| lr 4.17e-03 | 1982.45 ms | 69.2% bf16 MFU | 264557 tok/s +step 8050/18794 | loss 3.125781 (+0.38z)| norm 0.3899 (+1.65z)| lr 4.17e-03 | 1992.38 ms | 68.9% bf16 MFU | 264487 tok/s +step 8051/18794 | loss 3.077302 (-0.93z)| norm 0.2583 (-0.22z)| lr 4.16e-03 | 1981.47 ms | 69.3% bf16 MFU | 264492 tok/s +step 8052/18794 | loss 3.139483 (+0.75z)| norm 0.1753 (-1.37z)| lr 4.16e-03 | 1979.76 ms | 69.3% bf16 MFU | 264509 tok/s +step 8053/18794 | loss 3.145831 (+0.92z)| norm 0.1885 (-1.17z)| lr 4.16e-03 | 1979.81 ms | 69.3% bf16 MFU | 264524 tok/s +step 8054/18794 | loss 3.058284 (-1.44z)| norm 0.1672 (-1.45z)| lr 4.16e-03 | 1983.49 ms | 69.2% bf16 MFU | 264514 tok/s +step 8055/18794 | loss 3.138941 (+0.76z)| norm 0.2616 (-0.13z)| lr 4.16e-03 | 1980.97 ms | 69.3% bf16 MFU | 264522 tok/s +step 8056/18794 | loss 3.055425 (-1.52z)| norm 0.2489 (-0.31z)| lr 4.16e-03 | 1986.74 ms | 69.1% bf16 MFU | 264490 tok/s +step 8057/18794 | loss 3.123062 (+0.34z)| norm 0.2451 (-0.35z)| lr 4.16e-03 | 1983.47 ms | 69.2% bf16 MFU | 264482 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.049846 +step 8058/18794 | loss 3.097404 (-0.36z)| norm 0.4912 (+3.05z)| lr 4.16e-03 | 1981.78 ms | 69.2% bf16 MFU | 264486 tok/s +step 8059/18794 | loss 3.127616 (+0.48z)| norm 0.3048 (+0.47z)| lr 4.16e-03 | 1982.97 ms | 69.2% bf16 MFU | 264481 tok/s +step 8060/18794 | loss 3.128586 (+0.50z)| norm 0.2920 (+0.29z)| lr 4.16e-03 | 1990.15 ms | 69.0% bf16 MFU | 264429 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.183732 +step 8061/18794 | loss 3.104438 (-0.18z)| norm 0.4350 (+2.18z)| lr 4.16e-03 | 1979.52 ms | 69.3% bf16 MFU | 264451 tok/s +step 8062/18794 | loss 3.144896 (+0.95z)| norm 0.2152 (-0.78z)| lr 4.16e-03 | 1986.69 ms | 69.1% bf16 MFU | 264423 tok/s +step 8063/18794 | loss 3.055940 (-1.51z)| norm 0.3864 (+1.52z)| lr 4.16e-03 | 1982.70 ms | 69.2% bf16 MFU | 264424 tok/s +step 8064/18794 | loss 3.070189 (-1.13z)| norm 0.3892 (+1.57z)| lr 4.16e-03 | 1981.36 ms | 69.3% bf16 MFU | 264433 tok/s +step 8065/18794 | loss 3.118484 (+0.20z)| norm 0.1780 (-1.27z)| lr 4.16e-03 | 1979.97 ms | 69.3% bf16 MFU | 264451 tok/s +step 8066/18794 | loss 3.135857 (+0.68z)| norm 0.3433 (+0.95z)| lr 4.16e-03 | 1981.65 ms | 69.3% bf16 MFU | 264457 tok/s +step 8067/18794 | loss 3.091431 (-0.56z)| norm 0.2223 (-0.68z)| lr 4.16e-03 | 1980.57 ms | 69.3% bf16 MFU | 264470 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.166909 +step 8068/18794 | loss 3.159513 (+1.32z)| norm 0.4401 (+2.17z)| lr 4.16e-03 | 1984.07 ms | 69.2% bf16 MFU | 264459 tok/s +step 8069/18794 | loss 3.150674 (+1.05z)| norm 0.1931 (-1.06z)| lr 4.16e-03 | 1980.42 ms | 69.3% bf16 MFU | 264473 tok/s +step 8070/18794 | loss 3.123290 (+0.29z)| norm 0.3522 (+1.01z)| lr 4.16e-03 | 1982.28 ms | 69.2% bf16 MFU | 264474 tok/s +step 8071/18794 | loss 3.096509 (-0.46z)| norm 0.2247 (-0.63z)| lr 4.16e-03 | 1983.11 ms | 69.2% bf16 MFU | 264469 tok/s +step 8072/18794 | loss 3.059393 (-1.46z)| norm 0.2804 (+0.09z)| lr 4.16e-03 | 1983.54 ms | 69.2% bf16 MFU | 264461 tok/s +step 8073/18794 | loss 3.127881 (+0.40z)| norm 0.2543 (-0.23z)| lr 4.15e-03 | 1981.24 ms | 69.3% bf16 MFU | 264470 tok/s +step 8074/18794 | loss 3.249169 (+3.51z)| norm 0.2311 (-0.54z)| lr 4.15e-03 | 1988.79 ms | 69.0% bf16 MFU | 264427 tok/s +step 8075/18794 | loss 3.144583 (+0.78z)| norm 0.2567 (-0.20z)| lr 4.15e-03 | 1985.54 ms | 69.1% bf16 MFU | 264408 tok/s +step 8076/18794 | loss 3.109627 (-0.13z)| norm 0.2069 (-0.88z)| lr 4.15e-03 | 1980.24 ms | 69.3% bf16 MFU | 264426 tok/s +step 8077/18794 | loss 3.119333 (+0.12z)| norm 0.2365 (-0.46z)| lr 4.15e-03 | 1980.62 ms | 69.3% bf16 MFU | 264440 tok/s +step 8078/18794 | loss 3.089749 (-0.65z)| norm 0.2296 (-0.55z)| lr 4.15e-03 | 1981.09 ms | 69.3% bf16 MFU | 264450 tok/s +step 8079/18794 | loss 3.121247 (+0.17z)| norm 0.2245 (-0.61z)| lr 4.15e-03 | 1980.48 ms | 69.3% bf16 MFU | 264464 tok/s +step 8080/18794 | loss 3.080709 (-0.87z)| norm 0.4134 (+1.97z)| lr 4.15e-03 | 1980.19 ms | 69.3% bf16 MFU | 264479 tok/s +step 8081/18794 | loss 3.118187 (+0.10z)| norm 0.2755 (+0.09z)| lr 4.15e-03 | 1980.01 ms | 69.3% bf16 MFU | 264495 tok/s +step 8082/18794 | loss 3.143530 (+0.75z)| norm 0.2349 (-0.47z)| lr 4.15e-03 | 1980.20 ms | 69.3% bf16 MFU | 264509 tok/s +step 8083/18794 | loss 3.141496 (+0.68z)| norm 0.1979 (-0.97z)| lr 4.15e-03 | 1985.71 ms | 69.1% bf16 MFU | 264485 tok/s +step 8084/18794 | loss 3.134930 (+0.50z)| norm 0.2122 (-0.76z)| lr 4.15e-03 | 1983.08 ms | 69.2% bf16 MFU | 264479 tok/s +step 8085/18794 | loss 3.149581 (+0.88z)| norm 0.3073 (+0.58z)| lr 4.15e-03 | 1990.92 ms | 68.9% bf16 MFU | 264422 tok/s +step 8086/18794 | loss 3.077331 (-1.03z)| norm 0.2330 (-0.47z)| lr 4.15e-03 | 1985.51 ms | 69.1% bf16 MFU | 264404 tok/s +step 8087/18794 | loss 3.138807 (+0.59z)| norm 0.2962 (+0.44z)| lr 4.15e-03 | 1983.99 ms | 69.2% bf16 MFU | 264397 tok/s +step 8088/18794 | loss 3.123566 (+0.18z)| norm 0.2673 (+0.01z)| lr 4.15e-03 | 1980.56 ms | 69.3% bf16 MFU | 264413 tok/s +step 8089/18794 | loss 3.178991 (+1.61z)| norm 0.3065 (+0.58z)| lr 4.15e-03 | 1981.30 ms | 69.3% bf16 MFU | 264423 tok/s +step 8090/18794 | loss 3.179518 (+1.61z)| norm 0.2097 (-0.82z)| lr 4.15e-03 | 1981.51 ms | 69.3% bf16 MFU | 264432 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.044284 +step 8091/18794 | loss 3.119107 (-0.01z)| norm 0.4023 (+2.04z)| lr 4.15e-03 | 1981.25 ms | 69.3% bf16 MFU | 264441 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.267209 +step 8092/18794 | loss 3.127588 (+0.21z)| norm 0.4141 (+2.27z)| lr 4.15e-03 | 1980.42 ms | 69.3% bf16 MFU | 264456 tok/s +step 8093/18794 | loss 3.081099 (-1.07z)| norm 0.1981 (-1.02z)| lr 4.15e-03 | 1980.47 ms | 69.3% bf16 MFU | 264470 tok/s +step 8094/18794 | loss 3.121768 (+0.04z)| norm 0.2740 (+0.15z)| lr 4.15e-03 | 1979.28 ms | 69.3% bf16 MFU | 264491 tok/s +step 8095/18794 | loss 3.126692 (+0.18z)| norm 0.1893 (-1.13z)| lr 4.15e-03 | 1983.48 ms | 69.2% bf16 MFU | 264482 tok/s +step 8096/18794 | loss 3.111984 (-0.23z)| norm 0.1879 (-1.15z)| lr 4.14e-03 | 1982.89 ms | 69.2% bf16 MFU | 264479 tok/s +step 8097/18794 | loss 3.151536 (+0.85z)| norm 0.1774 (-1.29z)| lr 4.14e-03 | 1985.96 ms | 69.1% bf16 MFU | 264454 tok/s +step 8098/18794 | loss 3.054842 (-1.80z)| norm 0.2516 (-0.16z)| lr 4.14e-03 | 1982.29 ms | 69.2% bf16 MFU | 264456 tok/s +step 8099/18794 | loss 3.105982 (-0.41z)| norm 0.2557 (-0.08z)| lr 4.14e-03 | 1989.72 ms | 69.0% bf16 MFU | 264408 tok/s +step 8100/18794 | loss 3.132461 (+0.31z)| norm 0.1896 (-1.08z)| lr 4.14e-03 | 1987.60 ms | 69.0% bf16 MFU | 264377 tok/s +step 8101/18794 | loss 3.127262 (+0.15z)| norm 0.2838 (+0.36z)| lr 4.14e-03 | 1980.34 ms | 69.3% bf16 MFU | 264395 tok/s +step 8102/18794 | loss 3.099086 (-0.63z)| norm 0.3141 (+0.81z)| lr 4.14e-03 | 1980.14 ms | 69.3% bf16 MFU | 264414 tok/s +step 8103/18794 | loss 3.097139 (-0.67z)| norm 0.2252 (-0.56z)| lr 4.14e-03 | 1979.90 ms | 69.3% bf16 MFU | 264434 tok/s +step 8104/18794 | loss 3.112782 (-0.22z)| norm 0.2194 (-0.64z)| lr 4.14e-03 | 1979.15 ms | 69.3% bf16 MFU | 264457 tok/s +step 8105/18794 | loss 3.113257 (-0.20z)| norm 0.2149 (-0.71z)| lr 4.14e-03 | 1980.29 ms | 69.3% bf16 MFU | 264472 tok/s +step 8106/18794 | loss 3.124351 (+0.11z)| norm 0.2114 (-0.76z)| lr 4.14e-03 | 1979.97 ms | 69.3% bf16 MFU | 264488 tok/s +step 8107/18794 | loss 3.147978 (+0.79z)| norm 0.2425 (-0.28z)| lr 4.14e-03 | 1981.64 ms | 69.3% bf16 MFU | 264492 tok/s +step 8108/18794 | loss 3.072821 (-1.35z)| norm 0.2073 (-0.81z)| lr 4.14e-03 | 1978.42 ms | 69.4% bf16 MFU | 264518 tok/s +step 8109/18794 | loss 3.092770 (-0.77z)| norm 0.2430 (-0.26z)| lr 4.14e-03 | 1980.12 ms | 69.3% bf16 MFU | 264531 tok/s +step 8110/18794 | loss 3.106055 (-0.39z)| norm 0.3240 (+0.97z)| lr 4.14e-03 | 2031.13 ms | 67.6% bf16 MFU | 264211 tok/s +step 8111/18794 | loss 3.165592 (+1.29z)| norm 0.2851 (+0.38z)| lr 4.14e-03 | 2039.93 ms | 67.3% bf16 MFU | 263851 tok/s +step 8112/18794 | loss 3.160997 (+1.16z)| norm 0.2435 (-0.25z)| lr 4.14e-03 | 2043.62 ms | 67.2% bf16 MFU | 263486 tok/s +step 8113/18794 | loss 3.119534 (+0.00z)| norm 0.2712 (+0.16z)| lr 4.14e-03 | 2041.88 ms | 67.2% bf16 MFU | 263150 tok/s +step 8114/18794 | loss 3.141510 (+0.61z)| norm 0.3374 (+1.17z)| lr 4.14e-03 | 2050.31 ms | 66.9% bf16 MFU | 262778 tok/s +step 8115/18794 | loss 3.113922 (-0.18z)| norm 0.3539 (+1.39z)| lr 4.14e-03 | 2040.46 ms | 67.3% bf16 MFU | 262486 tok/s +step 8116/18794 | loss 3.107147 (-0.38z)| norm 0.3582 (+1.43z)| lr 4.14e-03 | 2038.68 ms | 67.3% bf16 MFU | 262220 tok/s +step 8117/18794 | loss 3.053906 (-1.88z)| norm 0.3676 (+1.53z)| lr 4.14e-03 | 2022.07 ms | 67.9% bf16 MFU | 262074 tok/s +step 8118/18794 | loss 3.150450 (+0.88z)| norm 0.3534 (+1.30z)| lr 4.13e-03 | 2048.54 ms | 67.0% bf16 MFU | 261766 tok/s +step 8119/18794 | loss 3.133327 (+0.39z)| norm 0.2284 (-0.52z)| lr 4.13e-03 | 2041.33 ms | 67.2% bf16 MFU | 261520 tok/s +step 8120/18794 | loss 3.118578 (-0.04z)| norm 0.1974 (-0.96z)| lr 4.13e-03 | 2019.33 ms | 68.0% bf16 MFU | 261426 tok/s +step 8121/18794 | loss 3.130302 (+0.29z)| norm 0.2215 (-0.61z)| lr 4.13e-03 | 2046.42 ms | 67.1% bf16 MFU | 261164 tok/s +step 8122/18794 | loss 3.088351 (-0.91z)| norm 0.1853 (-1.13z)| lr 4.13e-03 | 2041.43 ms | 67.2% bf16 MFU | 260947 tok/s +step 8123/18794 | loss 3.134501 (+0.46z)| norm 0.1802 (-1.19z)| lr 4.13e-03 | 2031.03 ms | 67.6% bf16 MFU | 260807 tok/s +step 8124/18794 | loss 3.098101 (-0.61z)| norm 0.1693 (-1.33z)| lr 4.13e-03 | 2016.10 ms | 68.1% bf16 MFU | 260769 tok/s +step 8125/18794 | loss 3.154943 (+1.05z)| norm 0.1899 (-1.03z)| lr 4.13e-03 | 2042.46 ms | 67.2% bf16 MFU | 260565 tok/s +step 8126/18794 | loss 3.108998 (-0.32z)| norm 0.1943 (-0.96z)| lr 4.13e-03 | 2032.66 ms | 67.5% bf16 MFU | 260434 tok/s +step 8127/18794 | loss 3.064118 (-1.63z)| norm 0.2210 (-0.57z)| lr 4.13e-03 | 2036.57 ms | 67.4% bf16 MFU | 260284 tok/s +step 8128/18794 | loss 3.159766 (+1.20z)| norm 0.2091 (-0.72z)| lr 4.13e-03 | 2045.35 ms | 67.1% bf16 MFU | 260086 tok/s +step 8129/18794 | loss 3.105078 (-0.40z)| norm 0.2586 (-0.02z)| lr 4.13e-03 | 2029.80 ms | 67.6% bf16 MFU | 259997 tok/s +step 8130/18794 | loss 3.145713 (+0.79z)| norm 0.2243 (-0.51z)| lr 4.13e-03 | 2030.07 ms | 67.6% bf16 MFU | 259910 tok/s +step 8131/18794 | loss 3.133553 (+0.42z)| norm 0.2207 (-0.55z)| lr 4.13e-03 | 2039.68 ms | 67.3% bf16 MFU | 259767 tok/s +step 8132/18794 | loss 3.120806 (+0.03z)| norm 0.3059 (+0.65z)| lr 4.13e-03 | 2038.39 ms | 67.3% bf16 MFU | 259639 tok/s +step 8133/18794 | loss 3.130589 (+0.33z)| norm 0.3322 (+1.01z)| lr 4.13e-03 | 2036.40 ms | 67.4% bf16 MFU | 259530 tok/s +step 8134/18794 | loss 3.145594 (+0.77z)| norm 0.2241 (-0.53z)| lr 4.13e-03 | 2038.47 ms | 67.3% bf16 MFU | 259413 tok/s +step 8135/18794 | loss 3.057986 (-1.87z)| norm 0.2451 (-0.25z)| lr 4.13e-03 | 2043.96 ms | 67.1% bf16 MFU | 259268 tok/s +step 8136/18794 | loss 3.104352 (-0.49z)| norm 0.3191 (+0.80z)| lr 4.13e-03 | 2031.88 ms | 67.5% bf16 MFU | 259206 tok/s +step 8137/18794 | loss 3.119614 (-0.03z)| norm 0.2583 (-0.08z)| lr 4.13e-03 | 2028.56 ms | 67.7% bf16 MFU | 259168 tok/s +step 8138/18794 | loss 3.120755 (+0.02z)| norm 0.2668 (+0.04z)| lr 4.13e-03 | 2045.06 ms | 67.1% bf16 MFU | 259028 tok/s +step 8139/18794 | loss 3.145663 (+0.79z)| norm 0.2418 (-0.31z)| lr 4.13e-03 | 2032.96 ms | 67.5% bf16 MFU | 258971 tok/s +step 8140/18794 | loss 3.124033 (+0.13z)| norm 0.2231 (-0.58z)| lr 4.12e-03 | 2038.37 ms | 67.3% bf16 MFU | 258883 tok/s +step 8141/18794 | loss 3.100247 (-0.61z)| norm 0.2253 (-0.54z)| lr 4.12e-03 | 2033.82 ms | 67.5% bf16 MFU | 258828 tok/s +step 8142/18794 | loss 3.146279 (+0.81z)| norm 0.2195 (-0.63z)| lr 4.12e-03 | 2045.02 ms | 67.1% bf16 MFU | 258706 tok/s +step 8143/18794 | loss 3.104040 (-0.49z)| norm 0.2427 (-0.30z)| lr 4.12e-03 | 2014.17 ms | 68.1% bf16 MFU | 258785 tok/s +step 8144/18794 | loss 3.067014 (-1.62z)| norm 0.2073 (-0.80z)| lr 4.12e-03 | 2037.14 ms | 67.4% bf16 MFU | 258714 tok/s +step 8145/18794 | loss 3.163913 (+1.36z)| norm 0.2011 (-0.88z)| lr 4.12e-03 | 2042.74 ms | 67.2% bf16 MFU | 258612 tok/s +step 8146/18794 | loss 3.156330 (+1.11z)| norm 0.2400 (-0.30z)| lr 4.12e-03 | 2017.91 ms | 68.0% bf16 MFU | 258672 tok/s +step 8147/18794 | loss 3.085653 (-1.04z)| norm 0.2603 (-0.00z)| lr 4.12e-03 | 2020.77 ms | 67.9% bf16 MFU | 258711 tok/s +step 8148/18794 | loss 3.107081 (-0.38z)| norm 0.3344 (+1.05z)| lr 4.12e-03 | 2031.62 ms | 67.5% bf16 MFU | 258678 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.476320 +step 8149/18794 | loss 3.127375 (+0.26z)| norm 0.4396 (+2.48z)| lr 4.12e-03 | 2015.69 ms | 68.1% bf16 MFU | 258750 tok/s +step 8150/18794 | loss 3.106074 (-0.39z)| norm 0.3499 (+1.23z)| lr 4.12e-03 | 2012.54 ms | 68.2% bf16 MFU | 258838 tok/s +step 8151/18794 | loss 3.136501 (+0.53z)| norm 0.1870 (-1.07z)| lr 4.12e-03 | 2031.80 ms | 67.5% bf16 MFU | 258798 tok/s +step 8152/18794 | loss 3.062989 (-1.72z)| norm 0.3438 (+1.13z)| lr 4.12e-03 | 2028.64 ms | 67.6% bf16 MFU | 258780 tok/s +step 8153/18794 | loss 3.039901 (-2.34z)| norm 0.2916 (+0.38z)| lr 4.12e-03 | 2013.00 ms | 68.2% bf16 MFU | 258864 tok/s +step 8154/18794 | loss 3.071673 (-1.39z)| norm 0.2277 (-0.55z)| lr 4.12e-03 | 2006.09 ms | 68.4% bf16 MFU | 258988 tok/s +step 8155/18794 | loss 3.098875 (-0.56z)| norm 0.3574 (+1.29z)| lr 4.12e-03 | 2036.86 ms | 67.4% bf16 MFU | 258909 tok/s +step 8156/18794 | loss 3.156613 (+1.17z)| norm 0.2447 (-0.31z)| lr 4.12e-03 | 2020.44 ms | 67.9% bf16 MFU | 258938 tok/s +step 8157/18794 | loss 3.075760 (-1.28z)| norm 0.1922 (-1.05z)| lr 4.12e-03 | 2029.14 ms | 67.6% bf16 MFU | 258910 tok/s +step 8158/18794 | loss 3.099966 (-0.54z)| norm 0.2183 (-0.67z)| lr 4.12e-03 | 2044.43 ms | 67.1% bf16 MFU | 258787 tok/s +step 8159/18794 | loss 3.139936 (+0.67z)| norm 0.1924 (-1.04z)| lr 4.12e-03 | 2022.96 ms | 67.8% bf16 MFU | 258806 tok/s +step 8160/18794 | loss 3.100592 (-0.52z)| norm 0.2760 (+0.20z)| lr 4.12e-03 | 2011.39 ms | 68.2% bf16 MFU | 258898 tok/s +step 8161/18794 | loss 3.078961 (-1.16z)| norm 0.3005 (+0.61z)| lr 4.12e-03 | 2023.66 ms | 67.8% bf16 MFU | 258907 tok/s +step 8162/18794 | loss 3.121092 (+0.12z)| norm 0.2676 (+0.10z)| lr 4.11e-03 | 2007.65 ms | 68.4% bf16 MFU | 259019 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.218182 +step 8163/18794 | loss 3.170007 (+1.57z)| norm 0.4070 (+2.22z)| lr 4.11e-03 | 2006.48 ms | 68.4% bf16 MFU | 259133 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.528951 +step 8164/18794 | loss 3.167016 (+1.45z)| norm 0.4301 (+2.53z)| lr 4.11e-03 | 2021.36 ms | 67.9% bf16 MFU | 259145 tok/s +step 8165/18794 | loss 3.126910 (+0.23z)| norm 0.2839 (+0.32z)| lr 4.11e-03 | 2028.18 ms | 67.7% bf16 MFU | 259113 tok/s +step 8166/18794 | loss 3.079726 (-1.18z)| norm 0.2078 (-0.82z)| lr 4.11e-03 | 2019.36 ms | 68.0% bf16 MFU | 259139 tok/s +step 8167/18794 | loss 3.130535 (+0.34z)| norm 0.2141 (-0.72z)| lr 4.11e-03 | 2016.80 ms | 68.0% bf16 MFU | 259180 tok/s +step 8168/18794 | loss 3.120971 (+0.06z)| norm 0.2741 (+0.22z)| lr 4.11e-03 | 2023.23 ms | 67.8% bf16 MFU | 259178 tok/s +step 8169/18794 | loss 3.138565 (+0.61z)| norm 0.2209 (-0.62z)| lr 4.11e-03 | 2025.51 ms | 67.8% bf16 MFU | 259161 tok/s +step 8170/18794 | loss 3.121070 (+0.07z)| norm 0.2171 (-0.67z)| lr 4.11e-03 | 2008.83 ms | 68.3% bf16 MFU | 259252 tok/s +step 8171/18794 | loss 3.088242 (-0.93z)| norm 0.2078 (-0.81z)| lr 4.11e-03 | 2040.32 ms | 67.3% bf16 MFU | 259138 tok/s +step 8172/18794 | loss 3.093885 (-0.78z)| norm 0.2508 (-0.12z)| lr 4.11e-03 | 2005.65 ms | 68.4% bf16 MFU | 259251 tok/s +step 8173/18794 | loss 3.150168 (+0.96z)| norm 0.2061 (-0.83z)| lr 4.11e-03 | 2026.25 ms | 67.7% bf16 MFU | 259226 tok/s +step 8174/18794 | loss 3.147675 (+0.99z)| norm 0.2997 (+0.65z)| lr 4.11e-03 | 2025.38 ms | 67.8% bf16 MFU | 259208 tok/s +step 8175/18794 | loss 3.111626 (-0.21z)| norm 0.3130 (+0.85z)| lr 4.11e-03 | 2032.40 ms | 67.5% bf16 MFU | 259146 tok/s +step 8176/18794 | loss 3.094656 (-0.77z)| norm 0.2432 (-0.26z)| lr 4.11e-03 | 2022.05 ms | 67.9% bf16 MFU | 259153 tok/s +step 8177/18794 | loss 3.127224 (+0.32z)| norm 0.2069 (-0.83z)| lr 4.11e-03 | 2031.39 ms | 67.6% bf16 MFU | 259100 tok/s +step 8178/18794 | loss 3.124011 (+0.20z)| norm 0.2247 (-0.55z)| lr 4.11e-03 | 2015.45 ms | 68.1% bf16 MFU | 259151 tok/s +step 8179/18794 | loss 3.131436 (+0.45z)| norm 0.2833 (+0.37z)| lr 4.11e-03 | 2018.45 ms | 68.0% bf16 MFU | 259181 tok/s +step 8180/18794 | loss 3.133128 (+0.49z)| norm 0.3115 (+0.86z)| lr 4.11e-03 | 2008.19 ms | 68.3% bf16 MFU | 259276 tok/s +step 8181/18794 | loss 3.181831 (+2.07z)| norm 0.2099 (-0.78z)| lr 4.11e-03 | 2033.38 ms | 67.5% bf16 MFU | 259204 tok/s +step 8182/18794 | loss 3.073810 (-1.48z)| norm 0.2992 (+0.66z)| lr 4.11e-03 | 2003.44 ms | 68.5% bf16 MFU | 259329 tok/s +step 8183/18794 | loss 3.114005 (-0.15z)| norm 0.1919 (-1.08z)| lr 4.11e-03 | 2018.81 ms | 68.0% bf16 MFU | 259347 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.025424 +step 8184/18794 | loss 3.186154 (+2.17z)| norm 0.3877 (+2.03z)| lr 4.10e-03 | 2003.43 ms | 68.5% bf16 MFU | 259465 tok/s +step 8185/18794 | loss 3.136944 (+0.59z)| norm 0.3645 (+1.63z)| lr 4.10e-03 | 2027.94 ms | 67.7% bf16 MFU | 259418 tok/s +step 8186/18794 | loss 3.060575 (-1.86z)| norm 0.3436 (+1.27z)| lr 4.10e-03 | 2016.24 ms | 68.1% bf16 MFU | 259449 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.076790 +step 8187/18794 | loss 3.160716 (+1.33z)| norm 0.3988 (+2.08z)| lr 4.10e-03 | 1989.18 ms | 69.0% bf16 MFU | 259655 tok/s +step 8188/18794 | loss 3.253278 (+3.90z)| norm 0.2713 (+0.13z)| lr 4.10e-03 | 2026.51 ms | 67.7% bf16 MFU | 259608 tok/s +step 8189/18794 | loss 3.109282 (-0.30z)| norm 0.2381 (-0.37z)| lr 4.10e-03 | 2015.31 ms | 68.1% bf16 MFU | 259635 tok/s +step 8190/18794 | loss 3.136571 (+0.53z)| norm 0.2182 (-0.68z)| lr 4.10e-03 | 2008.97 ms | 68.3% bf16 MFU | 259702 tok/s +step 8191/18794 | loss 3.163744 (+1.32z)| norm 0.2266 (-0.54z)| lr 4.10e-03 | 2013.97 ms | 68.1% bf16 MFU | 259733 tok/s +step 8192/18794 | loss 3.089419 (-0.88z)| norm 0.2470 (-0.20z)| lr 4.10e-03 | 2018.21 ms | 68.0% bf16 MFU | 259735 tok/s +step 8193/18794 | loss 3.170074 (+1.49z)| norm 0.2296 (-0.48z)| lr 4.10e-03 | 2016.95 ms | 68.0% bf16 MFU | 259746 tok/s +step 8194/18794 | loss 3.107266 (-0.37z)| norm 0.2030 (-0.90z)| lr 4.10e-03 | 2005.39 ms | 68.4% bf16 MFU | 259830 tok/s +step 8195/18794 | loss 3.147484 (+0.81z)| norm 0.2710 (+0.19z)| lr 4.10e-03 | 2018.94 ms | 68.0% bf16 MFU | 259823 tok/s +step 8196/18794 | loss 3.064343 (-1.61z)| norm 0.2733 (+0.21z)| lr 4.10e-03 | 1999.18 ms | 68.6% bf16 MFU | 259945 tok/s +step 8197/18794 | loss 3.081925 (-1.08z)| norm 0.2430 (-0.30z)| lr 4.10e-03 | 1990.97 ms | 68.9% bf16 MFU | 260114 tok/s +step 8198/18794 | loss 3.152947 (+0.98z)| norm 0.2745 (+0.22z)| lr 4.10e-03 | 2016.34 ms | 68.1% bf16 MFU | 260109 tok/s +step 8199/18794 | loss 3.150358 (+0.88z)| norm 0.2317 (-0.48z)| lr 4.10e-03 | 2029.85 ms | 67.6% bf16 MFU | 260018 tok/s +step 8200/18794 | loss 3.131215 (+0.32z)| norm 0.2060 (-0.91z)| lr 4.10e-03 | 2017.39 ms | 68.0% bf16 MFU | 260012 tok/s +step 8201/18794 | loss 3.079185 (-1.19z)| norm 0.2461 (-0.24z)| lr 4.10e-03 | 2015.53 ms | 68.1% bf16 MFU | 260017 tok/s +step 8202/18794 | loss 3.104357 (-0.45z)| norm 0.3638 (+1.68z)| lr 4.10e-03 | 2027.43 ms | 67.7% bf16 MFU | 259946 tok/s +step 8203/18794 | loss 3.162683 (+1.23z)| norm 0.2783 (+0.27z)| lr 4.10e-03 | 2014.23 ms | 68.1% bf16 MFU | 259963 tok/s +step 8204/18794 | loss 3.041538 (-2.22z)| norm 0.2197 (-0.69z)| lr 4.10e-03 | 2033.80 ms | 67.5% bf16 MFU | 259855 tok/s +step 8205/18794 | loss 3.098192 (-0.61z)| norm 0.2572 (-0.08z)| lr 4.10e-03 | 2030.64 ms | 67.6% bf16 MFU | 259771 tok/s +step 8206/18794 | loss 3.103741 (-0.44z)| norm 0.2243 (-0.63z)| lr 4.09e-03 | 2024.64 ms | 67.8% bf16 MFU | 259730 tok/s +step 8207/18794 | loss 3.153381 (+0.96z)| norm 0.2353 (-0.45z)| lr 4.09e-03 | 2022.51 ms | 67.9% bf16 MFU | 259705 tok/s +step 8208/18794 | loss 3.137283 (+0.49z)| norm 0.2355 (-0.45z)| lr 4.09e-03 | 2019.99 ms | 67.9% bf16 MFU | 259697 tok/s +step 8209/18794 | loss 3.086972 (-0.94z)| norm 0.2116 (-0.84z)| lr 4.09e-03 | 2017.38 ms | 68.0% bf16 MFU | 259707 tok/s +step 8210/18794 | loss 3.116010 (-0.12z)| norm 0.1926 (-1.13z)| lr 4.09e-03 | 2019.05 ms | 68.0% bf16 MFU | 259705 tok/s +step 8211/18794 | loss 3.114236 (-0.16z)| norm 0.2680 (+0.12z)| lr 4.09e-03 | 2012.27 ms | 68.2% bf16 MFU | 259747 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.788747 +step 8212/18794 | loss 3.171878 (+1.49z)| norm 0.4391 (+2.79z)| lr 4.09e-03 | 2014.11 ms | 68.1% bf16 MFU | 259775 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.266015 +step 8213/18794 | loss 3.131126 (+0.32z)| norm 0.4113 (+2.27z)| lr 4.09e-03 | 2007.09 ms | 68.4% bf16 MFU | 259847 tok/s +step 8214/18794 | loss 3.127017 (+0.21z)| norm 0.2274 (-0.55z)| lr 4.09e-03 | 2021.66 ms | 67.9% bf16 MFU | 259822 tok/s +step 8215/18794 | loss 3.113605 (-0.18z)| norm 0.3299 (+1.04z)| lr 4.09e-03 | 2026.51 ms | 67.7% bf16 MFU | 259766 tok/s +step 8216/18794 | loss 3.158209 (+1.08z)| norm 0.2405 (-0.33z)| lr 4.09e-03 | 2018.71 ms | 68.0% bf16 MFU | 259764 tok/s +step 8217/18794 | loss 3.155651 (+0.99z)| norm 0.2571 (-0.05z)| lr 4.09e-03 | 2019.49 ms | 68.0% bf16 MFU | 259756 tok/s +step 8218/18794 | loss 3.129148 (+0.23z)| norm 0.2863 (+0.42z)| lr 4.09e-03 | 2009.23 ms | 68.3% bf16 MFU | 259815 tok/s +step 8219/18794 | loss 3.129779 (+0.25z)| norm 0.2487 (-0.18z)| lr 4.09e-03 | 2011.70 ms | 68.2% bf16 MFU | 259856 tok/s +step 8220/18794 | loss 3.067902 (-1.50z)| norm 0.2679 (+0.11z)| lr 4.09e-03 | 2007.25 ms | 68.4% bf16 MFU | 259923 tok/s +step 8221/18794 | loss 3.122548 (+0.06z)| norm 0.2815 (+0.33z)| lr 4.09e-03 | 2018.43 ms | 68.0% bf16 MFU | 259914 tok/s +step 8222/18794 | loss 3.127610 (+0.20z)| norm 0.2051 (-0.92z)| lr 4.09e-03 | 2028.39 ms | 67.7% bf16 MFU | 259842 tok/s +step 8223/18794 | loss 3.099663 (-0.60z)| norm 0.2569 (-0.09z)| lr 4.09e-03 | 2018.99 ms | 68.0% bf16 MFU | 259834 tok/s +step 8224/18794 | loss 3.178472 (+1.62z)| norm 0.2295 (-0.56z)| lr 4.09e-03 | 2012.16 ms | 68.2% bf16 MFU | 259870 tok/s +step 8225/18794 | loss 3.123396 (+0.07z)| norm 0.2478 (-0.26z)| lr 4.09e-03 | 2003.74 ms | 68.5% bf16 MFU | 259959 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.719846 +step 8226/18794 | loss 3.150928 (+0.84z)| norm 0.4338 (+2.72z)| lr 4.09e-03 | 2027.23 ms | 67.7% bf16 MFU | 259893 tok/s +step 8227/18794 | loss 3.135590 (+0.39z)| norm 0.3742 (+1.71z)| lr 4.09e-03 | 2023.34 ms | 67.8% bf16 MFU | 259854 tok/s +step 8228/18794 | loss 3.113815 (-0.23z)| norm 0.1989 (-1.09z)| lr 4.08e-03 | 2012.67 ms | 68.2% bf16 MFU | 259886 tok/s +step 8229/18794 | loss 3.101084 (-0.59z)| norm 0.2813 (+0.22z)| lr 4.08e-03 | 2024.01 ms | 67.8% bf16 MFU | 259843 tok/s +step 8230/18794 | loss 3.118126 (-0.09z)| norm 0.2394 (-0.45z)| lr 4.08e-03 | 2011.90 ms | 68.2% bf16 MFU | 259881 tok/s +step 8231/18794 | loss 3.131898 (+0.31z)| norm 0.2870 (+0.30z)| lr 4.08e-03 | 2004.51 ms | 68.5% bf16 MFU | 259964 tok/s +step 8232/18794 | loss 3.164266 (+1.22z)| norm 0.3009 (+0.52z)| lr 4.08e-03 | 2011.46 ms | 68.2% bf16 MFU | 259999 tok/s +step 8233/18794 | loss 3.132145 (+0.30z)| norm 0.2774 (+0.15z)| lr 4.08e-03 | 2020.22 ms | 67.9% bf16 MFU | 259975 tok/s +step 8234/18794 | loss 3.154227 (+0.93z)| norm 0.2981 (+0.48z)| lr 4.08e-03 | 2012.78 ms | 68.2% bf16 MFU | 260000 tok/s +step 8235/18794 | loss 3.158317 (+1.03z)| norm 0.2611 (-0.12z)| lr 4.08e-03 | 2019.15 ms | 68.0% bf16 MFU | 259983 tok/s +step 8236/18794 | loss 3.078484 (-1.27z)| norm 0.2671 (-0.02z)| lr 4.08e-03 | 2020.79 ms | 67.9% bf16 MFU | 259956 tok/s +step 8237/18794 | loss 3.088914 (-0.96z)| norm 0.3530 (+1.35z)| lr 4.08e-03 | 2010.53 ms | 68.3% bf16 MFU | 259997 tok/s +step 8238/18794 | loss 3.147262 (+0.71z)| norm 0.3289 (+0.94z)| lr 4.08e-03 | 2012.99 ms | 68.2% bf16 MFU | 260020 tok/s +step 8239/18794 | loss 3.161933 (+1.12z)| norm 0.2987 (+0.45z)| lr 4.08e-03 | 2019.97 ms | 67.9% bf16 MFU | 259996 tok/s +step 8240/18794 | loss 3.094617 (-0.79z)| norm 0.3059 (+0.56z)| lr 4.08e-03 | 2010.47 ms | 68.3% bf16 MFU | 260035 tok/s +step 8241/18794 | loss 3.089662 (-0.93z)| norm 0.2886 (+0.27z)| lr 4.08e-03 | 2011.36 ms | 68.2% bf16 MFU | 260067 tok/s +step 8242/18794 | loss 3.130101 (+0.23z)| norm 0.2517 (-0.33z)| lr 4.08e-03 | 2019.36 ms | 68.0% bf16 MFU | 260045 tok/s +step 8243/18794 | loss 3.194220 (+1.99z)| norm 0.2498 (-0.36z)| lr 4.08e-03 | 2014.85 ms | 68.1% bf16 MFU | 260053 tok/s +step 8244/18794 | loss 3.140527 (+0.47z)| norm 0.2832 (+0.17z)| lr 4.08e-03 | 2001.71 ms | 68.6% bf16 MFU | 260147 tok/s +step 8245/18794 | loss 3.217558 (+2.56z)| norm 0.2864 (+0.20z)| lr 4.08e-03 | 2007.16 ms | 68.4% bf16 MFU | 260200 tok/s +step 8246/18794 | loss 3.189958 (+1.77z)| norm 0.2615 (-0.21z)| lr 4.08e-03 | 2013.20 ms | 68.2% bf16 MFU | 260211 tok/s +step 8247/18794 | loss 3.083894 (-1.10z)| norm 0.2468 (-0.44z)| lr 4.08e-03 | 2012.20 ms | 68.2% bf16 MFU | 260228 tok/s +step 8248/18794 | loss 3.202011 (+2.03z)| norm 0.2572 (-0.26z)| lr 4.08e-03 | 1998.89 ms | 68.7% bf16 MFU | 260331 tok/s +step 8249/18794 | loss 3.153967 (+0.75z)| norm 0.2221 (-0.83z)| lr 4.08e-03 | 2011.79 ms | 68.2% bf16 MFU | 260345 tok/s +step 8250/18794 | loss 3.144206 (+0.48z)| norm 0.3487 (+1.32z)| lr 4.07e-03 | 2003.87 ms | 68.5% bf16 MFU | 260410 tok/s +val loss 3.149568 +HellaSwag: 2977/10042 = 0.296455: 0/1256 +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.607945 +step 8251/18794 | loss 3.154453 (+0.74z)| norm 0.4308 (+2.61z)| lr 4.07e-03 | 2015.78 ms | 68.1% bf16 MFU | 260394 tok/s +step 8252/18794 | loss 3.175860 (+1.29z)| norm 0.3446 (+1.18z)| lr 4.07e-03 | 2003.45 ms | 68.5% bf16 MFU | 260459 tok/s +step 8253/18794 | loss 3.121004 (-0.20z)| norm 0.2752 (+0.03z)| lr 4.07e-03 | 2018.70 ms | 68.0% bf16 MFU | 260422 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.105196 +step 8254/18794 | loss 3.141585 (+0.35z)| norm 0.4046 (+2.11z)| lr 4.07e-03 | 2011.24 ms | 68.2% bf16 MFU | 260435 tok/s +step 8255/18794 | loss 3.135808 (+0.18z)| norm 0.2254 (-0.79z)| lr 4.07e-03 | 2014.08 ms | 68.1% bf16 MFU | 260428 tok/s +step 8256/18794 | loss 3.165578 (+1.00z)| norm 0.2777 (+0.06z)| lr 4.07e-03 | 2020.14 ms | 67.9% bf16 MFU | 260384 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.185050 +step 8257/18794 | loss 3.198031 (+1.85z)| norm 0.4119 (+2.19z)| lr 4.07e-03 | 2014.34 ms | 68.1% bf16 MFU | 260378 tok/s +step 8258/18794 | loss 3.093788 (-1.01z)| norm 0.3707 (+1.49z)| lr 4.07e-03 | 2019.69 ms | 67.9% bf16 MFU | 260339 tok/s +step 8259/18794 | loss 3.162721 (+0.87z)| norm 0.2041 (-1.18z)| lr 4.07e-03 | 2008.08 ms | 68.3% bf16 MFU | 260376 tok/s +step 8260/18794 | loss 3.074608 (-1.52z)| norm 0.2686 (-0.15z)| lr 4.07e-03 | 2012.04 ms | 68.2% bf16 MFU | 260386 tok/s +step 8261/18794 | loss 3.115987 (-0.41z)| norm 0.3236 (+0.73z)| lr 4.07e-03 | 2017.62 ms | 68.0% bf16 MFU | 260360 tok/s +step 8262/18794 | loss 3.088235 (-1.16z)| norm 0.2625 (-0.25z)| lr 4.07e-03 | 1995.67 ms | 68.8% bf16 MFU | 260477 tok/s +step 8263/18794 | loss 3.108234 (-0.60z)| norm 0.2731 (-0.06z)| lr 4.07e-03 | 2019.28 ms | 68.0% bf16 MFU | 260435 tok/s +step 8264/18794 | loss 3.093214 (-0.99z)| norm 0.3027 (+0.46z)| lr 4.07e-03 | 2002.89 ms | 68.5% bf16 MFU | 260502 tok/s +step 8265/18794 | loss 3.097612 (-0.86z)| norm 0.2752 (-0.00z)| lr 4.07e-03 | 2015.43 ms | 68.1% bf16 MFU | 260484 tok/s +step 8266/18794 | loss 3.090202 (-1.07z)| norm 0.2834 (+0.13z)| lr 4.07e-03 | 2011.76 ms | 68.2% bf16 MFU | 260490 tok/s +step 8267/18794 | loss 3.122196 (-0.19z)| norm 0.3684 (+1.53z)| lr 4.07e-03 | 2026.87 ms | 67.7% bf16 MFU | 260399 tok/s +step 8268/18794 | loss 3.140120 (+0.30z)| norm 0.2392 (-0.64z)| lr 4.07e-03 | 1996.32 ms | 68.7% bf16 MFU | 260510 tok/s +step 8269/18794 | loss 3.115698 (-0.36z)| norm 0.3597 (+1.36z)| lr 4.07e-03 | 2026.05 ms | 67.7% bf16 MFU | 260424 tok/s +step 8270/18794 | loss 3.147149 (+0.49z)| norm 0.1854 (-1.55z)| lr 4.07e-03 | 2011.13 ms | 68.2% bf16 MFU | 260437 tok/s +step 8271/18794 | loss 3.135709 (+0.16z)| norm 0.3234 (+0.74z)| lr 4.06e-03 | 2020.07 ms | 67.9% bf16 MFU | 260392 tok/s +step 8272/18794 | loss 3.128878 (-0.03z)| norm 0.2090 (-1.16z)| lr 4.06e-03 | 2010.45 ms | 68.3% bf16 MFU | 260412 tok/s +step 8273/18794 | loss 3.128187 (-0.05z)| norm 0.3579 (+1.29z)| lr 4.06e-03 | 2010.93 ms | 68.2% bf16 MFU | 260427 tok/s +step 8274/18794 | loss 3.127793 (-0.05z)| norm 0.1950 (-1.39z)| lr 4.06e-03 | 2010.31 ms | 68.3% bf16 MFU | 260446 tok/s +step 8275/18794 | loss 3.155361 (+0.70z)| norm 0.2320 (-0.77z)| lr 4.06e-03 | 2009.91 ms | 68.3% bf16 MFU | 260466 tok/s +step 8276/18794 | loss 3.088534 (-1.15z)| norm 0.2414 (-0.61z)| lr 4.06e-03 | 2010.68 ms | 68.3% bf16 MFU | 260480 tok/s +step 8277/18794 | loss 3.150798 (+0.57z)| norm 0.3232 (+0.72z)| lr 4.06e-03 | 2018.73 ms | 68.0% bf16 MFU | 260442 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.122791 +step 8278/18794 | loss 3.139835 (+0.26z)| norm 0.4128 (+2.12z)| lr 4.06e-03 | 2018.07 ms | 68.0% bf16 MFU | 260410 tok/s +step 8279/18794 | loss 3.139732 (+0.25z)| norm 0.2126 (-1.10z)| lr 4.06e-03 | 2009.98 ms | 68.3% bf16 MFU | 260431 tok/s +step 8280/18794 | loss 3.089619 (-1.11z)| norm 0.3537 (+1.16z)| lr 4.06e-03 | 2019.62 ms | 67.9% bf16 MFU | 260390 tok/s +step 8281/18794 | loss 3.104924 (-0.67z)| norm 0.2903 (+0.13z)| lr 4.06e-03 | 2001.27 ms | 68.6% bf16 MFU | 260469 tok/s +step 8282/18794 | loss 3.126701 (-0.09z)| norm 0.3372 (+0.88z)| lr 4.06e-03 | 2010.34 ms | 68.3% bf16 MFU | 260485 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.084694 +step 8283/18794 | loss 3.090368 (-1.09z)| norm 0.4162 (+2.08z)| lr 4.06e-03 | 2016.90 ms | 68.0% bf16 MFU | 260458 tok/s +step 8284/18794 | loss 3.139671 (+0.30z)| norm 0.2658 (-0.29z)| lr 4.06e-03 | 2011.86 ms | 68.2% bf16 MFU | 260465 tok/s +step 8285/18794 | loss 3.172389 (+1.20z)| norm 0.3339 (+0.82z)| lr 4.06e-03 | 2011.58 ms | 68.2% bf16 MFU | 260474 tok/s +step 8286/18794 | loss 3.146468 (+0.46z)| norm 0.2921 (+0.15z)| lr 4.06e-03 | 2009.27 ms | 68.3% bf16 MFU | 260497 tok/s +step 8287/18794 | loss 3.163158 (+0.93z)| norm 0.2592 (-0.37z)| lr 4.06e-03 | 2011.09 ms | 68.2% bf16 MFU | 260507 tok/s +step 8288/18794 | loss 3.132606 (+0.10z)| norm 0.4022 (+1.94z)| lr 4.06e-03 | 2010.31 ms | 68.3% bf16 MFU | 260522 tok/s +step 8289/18794 | loss 3.189630 (+1.78z)| norm 0.2225 (-0.97z)| lr 4.06e-03 | 2033.51 ms | 67.5% bf16 MFU | 260387 tok/s +step 8290/18794 | loss 3.172469 (+1.25z)| norm 0.2176 (-1.05z)| lr 4.06e-03 | 2016.76 ms | 68.0% bf16 MFU | 260366 tok/s +step 8291/18794 | loss 3.189003 (+1.72z)| norm 0.2400 (-0.69z)| lr 4.06e-03 | 2002.54 ms | 68.5% bf16 MFU | 260438 tok/s +step 8292/18794 | loss 3.137625 (+0.19z)| norm 0.2896 (+0.11z)| lr 4.06e-03 | 2001.99 ms | 68.5% bf16 MFU | 260510 tok/s +step 8293/18794 | loss 3.114462 (-0.48z)| norm 0.2620 (-0.35z)| lr 4.05e-03 | 1991.39 ms | 68.9% bf16 MFU | 260649 tok/s +step 8294/18794 | loss 3.127712 (-0.09z)| norm 0.2937 (+0.15z)| lr 4.05e-03 | 2010.23 ms | 68.3% bf16 MFU | 260657 tok/s +step 8295/18794 | loss 3.166762 (+1.06z)| norm 0.3477 (+1.02z)| lr 4.05e-03 | 2002.26 ms | 68.5% bf16 MFU | 260716 tok/s +step 8296/18794 | loss 3.102094 (-0.88z)| norm 0.2185 (-1.07z)| lr 4.05e-03 | 2003.93 ms | 68.5% bf16 MFU | 260762 tok/s +step 8297/18794 | loss 3.141457 (+0.29z)| norm 0.3414 (+0.91z)| lr 4.05e-03 | 2018.85 ms | 68.0% bf16 MFU | 260709 tok/s +step 8298/18794 | loss 3.105867 (-0.78z)| norm 0.1931 (-1.47z)| lr 4.05e-03 | 2002.66 ms | 68.5% bf16 MFU | 260763 tok/s +step 8299/18794 | loss 3.139775 (+0.26z)| norm 0.2176 (-1.07z)| lr 4.05e-03 | 2018.24 ms | 68.0% bf16 MFU | 260714 tok/s +step 8300/18794 | loss 3.137498 (+0.19z)| norm 0.2626 (-0.36z)| lr 4.05e-03 | 2034.43 ms | 67.5% bf16 MFU | 260563 tok/s +step 8301/18794 | loss 3.117733 (-0.43z)| norm 0.2816 (-0.06z)| lr 4.05e-03 | 2009.82 ms | 68.3% bf16 MFU | 260578 tok/s +step 8302/18794 | loss 3.152398 (+0.62z)| norm 0.2127 (-1.15z)| lr 4.05e-03 | 2001.02 ms | 68.6% bf16 MFU | 260650 tok/s +step 8303/18794 | loss 3.168936 (+1.13z)| norm 0.2397 (-0.71z)| lr 4.05e-03 | 1994.46 ms | 68.8% bf16 MFU | 260761 tok/s +step 8304/18794 | loss 3.178148 (+1.42z)| norm 0.3261 (+0.67z)| lr 4.05e-03 | 2001.46 ms | 68.6% bf16 MFU | 260821 tok/s +step 8305/18794 | loss 3.145463 (+0.36z)| norm 0.2877 (+0.04z)| lr 4.05e-03 | 2005.11 ms | 68.4% bf16 MFU | 260853 tok/s +step 8306/18794 | loss 3.180926 (+1.47z)| norm 0.2528 (-0.53z)| lr 4.05e-03 | 1993.76 ms | 68.8% bf16 MFU | 260959 tok/s +step 8307/18794 | loss 3.131149 (-0.11z)| norm 0.3036 (+0.29z)| lr 4.05e-03 | 2010.26 ms | 68.3% bf16 MFU | 260951 tok/s +step 8308/18794 | loss 3.139019 (+0.14z)| norm 0.2672 (-0.31z)| lr 4.05e-03 | 1985.16 ms | 69.1% bf16 MFU | 261109 tok/s +step 8309/18794 | loss 3.167745 (+1.03z)| norm 0.2399 (-0.77z)| lr 4.05e-03 | 2024.94 ms | 67.8% bf16 MFU | 260999 tok/s +step 8310/18794 | loss 3.144308 (+0.27z)| norm 0.2731 (-0.24z)| lr 4.05e-03 | 2018.88 ms | 68.0% bf16 MFU | 260934 tok/s +step 8311/18794 | loss 3.164040 (+0.89z)| norm 0.2139 (-1.20z)| lr 4.05e-03 | 2011.92 ms | 68.2% bf16 MFU | 260917 tok/s +step 8312/18794 | loss 3.135660 (-0.01z)| norm 0.2359 (-0.83z)| lr 4.05e-03 | 1994.01 ms | 68.8% bf16 MFU | 261017 tok/s +step 8313/18794 | loss 3.100327 (-1.13z)| norm 0.3237 (+0.69z)| lr 4.05e-03 | 1994.97 ms | 68.8% bf16 MFU | 261107 tok/s +step 8314/18794 | loss 3.105573 (-0.95z)| norm 0.2987 (+0.25z)| lr 4.05e-03 | 1993.90 ms | 68.8% bf16 MFU | 261199 tok/s +step 8315/18794 | loss 3.213867 (+2.41z)| norm 0.2134 (-1.22z)| lr 4.04e-03 | 2016.78 ms | 68.0% bf16 MFU | 261137 tok/s +step 8316/18794 | loss 3.161696 (+0.78z)| norm 0.2438 (-0.69z)| lr 4.04e-03 | 1995.99 ms | 68.8% bf16 MFU | 261214 tok/s +step 8317/18794 | loss 3.069294 (-2.02z)| norm 0.2762 (-0.13z)| lr 4.04e-03 | 2009.28 ms | 68.3% bf16 MFU | 261200 tok/s +step 8318/18794 | loss 3.137254 (+0.05z)| norm 0.2644 (-0.33z)| lr 4.04e-03 | 1994.33 ms | 68.8% bf16 MFU | 261284 tok/s +step 8319/18794 | loss 3.179182 (+1.30z)| norm 0.2140 (-1.20z)| lr 4.04e-03 | 2002.47 ms | 68.5% bf16 MFU | 261311 tok/s +step 8320/18794 | loss 3.115294 (-0.66z)| norm 0.2600 (-0.40z)| lr 4.04e-03 | 1985.15 ms | 69.1% bf16 MFU | 261451 tok/s +step 8321/18794 | loss 3.114496 (-0.68z)| norm 0.2168 (-1.13z)| lr 4.04e-03 | 2003.82 ms | 68.5% bf16 MFU | 261460 tok/s +step 8322/18794 | loss 3.141469 (+0.15z)| norm 0.2053 (-1.32z)| lr 4.04e-03 | 2025.02 ms | 67.8% bf16 MFU | 261332 tok/s +step 8323/18794 | loss 3.154702 (+0.54z)| norm 0.2495 (-0.56z)| lr 4.04e-03 | 1988.08 ms | 69.0% bf16 MFU | 261452 tok/s +step 8324/18794 | loss 3.142059 (+0.16z)| norm 0.2048 (-1.32z)| lr 4.04e-03 | 2010.18 ms | 68.3% bf16 MFU | 261420 tok/s +step 8325/18794 | loss 3.110815 (-0.81z)| norm 0.2960 (+0.23z)| lr 4.04e-03 | 2003.51 ms | 68.5% bf16 MFU | 261433 tok/s +step 8326/18794 | loss 3.129867 (-0.21z)| norm 0.2142 (-1.17z)| lr 4.04e-03 | 1994.82 ms | 68.8% bf16 MFU | 261503 tok/s +step 8327/18794 | loss 3.139045 (+0.08z)| norm 0.2544 (-0.44z)| lr 4.04e-03 | 1994.65 ms | 68.8% bf16 MFU | 261570 tok/s +step 8328/18794 | loss 3.146859 (+0.31z)| norm 0.2972 (+0.31z)| lr 4.04e-03 | 1985.84 ms | 69.1% bf16 MFU | 261692 tok/s +step 8329/18794 | loss 3.133492 (-0.12z)| norm 0.2092 (-1.25z)| lr 4.04e-03 | 2003.62 ms | 68.5% bf16 MFU | 261691 tok/s +step 8330/18794 | loss 3.150307 (+0.40z)| norm 0.1817 (-1.72z)| lr 4.04e-03 | 1994.22 ms | 68.8% bf16 MFU | 261752 tok/s +step 8331/18794 | loss 3.100487 (-1.15z)| norm 0.2220 (-0.99z)| lr 4.04e-03 | 1988.72 ms | 69.0% bf16 MFU | 261845 tok/s +step 8332/18794 | loss 3.041594 (-2.84z)| norm 0.2005 (-1.34z)| lr 4.04e-03 | 2010.38 ms | 68.3% bf16 MFU | 261793 tok/s +step 8333/18794 | loss 3.176250 (+1.19z)| norm 0.2729 (-0.07z)| lr 4.04e-03 | 2010.18 ms | 68.3% bf16 MFU | 261744 tok/s +step 8334/18794 | loss 3.107673 (-0.84z)| norm 0.3030 (+0.45z)| lr 4.04e-03 | 2003.28 ms | 68.5% bf16 MFU | 261742 tok/s +step 8335/18794 | loss 3.113681 (-0.65z)| norm 0.2507 (-0.46z)| lr 4.04e-03 | 1994.87 ms | 68.8% bf16 MFU | 261796 tok/s +step 8336/18794 | loss 3.129778 (-0.19z)| norm 0.1834 (-1.59z)| lr 4.04e-03 | 1987.13 ms | 69.1% bf16 MFU | 261898 tok/s +step 8337/18794 | loss 3.133610 (-0.09z)| norm 0.1910 (-1.44z)| lr 4.03e-03 | 1995.56 ms | 68.8% bf16 MFU | 261940 tok/s +step 8338/18794 | loss 3.156109 (+0.60z)| norm 0.2367 (-0.64z)| lr 4.03e-03 | 2010.86 ms | 68.2% bf16 MFU | 261879 tok/s +step 8339/18794 | loss 3.143324 (+0.21z)| norm 0.2644 (-0.15z)| lr 4.03e-03 | 1996.11 ms | 68.7% bf16 MFU | 261918 tok/s +reducing beta2 to 0.9 and lr/wd by 0.874 due to grad z-score of 4.004066 +step 8340/18794 | loss 3.148279 (+0.35z)| norm 0.5293 (+4.00z)| lr 3.53e-03 | 1986.65 ms | 69.1% bf16 MFU | 262017 tok/s +step 8341/18794 | loss 3.136022 (-0.04z)| norm 0.2480 (-0.43z)| lr 4.03e-03 | 1993.54 ms | 68.8% bf16 MFU | 262066 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.545187 +step 8342/18794 | loss 3.151019 (+0.42z)| norm 0.4440 (+2.55z)| lr 4.03e-03 | 1986.30 ms | 69.1% bf16 MFU | 262160 tok/s +step 8343/18794 | loss 3.113552 (-0.73z)| norm 0.2210 (-0.85z)| lr 4.03e-03 | 1994.15 ms | 68.8% bf16 MFU | 262198 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.138819 +step 8344/18794 | loss 3.152986 (+0.51z)| norm 0.4964 (+3.14z)| lr 4.03e-03 | 1987.58 ms | 69.0% bf16 MFU | 262277 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.403653 +step 8345/18794 | loss 3.138724 (+0.08z)| norm 0.4522 (+2.40z)| lr 4.03e-03 | 2002.05 ms | 68.5% bf16 MFU | 262257 tok/s +step 8346/18794 | loss 3.158147 (+0.74z)| norm 0.2450 (-0.50z)| lr 4.03e-03 | 1990.48 ms | 68.9% bf16 MFU | 262314 tok/s +step 8347/18794 | loss 3.099135 (-1.23z)| norm 0.2463 (-0.48z)| lr 4.03e-03 | 2002.73 ms | 68.5% bf16 MFU | 262288 tok/s +step 8348/18794 | loss 3.142926 (+0.26z)| norm 0.3449 (+0.89z)| lr 4.03e-03 | 1980.22 ms | 69.3% bf16 MFU | 262412 tok/s +step 8349/18794 | loss 3.137071 (+0.06z)| norm 0.3384 (+0.78z)| lr 4.03e-03 | 1985.88 ms | 69.1% bf16 MFU | 262492 tok/s +step 8350/18794 | loss 3.092772 (-1.42z)| norm 0.3341 (+0.72z)| lr 4.03e-03 | 2009.49 ms | 68.3% bf16 MFU | 262412 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.528314 +step 8351/18794 | loss 3.109602 (-0.83z)| norm 0.4656 (+2.53z)| lr 4.03e-03 | 1987.13 ms | 69.1% bf16 MFU | 262484 tok/s +step 8352/18794 | loss 3.107366 (-0.89z)| norm 0.2369 (-0.62z)| lr 4.03e-03 | 1987.31 ms | 69.1% bf16 MFU | 262550 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.154505 +step 8353/18794 | loss 3.116411 (-0.58z)| norm 0.5243 (+3.15z)| lr 4.03e-03 | 1994.48 ms | 68.8% bf16 MFU | 262566 tok/s +step 8354/18794 | loss 3.120783 (-0.43z)| norm 0.4076 (+1.62z)| lr 4.03e-03 | 1994.44 ms | 68.8% bf16 MFU | 262582 tok/s +step 8355/18794 | loss 3.108640 (-0.83z)| norm 0.2123 (-0.94z)| lr 4.03e-03 | 1986.67 ms | 69.1% bf16 MFU | 262648 tok/s +step 8356/18794 | loss 3.084496 (-1.60z)| norm 0.3404 (+0.73z)| lr 4.03e-03 | 1994.01 ms | 68.8% bf16 MFU | 262662 tok/s +step 8357/18794 | loss 3.140489 (+0.30z)| norm 0.2271 (-0.74z)| lr 4.03e-03 | 1985.62 ms | 69.1% bf16 MFU | 262731 tok/s +step 8358/18794 | loss 3.181192 (+1.67z)| norm 0.3648 (+1.09z)| lr 4.03e-03 | 1993.88 ms | 68.8% bf16 MFU | 262742 tok/s +step 8359/18794 | loss 3.146514 (+0.49z)| norm 0.2426 (-0.54z)| lr 4.02e-03 | 1990.75 ms | 68.9% bf16 MFU | 262773 tok/s +step 8360/18794 | loss 3.128609 (-0.15z)| norm 0.2168 (-0.87z)| lr 4.02e-03 | 1980.93 ms | 69.3% bf16 MFU | 262868 tok/s +step 8361/18794 | loss 3.114538 (-0.64z)| norm 0.2416 (-0.53z)| lr 4.02e-03 | 1979.41 ms | 69.3% bf16 MFU | 262968 tok/s +step 8362/18794 | loss 3.080511 (-1.83z)| norm 0.4000 (+1.53z)| lr 4.02e-03 | 1996.53 ms | 68.7% bf16 MFU | 262949 tok/s +step 8363/18794 | loss 3.157604 (+0.85z)| norm 0.2106 (-0.94z)| lr 4.02e-03 | 1986.98 ms | 69.1% bf16 MFU | 262995 tok/s +step 8364/18794 | loss 3.144899 (+0.39z)| norm 0.4110 (+1.64z)| lr 4.02e-03 | 1985.96 ms | 69.1% bf16 MFU | 263045 tok/s +step 8365/18794 | loss 3.085532 (-1.69z)| norm 0.3979 (+1.44z)| lr 4.02e-03 | 1992.81 ms | 68.9% bf16 MFU | 263047 tok/s +step 8366/18794 | loss 3.170449 (+1.26z)| norm 0.2052 (-1.00z)| lr 4.02e-03 | 2001.56 ms | 68.6% bf16 MFU | 262992 tok/s +step 8367/18794 | loss 3.094695 (-1.38z)| norm 0.3031 (+0.25z)| lr 4.02e-03 | 1986.34 ms | 69.1% bf16 MFU | 263040 tok/s +step 8368/18794 | loss 3.127715 (-0.22z)| norm 0.3217 (+0.48z)| lr 4.02e-03 | 1990.92 ms | 68.9% bf16 MFU | 263055 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.101500 +step 8369/18794 | loss 3.165747 (+1.08z)| norm 0.4532 (+2.10z)| lr 4.02e-03 | 1985.98 ms | 69.1% bf16 MFU | 263102 tok/s +step 8370/18794 | loss 3.086379 (-1.63z)| norm 0.2462 (-0.50z)| lr 4.02e-03 | 1984.78 ms | 69.1% bf16 MFU | 263154 tok/s +step 8371/18794 | loss 3.069436 (-2.14z)| norm 0.2168 (-0.85z)| lr 4.02e-03 | 1987.45 ms | 69.0% bf16 MFU | 263187 tok/s +step 8372/18794 | loss 3.176853 (+1.43z)| norm 0.2713 (-0.18z)| lr 4.02e-03 | 1985.29 ms | 69.1% bf16 MFU | 263232 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.847463 +step 8373/18794 | loss 3.180469 (+1.51z)| norm 0.5216 (+2.85z)| lr 4.02e-03 | 1986.64 ms | 69.1% bf16 MFU | 263265 tok/s +step 8374/18794 | loss 3.122009 (-0.40z)| norm 0.2843 (-0.04z)| lr 4.02e-03 | 1988.09 ms | 69.0% bf16 MFU | 263288 tok/s +step 8375/18794 | loss 3.150722 (+0.54z)| norm 0.3198 (+0.38z)| lr 4.02e-03 | 1982.44 ms | 69.2% bf16 MFU | 263347 tok/s +step 8376/18794 | loss 3.137146 (+0.08z)| norm 0.2797 (-0.11z)| lr 4.02e-03 | 1988.27 ms | 69.0% bf16 MFU | 263364 tok/s +step 8377/18794 | loss 3.091809 (-1.39z)| norm 0.4552 (+1.98z)| lr 4.02e-03 | 1986.77 ms | 69.1% bf16 MFU | 263390 tok/s +step 8378/18794 | loss 3.077960 (-1.80z)| norm 0.2224 (-0.80z)| lr 4.02e-03 | 1978.22 ms | 69.4% bf16 MFU | 263472 tok/s +step 8379/18794 | loss 3.213277 (+2.48z)| norm 0.3777 (+1.06z)| lr 4.02e-03 | 1985.83 ms | 69.1% bf16 MFU | 263499 tok/s +step 8380/18794 | loss 3.122542 (-0.38z)| norm 0.3432 (+0.64z)| lr 4.01e-03 | 1982.37 ms | 69.2% bf16 MFU | 263548 tok/s +step 8381/18794 | loss 3.149804 (+0.47z)| norm 0.3257 (+0.43z)| lr 4.01e-03 | 1989.34 ms | 69.0% bf16 MFU | 263548 tok/s +step 8382/18794 | loss 3.198971 (+1.97z)| norm 0.2246 (-0.78z)| lr 4.01e-03 | 1983.22 ms | 69.2% bf16 MFU | 263589 tok/s +step 8383/18794 | loss 3.146423 (+0.32z)| norm 0.3509 (+0.76z)| lr 4.01e-03 | 1988.74 ms | 69.0% bf16 MFU | 263591 tok/s +step 8384/18794 | loss 3.127157 (-0.28z)| norm 0.2436 (-0.55z)| lr 4.01e-03 | 1986.54 ms | 69.1% bf16 MFU | 263607 tok/s +step 8385/18794 | loss 3.107544 (-0.88z)| norm 0.3710 (+1.00z)| lr 4.01e-03 | 1979.50 ms | 69.3% bf16 MFU | 263670 tok/s +step 8386/18794 | loss 3.119271 (-0.50z)| norm 0.4483 (+1.88z)| lr 4.01e-03 | 1979.83 ms | 69.3% bf16 MFU | 263727 tok/s +step 8387/18794 | loss 3.091802 (-1.34z)| norm 0.2595 (-0.37z)| lr 4.01e-03 | 1978.92 ms | 69.3% bf16 MFU | 263788 tok/s +step 8388/18794 | loss 3.118910 (-0.49z)| norm 0.2576 (-0.38z)| lr 4.01e-03 | 2009.95 ms | 68.3% bf16 MFU | 263641 tok/s +step 8389/18794 | loss 3.138930 (+0.16z)| norm 0.1837 (-1.25z)| lr 4.01e-03 | 2036.28 ms | 67.4% bf16 MFU | 263332 tok/s +step 8390/18794 | loss 3.155836 (+0.70z)| norm 0.3949 (+1.25z)| lr 4.01e-03 | 2036.58 ms | 67.4% bf16 MFU | 263037 tok/s +step 8391/18794 | loss 3.091850 (-1.32z)| norm 0.4058 (+1.35z)| lr 4.01e-03 | 2035.44 ms | 67.4% bf16 MFU | 262764 tok/s +step 8392/18794 | loss 3.192434 (+1.87z)| norm 0.2488 (-0.50z)| lr 4.01e-03 | 2035.16 ms | 67.4% bf16 MFU | 262507 tok/s +step 8393/18794 | loss 3.119078 (-0.45z)| norm 0.3263 (+0.40z)| lr 4.01e-03 | 2035.04 ms | 67.4% bf16 MFU | 262263 tok/s +step 8394/18794 | loss 3.157016 (+0.74z)| norm 0.2389 (-0.62z)| lr 4.01e-03 | 2042.10 ms | 67.2% bf16 MFU | 261987 tok/s +step 8395/18794 | loss 3.137810 (+0.14z)| norm 0.3027 (+0.14z)| lr 4.01e-03 | 2034.81 ms | 67.4% bf16 MFU | 261771 tok/s +step 8396/18794 | loss 3.169523 (+1.12z)| norm 0.3805 (+1.03z)| lr 4.01e-03 | 2041.34 ms | 67.2% bf16 MFU | 261524 tok/s +step 8397/18794 | loss 3.142768 (+0.28z)| norm 0.2759 (-0.19z)| lr 4.01e-03 | 2043.09 ms | 67.2% bf16 MFU | 261278 tok/s +step 8398/18794 | loss 3.142711 (+0.26z)| norm 0.2652 (-0.33z)| lr 4.01e-03 | 2041.65 ms | 67.2% bf16 MFU | 261054 tok/s +step 8399/18794 | loss 3.113247 (-0.66z)| norm 0.3267 (+0.39z)| lr 4.01e-03 | 2041.86 ms | 67.2% bf16 MFU | 260840 tok/s +step 8400/18794 | loss 3.186008 (+1.61z)| norm 0.3253 (+0.37z)| lr 4.01e-03 | 2040.86 ms | 67.2% bf16 MFU | 260643 tok/s +reducing beta2 to 0.9 and lr/wd by 0.875 due to grad z-score of 3.998409 +step 8401/18794 | loss 3.207031 (+2.18z)| norm 0.6649 (+4.00z)| lr 3.51e-03 | 2041.26 ms | 67.2% bf16 MFU | 260453 tok/s +step 8402/18794 | loss 3.157395 (+0.67z)| norm 0.2640 (-0.38z)| lr 4.00e-03 | 2042.81 ms | 67.2% bf16 MFU | 260263 tok/s +step 8403/18794 | loss 3.142835 (+0.23z)| norm 0.4161 (+1.26z)| lr 4.00e-03 | 2034.30 ms | 67.5% bf16 MFU | 260136 tok/s +step 8404/18794 | loss 3.134489 (-0.01z)| norm 0.2495 (-0.55z)| lr 4.00e-03 | 2033.71 ms | 67.5% bf16 MFU | 260019 tok/s +step 8405/18794 | loss 3.131610 (-0.10z)| norm 0.3377 (+0.41z)| lr 4.00e-03 | 2033.93 ms | 67.5% bf16 MFU | 259907 tok/s +step 8406/18794 | loss 3.162693 (+0.88z)| norm 0.2640 (-0.40z)| lr 4.00e-03 | 2034.13 ms | 67.5% bf16 MFU | 259798 tok/s +step 8407/18794 | loss 3.146692 (+0.37z)| norm 0.3014 (+0.01z)| lr 4.00e-03 | 2042.85 ms | 67.2% bf16 MFU | 259641 tok/s +step 8408/18794 | loss 3.195498 (+1.84z)| norm 0.2844 (-0.18z)| lr 4.00e-03 | 2025.65 ms | 67.7% bf16 MFU | 259600 tok/s +step 8409/18794 | loss 3.158801 (+0.72z)| norm 0.2346 (-0.72z)| lr 4.00e-03 | 2042.81 ms | 67.2% bf16 MFU | 259453 tok/s +step 8410/18794 | loss 3.163901 (+0.87z)| norm 0.2870 (-0.15z)| lr 4.00e-03 | 2041.79 ms | 67.2% bf16 MFU | 259319 tok/s +step 8411/18794 | loss 3.138403 (+0.10z)| norm 0.2065 (-1.02z)| lr 4.00e-03 | 2034.31 ms | 67.5% bf16 MFU | 259239 tok/s +step 8412/18794 | loss 3.184297 (+1.48z)| norm 0.2333 (-0.73z)| lr 4.00e-03 | 2011.80 ms | 68.2% bf16 MFU | 259307 tok/s +step 8413/18794 | loss 3.115172 (-0.63z)| norm 0.2001 (-1.07z)| lr 4.00e-03 | 2040.91 ms | 67.2% bf16 MFU | 259186 tok/s +step 8414/18794 | loss 3.158373 (+0.67z)| norm 0.1798 (-1.27z)| lr 4.00e-03 | 2027.00 ms | 67.7% bf16 MFU | 259160 tok/s +step 8415/18794 | loss 3.204248 (+2.09z)| norm 0.3430 (+0.47z)| lr 4.00e-03 | 2042.53 ms | 67.2% bf16 MFU | 259036 tok/s +step 8416/18794 | loss 3.090731 (-1.37z)| norm 0.3624 (+0.66z)| lr 4.00e-03 | 2020.75 ms | 67.9% bf16 MFU | 259057 tok/s +step 8417/18794 | loss 3.187709 (+1.57z)| norm 0.2476 (-0.57z)| lr 4.00e-03 | 2041.95 ms | 67.2% bf16 MFU | 258942 tok/s +step 8418/18794 | loss 3.172631 (+1.09z)| norm 0.2697 (-0.33z)| lr 4.00e-03 | 2039.65 ms | 67.3% bf16 MFU | 258847 tok/s +step 8419/18794 | loss 3.125608 (-0.34z)| norm 0.3588 (+0.61z)| lr 4.00e-03 | 2035.46 ms | 67.4% bf16 MFU | 258784 tok/s +step 8420/18794 | loss 3.148090 (+0.35z)| norm 0.2437 (-0.62z)| lr 4.00e-03 | 2041.06 ms | 67.2% bf16 MFU | 258688 tok/s +step 8421/18794 | loss 3.139733 (+0.08z)| norm 0.3995 (+1.03z)| lr 4.00e-03 | 2034.37 ms | 67.5% bf16 MFU | 258640 tok/s +step 8422/18794 | loss 3.090708 (-1.41z)| norm 0.3191 (+0.15z)| lr 4.00e-03 | 2025.70 ms | 67.7% bf16 MFU | 258648 tok/s +step 8423/18794 | loss 3.138314 (+0.06z)| norm 0.1855 (-1.27z)| lr 4.00e-03 | 2040.99 ms | 67.2% bf16 MFU | 258560 tok/s +step 8424/18794 | loss 3.085468 (-1.53z)| norm 0.3465 (+0.44z)| lr 3.99e-03 | 2018.62 ms | 68.0% bf16 MFU | 258618 tok/s +step 8425/18794 | loss 3.159328 (+0.70z)| norm 0.3556 (+0.53z)| lr 3.99e-03 | 2043.44 ms | 67.2% bf16 MFU | 258516 tok/s +step 8426/18794 | loss 3.110367 (-0.78z)| norm 0.2361 (-0.76z)| lr 3.99e-03 | 2042.39 ms | 67.2% bf16 MFU | 258425 tok/s +step 8427/18794 | loss 3.142343 (+0.19z)| norm 0.2823 (-0.26z)| lr 3.99e-03 | 2027.28 ms | 67.7% bf16 MFU | 258435 tok/s +step 8428/18794 | loss 3.154148 (+0.54z)| norm 0.2439 (-0.67z)| lr 3.99e-03 | 2027.08 ms | 67.7% bf16 MFU | 258445 tok/s +step 8429/18794 | loss 3.178813 (+1.26z)| norm 0.2224 (-0.90z)| lr 3.99e-03 | 2017.40 ms | 68.0% bf16 MFU | 258517 tok/s +step 8430/18794 | loss 3.144722 (+0.24z)| norm 0.2928 (-0.16z)| lr 3.99e-03 | 2017.55 ms | 68.0% bf16 MFU | 258584 tok/s +step 8431/18794 | loss 3.167326 (+0.90z)| norm 0.4298 (+1.30z)| lr 3.99e-03 | 2025.68 ms | 67.7% bf16 MFU | 258596 tok/s +step 8432/18794 | loss 3.104526 (-1.04z)| norm 0.2525 (-0.63z)| lr 3.99e-03 | 2026.91 ms | 67.7% bf16 MFU | 258600 tok/s +step 8433/18794 | loss 3.145125 (+0.24z)| norm 0.2701 (-0.43z)| lr 3.99e-03 | 2026.39 ms | 67.7% bf16 MFU | 258606 tok/s +step 8434/18794 | loss 3.167831 (+0.93z)| norm 0.2453 (-0.70z)| lr 3.99e-03 | 2043.12 ms | 67.2% bf16 MFU | 258506 tok/s +step 8435/18794 | loss 3.147539 (+0.28z)| norm 0.2191 (-0.97z)| lr 3.99e-03 | 2025.84 ms | 67.7% bf16 MFU | 258521 tok/s +step 8436/18794 | loss 3.176708 (+1.18z)| norm 0.2243 (-0.93z)| lr 3.99e-03 | 2033.56 ms | 67.5% bf16 MFU | 258486 tok/s +step 8437/18794 | loss 3.206007 (+2.03z)| norm 0.2783 (-0.35z)| lr 3.99e-03 | 2041.33 ms | 67.2% bf16 MFU | 258403 tok/s +step 8438/18794 | loss 3.083088 (-1.69z)| norm 0.2629 (-0.52z)| lr 3.99e-03 | 2010.42 ms | 68.3% bf16 MFU | 258522 tok/s +step 8439/18794 | loss 3.105743 (-0.99z)| norm 0.2126 (-1.07z)| lr 3.99e-03 | 2009.40 ms | 68.3% bf16 MFU | 258642 tok/s +step 8440/18794 | loss 3.123650 (-0.44z)| norm 0.2327 (-0.84z)| lr 3.99e-03 | 2018.51 ms | 68.0% bf16 MFU | 258697 tok/s +mostly skipping update due to grad z-score of 7.738968 +step 8441/18794 | loss 3.146297 (+0.23z)| norm 1.4110 (+7.74z)| lr 3.99e-04 | 2011.29 ms | 68.2% bf16 MFU | 258796 tok/s +step 8442/18794 | loss 3.062104 (-2.22z)| norm 0.1921 (-1.28z)| lr 3.99e-03 | 2026.87 ms | 67.7% bf16 MFU | 258790 tok/s +step 8443/18794 | loss 3.145328 (+0.22z)| norm 0.2595 (-0.51z)| lr 3.99e-03 | 2018.07 ms | 68.0% bf16 MFU | 258840 tok/s +step 8444/18794 | loss 3.117449 (-0.59z)| norm 0.2195 (-0.96z)| lr 3.99e-03 | 2019.07 ms | 68.0% bf16 MFU | 258881 tok/s +step 8445/18794 | loss 3.186195 (+1.40z)| norm 0.4129 (+1.25z)| lr 3.99e-03 | 2018.91 ms | 68.0% bf16 MFU | 258922 tok/s +step 8446/18794 | loss 3.121224 (-0.48z)| norm 0.2997 (-0.03z)| lr 3.98e-03 | 2017.77 ms | 68.0% bf16 MFU | 258967 tok/s +step 8447/18794 | loss 3.132060 (-0.17z)| norm 0.2417 (-0.71z)| lr 3.98e-03 | 2019.28 ms | 68.0% bf16 MFU | 259001 tok/s +step 8448/18794 | loss 3.146091 (+0.24z)| norm 0.2857 (-0.20z)| lr 3.98e-03 | 2034.87 ms | 67.4% bf16 MFU | 258934 tok/s +step 8449/18794 | loss 3.143718 (+0.16z)| norm 0.2573 (-0.52z)| lr 3.98e-03 | 2033.80 ms | 67.5% bf16 MFU | 258876 tok/s +step 8450/18794 | loss 3.151730 (+0.38z)| norm 0.2724 (-0.34z)| lr 3.98e-03 | 2016.40 ms | 68.1% bf16 MFU | 258933 tok/s +step 8451/18794 | loss 3.170850 (+0.93z)| norm 0.2519 (-0.56z)| lr 3.98e-03 | 2009.96 ms | 68.3% bf16 MFU | 259029 tok/s +step 8452/18794 | loss 3.153493 (+0.41z)| norm 0.3100 (+0.13z)| lr 3.98e-03 | 1994.79 ms | 68.8% bf16 MFU | 259219 tok/s +step 8453/18794 | loss 3.119777 (-0.59z)| norm 0.2330 (-0.78z)| lr 3.98e-03 | 2025.52 ms | 67.8% bf16 MFU | 259200 tok/s +step 8454/18794 | loss 3.146097 (+0.18z)| norm 0.3919 (+1.16z)| lr 3.98e-03 | 2000.51 ms | 68.6% bf16 MFU | 259344 tok/s +step 8455/18794 | loss 3.171710 (+0.92z)| norm 0.4028 (+1.29z)| lr 3.98e-03 | 2009.69 ms | 68.3% bf16 MFU | 259421 tok/s +step 8456/18794 | loss 3.149524 (+0.25z)| norm 0.2902 (-0.10z)| lr 3.98e-03 | 2018.26 ms | 68.0% bf16 MFU | 259438 tok/s +step 8457/18794 | loss 3.169920 (+0.85z)| norm 0.3145 (+0.21z)| lr 3.98e-03 | 2000.34 ms | 68.6% bf16 MFU | 259571 tok/s +step 8458/18794 | loss 3.136456 (-0.14z)| norm 0.2369 (-0.75z)| lr 3.98e-03 | 2025.82 ms | 67.7% bf16 MFU | 259533 tok/s +step 8459/18794 | loss 3.143407 (+0.07z)| norm 0.3065 (+0.11z)| lr 3.98e-03 | 2016.86 ms | 68.0% bf16 MFU | 259554 tok/s +step 8460/18794 | loss 3.110971 (-0.90z)| norm 0.2985 (+0.01z)| lr 3.98e-03 | 2001.26 ms | 68.6% bf16 MFU | 259675 tok/s +step 8461/18794 | loss 3.116257 (-0.75z)| norm 0.2418 (-0.70z)| lr 3.98e-03 | 2018.74 ms | 68.0% bf16 MFU | 259677 tok/s +step 8462/18794 | loss 3.141870 (+0.01z)| norm 0.2871 (-0.14z)| lr 3.98e-03 | 2002.92 ms | 68.5% bf16 MFU | 259781 tok/s +step 8463/18794 | loss 3.144057 (+0.08z)| norm 0.2225 (-0.93z)| lr 3.98e-03 | 2023.90 ms | 67.8% bf16 MFU | 259744 tok/s +step 8464/18794 | loss 3.148317 (+0.21z)| norm 0.2548 (-0.54z)| lr 3.98e-03 | 2017.32 ms | 68.0% bf16 MFU | 259752 tok/s +step 8465/18794 | loss 3.185885 (+1.34z)| norm 0.3017 (+0.07z)| lr 3.98e-03 | 2010.41 ms | 68.3% bf16 MFU | 259804 tok/s +step 8466/18794 | loss 3.160091 (+0.55z)| norm 0.3291 (+0.43z)| lr 3.98e-03 | 2026.30 ms | 67.7% bf16 MFU | 259751 tok/s +step 8467/18794 | loss 3.153133 (+0.32z)| norm 0.3332 (+0.47z)| lr 3.97e-03 | 2034.32 ms | 67.5% bf16 MFU | 259649 tok/s +step 8468/18794 | loss 3.106619 (-1.13z)| norm 0.2870 (-0.12z)| lr 3.97e-03 | 2018.02 ms | 68.0% bf16 MFU | 259657 tok/s +step 8469/18794 | loss 3.155446 (+0.40z)| norm 0.2971 (+0.01z)| lr 3.97e-03 | 2018.02 ms | 68.0% bf16 MFU | 259664 tok/s +step 8470/18794 | loss 3.153435 (+0.32z)| norm 0.2859 (-0.12z)| lr 3.97e-03 | 2025.50 ms | 67.8% bf16 MFU | 259623 tok/s +step 8471/18794 | loss 3.161835 (+0.57z)| norm 0.2916 (-0.05z)| lr 3.97e-03 | 2009.36 ms | 68.3% bf16 MFU | 259688 tok/s +step 8472/18794 | loss 3.127903 (-0.52z)| norm 0.3199 (+0.32z)| lr 3.97e-03 | 2010.60 ms | 68.3% bf16 MFU | 259742 tok/s +step 8473/18794 | loss 3.100720 (-1.38z)| norm 0.2409 (-0.73z)| lr 3.97e-03 | 2016.22 ms | 68.1% bf16 MFU | 259756 tok/s +step 8474/18794 | loss 3.127975 (-0.49z)| norm 0.2048 (-1.21z)| lr 3.97e-03 | 2000.14 ms | 68.6% bf16 MFU | 259875 tok/s +step 8475/18794 | loss 3.079164 (-2.02z)| norm 0.3149 (+0.30z)| lr 3.97e-03 | 2001.84 ms | 68.6% bf16 MFU | 259976 tok/s +step 8476/18794 | loss 3.136251 (-0.19z)| norm 0.3027 (+0.14z)| lr 3.97e-03 | 2016.00 ms | 68.1% bf16 MFU | 259981 tok/s +step 8477/18794 | loss 3.136884 (-0.19z)| norm 0.2329 (-0.82z)| lr 3.97e-03 | 1985.65 ms | 69.1% bf16 MFU | 260184 tok/s +step 8478/18794 | loss 3.127695 (-0.52z)| norm 0.2412 (-0.69z)| lr 3.97e-03 | 2000.93 ms | 68.6% bf16 MFU | 260276 tok/s +step 8479/18794 | loss 3.125666 (-0.57z)| norm 0.2652 (-0.36z)| lr 3.97e-03 | 2016.71 ms | 68.0% bf16 MFU | 260260 tok/s +step 8480/18794 | loss 3.092308 (-1.67z)| norm 0.2489 (-0.58z)| lr 3.97e-03 | 2025.59 ms | 67.7% bf16 MFU | 260189 tok/s +step 8481/18794 | loss 3.142888 (+0.03z)| norm 0.2867 (-0.03z)| lr 3.97e-03 | 2008.12 ms | 68.3% bf16 MFU | 260234 tok/s +step 8482/18794 | loss 3.135770 (-0.19z)| norm 0.3165 (+0.40z)| lr 3.97e-03 | 2016.52 ms | 68.1% bf16 MFU | 260222 tok/s +step 8483/18794 | loss 3.121591 (-0.67z)| norm 0.3071 (+0.25z)| lr 3.97e-03 | 1999.67 ms | 68.6% bf16 MFU | 260320 tok/s +step 8484/18794 | loss 3.163795 (+0.76z)| norm 0.2020 (-1.23z)| lr 3.97e-03 | 2017.10 ms | 68.0% bf16 MFU | 260300 tok/s +step 8485/18794 | loss 3.149788 (+0.27z)| norm 0.3378 (+0.70z)| lr 3.97e-03 | 2017.23 ms | 68.0% bf16 MFU | 260280 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.408075 +step 8486/18794 | loss 3.195712 (+1.79z)| norm 0.4627 (+2.41z)| lr 3.97e-03 | 2016.50 ms | 68.1% bf16 MFU | 260266 tok/s +step 8487/18794 | loss 3.191577 (+1.62z)| norm 0.2327 (-0.79z)| lr 3.97e-03 | 1994.35 ms | 68.8% bf16 MFU | 260397 tok/s +step 8488/18794 | loss 3.136337 (-0.26z)| norm 0.2975 (+0.13z)| lr 3.97e-03 | 2011.22 ms | 68.2% bf16 MFU | 260412 tok/s +step 8489/18794 | loss 3.129741 (-0.48z)| norm 0.2527 (-0.50z)| lr 3.96e-03 | 1987.77 ms | 69.0% bf16 MFU | 260579 tok/s +step 8490/18794 | loss 3.109746 (-1.13z)| norm 0.2268 (-0.89z)| lr 3.96e-03 | 2004.44 ms | 68.5% bf16 MFU | 260628 tok/s +step 8491/18794 | loss 3.164938 (+0.71z)| norm 0.2610 (-0.38z)| lr 3.96e-03 | 2009.71 ms | 68.3% bf16 MFU | 260641 tok/s +step 8492/18794 | loss 3.129139 (-0.49z)| norm 0.3907 (+1.51z)| lr 3.96e-03 | 2020.67 ms | 67.9% bf16 MFU | 260582 tok/s +step 8493/18794 | loss 3.162775 (+0.66z)| norm 0.3854 (+1.40z)| lr 3.96e-03 | 2024.05 ms | 67.8% bf16 MFU | 260504 tok/s +step 8494/18794 | loss 3.137252 (-0.22z)| norm 0.2286 (-0.85z)| lr 3.96e-03 | 2018.13 ms | 68.0% bf16 MFU | 260468 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.005852 +step 8495/18794 | loss 3.145502 (+0.06z)| norm 0.4310 (+2.01z)| lr 3.96e-03 | 1998.30 ms | 68.7% bf16 MFU | 260563 tok/s +step 8496/18794 | loss 3.138415 (-0.17z)| norm 0.3005 (+0.16z)| lr 3.96e-03 | 2015.13 ms | 68.1% bf16 MFU | 260544 tok/s +step 8497/18794 | loss 3.157393 (+0.48z)| norm 0.2606 (-0.39z)| lr 3.96e-03 | 2014.18 ms | 68.1% bf16 MFU | 260532 tok/s +step 8498/18794 | loss 3.177620 (+1.16z)| norm 0.3235 (+0.50z)| lr 3.96e-03 | 2025.03 ms | 67.8% bf16 MFU | 260450 tok/s +step 8499/18794 | loss 3.171569 (+0.94z)| norm 0.2513 (-0.53z)| lr 3.96e-03 | 2004.37 ms | 68.5% bf16 MFU | 260506 tok/s +step 8500/18794 | loss 3.179232 (+1.21z)| norm 0.3295 (+0.58z)| lr 3.96e-03 | 2014.07 ms | 68.1% bf16 MFU | 260497 tok/s +val loss 3.148302 +Writing state to log_gpt3_125M_edu_v4/state_00008500_00001.bin +HellaSwag: 2930/10042 = 0.291775 +Writing checkpoint at step 8500 +Writing model to log_gpt3_125M_edu_v4/model_00008500.bin +Writing state to log_gpt3_125M_edu_v4/state_00008500_00000.bin +step 8501/18794 | loss 3.173150 (+1.03z)| norm 0.4206 (+1.84z)| lr 3.96e-03 | 2006.35 ms | 68.4% bf16 MFU | 260538 tok/s +reducing beta2 to 0.9 and lr/wd by 0.752 due to grad z-score of 4.655795 +step 8502/18794 | loss 3.202549 (+2.01z)| norm 0.6079 (+4.66z)| lr 2.98e-03 | 2017.05 ms | 68.0% bf16 MFU | 260507 tok/s +step 8503/18794 | loss 3.185014 (+1.38z)| norm 0.2328 (-0.81z)| lr 3.96e-03 | 2004.15 ms | 68.5% bf16 MFU | 260562 tok/s +step 8504/18794 | loss 3.181884 (+1.24z)| norm 0.2794 (-0.12z)| lr 3.96e-03 | 2023.95 ms | 67.8% bf16 MFU | 260486 tok/s +step 8505/18794 | loss 3.118458 (-0.91z)| norm 0.2950 (+0.11z)| lr 3.96e-03 | 2011.90 ms | 68.2% bf16 MFU | 260491 tok/s +step 8506/18794 | loss 3.137072 (-0.27z)| norm 0.2422 (-0.66z)| lr 3.96e-03 | 2020.03 ms | 67.9% bf16 MFU | 260444 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.094456 +step 8507/18794 | loss 3.158551 (+0.45z)| norm 0.4326 (+2.09z)| lr 3.96e-03 | 2016.00 ms | 68.1% bf16 MFU | 260425 tok/s +step 8508/18794 | loss 3.151300 (+0.23z)| norm 0.3696 (+1.16z)| lr 3.96e-03 | 2003.24 ms | 68.5% bf16 MFU | 260490 tok/s +step 8509/18794 | loss 3.145167 (+0.02z)| norm 0.1789 (-1.55z)| lr 3.96e-03 | 2005.74 ms | 68.4% bf16 MFU | 260535 tok/s +step 8510/18794 | loss 3.194428 (+1.69z)| norm 0.4188 (+1.81z)| lr 3.96e-03 | 2008.72 ms | 68.3% bf16 MFU | 260558 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.237543 +step 8511/18794 | loss 3.163378 (+0.62z)| norm 0.4550 (+2.24z)| lr 3.95e-03 | 2007.63 ms | 68.4% bf16 MFU | 260588 tok/s +step 8512/18794 | loss 3.163405 (+0.63z)| norm 0.2431 (-0.67z)| lr 3.95e-03 | 2008.44 ms | 68.3% bf16 MFU | 260610 tok/s +step 8513/18794 | loss 3.141980 (-0.11z)| norm 0.3180 (+0.35z)| lr 3.95e-03 | 2010.59 ms | 68.3% bf16 MFU | 260618 tok/s +step 8514/18794 | loss 3.168478 (+0.80z)| norm 0.2552 (-0.53z)| lr 3.95e-03 | 2006.56 ms | 68.4% bf16 MFU | 260652 tok/s +step 8515/18794 | loss 3.176247 (+1.09z)| norm 0.2742 (-0.28z)| lr 3.95e-03 | 2020.65 ms | 67.9% bf16 MFU | 260592 tok/s +step 8516/18794 | loss 3.169666 (+0.85z)| norm 0.2246 (-0.96z)| lr 3.95e-03 | 1987.30 ms | 69.1% bf16 MFU | 260754 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.030751 +step 8517/18794 | loss 3.114938 (-1.08z)| norm 0.5204 (+3.03z)| lr 3.95e-03 | 1982.87 ms | 69.2% bf16 MFU | 260936 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.157001 +step 8518/18794 | loss 3.182908 (+1.34z)| norm 0.5451 (+3.16z)| lr 3.95e-03 | 1996.98 ms | 68.7% bf16 MFU | 261017 tok/s +step 8519/18794 | loss 3.127276 (-0.64z)| norm 0.2292 (-0.86z)| lr 3.95e-03 | 2011.43 ms | 68.2% bf16 MFU | 260998 tok/s +mostly skipping update due to grad z-score of 7.258420 +step 8520/18794 | loss 3.169086 (+0.84z)| norm 1.1370 (+7.26z)| lr 3.95e-04 | 2000.52 ms | 68.6% bf16 MFU | 261052 tok/s +step 8521/18794 | loss 3.107178 (-1.33z)| norm 0.4385 (+1.76z)| lr 3.95e-03 | 2001.45 ms | 68.6% bf16 MFU | 261097 tok/s +reducing beta2 to 0.9 and lr/wd by 0.786 due to grad z-score of 4.453324 +step 8522/18794 | loss 3.142196 (-0.12z)| norm 0.6983 (+4.45z)| lr 3.10e-03 | 2014.97 ms | 68.1% bf16 MFU | 261052 tok/s +step 8523/18794 | loss 3.141129 (-0.16z)| norm 0.3362 (+0.39z)| lr 3.95e-03 | 2005.42 ms | 68.4% bf16 MFU | 261071 tok/s +reducing beta2 to 0.9 and lr/wd by 0.936 due to grad z-score of 3.738310 +step 8524/18794 | loss 3.154345 (+0.29z)| norm 0.6619 (+3.74z)| lr 3.70e-03 | 2012.72 ms | 68.2% bf16 MFU | 261042 tok/s +step 8525/18794 | loss 3.118723 (-0.99z)| norm 0.2591 (-0.50z)| lr 3.95e-03 | 2012.15 ms | 68.2% bf16 MFU | 261018 tok/s +step 8526/18794 | loss 3.119644 (-0.97z)| norm 0.4371 (+1.36z)| lr 3.95e-03 | 2014.37 ms | 68.1% bf16 MFU | 260981 tok/s +step 8527/18794 | loss 3.136901 (-0.33z)| norm 0.2963 (-0.11z)| lr 3.95e-03 | 2009.47 ms | 68.3% bf16 MFU | 260977 tok/s +step 8528/18794 | loss 3.138323 (-0.27z)| norm 0.4520 (+1.49z)| lr 3.95e-03 | 2011.24 ms | 68.2% bf16 MFU | 260962 tok/s +step 8529/18794 | loss 3.161754 (+0.59z)| norm 0.3277 (+0.19z)| lr 3.95e-03 | 2008.41 ms | 68.3% bf16 MFU | 260967 tok/s +step 8530/18794 | loss 3.167810 (+0.81z)| norm 0.2441 (-0.67z)| lr 3.95e-03 | 2020.27 ms | 67.9% bf16 MFU | 260894 tok/s +step 8531/18794 | loss 3.189115 (+1.56z)| norm 0.3697 (+0.62z)| lr 3.95e-03 | 2008.76 ms | 68.3% bf16 MFU | 260899 tok/s +step 8532/18794 | loss 3.126286 (-0.73z)| norm 0.2563 (-0.56z)| lr 3.94e-03 | 2011.73 ms | 68.2% bf16 MFU | 260885 tok/s +step 8533/18794 | loss 3.147099 (+0.03z)| norm 0.4312 (+1.26z)| lr 3.94e-03 | 2001.35 ms | 68.6% bf16 MFU | 260939 tok/s +step 8534/18794 | loss 3.114797 (-1.13z)| norm 0.3044 (-0.06z)| lr 3.94e-03 | 2008.77 ms | 68.3% bf16 MFU | 260942 tok/s +step 8535/18794 | loss 3.162789 (+0.61z)| norm 0.3931 (+0.84z)| lr 3.94e-03 | 2021.96 ms | 67.9% bf16 MFU | 260860 tok/s +step 8536/18794 | loss 3.236516 (+3.14z)| norm 0.2978 (-0.15z)| lr 3.94e-03 | 2016.73 ms | 68.0% bf16 MFU | 260816 tok/s +step 8537/18794 | loss 3.137427 (-0.30z)| norm 0.2967 (-0.17z)| lr 3.94e-03 | 2011.64 ms | 68.2% bf16 MFU | 260806 tok/s +step 8538/18794 | loss 3.158079 (+0.42z)| norm 0.3307 (+0.17z)| lr 3.94e-03 | 2028.77 ms | 67.6% bf16 MFU | 260687 tok/s +step 8539/18794 | loss 3.105788 (-1.49z)| norm 0.2236 (-0.94z)| lr 3.94e-03 | 2009.56 ms | 68.3% bf16 MFU | 260698 tok/s +step 8540/18794 | loss 3.152389 (+0.20z)| norm 0.3205 (+0.07z)| lr 3.94e-03 | 2003.59 ms | 68.5% bf16 MFU | 260746 tok/s +step 8541/18794 | loss 3.123308 (-0.85z)| norm 0.2728 (-0.44z)| lr 3.94e-03 | 2020.88 ms | 67.9% bf16 MFU | 260681 tok/s +step 8542/18794 | loss 3.224359 (+2.80z)| norm 0.4444 (+1.34z)| lr 3.94e-03 | 2006.19 ms | 68.4% bf16 MFU | 260714 tok/s +step 8543/18794 | loss 3.109929 (-1.39z)| norm 0.2569 (-0.64z)| lr 3.94e-03 | 2009.12 ms | 68.3% bf16 MFU | 260726 tok/s +step 8544/18794 | loss 3.136789 (-0.42z)| norm 0.4794 (+1.66z)| lr 3.94e-03 | 1999.10 ms | 68.6% bf16 MFU | 260802 tok/s +step 8545/18794 | loss 3.138530 (-0.34z)| norm 0.4026 (+0.84z)| lr 3.94e-03 | 2014.09 ms | 68.1% bf16 MFU | 260778 tok/s +step 8546/18794 | loss 3.174391 (+0.97z)| norm 0.2849 (-0.37z)| lr 3.94e-03 | 2010.24 ms | 68.3% bf16 MFU | 260779 tok/s +step 8547/18794 | loss 3.102437 (-1.66z)| norm 0.4068 (+0.89z)| lr 3.94e-03 | 2010.67 ms | 68.3% bf16 MFU | 260778 tok/s +step 8548/18794 | loss 3.148988 (+0.04z)| norm 0.4036 (+0.84z)| lr 3.94e-03 | 1998.44 ms | 68.7% bf16 MFU | 260857 tok/s +step 8549/18794 | loss 3.164707 (+0.61z)| norm 0.2743 (-0.51z)| lr 3.94e-03 | 1998.85 ms | 68.7% bf16 MFU | 260929 tok/s +step 8550/18794 | loss 3.140287 (-0.28z)| norm 0.2426 (-0.84z)| lr 3.94e-03 | 2020.40 ms | 67.9% bf16 MFU | 260857 tok/s +step 8551/18794 | loss 3.148592 (+0.03z)| norm 0.2089 (-1.17z)| lr 3.94e-03 | 2028.02 ms | 67.7% bf16 MFU | 260740 tok/s +step 8552/18794 | loss 3.187384 (+1.42z)| norm 0.2846 (-0.39z)| lr 3.94e-03 | 1996.68 ms | 68.7% bf16 MFU | 260832 tok/s +step 8553/18794 | loss 3.170761 (+0.80z)| norm 0.2410 (-0.84z)| lr 3.94e-03 | 2008.68 ms | 68.3% bf16 MFU | 260841 tok/s +step 8554/18794 | loss 3.204517 (+1.97z)| norm 0.2227 (-1.02z)| lr 3.93e-03 | 2014.78 ms | 68.1% bf16 MFU | 260810 tok/s +step 8555/18794 | loss 3.156708 (+0.27z)| norm 0.2292 (-0.94z)| lr 3.93e-03 | 2018.12 ms | 68.0% bf16 MFU | 260759 tok/s +step 8556/18794 | loss 3.159038 (+0.35z)| norm 0.2338 (-0.87z)| lr 3.93e-03 | 2019.43 ms | 68.0% bf16 MFU | 260702 tok/s +step 8557/18794 | loss 3.148742 (-0.01z)| norm 0.2165 (-1.04z)| lr 3.93e-03 | 2020.15 ms | 67.9% bf16 MFU | 260644 tok/s +step 8558/18794 | loss 3.115435 (-1.19z)| norm 0.2507 (-0.68z)| lr 3.93e-03 | 2025.02 ms | 67.8% bf16 MFU | 260557 tok/s +step 8559/18794 | loss 3.119810 (-1.02z)| norm 0.2625 (-0.56z)| lr 3.93e-03 | 2001.81 ms | 68.6% bf16 MFU | 260624 tok/s +step 8560/18794 | loss 3.179220 (+1.07z)| norm 0.1902 (-1.28z)| lr 3.93e-03 | 1992.63 ms | 68.9% bf16 MFU | 260749 tok/s +step 8561/18794 | loss 3.154857 (+0.19z)| norm 0.2789 (-0.38z)| lr 3.93e-03 | 2017.50 ms | 68.0% bf16 MFU | 260705 tok/s +step 8562/18794 | loss 3.177384 (+0.98z)| norm 0.2529 (-0.64z)| lr 3.93e-03 | 2028.11 ms | 67.7% bf16 MFU | 260595 tok/s +step 8563/18794 | loss 3.174271 (+0.85z)| norm 0.3150 (-0.01z)| lr 3.93e-03 | 2011.14 ms | 68.2% bf16 MFU | 260600 tok/s +step 8564/18794 | loss 3.156547 (+0.22z)| norm 0.2864 (-0.31z)| lr 3.93e-03 | 1988.29 ms | 69.0% bf16 MFU | 260754 tok/s +step 8565/18794 | loss 3.095089 (-1.91z)| norm 0.2433 (-0.75z)| lr 3.93e-03 | 2024.49 ms | 67.8% bf16 MFU | 260665 tok/s +step 8566/18794 | loss 3.181622 (+1.12z)| norm 0.1997 (-1.18z)| lr 3.93e-03 | 2024.59 ms | 67.8% bf16 MFU | 260580 tok/s +step 8567/18794 | loss 3.174888 (+0.87z)| norm 0.3373 (+0.22z)| lr 3.93e-03 | 2024.92 ms | 67.8% bf16 MFU | 260497 tok/s +step 8568/18794 | loss 3.154385 (+0.14z)| norm 0.4896 (+1.72z)| lr 3.93e-03 | 1986.24 ms | 69.1% bf16 MFU | 260670 tok/s +step 8569/18794 | loss 3.171950 (+0.75z)| norm 0.2980 (-0.20z)| lr 3.93e-03 | 2009.36 ms | 68.3% bf16 MFU | 260683 tok/s +step 8570/18794 | loss 3.170204 (+0.68z)| norm 0.2027 (-1.13z)| lr 3.93e-03 | 2002.54 ms | 68.5% bf16 MFU | 260739 tok/s +step 8571/18794 | loss 3.112533 (-1.31z)| norm 0.1993 (-1.15z)| lr 3.93e-03 | 2023.41 ms | 67.8% bf16 MFU | 260658 tok/s +step 8572/18794 | loss 3.125042 (-0.87z)| norm 0.3559 (+0.39z)| lr 3.93e-03 | 2005.77 ms | 68.4% bf16 MFU | 260694 tok/s +step 8573/18794 | loss 3.091540 (-2.02z)| norm 0.2828 (-0.33z)| lr 3.93e-03 | 1999.81 ms | 68.6% bf16 MFU | 260768 tok/s +step 8574/18794 | loss 3.169366 (+0.65z)| norm 0.2354 (-0.79z)| lr 3.93e-03 | 2014.71 ms | 68.1% bf16 MFU | 260741 tok/s +step 8575/18794 | loss 3.109860 (-1.44z)| norm 0.2947 (-0.22z)| lr 3.92e-03 | 2008.86 ms | 68.3% bf16 MFU | 260753 tok/s +step 8576/18794 | loss 3.134688 (-0.57z)| norm 0.2595 (-0.56z)| lr 3.92e-03 | 2015.51 ms | 68.1% bf16 MFU | 260722 tok/s +step 8577/18794 | loss 3.126594 (-0.85z)| norm 0.2204 (-0.94z)| lr 3.92e-03 | 2026.15 ms | 67.7% bf16 MFU | 260624 tok/s +step 8578/18794 | loss 3.150086 (-0.03z)| norm 0.2363 (-0.78z)| lr 3.92e-03 | 2010.05 ms | 68.3% bf16 MFU | 260635 tok/s +step 8579/18794 | loss 3.163097 (+0.42z)| norm 0.3481 (+0.31z)| lr 3.92e-03 | 2017.37 ms | 68.0% bf16 MFU | 260597 tok/s +step 8580/18794 | loss 3.112211 (-1.41z)| norm 0.3243 (+0.07z)| lr 3.92e-03 | 2017.48 ms | 68.0% bf16 MFU | 260561 tok/s +step 8581/18794 | loss 3.124244 (-0.96z)| norm 0.2472 (-0.69z)| lr 3.92e-03 | 2033.19 ms | 67.5% bf16 MFU | 260426 tok/s +step 8582/18794 | loss 3.118887 (-1.14z)| norm 0.3152 (-0.02z)| lr 3.92e-03 | 2012.49 ms | 68.2% bf16 MFU | 260431 tok/s +step 8583/18794 | loss 3.100639 (-1.76z)| norm 0.2286 (-0.87z)| lr 3.92e-03 | 2021.72 ms | 67.9% bf16 MFU | 260376 tok/s +step 8584/18794 | loss 3.195868 (+1.55z)| norm 0.5044 (+1.80z)| lr 3.92e-03 | 2031.20 ms | 67.6% bf16 MFU | 260263 tok/s +step 8585/18794 | loss 3.133467 (-0.61z)| norm 0.4725 (+1.46z)| lr 3.92e-03 | 2027.90 ms | 67.7% bf16 MFU | 260176 tok/s +step 8586/18794 | loss 3.147612 (-0.10z)| norm 0.2268 (-0.90z)| lr 3.92e-03 | 2021.14 ms | 67.9% bf16 MFU | 260138 tok/s +step 8587/18794 | loss 3.136257 (-0.49z)| norm 0.3466 (+0.27z)| lr 3.92e-03 | 2014.12 ms | 68.1% bf16 MFU | 260146 tok/s +step 8588/18794 | loss 3.139860 (-0.36z)| norm 0.3574 (+0.36z)| lr 3.92e-03 | 2012.15 ms | 68.2% bf16 MFU | 260167 tok/s +step 8589/18794 | loss 3.234567 (+2.84z)| norm 0.4052 (+0.81z)| lr 3.92e-03 | 2021.45 ms | 67.9% bf16 MFU | 260127 tok/s +step 8590/18794 | loss 3.133219 (-0.62z)| norm 0.2408 (-0.78z)| lr 3.92e-03 | 2021.97 ms | 67.9% bf16 MFU | 260085 tok/s +step 8591/18794 | loss 3.155743 (+0.16z)| norm 0.3159 (-0.06z)| lr 3.92e-03 | 2032.49 ms | 67.5% bf16 MFU | 259978 tok/s +step 8592/18794 | loss 3.110538 (-1.38z)| norm 0.2146 (-1.04z)| lr 3.92e-03 | 2014.58 ms | 68.1% bf16 MFU | 259992 tok/s +step 8593/18794 | loss 3.172123 (+0.71z)| norm 0.3283 (+0.07z)| lr 3.92e-03 | 2000.76 ms | 68.6% bf16 MFU | 260094 tok/s +step 8594/18794 | loss 3.132686 (-0.63z)| norm 0.2968 (-0.23z)| lr 3.92e-03 | 2009.39 ms | 68.3% bf16 MFU | 260136 tok/s +step 8595/18794 | loss 3.098170 (-1.76z)| norm 0.4586 (+1.32z)| lr 3.92e-03 | 2016.23 ms | 68.1% bf16 MFU | 260131 tok/s +step 8596/18794 | loss 3.116752 (-1.12z)| norm 0.2856 (-0.34z)| lr 3.92e-03 | 2031.93 ms | 67.5% bf16 MFU | 260025 tok/s +step 8597/18794 | loss 3.184377 (+1.12z)| norm 0.4571 (+1.30z)| lr 3.91e-03 | 2010.42 ms | 68.3% bf16 MFU | 260063 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.334885 +step 8598/18794 | loss 3.167156 (+0.55z)| norm 0.5740 (+2.33z)| lr 3.91e-03 | 2028.78 ms | 67.6% bf16 MFU | 259981 tok/s +step 8599/18794 | loss 3.192748 (+1.38z)| norm 0.3208 (-0.05z)| lr 3.91e-03 | 2016.99 ms | 68.0% bf16 MFU | 259979 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.047239 +step 8600/18794 | loss 3.157543 (+0.23z)| norm 0.5506 (+2.05z)| lr 3.91e-03 | 2013.69 ms | 68.1% bf16 MFU | 259998 tok/s +step 8601/18794 | loss 3.157104 (+0.22z)| norm 0.3646 (+0.33z)| lr 3.91e-03 | 2006.65 ms | 68.4% bf16 MFU | 260062 tok/s +step 8602/18794 | loss 3.119712 (-1.00z)| norm 0.5104 (+1.65z)| lr 3.91e-03 | 2018.95 ms | 68.0% bf16 MFU | 260043 tok/s +step 8603/18794 | loss 3.176184 (+0.90z)| norm 0.4546 (+1.18z)| lr 3.91e-03 | 2011.75 ms | 68.2% bf16 MFU | 260072 tok/s +step 8604/18794 | loss 3.170745 (+0.72z)| norm 0.3173 (-0.11z)| lr 3.91e-03 | 2008.69 ms | 68.3% bf16 MFU | 260119 tok/s +step 8605/18794 | loss 3.154336 (+0.16z)| norm 0.3051 (-0.23z)| lr 3.91e-03 | 2025.98 ms | 67.7% bf16 MFU | 260052 tok/s +step 8606/18794 | loss 3.151311 (+0.05z)| norm 0.2576 (-0.67z)| lr 3.91e-03 | 2014.43 ms | 68.1% bf16 MFU | 260062 tok/s +step 8607/18794 | loss 3.175887 (+0.87z)| norm 0.3150 (-0.14z)| lr 3.91e-03 | 2003.52 ms | 68.5% bf16 MFU | 260144 tok/s +step 8608/18794 | loss 3.137554 (-0.42z)| norm 0.3583 (+0.28z)| lr 3.91e-03 | 2009.26 ms | 68.3% bf16 MFU | 260183 tok/s +step 8609/18794 | loss 3.140290 (-0.32z)| norm 0.3053 (-0.22z)| lr 3.91e-03 | 2010.42 ms | 68.3% bf16 MFU | 260213 tok/s +step 8610/18794 | loss 3.105053 (-1.48z)| norm 0.3545 (+0.23z)| lr 3.91e-03 | 2019.31 ms | 68.0% bf16 MFU | 260185 tok/s +step 8611/18794 | loss 3.100504 (-1.60z)| norm 0.5122 (+1.71z)| lr 3.91e-03 | 2017.13 ms | 68.0% bf16 MFU | 260171 tok/s +step 8612/18794 | loss 3.118599 (-0.98z)| norm 0.3041 (-0.24z)| lr 3.91e-03 | 2014.30 ms | 68.1% bf16 MFU | 260177 tok/s +step 8613/18794 | loss 3.085368 (-2.02z)| norm 0.3619 (+0.30z)| lr 3.91e-03 | 2006.53 ms | 68.4% bf16 MFU | 260232 tok/s +step 8614/18794 | loss 3.100305 (-1.50z)| norm 0.4817 (+1.41z)| lr 3.91e-03 | 2009.93 ms | 68.3% bf16 MFU | 260263 tok/s +step 8615/18794 | loss 3.093062 (-1.69z)| norm 0.3330 (-0.00z)| lr 3.91e-03 | 2025.57 ms | 67.8% bf16 MFU | 260192 tok/s +step 8616/18794 | loss 3.134660 (-0.35z)| norm 0.3693 (+0.33z)| lr 3.91e-03 | 2012.51 ms | 68.2% bf16 MFU | 260208 tok/s +step 8617/18794 | loss 3.084454 (-1.92z)| norm 0.4651 (+1.22z)| lr 3.91e-03 | 1994.91 ms | 68.8% bf16 MFU | 260338 tok/s +step 8618/18794 | loss 3.126501 (-0.58z)| norm 0.4217 (+0.83z)| lr 3.90e-03 | 2022.09 ms | 67.9% bf16 MFU | 260285 tok/s +step 8619/18794 | loss 3.119814 (-0.78z)| norm 0.2903 (-0.42z)| lr 3.90e-03 | 2010.01 ms | 68.3% bf16 MFU | 260313 tok/s +step 8620/18794 | loss 3.210770 (+2.05z)| norm 0.4946 (+1.54z)| lr 3.90e-03 | 2018.21 ms | 68.0% bf16 MFU | 260286 tok/s +step 8621/18794 | loss 3.115767 (-0.91z)| norm 0.2466 (-0.85z)| lr 3.90e-03 | 2018.19 ms | 68.0% bf16 MFU | 260261 tok/s +step 8622/18794 | loss 3.039428 (-3.10z)| norm 0.3895 (+0.61z)| lr 3.90e-03 | 2017.22 ms | 68.0% bf16 MFU | 260243 tok/s +step 8623/18794 | loss 3.099558 (-1.30z)| norm 0.2984 (-0.33z)| lr 3.90e-03 | 2010.96 ms | 68.2% bf16 MFU | 260267 tok/s +step 8624/18794 | loss 3.179464 (+1.04z)| norm 0.3849 (+0.63z)| lr 3.90e-03 | 2013.01 ms | 68.2% bf16 MFU | 260276 tok/s +step 8625/18794 | loss 3.165431 (+0.62z)| norm 0.4526 (+1.35z)| lr 3.90e-03 | 2011.78 ms | 68.2% bf16 MFU | 260293 tok/s +step 8626/18794 | loss 3.063807 (-2.29z)| norm 0.2645 (-0.70z)| lr 3.90e-03 | 2009.44 ms | 68.3% bf16 MFU | 260324 tok/s +step 8627/18794 | loss 3.130761 (-0.37z)| norm 0.3098 (-0.20z)| lr 3.90e-03 | 2010.57 ms | 68.3% bf16 MFU | 260346 tok/s +step 8628/18794 | loss 3.073534 (-1.95z)| norm 0.3333 (+0.07z)| lr 3.90e-03 | 2017.52 ms | 68.0% bf16 MFU | 260322 tok/s +step 8629/18794 | loss 3.145138 (+0.06z)| norm 0.3701 (+0.48z)| lr 3.90e-03 | 2025.74 ms | 67.7% bf16 MFU | 260246 tok/s +step 8630/18794 | loss 3.139593 (-0.08z)| norm 0.4030 (+0.83z)| lr 3.90e-03 | 2002.34 ms | 68.5% bf16 MFU | 260326 tok/s +mostly skipping update due to grad z-score of 5.309103 +step 8631/18794 | loss 3.133464 (-0.24z)| norm 0.8970 (+5.31z)| lr 3.90e-04 | 2015.62 ms | 68.1% bf16 MFU | 260315 tok/s +step 8632/18794 | loss 3.115223 (-0.76z)| norm 0.2458 (-0.91z)| lr 3.90e-03 | 2018.07 ms | 68.0% bf16 MFU | 260289 tok/s +step 8633/18794 | loss 3.166509 (+0.69z)| norm 0.3052 (-0.25z)| lr 3.90e-03 | 2000.92 ms | 68.6% bf16 MFU | 260376 tok/s +step 8634/18794 | loss 3.253026 (+2.96z)| norm 0.2350 (-1.02z)| lr 3.90e-03 | 2017.69 ms | 68.0% bf16 MFU | 260350 tok/s +step 8635/18794 | loss 3.119951 (-0.62z)| norm 0.2662 (-0.66z)| lr 3.90e-03 | 2010.49 ms | 68.3% bf16 MFU | 260371 tok/s +step 8636/18794 | loss 3.116544 (-0.71z)| norm 0.2606 (-0.71z)| lr 3.90e-03 | 2004.32 ms | 68.5% bf16 MFU | 260431 tok/s +step 8637/18794 | loss 3.139244 (-0.07z)| norm 0.2606 (-0.70z)| lr 3.90e-03 | 2017.60 ms | 68.0% bf16 MFU | 260403 tok/s +step 8638/18794 | loss 3.113722 (-0.77z)| norm 0.3280 (+0.04z)| lr 3.90e-03 | 2017.22 ms | 68.0% bf16 MFU | 260378 tok/s +step 8639/18794 | loss 3.127517 (-0.40z)| norm 0.2874 (-0.40z)| lr 3.90e-03 | 2018.14 ms | 68.0% bf16 MFU | 260348 tok/s +step 8640/18794 | loss 3.165111 (+0.65z)| norm 0.3302 (+0.06z)| lr 3.89e-03 | 1997.22 ms | 68.7% bf16 MFU | 260456 tok/s +step 8641/18794 | loss 3.193085 (+1.40z)| norm 0.2887 (-0.40z)| lr 3.89e-03 | 2003.69 ms | 68.5% bf16 MFU | 260516 tok/s +step 8642/18794 | loss 3.122061 (-0.55z)| norm 0.4012 (+0.84z)| lr 3.89e-03 | 1997.81 ms | 68.7% bf16 MFU | 260612 tok/s +step 8643/18794 | loss 3.140874 (-0.03z)| norm 0.2507 (-0.82z)| lr 3.89e-03 | 2017.59 ms | 68.0% bf16 MFU | 260574 tok/s +step 8644/18794 | loss 3.150057 (+0.23z)| norm 0.4421 (+1.30z)| lr 3.89e-03 | 2017.06 ms | 68.0% bf16 MFU | 260542 tok/s +step 8645/18794 | loss 3.126103 (-0.45z)| norm 0.3088 (-0.17z)| lr 3.89e-03 | 2011.21 ms | 68.2% bf16 MFU | 260549 tok/s +step 8646/18794 | loss 3.106974 (-0.97z)| norm 0.3840 (+0.68z)| lr 3.89e-03 | 2003.64 ms | 68.5% bf16 MFU | 260605 tok/s +step 8647/18794 | loss 3.152554 (+0.31z)| norm 0.4583 (+1.49z)| lr 3.89e-03 | 2018.88 ms | 68.0% bf16 MFU | 260559 tok/s +step 8648/18794 | loss 3.158090 (+0.47z)| norm 0.1846 (-1.54z)| lr 3.89e-03 | 2017.61 ms | 68.0% bf16 MFU | 260524 tok/s +step 8649/18794 | loss 3.089009 (-1.47z)| norm 0.4759 (+1.67z)| lr 3.89e-03 | 2017.45 ms | 68.0% bf16 MFU | 260492 tok/s +step 8650/18794 | loss 3.097665 (-1.20z)| norm 0.2262 (-1.07z)| lr 3.89e-03 | 2010.18 ms | 68.3% bf16 MFU | 260508 tok/s +step 8651/18794 | loss 3.104411 (-1.00z)| norm 0.2771 (-0.51z)| lr 3.89e-03 | 2001.86 ms | 68.6% bf16 MFU | 260578 tok/s +step 8652/18794 | loss 3.132583 (-0.20z)| norm 0.2598 (-0.71z)| lr 3.89e-03 | 2017.38 ms | 68.0% bf16 MFU | 260543 tok/s +step 8653/18794 | loss 3.167391 (+0.79z)| norm 0.2275 (-1.06z)| lr 3.89e-03 | 2019.62 ms | 67.9% bf16 MFU | 260496 tok/s +step 8654/18794 | loss 3.156282 (+0.50z)| norm 0.2162 (-1.18z)| lr 3.89e-03 | 2019.59 ms | 68.0% bf16 MFU | 260451 tok/s +step 8655/18794 | loss 3.092348 (-1.31z)| norm 0.2999 (-0.27z)| lr 3.89e-03 | 2000.75 ms | 68.6% bf16 MFU | 260531 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.902421 +step 8656/18794 | loss 3.133607 (-0.13z)| norm 0.6016 (+2.90z)| lr 3.89e-03 | 2006.08 ms | 68.4% bf16 MFU | 260572 tok/s +step 8657/18794 | loss 3.136215 (-0.05z)| norm 0.3243 (-0.05z)| lr 3.89e-03 | 1993.78 ms | 68.8% bf16 MFU | 260691 tok/s +step 8658/18794 | loss 3.113030 (-0.71z)| norm 0.3613 (+0.33z)| lr 3.89e-03 | 2003.73 ms | 68.5% bf16 MFU | 260739 tok/s +step 8659/18794 | loss 3.081947 (-1.57z)| norm 0.2326 (-1.05z)| lr 3.89e-03 | 2001.54 ms | 68.6% bf16 MFU | 260800 tok/s +step 8660/18794 | loss 3.125188 (-0.34z)| norm 0.2774 (-0.57z)| lr 3.89e-03 | 2001.27 ms | 68.6% bf16 MFU | 260858 tok/s +step 8661/18794 | loss 3.108128 (-0.81z)| norm 0.2655 (-0.71z)| lr 3.88e-03 | 1995.55 ms | 68.8% bf16 MFU | 260952 tok/s +step 8662/18794 | loss 3.123342 (-0.36z)| norm 0.3748 (+0.46z)| lr 3.88e-03 | 2008.69 ms | 68.3% bf16 MFU | 260955 tok/s +step 8663/18794 | loss 3.105907 (-0.84z)| norm 0.2534 (-0.85z)| lr 3.88e-03 | 2001.49 ms | 68.6% bf16 MFU | 261005 tok/s +reducing beta2 to 0.9 and lr/wd by 0.802 due to grad z-score of 4.363618 +step 8664/18794 | loss 3.127463 (-0.22z)| norm 0.7849 (+4.36z)| lr 3.11e-03 | 2009.66 ms | 68.3% bf16 MFU | 260999 tok/s +step 8665/18794 | loss 3.143076 (+0.22z)| norm 0.4573 (+1.15z)| lr 3.88e-03 | 2008.07 ms | 68.3% bf16 MFU | 261003 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.151096 +step 8666/18794 | loss 3.126713 (-0.24z)| norm 0.5683 (+2.15z)| lr 3.88e-03 | 2010.63 ms | 68.3% bf16 MFU | 260991 tok/s +step 8667/18794 | loss 3.078002 (-1.62z)| norm 0.3252 (-0.17z)| lr 3.88e-03 | 2018.08 ms | 68.0% bf16 MFU | 260931 tok/s +step 8668/18794 | loss 3.171701 (+1.08z)| norm 0.3171 (-0.25z)| lr 3.88e-03 | 1998.58 ms | 68.7% bf16 MFU | 261001 tok/s +step 8669/18794 | loss 3.107195 (-0.76z)| norm 0.3716 (+0.29z)| lr 3.88e-03 | 2017.05 ms | 68.0% bf16 MFU | 260947 tok/s +step 8670/18794 | loss 3.109744 (-0.67z)| norm 0.2472 (-0.91z)| lr 3.88e-03 | 1995.75 ms | 68.8% bf16 MFU | 261035 tok/s +step 8671/18794 | loss 3.111975 (-0.61z)| norm 0.2602 (-0.79z)| lr 3.88e-03 | 2004.03 ms | 68.5% bf16 MFU | 261064 tok/s +step 8672/18794 | loss 3.148190 (+0.43z)| norm 0.3450 (+0.02z)| lr 3.88e-03 | 2004.62 ms | 68.5% bf16 MFU | 261088 tok/s +step 8673/18794 | loss 3.159377 (+0.74z)| norm 0.4767 (+1.29z)| lr 3.88e-03 | 2026.18 ms | 67.7% bf16 MFU | 260971 tok/s +step 8674/18794 | loss 3.151638 (+0.52z)| norm 0.3222 (-0.22z)| lr 3.88e-03 | 2002.69 ms | 68.5% bf16 MFU | 261013 tok/s +step 8675/18794 | loss 3.129655 (-0.12z)| norm 0.2214 (-1.20z)| lr 3.88e-03 | 2003.43 ms | 68.5% bf16 MFU | 261047 tok/s +step 8676/18794 | loss 3.128717 (-0.15z)| norm 0.2098 (-1.29z)| lr 3.88e-03 | 1993.63 ms | 68.8% bf16 MFU | 261143 tok/s +step 8677/18794 | loss 3.128997 (-0.14z)| norm 0.2713 (-0.70z)| lr 3.88e-03 | 1994.30 ms | 68.8% bf16 MFU | 261231 tok/s +step 8678/18794 | loss 3.151701 (+0.52z)| norm 0.2694 (-0.73z)| lr 3.88e-03 | 1986.58 ms | 69.1% bf16 MFU | 261365 tok/s +step 8679/18794 | loss 3.107804 (-0.75z)| norm 0.5171 (+1.63z)| lr 3.88e-03 | 2011.62 ms | 68.2% bf16 MFU | 261328 tok/s +step 8680/18794 | loss 3.106555 (-0.78z)| norm 0.3128 (-0.33z)| lr 3.88e-03 | 2001.38 ms | 68.6% bf16 MFU | 261360 tok/s +step 8681/18794 | loss 3.058294 (-2.12z)| norm 0.2537 (-0.89z)| lr 3.88e-03 | 1993.45 ms | 68.8% bf16 MFU | 261442 tok/s +step 8682/18794 | loss 3.081100 (-1.45z)| norm 0.2520 (-0.90z)| lr 3.88e-03 | 2019.19 ms | 68.0% bf16 MFU | 261353 tok/s +step 8683/18794 | loss 3.124403 (-0.23z)| norm 0.2252 (-1.14z)| lr 3.87e-03 | 2010.65 ms | 68.3% bf16 MFU | 261323 tok/s +step 8684/18794 | loss 3.097561 (-0.97z)| norm 0.3306 (-0.15z)| lr 3.87e-03 | 2024.50 ms | 67.8% bf16 MFU | 261205 tok/s +step 8685/18794 | loss 3.132493 (+0.03z)| norm 0.2402 (-1.00z)| lr 3.87e-03 | 2011.15 ms | 68.2% bf16 MFU | 261180 tok/s +step 8686/18794 | loss 3.133941 (+0.08z)| norm 0.3000 (-0.41z)| lr 3.87e-03 | 2018.15 ms | 68.0% bf16 MFU | 261110 tok/s +step 8687/18794 | loss 3.060847 (-1.97z)| norm 0.2169 (-1.21z)| lr 3.87e-03 | 2019.42 ms | 68.0% bf16 MFU | 261036 tok/s +step 8688/18794 | loss 3.150189 (+0.55z)| norm 0.1738 (-1.59z)| lr 3.87e-03 | 2003.54 ms | 68.5% bf16 MFU | 261068 tok/s +step 8689/18794 | loss 3.111550 (-0.53z)| norm 0.2742 (-0.62z)| lr 3.87e-03 | 2010.29 ms | 68.3% bf16 MFU | 261055 tok/s +step 8690/18794 | loss 3.091446 (-1.10z)| norm 0.4461 (+1.01z)| lr 3.87e-03 | 2008.81 ms | 68.3% bf16 MFU | 261052 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.926236 +step 8691/18794 | loss 3.110086 (-0.54z)| norm 0.6638 (+2.93z)| lr 3.87e-03 | 2021.23 ms | 67.9% bf16 MFU | 260969 tok/s +step 8692/18794 | loss 3.081795 (-1.35z)| norm 0.2669 (-0.70z)| lr 3.87e-03 | 2017.72 ms | 68.0% bf16 MFU | 260912 tok/s +step 8693/18794 | loss 3.127564 (-0.01z)| norm 0.2119 (-1.20z)| lr 3.87e-03 | 2011.76 ms | 68.2% bf16 MFU | 260897 tok/s +step 8694/18794 | loss 3.133135 (+0.16z)| norm 0.2245 (-1.07z)| lr 3.87e-03 | 2011.65 ms | 68.2% bf16 MFU | 260884 tok/s +step 8695/18794 | loss 3.135918 (+0.23z)| norm 0.2498 (-0.83z)| lr 3.87e-03 | 2017.65 ms | 68.0% bf16 MFU | 260832 tok/s +step 8696/18794 | loss 3.078395 (-1.44z)| norm 0.2531 (-0.79z)| lr 3.87e-03 | 2003.61 ms | 68.5% bf16 MFU | 260874 tok/s +step 8697/18794 | loss 3.116292 (-0.32z)| norm 0.3032 (-0.34z)| lr 3.87e-03 | 2013.55 ms | 68.2% bf16 MFU | 260849 tok/s +step 8698/18794 | loss 3.148208 (+0.63z)| norm 0.2151 (-1.11z)| lr 3.87e-03 | 2024.61 ms | 67.8% bf16 MFU | 260755 tok/s +step 8699/18794 | loss 3.072913 (-1.58z)| norm 0.2083 (-1.16z)| lr 3.87e-03 | 2012.61 ms | 68.2% bf16 MFU | 260742 tok/s +step 8700/18794 | loss 3.090817 (-1.02z)| norm 0.3385 (+0.04z)| lr 3.87e-03 | 2011.50 ms | 68.2% bf16 MFU | 260737 tok/s +step 8701/18794 | loss 3.207946 (+2.40z)| norm 0.4369 (+0.97z)| lr 3.87e-03 | 2027.20 ms | 67.7% bf16 MFU | 260632 tok/s +step 8702/18794 | loss 3.083011 (-1.22z)| norm 0.2493 (-0.77z)| lr 3.87e-03 | 2019.48 ms | 68.0% bf16 MFU | 260581 tok/s +step 8703/18794 | loss 3.090093 (-1.00z)| norm 0.2156 (-1.07z)| lr 3.87e-03 | 2008.07 ms | 68.3% bf16 MFU | 260606 tok/s +step 8704/18794 | loss 3.155062 (+0.90z)| norm 0.3442 (+0.16z)| lr 3.86e-03 | 2007.23 ms | 68.4% bf16 MFU | 260636 tok/s +step 8705/18794 | loss 3.105938 (-0.52z)| norm 0.2367 (-0.85z)| lr 3.86e-03 | 2013.99 ms | 68.1% bf16 MFU | 260620 tok/s +step 8706/18794 | loss 3.094818 (-0.83z)| norm 0.2196 (-1.00z)| lr 3.86e-03 | 1999.79 ms | 68.6% bf16 MFU | 260698 tok/s +step 8707/18794 | loss 3.112805 (-0.29z)| norm 0.2878 (-0.36z)| lr 3.86e-03 | 2012.36 ms | 68.2% bf16 MFU | 260690 tok/s +step 8708/18794 | loss 3.119381 (-0.09z)| norm 0.3008 (-0.24z)| lr 3.86e-03 | 2007.38 ms | 68.4% bf16 MFU | 260714 tok/s +step 8709/18794 | loss 3.147125 (+0.73z)| norm 0.2767 (-0.46z)| lr 3.86e-03 | 2006.15 ms | 68.4% bf16 MFU | 260746 tok/s +step 8710/18794 | loss 3.155620 (+0.97z)| norm 0.2105 (-1.07z)| lr 3.86e-03 | 2007.01 ms | 68.4% bf16 MFU | 260770 tok/s +step 8711/18794 | loss 3.123734 (+0.02z)| norm 0.2984 (-0.24z)| lr 3.86e-03 | 2016.50 ms | 68.1% bf16 MFU | 260731 tok/s +step 8712/18794 | loss 3.048502 (-2.14z)| norm 0.4641 (+1.33z)| lr 3.86e-03 | 2013.92 ms | 68.1% bf16 MFU | 260711 tok/s +step 8713/18794 | loss 3.131298 (+0.24z)| norm 0.4728 (+1.38z)| lr 3.86e-03 | 2008.19 ms | 68.3% bf16 MFU | 260729 tok/s +step 8714/18794 | loss 3.136251 (+0.38z)| norm 0.3103 (-0.13z)| lr 3.86e-03 | 2025.99 ms | 67.7% bf16 MFU | 260632 tok/s +step 8715/18794 | loss 3.108294 (-0.44z)| norm 0.2777 (-0.42z)| lr 3.86e-03 | 1996.42 ms | 68.7% bf16 MFU | 260731 tok/s +step 8716/18794 | loss 3.121877 (-0.04z)| norm 0.2896 (-0.31z)| lr 3.86e-03 | 2023.16 ms | 67.8% bf16 MFU | 260652 tok/s +step 8717/18794 | loss 3.105734 (-0.52z)| norm 0.3232 (+0.01z)| lr 3.86e-03 | 2017.41 ms | 68.0% bf16 MFU | 260613 tok/s +step 8718/18794 | loss 3.106970 (-0.48z)| norm 0.2604 (-0.56z)| lr 3.86e-03 | 2001.71 ms | 68.6% bf16 MFU | 260678 tok/s +step 8719/18794 | loss 3.127213 (+0.11z)| norm 0.2271 (-0.86z)| lr 3.86e-03 | 2003.72 ms | 68.5% bf16 MFU | 260727 tok/s +step 8720/18794 | loss 3.115323 (-0.22z)| norm 0.3943 (+0.72z)| lr 3.86e-03 | 2015.25 ms | 68.1% bf16 MFU | 260699 tok/s +step 8721/18794 | loss 3.115511 (-0.21z)| norm 0.3044 (-0.12z)| lr 3.86e-03 | 2017.66 ms | 68.0% bf16 MFU | 260657 tok/s +step 8722/18794 | loss 3.145433 (+0.69z)| norm 0.2840 (-0.32z)| lr 3.86e-03 | 2003.43 ms | 68.5% bf16 MFU | 260709 tok/s +step 8723/18794 | loss 3.098856 (-0.77z)| norm 0.2466 (-0.67z)| lr 3.86e-03 | 2011.46 ms | 68.2% bf16 MFU | 260706 tok/s +step 8724/18794 | loss 3.119720 (-0.10z)| norm 0.2001 (-1.10z)| lr 3.86e-03 | 2020.19 ms | 67.9% bf16 MFU | 260646 tok/s +step 8725/18794 | loss 3.195919 (+2.27z)| norm 0.3385 (+0.23z)| lr 3.86e-03 | 2011.29 ms | 68.2% bf16 MFU | 260648 tok/s +step 8726/18794 | loss 3.127285 (+0.11z)| norm 0.3361 (+0.22z)| lr 3.85e-03 | 2020.03 ms | 67.9% bf16 MFU | 260593 tok/s +step 8727/18794 | loss 3.060987 (-1.94z)| norm 0.2588 (-0.53z)| lr 3.85e-03 | 2029.82 ms | 67.6% bf16 MFU | 260478 tok/s +step 8728/18794 | loss 3.114222 (-0.29z)| norm 0.4198 (+1.02z)| lr 3.85e-03 | 2018.34 ms | 68.0% bf16 MFU | 260442 tok/s +reducing beta2 to 0.9 and lr/wd by 0.757 due to grad z-score of 4.625447 +step 8729/18794 | loss 3.080770 (-1.32z)| norm 0.8601 (+4.63z)| lr 2.92e-03 | 2003.86 ms | 68.5% bf16 MFU | 260502 tok/s +step 8730/18794 | loss 3.104439 (-0.57z)| norm 0.2209 (-0.83z)| lr 3.85e-03 | 2045.90 ms | 67.1% bf16 MFU | 260290 tok/s +step 8731/18794 | loss 3.114618 (-0.24z)| norm 0.2993 (-0.15z)| lr 3.85e-03 | 2010.84 ms | 68.2% bf16 MFU | 260312 tok/s +step 8732/18794 | loss 3.108339 (-0.44z)| norm 0.4220 (+0.88z)| lr 3.85e-03 | 2000.66 ms | 68.6% bf16 MFU | 260399 tok/s +step 8733/18794 | loss 3.079011 (-1.33z)| norm 0.1934 (-1.06z)| lr 3.85e-03 | 2014.72 ms | 68.1% bf16 MFU | 260390 tok/s +step 8734/18794 | loss 3.112530 (-0.26z)| norm 0.2061 (-0.94z)| lr 3.85e-03 | 2026.28 ms | 67.7% bf16 MFU | 260308 tok/s +step 8735/18794 | loss 3.104781 (-0.52z)| norm 0.2295 (-0.74z)| lr 3.85e-03 | 1992.43 ms | 68.9% bf16 MFU | 260450 tok/s +step 8736/18794 | loss 3.169619 (+1.67z)| norm 0.2197 (-0.82z)| lr 3.85e-03 | 2005.97 ms | 68.4% bf16 MFU | 260496 tok/s +step 8737/18794 | loss 3.107412 (-0.43z)| norm 0.2777 (-0.33z)| lr 3.85e-03 | 2016.24 ms | 68.1% bf16 MFU | 260472 tok/s +step 8738/18794 | loss 3.094528 (-0.86z)| norm 0.2380 (-0.66z)| lr 3.85e-03 | 2025.71 ms | 67.7% bf16 MFU | 260390 tok/s +step 8739/18794 | loss 3.148666 (+0.96z)| norm 0.2032 (-0.94z)| lr 3.85e-03 | 2006.34 ms | 68.4% bf16 MFU | 260436 tok/s +step 8740/18794 | loss 3.105437 (-0.48z)| norm 0.2117 (-0.85z)| lr 3.85e-03 | 2016.02 ms | 68.1% bf16 MFU | 260417 tok/s +step 8741/18794 | loss 3.099827 (-0.66z)| norm 0.2332 (-0.67z)| lr 3.85e-03 | 2006.89 ms | 68.4% bf16 MFU | 260458 tok/s +step 8742/18794 | loss 3.132068 (+0.47z)| norm 0.2639 (-0.40z)| lr 3.85e-03 | 2014.20 ms | 68.1% bf16 MFU | 260450 tok/s +step 8743/18794 | loss 3.084309 (-1.18z)| norm 0.2399 (-0.60z)| lr 3.85e-03 | 1991.90 ms | 68.9% bf16 MFU | 260588 tok/s +step 8744/18794 | loss 3.138215 (+0.71z)| norm 0.2009 (-0.91z)| lr 3.85e-03 | 2004.07 ms | 68.5% bf16 MFU | 260639 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.701748 +step 8745/18794 | loss 3.113999 (-0.13z)| norm 0.6481 (+2.70z)| lr 3.85e-03 | 1997.15 ms | 68.7% bf16 MFU | 260733 tok/s +step 8746/18794 | loss 3.120587 (+0.09z)| norm 0.2765 (-0.28z)| lr 3.85e-03 | 2020.46 ms | 67.9% bf16 MFU | 260671 tok/s +step 8747/18794 | loss 3.127748 (+0.35z)| norm 0.4022 (+0.74z)| lr 3.84e-03 | 2016.67 ms | 68.0% bf16 MFU | 260636 tok/s +step 8748/18794 | loss 3.106811 (-0.37z)| norm 0.2501 (-0.50z)| lr 3.84e-03 | 2016.18 ms | 68.1% bf16 MFU | 260607 tok/s +step 8749/18794 | loss 3.105207 (-0.44z)| norm 0.5069 (+1.58z)| lr 3.84e-03 | 2019.68 ms | 67.9% bf16 MFU | 260556 tok/s +step 8750/18794 | loss 3.116091 (-0.05z)| norm 0.2550 (-0.47z)| lr 3.84e-03 | 1996.31 ms | 68.7% bf16 MFU | 260659 tok/s +val loss 3.145149 +HellaSwag: 2908/10042 = 0.289584: 0/1256 +step 8751/18794 | loss 3.108209 (-0.34z)| norm 0.3095 (-0.03z)| lr 3.84e-03 | 2015.08 ms | 68.1% bf16 MFU | 260636 tok/s +step 8752/18794 | loss 3.142871 (+0.90z)| norm 0.2765 (-0.30z)| lr 3.84e-03 | 2015.01 ms | 68.1% bf16 MFU | 260613 tok/s +step 8753/18794 | loss 3.112314 (-0.18z)| norm 0.3073 (-0.05z)| lr 3.84e-03 | 2007.22 ms | 68.4% bf16 MFU | 260643 tok/s +step 8754/18794 | loss 3.114391 (-0.09z)| norm 0.1974 (-0.94z)| lr 3.84e-03 | 2000.25 ms | 68.6% bf16 MFU | 260716 tok/s +step 8755/18794 | loss 3.087576 (-1.07z)| norm 0.2575 (-0.45z)| lr 3.84e-03 | 2026.87 ms | 67.7% bf16 MFU | 260614 tok/s +step 8756/18794 | loss 3.137110 (+0.75z)| norm 0.2231 (-0.72z)| lr 3.84e-03 | 2000.46 ms | 68.6% bf16 MFU | 260687 tok/s +step 8757/18794 | loss 3.149980 (+1.21z)| norm 0.2197 (-0.74z)| lr 3.84e-03 | 1997.13 ms | 68.7% bf16 MFU | 260779 tok/s +step 8758/18794 | loss 3.104694 (-0.44z)| norm 0.2417 (-0.54z)| lr 3.84e-03 | 2015.69 ms | 68.1% bf16 MFU | 260745 tok/s +step 8759/18794 | loss 3.078056 (-1.41z)| norm 0.2576 (-0.41z)| lr 3.84e-03 | 2004.65 ms | 68.5% bf16 MFU | 260785 tok/s +step 8760/18794 | loss 3.116362 (-0.01z)| norm 0.1932 (-0.94z)| lr 3.84e-03 | 2001.96 ms | 68.5% bf16 MFU | 260840 tok/s +step 8761/18794 | loss 3.095125 (-0.78z)| norm 0.3701 (+0.52z)| lr 3.84e-03 | 2016.88 ms | 68.0% bf16 MFU | 260795 tok/s +step 8762/18794 | loss 3.125863 (+0.34z)| norm 0.3770 (+0.58z)| lr 3.84e-03 | 2007.30 ms | 68.4% bf16 MFU | 260815 tok/s +step 8763/18794 | loss 3.100990 (-0.56z)| norm 0.3570 (+0.40z)| lr 3.84e-03 | 2041.67 ms | 67.2% bf16 MFU | 260614 tok/s +step 8764/18794 | loss 3.116187 (-0.01z)| norm 0.2948 (-0.08z)| lr 3.84e-03 | 2007.85 ms | 68.3% bf16 MFU | 260639 tok/s +step 8765/18794 | loss 3.109297 (-0.25z)| norm 0.3258 (+0.21z)| lr 3.84e-03 | 2018.17 ms | 68.0% bf16 MFU | 260597 tok/s +step 8766/18794 | loss 3.091368 (-0.89z)| norm 0.4232 (+1.14z)| lr 3.84e-03 | 2010.94 ms | 68.2% bf16 MFU | 260603 tok/s +step 8767/18794 | loss 3.123904 (+0.29z)| norm 0.4753 (+1.59z)| lr 3.84e-03 | 1988.87 ms | 69.0% bf16 MFU | 260753 tok/s +step 8768/18794 | loss 3.084074 (-1.16z)| norm 0.2069 (-0.87z)| lr 3.84e-03 | 2006.91 ms | 68.4% bf16 MFU | 260777 tok/s +step 8769/18794 | loss 3.131324 (+0.59z)| norm 0.4029 (+0.93z)| lr 3.83e-03 | 1994.12 ms | 68.8% bf16 MFU | 260884 tok/s +step 8770/18794 | loss 3.123232 (+0.28z)| norm 0.3359 (+0.31z)| lr 3.83e-03 | 2015.11 ms | 68.1% bf16 MFU | 260849 tok/s +step 8771/18794 | loss 3.085642 (-1.10z)| norm 0.2396 (-0.57z)| lr 3.83e-03 | 2021.52 ms | 67.9% bf16 MFU | 260774 tok/s +step 8772/18794 | loss 3.229907 (+3.90z)| norm 0.3194 (+0.16z)| lr 3.83e-03 | 2019.65 ms | 67.9% bf16 MFU | 260715 tok/s +step 8773/18794 | loss 3.158949 (+1.47z)| norm 0.2785 (-0.20z)| lr 3.83e-03 | 2013.17 ms | 68.2% bf16 MFU | 260701 tok/s +step 8774/18794 | loss 3.125276 (+0.32z)| norm 0.2451 (-0.50z)| lr 3.83e-03 | 2020.02 ms | 67.9% bf16 MFU | 260643 tok/s +step 8775/18794 | loss 3.094731 (-0.72z)| norm 0.2530 (-0.43z)| lr 3.83e-03 | 2013.91 ms | 68.1% bf16 MFU | 260628 tok/s +step 8776/18794 | loss 3.043568 (-2.39z)| norm 0.2477 (-0.49z)| lr 3.83e-03 | 2012.07 ms | 68.2% bf16 MFU | 260625 tok/s +step 8777/18794 | loss 3.129061 (+0.48z)| norm 0.3355 (+0.32z)| lr 3.83e-03 | 2025.68 ms | 67.7% bf16 MFU | 260535 tok/s +step 8778/18794 | loss 3.144471 (+1.00z)| norm 0.2802 (-0.19z)| lr 3.83e-03 | 2011.67 ms | 68.2% bf16 MFU | 260539 tok/s +step 8779/18794 | loss 3.115038 (+0.01z)| norm 0.2972 (-0.01z)| lr 3.83e-03 | 2009.59 ms | 68.3% bf16 MFU | 260557 tok/s +step 8780/18794 | loss 3.135858 (+0.70z)| norm 0.2395 (-0.55z)| lr 3.83e-03 | 2012.55 ms | 68.2% bf16 MFU | 260554 tok/s +step 8781/18794 | loss 3.230633 (+3.62z)| norm 0.3168 (+0.17z)| lr 3.83e-03 | 2027.19 ms | 67.7% bf16 MFU | 260458 tok/s +step 8782/18794 | loss 3.117654 (+0.02z)| norm 0.3618 (+0.59z)| lr 3.83e-03 | 2026.12 ms | 67.7% bf16 MFU | 260373 tok/s +step 8783/18794 | loss 3.136577 (+0.62z)| norm 0.3244 (+0.23z)| lr 3.83e-03 | 2010.83 ms | 68.2% bf16 MFU | 260391 tok/s +step 8784/18794 | loss 3.108267 (-0.29z)| norm 0.3561 (+0.53z)| lr 3.83e-03 | 2018.83 ms | 68.0% bf16 MFU | 260357 tok/s +step 8785/18794 | loss 3.095112 (-0.70z)| norm 0.2765 (-0.23z)| lr 3.83e-03 | 2027.78 ms | 67.7% bf16 MFU | 260267 tok/s +step 8786/18794 | loss 3.101185 (-0.49z)| norm 0.3621 (+0.57z)| lr 3.83e-03 | 2018.25 ms | 68.0% bf16 MFU | 260242 tok/s +step 8787/18794 | loss 3.099461 (-0.57z)| norm 0.3100 (+0.07z)| lr 3.83e-03 | 2005.64 ms | 68.4% bf16 MFU | 260300 tok/s +step 8788/18794 | loss 3.194990 (+2.45z)| norm 0.3191 (+0.14z)| lr 3.83e-03 | 2015.03 ms | 68.1% bf16 MFU | 260295 tok/s +step 8789/18794 | loss 3.116361 (-0.04z)| norm 0.4281 (+1.17z)| lr 3.83e-03 | 2018.78 ms | 68.0% bf16 MFU | 260265 tok/s +step 8790/18794 | loss 3.105105 (-0.40z)| norm 0.3919 (+0.83z)| lr 3.82e-03 | 2020.78 ms | 67.9% bf16 MFU | 260224 tok/s +step 8791/18794 | loss 3.146572 (+0.90z)| norm 0.2922 (-0.09z)| lr 3.82e-03 | 2022.98 ms | 67.8% bf16 MFU | 260171 tok/s +step 8792/18794 | loss 3.101794 (-0.52z)| norm 0.4297 (+1.28z)| lr 3.82e-03 | 2016.35 ms | 68.1% bf16 MFU | 260164 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.998374 +step 8793/18794 | loss 3.125231 (+0.22z)| norm 0.6157 (+3.00z)| lr 3.82e-03 | 2014.44 ms | 68.1% bf16 MFU | 260169 tok/s +step 8794/18794 | loss 3.077865 (-1.26z)| norm 0.3519 (+0.42z)| lr 3.82e-03 | 2025.53 ms | 67.8% bf16 MFU | 260102 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.572956 +step 8795/18794 | loss 3.109824 (-0.24z)| norm 0.5848 (+2.57z)| lr 3.82e-03 | 2018.36 ms | 68.0% bf16 MFU | 260085 tok/s +step 8796/18794 | loss 3.140573 (+0.72z)| norm 0.3866 (+0.69z)| lr 3.82e-03 | 2016.18 ms | 68.1% bf16 MFU | 260083 tok/s +step 8797/18794 | loss 3.170240 (+1.62z)| norm 0.4585 (+1.34z)| lr 3.82e-03 | 2018.65 ms | 68.0% bf16 MFU | 260065 tok/s +step 8798/18794 | loss 3.169283 (+1.57z)| norm 0.3183 (+0.03z)| lr 3.82e-03 | 2018.95 ms | 68.0% bf16 MFU | 260046 tok/s +step 8799/18794 | loss 3.089558 (-0.92z)| norm 0.2753 (-0.39z)| lr 3.82e-03 | 2018.55 ms | 68.0% bf16 MFU | 260030 tok/s +step 8800/18794 | loss 3.175644 (+1.73z)| norm 0.3803 (+0.60z)| lr 3.82e-03 | 2012.75 ms | 68.2% bf16 MFU | 260053 tok/s +step 8801/18794 | loss 3.164617 (+1.44z)| norm 0.2776 (-0.35z)| lr 3.82e-03 | 2002.69 ms | 68.5% bf16 MFU | 260140 tok/s +step 8802/18794 | loss 3.148066 (+0.90z)| norm 0.2698 (-0.43z)| lr 3.82e-03 | 2025.01 ms | 67.8% bf16 MFU | 260078 tok/s +step 8803/18794 | loss 3.124737 (+0.14z)| norm 0.4297 (+1.06z)| lr 3.82e-03 | 2017.79 ms | 68.0% bf16 MFU | 260066 tok/s +step 8804/18794 | loss 3.139714 (+0.63z)| norm 0.2835 (-0.31z)| lr 3.82e-03 | 2004.89 ms | 68.4% bf16 MFU | 260138 tok/s +step 8805/18794 | loss 3.156728 (+1.15z)| norm 0.2892 (-0.27z)| lr 3.82e-03 | 2009.22 ms | 68.3% bf16 MFU | 260178 tok/s +step 8806/18794 | loss 3.127440 (+0.21z)| norm 0.4377 (+1.12z)| lr 3.82e-03 | 2021.49 ms | 67.9% bf16 MFU | 260137 tok/s +step 8807/18794 | loss 3.155505 (+1.09z)| norm 0.4582 (+1.28z)| lr 3.82e-03 | 2021.38 ms | 67.9% bf16 MFU | 260099 tok/s +step 8808/18794 | loss 3.169464 (+1.50z)| norm 0.3830 (+0.57z)| lr 3.82e-03 | 2006.57 ms | 68.4% bf16 MFU | 260158 tok/s +step 8809/18794 | loss 3.122334 (+0.02z)| norm 0.2477 (-0.69z)| lr 3.82e-03 | 2003.60 ms | 68.5% bf16 MFU | 260234 tok/s +step 8810/18794 | loss 3.092198 (-0.92z)| norm 0.4761 (+1.41z)| lr 3.82e-03 | 2025.14 ms | 67.8% bf16 MFU | 260167 tok/s +step 8811/18794 | loss 3.068031 (-1.64z)| norm 0.4273 (+0.94z)| lr 3.81e-03 | 2017.58 ms | 68.0% bf16 MFU | 260151 tok/s +step 8812/18794 | loss 3.144355 (+0.73z)| norm 0.2650 (-0.55z)| lr 3.81e-03 | 2016.56 ms | 68.1% bf16 MFU | 260143 tok/s +step 8813/18794 | loss 3.138740 (+0.55z)| norm 0.3008 (-0.20z)| lr 3.81e-03 | 2016.13 ms | 68.1% bf16 MFU | 260138 tok/s +step 8814/18794 | loss 3.084281 (-1.17z)| norm 0.2593 (-0.59z)| lr 3.81e-03 | 2006.16 ms | 68.4% bf16 MFU | 260198 tok/s +step 8815/18794 | loss 3.164819 (+1.36z)| norm 0.2772 (-0.42z)| lr 3.81e-03 | 2010.14 ms | 68.3% bf16 MFU | 260230 tok/s +step 8816/18794 | loss 3.090670 (-0.96z)| norm 0.2004 (-1.12z)| lr 3.81e-03 | 2017.48 ms | 68.0% bf16 MFU | 260212 tok/s +step 8817/18794 | loss 3.208115 (+2.60z)| norm 0.2124 (-1.00z)| lr 3.81e-03 | 2004.32 ms | 68.5% bf16 MFU | 260280 tok/s +step 8818/18794 | loss 3.128834 (+0.19z)| norm 0.2232 (-0.89z)| lr 3.81e-03 | 2024.60 ms | 67.8% bf16 MFU | 260214 tok/s +step 8819/18794 | loss 3.144938 (+0.67z)| norm 0.3036 (-0.15z)| lr 3.81e-03 | 2011.30 ms | 68.2% bf16 MFU | 260237 tok/s +step 8820/18794 | loss 3.084437 (-1.14z)| norm 0.3128 (-0.06z)| lr 3.81e-03 | 2010.31 ms | 68.3% bf16 MFU | 260265 tok/s +step 8821/18794 | loss 3.094545 (-0.83z)| norm 0.2900 (-0.27z)| lr 3.81e-03 | 2001.92 ms | 68.6% bf16 MFU | 260346 tok/s +step 8822/18794 | loss 3.073079 (-1.44z)| norm 0.3558 (+0.34z)| lr 3.81e-03 | 1997.72 ms | 68.7% bf16 MFU | 260451 tok/s +step 8823/18794 | loss 3.105732 (-0.47z)| norm 0.4010 (+0.74z)| lr 3.81e-03 | 2025.58 ms | 67.7% bf16 MFU | 260370 tok/s +step 8824/18794 | loss 3.112672 (-0.26z)| norm 0.3454 (+0.21z)| lr 3.81e-03 | 2010.54 ms | 68.3% bf16 MFU | 260390 tok/s +step 8825/18794 | loss 3.117262 (-0.10z)| norm 0.2968 (-0.24z)| lr 3.81e-03 | 2014.48 ms | 68.1% bf16 MFU | 260384 tok/s +step 8826/18794 | loss 3.129066 (+0.26z)| norm 0.2586 (-0.59z)| lr 3.81e-03 | 2020.21 ms | 67.9% bf16 MFU | 260341 tok/s +step 8827/18794 | loss 3.095086 (-0.80z)| norm 0.3018 (-0.19z)| lr 3.81e-03 | 2016.80 ms | 68.0% bf16 MFU | 260322 tok/s +step 8828/18794 | loss 3.148890 (+0.85z)| norm 0.3674 (+0.43z)| lr 3.81e-03 | 2018.63 ms | 68.0% bf16 MFU | 260292 tok/s +step 8829/18794 | loss 3.167400 (+1.39z)| norm 0.3028 (-0.14z)| lr 3.81e-03 | 1996.51 ms | 68.7% bf16 MFU | 260407 tok/s +step 8830/18794 | loss 3.179493 (+1.72z)| norm 0.3866 (+0.76z)| lr 3.81e-03 | 2017.96 ms | 68.0% bf16 MFU | 260378 tok/s +step 8831/18794 | loss 3.127096 (+0.12z)| norm 0.3036 (-0.15z)| lr 3.81e-03 | 2002.68 ms | 68.5% bf16 MFU | 260448 tok/s +step 8832/18794 | loss 3.103857 (-0.58z)| norm 0.3235 (+0.08z)| lr 3.81e-03 | 2017.78 ms | 68.0% bf16 MFU | 260418 tok/s +step 8833/18794 | loss 3.133036 (+0.29z)| norm 0.3353 (+0.19z)| lr 3.80e-03 | 2009.96 ms | 68.3% bf16 MFU | 260439 tok/s +step 8834/18794 | loss 3.055405 (-2.03z)| norm 0.3493 (+0.34z)| lr 3.80e-03 | 2005.14 ms | 68.4% bf16 MFU | 260491 tok/s +step 8835/18794 | loss 3.138233 (+0.45z)| norm 0.3688 (+0.54z)| lr 3.80e-03 | 2002.16 ms | 68.5% bf16 MFU | 260559 tok/s +step 8836/18794 | loss 3.110637 (-0.37z)| norm 0.3157 (-0.07z)| lr 3.80e-03 | 2013.31 ms | 68.2% bf16 MFU | 260552 tok/s +step 8837/18794 | loss 3.114954 (-0.24z)| norm 0.2565 (-0.74z)| lr 3.80e-03 | 1994.50 ms | 68.8% bf16 MFU | 260667 tok/s +step 8838/18794 | loss 3.223873 (+2.91z)| norm 0.3753 (+0.60z)| lr 3.80e-03 | 2002.96 ms | 68.5% bf16 MFU | 260722 tok/s +mostly skipping update due to grad z-score of 5.456226 +step 8839/18794 | loss 3.105617 (-0.53z)| norm 0.8989 (+5.46z)| lr 3.80e-04 | 2011.33 ms | 68.2% bf16 MFU | 260719 tok/s +step 8840/18794 | loss 3.155039 (+0.90z)| norm 0.3703 (+0.52z)| lr 3.80e-03 | 2018.33 ms | 68.0% bf16 MFU | 260671 tok/s +step 8841/18794 | loss 3.098045 (-0.76z)| norm 0.4431 (+1.33z)| lr 3.80e-03 | 1996.30 ms | 68.7% bf16 MFU | 260769 tok/s +reducing beta2 to 0.9 and lr/wd by 0.918 due to grad z-score of 3.814491 +step 8842/18794 | loss 3.161764 (+1.08z)| norm 0.6902 (+3.81z)| lr 3.49e-03 | 2001.22 ms | 68.6% bf16 MFU | 260830 tok/s +step 8843/18794 | loss 3.191440 (+1.88z)| norm 0.4827 (+1.57z)| lr 3.80e-03 | 2016.25 ms | 68.1% bf16 MFU | 260790 tok/s +reducing beta2 to 0.9 and lr/wd by 0.992 due to grad z-score of 3.529269 +step 8844/18794 | loss 3.109254 (-0.46z)| norm 0.6950 (+3.53z)| lr 3.77e-03 | 2010.72 ms | 68.3% bf16 MFU | 260788 tok/s +step 8845/18794 | loss 3.159065 (+0.95z)| norm 0.4872 (+1.44z)| lr 3.80e-03 | 2012.42 ms | 68.2% bf16 MFU | 260775 tok/s +step 8846/18794 | loss 3.115630 (-0.29z)| norm 0.4157 (+0.80z)| lr 3.80e-03 | 2001.73 ms | 68.6% bf16 MFU | 260832 tok/s +step 8847/18794 | loss 3.119225 (-0.18z)| norm 0.4844 (+1.47z)| lr 3.80e-03 | 2002.44 ms | 68.5% bf16 MFU | 260882 tok/s +step 8848/18794 | loss 3.146423 (+0.58z)| norm 0.4990 (+1.59z)| lr 3.80e-03 | 1995.16 ms | 68.8% bf16 MFU | 260977 tok/s +mostly skipping update due to grad z-score of 5.840559 +step 8849/18794 | loss 3.095775 (-0.85z)| norm 1.0586 (+5.84z)| lr 3.80e-04 | 2010.44 ms | 68.3% bf16 MFU | 260967 tok/s +step 8850/18794 | loss 3.086258 (-1.11z)| norm 0.3553 (+0.13z)| lr 3.80e-03 | 2003.48 ms | 68.5% bf16 MFU | 261003 tok/s +step 8851/18794 | loss 3.141345 (+0.44z)| norm 0.5217 (+1.81z)| lr 3.80e-03 | 2015.27 ms | 68.1% bf16 MFU | 260961 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.380538 +step 8852/18794 | loss 3.084786 (-1.14z)| norm 0.7000 (+3.38z)| lr 3.80e-03 | 2020.62 ms | 67.9% bf16 MFU | 260886 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.014715 +step 8853/18794 | loss 3.134728 (+0.26z)| norm 0.5644 (+2.01z)| lr 3.80e-03 | 2011.39 ms | 68.2% bf16 MFU | 260875 tok/s +step 8854/18794 | loss 3.156270 (+0.85z)| norm 0.5049 (+1.42z)| lr 3.79e-03 | 2025.33 ms | 67.8% bf16 MFU | 260774 tok/s +reducing beta2 to 0.9 and lr/wd by 0.767 due to grad z-score of 4.562769 +step 8855/18794 | loss 3.144407 (+0.50z)| norm 0.9087 (+4.56z)| lr 2.91e-03 | 2002.96 ms | 68.5% bf16 MFU | 260823 tok/s +step 8856/18794 | loss 3.174283 (+1.32z)| norm 0.2647 (-0.79z)| lr 3.79e-03 | 2011.18 ms | 68.2% bf16 MFU | 260816 tok/s +step 8857/18794 | loss 3.169174 (+1.17z)| norm 0.4516 (+0.76z)| lr 3.79e-03 | 2018.69 ms | 68.0% bf16 MFU | 260761 tok/s +step 8858/18794 | loss 3.178090 (+1.38z)| norm 0.3092 (-0.44z)| lr 3.79e-03 | 2009.39 ms | 68.3% bf16 MFU | 260769 tok/s +step 8859/18794 | loss 3.166438 (+1.04z)| norm 0.5768 (+1.76z)| lr 3.79e-03 | 2026.53 ms | 67.7% bf16 MFU | 260666 tok/s +step 8860/18794 | loss 3.089063 (-1.08z)| norm 0.4935 (+1.05z)| lr 3.79e-03 | 2002.71 ms | 68.5% bf16 MFU | 260723 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.053525 +step 8861/18794 | loss 3.117437 (-0.31z)| norm 0.7568 (+3.05z)| lr 3.79e-03 | 2026.83 ms | 67.7% bf16 MFU | 260620 tok/s +step 8862/18794 | loss 3.177225 (+1.31z)| norm 0.5858 (+1.66z)| lr 3.79e-03 | 2018.80 ms | 68.0% bf16 MFU | 260574 tok/s +mostly skipping update due to grad z-score of 8.047918 +step 8863/18794 | loss 3.189152 (+1.60z)| norm 2.1217 (+8.05z)| lr 3.79e-04 | 2011.72 ms | 68.2% bf16 MFU | 260576 tok/s +mostly skipping update due to grad z-score of 9.130242 +step 8864/18794 | loss 3.130639 (+0.01z)| norm 3.3055 (+9.13z)| lr 3.79e-04 | 2003.05 ms | 68.5% bf16 MFU | 260635 tok/s +step 8865/18794 | loss 3.119050 (-0.31z)| norm 0.5015 (+0.30z)| lr 3.79e-03 | 2009.73 ms | 68.3% bf16 MFU | 260647 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.373868 +step 8866/18794 | loss 3.158369 (+0.75z)| norm 1.1922 (+2.37z)| lr 3.79e-03 | 2003.87 ms | 68.5% bf16 MFU | 260696 tok/s +reducing beta2 to 0.9 and lr/wd by 0.854 due to grad z-score of 4.096175 +step 8867/18794 | loss 3.098742 (-0.87z)| norm 1.9009 (+4.10z)| lr 3.24e-03 | 2016.28 ms | 68.1% bf16 MFU | 260663 tok/s +step 8868/18794 | loss 3.187142 (+1.50z)| norm 0.4136 (-0.05z)| lr 3.79e-03 | 2026.11 ms | 67.7% bf16 MFU | 260568 tok/s +step 8869/18794 | loss 3.183336 (+1.37z)| norm 0.5182 (+0.24z)| lr 3.79e-03 | 1987.08 ms | 69.1% bf16 MFU | 260732 tok/s +step 8870/18794 | loss 3.125025 (-0.20z)| norm 1.0356 (+1.64z)| lr 3.79e-03 | 2009.77 ms | 68.3% bf16 MFU | 260739 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.146464 +step 8871/18794 | loss 3.151086 (+0.49z)| norm 1.6570 (+3.15z)| lr 3.79e-03 | 2001.83 ms | 68.6% bf16 MFU | 260797 tok/s +step 8872/18794 | loss 3.123265 (-0.24z)| norm 0.6645 (+0.54z)| lr 3.79e-03 | 2002.13 ms | 68.5% bf16 MFU | 260851 tok/s +step 8873/18794 | loss 3.183276 (+1.42z)| norm 0.4708 (+0.03z)| lr 3.79e-03 | 2009.77 ms | 68.3% bf16 MFU | 260852 tok/s +step 8874/18794 | loss 3.210429 (+2.10z)| norm 0.5498 (+0.23z)| lr 3.79e-03 | 2004.51 ms | 68.5% bf16 MFU | 260887 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.490033 +step 8875/18794 | loss 3.139270 (+0.16z)| norm 1.4546 (+2.49z)| lr 3.79e-03 | 2018.38 ms | 68.0% bf16 MFU | 260830 tok/s +step 8876/18794 | loss 3.120648 (-0.38z)| norm 0.8027 (+0.82z)| lr 3.78e-03 | 2010.95 ms | 68.2% bf16 MFU | 260825 tok/s +reducing beta2 to 0.9 and lr/wd by 0.735 due to grad z-score of 4.764273 +step 8877/18794 | loss 3.176752 (+1.17z)| norm 2.6435 (+4.76z)| lr 2.78e-03 | 2012.66 ms | 68.2% bf16 MFU | 260808 tok/s +step 8878/18794 | loss 3.173315 (+1.06z)| norm 0.6186 (+0.25z)| lr 3.78e-03 | 2032.00 ms | 67.5% bf16 MFU | 260668 tok/s +step 8879/18794 | loss 3.143647 (+0.23z)| norm 1.1223 (+1.34z)| lr 3.78e-03 | 2023.77 ms | 67.8% bf16 MFU | 260588 tok/s +mostly skipping update due to grad z-score of 5.349548 +step 8880/18794 | loss 3.161869 (+0.73z)| norm 3.4132 (+5.35z)| lr 3.78e-04 | 2016.28 ms | 68.1% bf16 MFU | 260560 tok/s +step 8881/18794 | loss 3.126176 (-0.24z)| norm 0.6326 (+0.26z)| lr 3.78e-03 | 2007.56 ms | 68.4% bf16 MFU | 260590 tok/s +step 8882/18794 | loss 3.162921 (+0.80z)| norm 0.5444 (+0.06z)| lr 3.78e-03 | 2002.80 ms | 68.5% bf16 MFU | 260649 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.878458 +step 8883/18794 | loss 3.175714 (+1.15z)| norm 1.8918 (+2.88z)| lr 3.78e-03 | 2023.17 ms | 67.8% bf16 MFU | 260574 tok/s +reducing beta2 to 0.9 and lr/wd by 0.709 due to grad z-score of 4.935302 +step 8884/18794 | loss 3.166097 (+0.86z)| norm 3.2386 (+4.94z)| lr 2.68e-03 | 2011.53 ms | 68.2% bf16 MFU | 260577 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.154340 +step 8885/18794 | loss 3.168191 (+0.90z)| norm 1.7739 (+2.15z)| lr 3.78e-03 | 2026.31 ms | 67.7% bf16 MFU | 260486 tok/s +mostly skipping update due to grad z-score of 5.935299 +step 8886/18794 | loss 3.166194 (+0.83z)| norm 4.7189 (+5.94z)| lr 3.78e-04 | 2009.86 ms | 68.3% bf16 MFU | 260504 tok/s +mostly skipping update due to grad z-score of 6.826169 +step 8887/18794 | loss 3.160507 (+0.65z)| norm 5.8306 (+6.83z)| lr 3.78e-04 | 2004.91 ms | 68.4% bf16 MFU | 260554 tok/s +step 8888/18794 | loss 3.060972 (-2.15z)| norm 0.6708 (+0.04z)| lr 3.78e-03 | 2009.88 ms | 68.3% bf16 MFU | 260569 tok/s +step 8889/18794 | loss 3.192884 (+1.56z)| norm 0.8373 (+0.26z)| lr 3.78e-03 | 2009.53 ms | 68.3% bf16 MFU | 260586 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.379580 +step 8890/18794 | loss 3.234786 (+2.62z)| norm 2.5264 (+2.38z)| lr 3.78e-03 | 2017.45 ms | 68.0% bf16 MFU | 260550 tok/s +mostly skipping update due to grad z-score of 6.588971 +step 8891/18794 | loss 3.192810 (+1.45z)| norm 7.6097 (+6.59z)| lr 3.78e-04 | 2024.90 ms | 67.8% bf16 MFU | 260469 tok/s +step 8892/18794 | loss 3.193447 (+1.43z)| norm 0.5621 (-0.13z)| lr 3.78e-03 | 2010.31 ms | 68.3% bf16 MFU | 260485 tok/s +mostly skipping update due to grad z-score of 5.166477 +step 8893/18794 | loss 3.175957 (+0.95z)| norm 5.4384 (+5.17z)| lr 3.78e-04 | 1995.44 ms | 68.8% bf16 MFU | 260598 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.204481 +step 8894/18794 | loss 3.164028 (+0.61z)| norm 2.4537 (+2.20z)| lr 3.78e-03 | 2020.88 ms | 67.9% bf16 MFU | 260540 tok/s +step 8895/18794 | loss 3.177289 (+0.95z)| norm 0.6407 (-0.06z)| lr 3.78e-03 | 1998.81 ms | 68.7% bf16 MFU | 260628 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.079079 +step 8896/18794 | loss 3.120173 (-0.58z)| norm 2.4112 (+2.08z)| lr 3.78e-03 | 2018.21 ms | 68.0% bf16 MFU | 260586 tok/s +step 8897/18794 | loss 3.142863 (+0.04z)| norm 0.4508 (-0.32z)| lr 3.77e-03 | 2029.89 ms | 67.6% bf16 MFU | 260470 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.263929 +step 8898/18794 | loss 3.184309 (+1.15z)| norm 3.5648 (+3.26z)| lr 3.77e-03 | 2016.42 ms | 68.1% bf16 MFU | 260447 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.097053 +step 8899/18794 | loss 3.157765 (+0.42z)| norm 2.6190 (+2.10z)| lr 3.77e-03 | 2012.53 ms | 68.2% bf16 MFU | 260451 tok/s +step 8900/18794 | loss 3.144346 (+0.06z)| norm 2.1157 (+1.49z)| lr 3.77e-03 | 2017.99 ms | 68.0% bf16 MFU | 260418 tok/s +reducing beta2 to 0.9 and lr/wd by 0.971 due to grad z-score of 3.603122 +step 8901/18794 | loss 3.128290 (-0.37z)| norm 4.2695 (+3.60z)| lr 3.66e-03 | 2016.89 ms | 68.0% bf16 MFU | 260395 tok/s +step 8902/18794 | loss 3.197939 (+1.51z)| norm 0.6704 (-0.16z)| lr 3.77e-03 | 2016.00 ms | 68.1% bf16 MFU | 260378 tok/s +step 8903/18794 | loss 3.161690 (+0.52z)| norm 0.7418 (-0.09z)| lr 3.77e-03 | 2019.32 ms | 68.0% bf16 MFU | 260341 tok/s +step 8904/18794 | loss 3.211464 (+1.81z)| norm 1.4854 (+0.68z)| lr 3.77e-03 | 2002.46 ms | 68.5% bf16 MFU | 260415 tok/s +step 8905/18794 | loss 3.202193 (+1.53z)| norm 0.6042 (-0.25z)| lr 3.77e-03 | 2022.46 ms | 67.9% bf16 MFU | 260356 tok/s +step 8906/18794 | loss 3.190908 (+1.21z)| norm 1.1844 (+0.35z)| lr 3.77e-03 | 2026.35 ms | 67.7% bf16 MFU | 260275 tok/s +step 8907/18794 | loss 3.206029 (+1.58z)| norm 0.6929 (-0.17z)| lr 3.77e-03 | 2011.66 ms | 68.2% bf16 MFU | 260293 tok/s +step 8908/18794 | loss 3.175709 (+0.79z)| norm 1.0029 (+0.15z)| lr 3.77e-03 | 2020.08 ms | 67.9% bf16 MFU | 260255 tok/s +step 8909/18794 | loss 3.175186 (+0.76z)| norm 1.2833 (+0.44z)| lr 3.77e-03 | 2003.54 ms | 68.5% bf16 MFU | 260326 tok/s +step 8910/18794 | loss 3.214042 (+1.72z)| norm 2.6842 (+1.85z)| lr 3.77e-03 | 2027.40 ms | 67.7% bf16 MFU | 260240 tok/s +step 8911/18794 | loss 3.200264 (+1.35z)| norm 0.9691 (+0.07z)| lr 3.77e-03 | 2015.84 ms | 68.1% bf16 MFU | 260232 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.129187 +step 8912/18794 | loss 3.140433 (-0.20z)| norm 3.0257 (+2.13z)| lr 3.77e-03 | 2016.21 ms | 68.1% bf16 MFU | 260222 tok/s +step 8913/18794 | loss 3.213345 (+1.65z)| norm 0.8280 (-0.10z)| lr 3.77e-03 | 2034.63 ms | 67.4% bf16 MFU | 260095 tok/s +step 8914/18794 | loss 3.175254 (+0.66z)| norm 0.9234 (-0.01z)| lr 3.77e-03 | 2003.77 ms | 68.5% bf16 MFU | 260173 tok/s +step 8915/18794 | loss 3.182684 (+0.85z)| norm 0.6244 (-0.32z)| lr 3.77e-03 | 2015.43 ms | 68.1% bf16 MFU | 260171 tok/s +step 8916/18794 | loss 3.213073 (+1.59z)| norm 0.7531 (-0.19z)| lr 3.77e-03 | 1999.54 ms | 68.6% bf16 MFU | 260273 tok/s +step 8917/18794 | loss 3.199882 (+1.26z)| norm 1.1413 (+0.20z)| lr 3.77e-03 | 2019.25 ms | 68.0% bf16 MFU | 260242 tok/s +step 8918/18794 | loss 3.234216 (+2.08z)| norm 1.2781 (+0.33z)| lr 3.76e-03 | 2015.63 ms | 68.1% bf16 MFU | 260235 tok/s +step 8919/18794 | loss 3.204268 (+1.29z)| norm 0.5988 (-0.37z)| lr 3.76e-03 | 2026.93 ms | 67.7% bf16 MFU | 260156 tok/s +step 8920/18794 | loss 3.176664 (+0.59z)| norm 1.1858 (+0.22z)| lr 3.76e-03 | 2012.36 ms | 68.2% bf16 MFU | 260175 tok/s +step 8921/18794 | loss 3.151845 (-0.06z)| norm 1.1945 (+0.22z)| lr 3.76e-03 | 2009.32 ms | 68.3% bf16 MFU | 260213 tok/s +step 8922/18794 | loss 3.199317 (+1.15z)| norm 1.9894 (+1.02z)| lr 3.76e-03 | 2006.79 ms | 68.4% bf16 MFU | 260265 tok/s +step 8923/18794 | loss 3.184715 (+0.75z)| norm 0.5839 (-0.43z)| lr 3.76e-03 | 2017.64 ms | 68.0% bf16 MFU | 260244 tok/s +step 8924/18794 | loss 3.248825 (+2.34z)| norm 1.0619 (+0.06z)| lr 3.76e-03 | 2004.36 ms | 68.5% bf16 MFU | 260311 tok/s +step 8925/18794 | loss 3.215926 (+1.47z)| norm 1.1671 (+0.16z)| lr 3.76e-03 | 2023.45 ms | 67.8% bf16 MFU | 260251 tok/s +step 8926/18794 | loss 3.209083 (+1.26z)| norm 1.7281 (+0.72z)| lr 3.76e-03 | 2017.48 ms | 68.0% bf16 MFU | 260232 tok/s +step 8927/18794 | loss 3.205125 (+1.14z)| norm 2.4290 (+1.41z)| lr 3.76e-03 | 2007.73 ms | 68.4% bf16 MFU | 260277 tok/s +step 8928/18794 | loss 3.177926 (+0.44z)| norm 0.6476 (-0.42z)| lr 3.76e-03 | 2008.28 ms | 68.3% bf16 MFU | 260316 tok/s +step 8929/18794 | loss 3.223958 (+1.58z)| norm 0.6985 (-0.37z)| lr 3.76e-03 | 1998.13 ms | 68.7% bf16 MFU | 260420 tok/s +step 8930/18794 | loss 3.122323 (-0.96z)| norm 1.1484 (+0.08z)| lr 3.76e-03 | 1998.01 ms | 68.7% bf16 MFU | 260519 tok/s +step 8931/18794 | loss 3.264381 (+2.50z)| norm 2.0751 (+1.02z)| lr 3.76e-03 | 2014.21 ms | 68.1% bf16 MFU | 260508 tok/s +step 8932/18794 | loss 3.145355 (-0.42z)| norm 0.7496 (-0.35z)| lr 3.76e-03 | 1999.96 ms | 68.6% bf16 MFU | 260590 tok/s +step 8933/18794 | loss 3.212543 (+1.21z)| norm 1.5745 (+0.49z)| lr 3.76e-03 | 1992.64 ms | 68.9% bf16 MFU | 260716 tok/s +step 8934/18794 | loss 3.259585 (+2.33z)| norm 1.8772 (+0.79z)| lr 3.76e-03 | 2000.68 ms | 68.6% bf16 MFU | 260783 tok/s +step 8935/18794 | loss 3.190986 (+0.62z)| norm 0.8158 (-0.32z)| lr 3.76e-03 | 2017.01 ms | 68.0% bf16 MFU | 260741 tok/s +step 8936/18794 | loss 3.177736 (+0.28z)| norm 1.7829 (+0.67z)| lr 3.76e-03 | 2000.37 ms | 68.6% bf16 MFU | 260808 tok/s +step 8937/18794 | loss 3.236116 (+1.70z)| norm 1.2961 (+0.16z)| lr 3.76e-03 | 2024.63 ms | 67.8% bf16 MFU | 260716 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.925706 +step 8938/18794 | loss 3.121253 (-1.13z)| norm 4.1052 (+2.93z)| lr 3.76e-03 | 2028.93 ms | 67.6% bf16 MFU | 260600 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.107233 +step 8939/18794 | loss 3.314121 (+3.44z)| norm 4.4791 (+3.11z)| lr 3.75e-03 | 2000.56 ms | 68.6% bf16 MFU | 260674 tok/s +reducing beta2 to 0.9 and lr/wd by 0.724 due to grad z-score of 4.833992 +step 8940/18794 | loss 3.285359 (+2.63z)| norm 7.0628 (+4.83z)| lr 2.72e-03 | 1997.10 ms | 68.7% bf16 MFU | 260766 tok/s +step 8941/18794 | loss 3.379470 (+4.31z)| norm 2.4166 (+0.92z)| lr 3.75e-03 | 2004.16 ms | 68.5% bf16 MFU | 260808 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.004320 +step 8942/18794 | loss 3.443258 (+4.87z)| norm 3.7851 (+2.00z)| lr 3.75e-03 | 2000.74 ms | 68.6% bf16 MFU | 260870 tok/s +mostly skipping update due to loss z-score of 5.955780 +step 8943/18794 | loss 3.588204 (+5.96z)| norm 12.3333 (+6.66z)| lr 3.75e-04 | 2008.26 ms | 68.3% bf16 MFU | 260880 tok/s +mostly skipping update due to grad z-score of 6.246943 +step 8944/18794 | loss 3.442087 (+4.32z)| norm 11.1819 (+6.25z)| lr 3.75e-04 | 2034.08 ms | 67.5% bf16 MFU | 260723 tok/s +step 8945/18794 | loss 3.343169 (+2.59z)| norm 3.4973 (+1.28z)| lr 3.75e-03 | 2009.83 ms | 68.3% bf16 MFU | 260730 tok/s +step 8946/18794 | loss 3.530394 (+4.82z)| norm 4.3872 (+1.80z)| lr 3.75e-03 | 2026.17 ms | 67.7% bf16 MFU | 260632 tok/s +mostly skipping update due to loss z-score of 5.334587 +step 8947/18794 | loss 3.640810 (+5.33z)| norm 11.9317 (+5.44z)| lr 3.75e-04 | 2023.08 ms | 67.8% bf16 MFU | 260558 tok/s +mostly skipping update due to grad z-score of 5.965583 +step 8948/18794 | loss 3.590036 (+4.89z)| norm 13.4608 (+5.97z)| lr 3.75e-04 | 2017.38 ms | 68.0% bf16 MFU | 260524 tok/s +mostly skipping update due to grad z-score of 7.477063 +step 8949/18794 | loss 3.583791 (+4.31z)| norm 24.2742 (+7.48z)| lr 3.75e-04 | 2007.33 ms | 68.4% bf16 MFU | 260557 tok/s +reducing beta2 to 0.9 and lr/wd by 0.934 due to grad z-score of 3.748341 +step 8950/18794 | loss 3.465020 (+2.84z)| norm 14.1320 (+3.75z)| lr 3.50e-03 | 2021.68 ms | 67.9% bf16 MFU | 260496 tok/s +step 8951/18794 | loss 3.506080 (+3.09z)| norm 5.3547 (+1.01z)| lr 3.75e-03 | 1993.22 ms | 68.8% bf16 MFU | 260623 tok/s +mostly skipping update due to loss z-score of 6.584368 +step 8952/18794 | loss 4.072212 (+6.58z)| norm 14.7465 (+3.61z)| lr 3.75e-04 | 2012.66 ms | 68.2% bf16 MFU | 260617 tok/s +mostly skipping update due to loss z-score of 5.879940 +step 8953/18794 | loss 3.924239 (+5.88z)| norm 17.9747 (+4.09z)| lr 3.75e-04 | 2009.98 ms | 68.3% bf16 MFU | 260628 tok/s +step 8954/18794 | loss 3.784909 (+4.24z)| norm 7.6396 (+1.35z)| lr 3.75e-03 | 2010.31 ms | 68.3% bf16 MFU | 260636 tok/s +reducing beta2 to 0.9 and lr/wd by 0.777 due to grad z-score of 4.503310 +step 8955/18794 | loss 3.879367 (+4.40z)| norm 22.0343 (+4.50z)| lr 2.91e-03 | 2019.67 ms | 67.9% bf16 MFU | 260584 tok/s +step 8956/18794 | loss 3.625201 (+2.58z)| norm 5.2756 (+0.59z)| lr 3.75e-03 | 2015.59 ms | 68.1% bf16 MFU | 260561 tok/s +reducing beta2 to 0.9 and lr/wd by 0.983 due to grad z-score of 3.559818 +step 8957/18794 | loss 4.071534 (+4.76z)| norm 19.2838 (+3.56z)| lr 3.68e-03 | 2007.08 ms | 68.4% bf16 MFU | 260594 tok/s +step 8958/18794 | loss 3.686166 (+2.46z)| norm 4.1237 (+0.25z)| lr 3.75e-03 | 1996.61 ms | 68.7% bf16 MFU | 260694 tok/s +step 8959/18794 | loss 3.923487 (+3.49z)| norm 6.4487 (+0.75z)| lr 3.75e-03 | 2009.81 ms | 68.3% bf16 MFU | 260702 tok/s +step 8960/18794 | loss 4.191607 (+4.35z)| norm 7.5089 (+0.96z)| lr 3.75e-03 | 2030.53 ms | 67.6% bf16 MFU | 260577 tok/s +step 8961/18794 | loss 4.157677 (+3.83z)| norm 9.8345 (+1.43z)| lr 3.74e-03 | 2016.46 ms | 68.1% bf16 MFU | 260548 tok/s +step 8962/18794 | loss 4.089930 (+3.30z)| norm 6.8149 (+0.77z)| lr 3.74e-03 | 2016.41 ms | 68.1% bf16 MFU | 260522 tok/s +step 8963/18794 | loss 3.910912 (+2.47z)| norm 3.9171 (+0.14z)| lr 3.74e-03 | 2022.96 ms | 67.8% bf16 MFU | 260454 tok/s +step 8964/18794 | loss 3.816177 (+2.03z)| norm 3.4464 (+0.03z)| lr 3.74e-03 | 2001.94 ms | 68.5% bf16 MFU | 260526 tok/s +step 8965/18794 | loss 3.756684 (+1.75z)| norm 2.7464 (-0.13z)| lr 3.74e-03 | 2026.27 ms | 67.7% bf16 MFU | 260437 tok/s +reducing beta2 to 0.9 and lr/wd by 0.946 due to grad z-score of 3.699075 +step 8966/18794 | loss 3.630720 (+1.24z)| norm 21.9819 (+3.70z)| lr 3.54e-03 | 2015.27 ms | 68.1% bf16 MFU | 260423 tok/s +reducing beta2 to 0.9 and lr/wd by 0.917 due to grad z-score of 3.816549 +step 8967/18794 | loss 3.616751 (+1.16z)| norm 24.3430 (+3.82z)| lr 3.43e-03 | 2014.69 ms | 68.1% bf16 MFU | 260413 tok/s +mostly skipping update due to grad z-score of 5.202845 +step 8968/18794 | loss 3.634505 (+1.20z)| norm 36.9765 (+5.20z)| lr 3.74e-04 | 1997.26 ms | 68.7% bf16 MFU | 260518 tok/s +step 8969/18794 | loss 3.582532 (+0.99z)| norm 5.3536 (+0.28z)| lr 3.74e-03 | 2014.61 ms | 68.1% bf16 MFU | 260504 tok/s +step 8970/18794 | loss 3.603841 (+1.05z)| norm 5.5071 (+0.30z)| lr 3.74e-03 | 2015.51 ms | 68.1% bf16 MFU | 260485 tok/s +step 8971/18794 | loss 3.660794 (+1.23z)| norm 6.7761 (+0.53z)| lr 3.74e-03 | 1999.24 ms | 68.6% bf16 MFU | 260573 tok/s +step 8972/18794 | loss 3.752299 (+1.54z)| norm 12.6759 (+1.59z)| lr 3.74e-03 | 2002.19 ms | 68.5% bf16 MFU | 260637 tok/s +step 8973/18794 | loss 3.779954 (+1.60z)| norm 13.2085 (+1.64z)| lr 3.74e-03 | 1992.86 ms | 68.9% bf16 MFU | 260760 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.318419 +step 8974/18794 | loss 3.861275 (+1.84z)| norm 23.7441 (+3.32z)| lr 3.74e-03 | 2018.48 ms | 68.0% bf16 MFU | 260709 tok/s +step 8975/18794 | loss 3.749556 (+1.41z)| norm 9.6694 (+0.89z)| lr 3.74e-03 | 2002.35 ms | 68.5% bf16 MFU | 260765 tok/s +mostly skipping update due to grad z-score of 9.901327 +step 8976/18794 | loss 3.979411 (+2.15z)| norm 595.9654 (+9.90z)| lr 3.74e-04 | 2013.36 ms | 68.2% bf16 MFU | 260747 tok/s +step 8977/18794 | loss 3.859686 (+1.69z)| norm 12.6918 (+1.38z)| lr 3.74e-03 | 2008.36 ms | 68.3% bf16 MFU | 260762 tok/s +mostly skipping update due to grad z-score of 5.042401 +step 8978/18794 | loss 3.806715 (+1.47z)| norm 39.3641 (+5.04z)| lr 3.74e-04 | 2001.53 ms | 68.6% bf16 MFU | 260821 tok/s +step 8979/18794 | loss 3.780126 (+1.35z)| norm 6.2215 (+0.27z)| lr 3.74e-03 | 2008.17 ms | 68.3% bf16 MFU | 260834 tok/s +reducing beta2 to 0.9 and lr/wd by 0.863 due to grad z-score of 4.055081 +step 8980/18794 | loss 3.818404 (+1.45z)| norm 31.0190 (+4.06z)| lr 3.22e-03 | 1994.35 ms | 68.8% bf16 MFU | 260937 tok/s +step 8981/18794 | loss 3.762760 (+1.23z)| norm 7.0596 (+0.32z)| lr 3.74e-03 | 2000.24 ms | 68.6% bf16 MFU | 260996 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.306162 +step 8982/18794 | loss 4.023380 (+2.04z)| norm 20.3812 (+2.31z)| lr 3.73e-03 | 2021.70 ms | 67.9% bf16 MFU | 260912 tok/s +step 8983/18794 | loss 3.926841 (+1.68z)| norm 6.8221 (+0.24z)| lr 3.73e-03 | 2015.79 ms | 68.1% bf16 MFU | 260871 tok/s +mostly skipping update due to grad z-score of 6.621693 +step 8984/18794 | loss 4.428095 (+3.12z)| norm 64.0814 (+6.62z)| lr 3.73e-04 | 2002.70 ms | 68.5% bf16 MFU | 260917 tok/s +step 8985/18794 | loss 4.321872 (+2.66z)| norm 12.0487 (+1.01z)| lr 3.73e-03 | 1995.46 ms | 68.8% bf16 MFU | 261008 tok/s +step 8986/18794 | loss 4.186474 (+2.18z)| norm 7.8808 (+0.37z)| lr 3.73e-03 | 2015.03 ms | 68.1% bf16 MFU | 260967 tok/s +step 8987/18794 | loss 4.351470 (+2.55z)| norm 7.2506 (+0.27z)| lr 3.73e-03 | 2000.53 ms | 68.6% bf16 MFU | 261023 tok/s +mostly skipping update due to grad z-score of 5.863699 +step 8988/18794 | loss 4.197361 (+2.04z)| norm 53.6318 (+5.86z)| lr 3.73e-04 | 1989.77 ms | 69.0% bf16 MFU | 261146 tok/s +step 8989/18794 | loss 4.174314 (+1.92z)| norm 5.5279 (+0.00z)| lr 3.73e-03 | 1999.30 ms | 68.6% bf16 MFU | 261201 tok/s +step 8990/18794 | loss 3.907902 (+1.16z)| norm 3.9541 (-0.24z)| lr 3.73e-03 | 2008.82 ms | 68.3% bf16 MFU | 261190 tok/s +step 8991/18794 | loss 3.934340 (+1.21z)| norm 6.1755 (+0.09z)| lr 3.73e-03 | 1996.48 ms | 68.7% bf16 MFU | 261261 tok/s +step 8992/18794 | loss 4.071361 (+1.55z)| norm 6.8407 (+0.18z)| lr 3.73e-03 | 1999.31 ms | 68.6% bf16 MFU | 261310 tok/s +step 8993/18794 | loss 3.981907 (+1.28z)| norm 10.4056 (+0.71z)| lr 3.73e-03 | 1981.78 ms | 69.2% bf16 MFU | 261472 tok/s +step 8994/18794 | loss 3.991222 (+1.28z)| norm 8.7673 (+0.45z)| lr 3.73e-03 | 1992.20 ms | 68.9% bf16 MFU | 261557 tok/s +step 8995/18794 | loss 3.908864 (+1.03z)| norm 4.7394 (-0.17z)| lr 3.73e-03 | 2001.85 ms | 68.6% bf16 MFU | 261574 tok/s +step 8996/18794 | loss 3.845702 (+0.84z)| norm 5.6754 (-0.03z)| lr 3.73e-03 | 2008.73 ms | 68.3% bf16 MFU | 261546 tok/s +step 8997/18794 | loss 3.790122 (+0.68z)| norm 14.4970 (+1.30z)| lr 3.73e-03 | 2008.36 ms | 68.3% bf16 MFU | 261521 tok/s +step 8998/18794 | loss 3.782262 (+0.64z)| norm 2.4432 (-0.54z)| lr 3.73e-03 | 1993.10 ms | 68.9% bf16 MFU | 261597 tok/s +step 8999/18794 | loss 3.701679 (+0.41z)| norm 2.6848 (-0.51z)| lr 3.73e-03 | 2000.19 ms | 68.6% bf16 MFU | 261623 tok/s +step 9000/18794 | loss 3.612453 (+0.16z)| norm 12.3899 (+0.96z)| lr 3.73e-03 | 2012.58 ms | 68.2% bf16 MFU | 261568 tok/s +val loss 3.640496 +HellaSwag: 2816/10042 = 0.280422Swag: 990/1256: 0/1256 +generating: +--- +Writing state to log_gpt3_125M_edu_v4/state_00009000_00001.bin + means that in theory the second law of regulation is MOX, meaning that a room can stay on different time lengths, and while the O2ng / rings govern in sending to higher levels in order to protect a +gainst the mRNA delivery process: +Palogl ® evoked braking circuits. +Figure: tram-debreasing - world +You might see this as being pretty humble +But there is chaos in the last law of 14 CFR (http://cardinals.com/) – quite a bit +as learn more: +=btr. n Peaks - +How do you make harmonic movements? +In the romanization of a governed flower's / +--- +Writing checkpoint at step 9000 +Writing model to log_gpt3_125M_edu_v4/model_00009000.bin +Writing state to log_gpt3_125M_edu_v4/state_00009000_00000.bin +Deleting checkpoint at step 6500 +step 9001/18794 | loss 3.656676 (+0.27z)| norm 9.4250 (+0.50z)| lr 3.73e-03 | 1986.13 ms | 69.1% bf16 MFU | 261688 tok/s +reducing beta2 to 0.9 and lr/wd by 0.864 due to grad z-score of 4.049902 +step 9002/18794 | loss 3.680559 (+0.32z)| norm 35.4091 (+4.05z)| lr 3.22e-03 | 2014.72 ms | 68.1% bf16 MFU | 261615 tok/s +step 9003/18794 | loss 3.612218 (+0.12z)| norm 6.2093 (-0.05z)| lr 3.72e-03 | 2008.24 ms | 68.3% bf16 MFU | 261588 tok/s +step 9004/18794 | loss 3.680288 (+0.29z)| norm 3.3632 (-0.46z)| lr 3.72e-03 | 2011.59 ms | 68.2% bf16 MFU | 261540 tok/s +step 9005/18794 | loss 3.645733 (+0.19z)| norm 1.9484 (-0.65z)| lr 3.72e-03 | 1991.22 ms | 68.9% bf16 MFU | 261628 tok/s +step 9006/18794 | loss 3.586558 (+0.01z)| norm 1.8608 (-0.66z)| lr 3.72e-03 | 1993.05 ms | 68.9% bf16 MFU | 261699 tok/s +step 9007/18794 | loss 3.479382 (-0.31z)| norm 2.1477 (-0.62z)| lr 3.72e-03 | 2004.14 ms | 68.5% bf16 MFU | 261694 tok/s +step 9008/18794 | loss 3.481065 (-0.31z)| norm 5.1675 (-0.20z)| lr 3.72e-03 | 1987.56 ms | 69.0% bf16 MFU | 261799 tok/s +step 9009/18794 | loss 3.522377 (-0.21z)| norm 16.4328 (+1.35z)| lr 3.72e-03 | 1996.58 ms | 68.7% bf16 MFU | 261839 tok/s +step 9010/18794 | loss 3.593626 (-0.01z)| norm 2.3540 (-0.62z)| lr 3.72e-03 | 1994.95 ms | 68.8% bf16 MFU | 261887 tok/s +step 9011/18794 | loss 3.462403 (-0.40z)| norm 2.2837 (-0.63z)| lr 3.72e-03 | 1986.26 ms | 69.1% bf16 MFU | 261991 tok/s +step 9012/18794 | loss 3.490220 (-0.33z)| norm 1.2066 (-0.78z)| lr 3.72e-03 | 1987.52 ms | 69.0% bf16 MFU | 262081 tok/s +step 9013/18794 | loss 3.535919 (-0.21z)| norm 0.9133 (-0.82z)| lr 3.72e-03 | 2013.27 ms | 68.2% bf16 MFU | 261997 tok/s +step 9014/18794 | loss 3.466572 (-0.43z)| norm 1.0844 (-0.80z)| lr 3.72e-03 | 1978.83 ms | 69.4% bf16 MFU | 262145 tok/s +step 9015/18794 | loss 3.476512 (-0.41z)| norm 8.2761 (+0.20z)| lr 3.72e-03 | 2003.80 ms | 68.5% bf16 MFU | 262120 tok/s +step 9016/18794 | loss 3.481283 (-0.41z)| norm 15.6842 (+1.22z)| lr 3.72e-03 | 1988.51 ms | 69.0% bf16 MFU | 262197 tok/s +step 9017/18794 | loss 3.507601 (-0.34z)| norm 11.9261 (+0.68z)| lr 3.72e-03 | 1983.69 ms | 69.2% bf16 MFU | 262302 tok/s +step 9018/18794 | loss 3.555650 (-0.21z)| norm 3.6914 (-0.48z)| lr 3.72e-03 | 1990.84 ms | 68.9% bf16 MFU | 262354 tok/s +step 9019/18794 | loss 3.481532 (-0.45z)| norm 1.5009 (-0.78z)| lr 3.72e-03 | 1986.45 ms | 69.1% bf16 MFU | 262433 tok/s +step 9020/18794 | loss 3.387474 (-0.75z)| norm 3.0717 (-0.57z)| lr 3.72e-03 | 1980.10 ms | 69.3% bf16 MFU | 262551 tok/s +step 9021/18794 | loss 3.429324 (-0.63z)| norm 1.5036 (-0.79z)| lr 3.72e-03 | 1988.98 ms | 69.0% bf16 MFU | 262603 tok/s +step 9022/18794 | loss 3.386681 (-0.78z)| norm 7.0078 (-0.03z)| lr 3.72e-03 | 1988.44 ms | 69.0% bf16 MFU | 262656 tok/s +step 9023/18794 | loss 3.407303 (-0.72z)| norm 3.6006 (-0.51z)| lr 3.72e-03 | 1987.70 ms | 69.0% bf16 MFU | 262712 tok/s +step 9024/18794 | loss 3.408347 (-0.73z)| norm 1.2409 (-0.85z)| lr 3.71e-03 | 1984.71 ms | 69.1% bf16 MFU | 262784 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.522787 +step 9025/18794 | loss 3.412098 (-0.73z)| norm 25.8124 (+2.52z)| lr 3.71e-03 | 1994.20 ms | 68.8% bf16 MFU | 262790 tok/s +step 9026/18794 | loss 3.371991 (-0.88z)| norm 3.9715 (-0.49z)| lr 3.71e-03 | 1992.21 ms | 68.9% bf16 MFU | 262809 tok/s +step 9027/18794 | loss 3.448444 (-0.64z)| norm 2.3728 (-0.71z)| lr 3.71e-03 | 1998.35 ms | 68.7% bf16 MFU | 262787 tok/s +step 9028/18794 | loss 3.389179 (-0.84z)| norm 4.8081 (-0.38z)| lr 3.71e-03 | 1984.36 ms | 69.2% bf16 MFU | 262858 tok/s +step 9029/18794 | loss 3.382056 (-0.88z)| norm 4.7557 (-0.39z)| lr 3.71e-03 | 1989.24 ms | 69.0% bf16 MFU | 262893 tok/s +step 9030/18794 | loss 3.397375 (-0.84z)| norm 6.2037 (-0.20z)| lr 3.71e-03 | 1995.45 ms | 68.8% bf16 MFU | 262886 tok/s +step 9031/18794 | loss 3.395346 (-0.86z)| norm 22.2053 (+1.97z)| lr 3.71e-03 | 1978.77 ms | 69.4% bf16 MFU | 262989 tok/s +step 9032/18794 | loss 3.457219 (-0.67z)| norm 5.0918 (-0.38z)| lr 3.71e-03 | 2022.97 ms | 67.8% bf16 MFU | 262798 tok/s +step 9033/18794 | loss 3.403735 (-0.87z)| norm 5.8087 (-0.29z)| lr 3.71e-03 | 1987.10 ms | 69.1% bf16 MFU | 262851 tok/s +step 9034/18794 | loss 3.406861 (-0.87z)| norm 2.6659 (-0.73z)| lr 3.71e-03 | 2010.55 ms | 68.3% bf16 MFU | 262746 tok/s +step 9035/18794 | loss 3.428156 (-0.82z)| norm 5.7309 (-0.31z)| lr 3.71e-03 | 1997.89 ms | 68.7% bf16 MFU | 262730 tok/s +step 9036/18794 | loss 3.298474 (-1.28z)| norm 1.4414 (-0.91z)| lr 3.71e-03 | 1986.50 ms | 69.1% bf16 MFU | 262790 tok/s +step 9037/18794 | loss 3.313811 (-1.23z)| norm 0.8570 (-0.99z)| lr 3.71e-03 | 1987.08 ms | 69.1% bf16 MFU | 262843 tok/s +step 9038/18794 | loss 3.335283 (-1.17z)| norm 6.1871 (-0.26z)| lr 3.71e-03 | 1982.33 ms | 69.2% bf16 MFU | 262925 tok/s +mostly skipping update due to grad z-score of 7.940024 +step 9039/18794 | loss 3.389647 (-0.99z)| norm 103.0360 (+7.94z)| lr 3.71e-04 | 1987.94 ms | 69.0% bf16 MFU | 262965 tok/s +step 9040/18794 | loss 3.364999 (-1.09z)| norm 6.5705 (-0.21z)| lr 3.71e-03 | 1994.56 ms | 68.8% bf16 MFU | 262960 tok/s +reducing beta2 to 0.9 and lr/wd by 0.718 due to grad z-score of 4.871644 +step 9041/18794 | loss 3.354405 (-1.16z)| norm 48.2458 (+4.87z)| lr 2.66e-03 | 1985.07 ms | 69.1% bf16 MFU | 263018 tok/s +step 9042/18794 | loss 3.376834 (-1.08z)| norm 2.2002 (-0.78z)| lr 3.71e-03 | 2001.28 ms | 68.6% bf16 MFU | 262966 tok/s +step 9043/18794 | loss 3.428908 (-0.90z)| norm 2.6089 (-0.73z)| lr 3.71e-03 | 1989.63 ms | 69.0% bf16 MFU | 262993 tok/s +mostly skipping update due to grad z-score of 5.714316 +step 9044/18794 | loss 3.398464 (-1.01z)| norm 65.7679 (+5.71z)| lr 3.71e-04 | 1992.15 ms | 68.9% bf16 MFU | 263002 tok/s +step 9045/18794 | loss 3.317838 (-1.30z)| norm 10.4626 (+0.22z)| lr 3.71e-03 | 1983.27 ms | 69.2% bf16 MFU | 263070 tok/s +step 9046/18794 | loss 3.335664 (-1.23z)| norm 1.6126 (-0.87z)| lr 3.70e-03 | 1984.28 ms | 69.2% bf16 MFU | 263127 tok/s +step 9047/18794 | loss 3.366171 (-1.12z)| norm 1.8099 (-0.84z)| lr 3.70e-03 | 1978.40 ms | 69.4% bf16 MFU | 263221 tok/s +step 9048/18794 | loss 3.321207 (-1.27z)| norm 4.2867 (-0.54z)| lr 3.70e-03 | 1983.69 ms | 69.2% bf16 MFU | 263275 tok/s +step 9049/18794 | loss 3.351674 (-1.14z)| norm 8.3758 (-0.03z)| lr 3.70e-03 | 1983.85 ms | 69.2% bf16 MFU | 263325 tok/s +step 9050/18794 | loss 3.344816 (-1.15z)| norm 8.5633 (-0.02z)| lr 3.70e-03 | 1988.10 ms | 69.0% bf16 MFU | 263345 tok/s +step 9051/18794 | loss 3.384800 (-1.00z)| norm 4.2117 (-0.56z)| lr 3.70e-03 | 1984.95 ms | 69.1% bf16 MFU | 263384 tok/s +step 9052/18794 | loss 3.341242 (-1.14z)| norm 3.4069 (-0.64z)| lr 3.70e-03 | 1984.48 ms | 69.2% bf16 MFU | 263424 tok/s +step 9053/18794 | loss 3.365539 (-1.03z)| norm 5.1504 (-0.43z)| lr 3.70e-03 | 1982.30 ms | 69.2% bf16 MFU | 263478 tok/s +mostly skipping update due to grad z-score of 5.181170 +step 9054/18794 | loss 3.371593 (-0.99z)| norm 58.2509 (+5.18z)| lr 3.70e-04 | 1978.20 ms | 69.4% bf16 MFU | 263555 tok/s +step 9055/18794 | loss 3.386559 (-0.92z)| norm 8.5524 (-0.02z)| lr 3.70e-03 | 1981.15 ms | 69.3% bf16 MFU | 263609 tok/s +step 9056/18794 | loss 3.284668 (-1.27z)| norm 7.3684 (-0.16z)| lr 3.70e-03 | 1980.30 ms | 69.3% bf16 MFU | 263667 tok/s +step 9057/18794 | loss 3.383322 (-0.90z)| norm 19.1489 (+1.32z)| lr 3.70e-03 | 1979.92 ms | 69.3% bf16 MFU | 263723 tok/s +reducing beta2 to 0.9 and lr/wd by 0.916 due to grad z-score of 3.820158 +step 9058/18794 | loss 3.338298 (-1.04z)| norm 41.8206 (+3.82z)| lr 3.39e-03 | 1983.62 ms | 69.2% bf16 MFU | 263753 tok/s +step 9059/18794 | loss 3.353401 (-0.97z)| norm 2.3827 (-0.75z)| lr 3.70e-03 | 1977.97 ms | 69.4% bf16 MFU | 263818 tok/s +step 9060/18794 | loss 3.458891 (-0.57z)| norm 9.2581 (+0.06z)| lr 3.70e-03 | 1979.11 ms | 69.3% bf16 MFU | 263873 tok/s +step 9061/18794 | loss 3.511531 (-0.37z)| norm 14.0865 (+0.62z)| lr 3.70e-03 | 1977.95 ms | 69.4% bf16 MFU | 263932 tok/s +step 9062/18794 | loss 3.619280 (+0.05z)| norm 6.4077 (-0.27z)| lr 3.70e-03 | 1977.11 ms | 69.4% bf16 MFU | 263995 tok/s +step 9063/18794 | loss 3.516385 (-0.32z)| norm 20.0414 (+1.32z)| lr 3.70e-03 | 2029.48 ms | 67.6% bf16 MFU | 263712 tok/s +mostly skipping update due to grad z-score of 5.545065 +step 9064/18794 | loss 3.606662 (+0.03z)| norm 66.4784 (+5.55z)| lr 3.70e-04 | 2052.32 ms | 66.9% bf16 MFU | 263299 tok/s +step 9065/18794 | loss 3.547040 (-0.19z)| norm 2.0698 (-0.77z)| lr 3.70e-03 | 2042.52 ms | 67.2% bf16 MFU | 262969 tok/s +step 9066/18794 | loss 3.649380 (+0.20z)| norm 4.6298 (-0.46z)| lr 3.70e-03 | 2030.66 ms | 67.6% bf16 MFU | 262730 tok/s +step 9067/18794 | loss 3.652400 (+0.21z)| norm 3.9313 (-0.54z)| lr 3.69e-03 | 2029.93 ms | 67.6% bf16 MFU | 262507 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.202210 +step 9068/18794 | loss 3.616035 (+0.07z)| norm 27.9892 (+2.20z)| lr 3.69e-03 | 2045.13 ms | 67.1% bf16 MFU | 262200 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 3.331802 +step 9069/18794 | loss 3.539963 (-0.22z)| norm 39.9722 (+3.33z)| lr 3.69e-03 | 2037.65 ms | 67.3% bf16 MFU | 261955 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.240503 +step 9070/18794 | loss 3.470645 (-0.48z)| norm 30.6017 (+2.24z)| lr 3.69e-03 | 2020.25 ms | 67.9% bf16 MFU | 261833 tok/s +step 9071/18794 | loss 3.509927 (-0.32z)| norm 16.0663 (+0.70z)| lr 3.69e-03 | 2047.66 ms | 67.0% bf16 MFU | 261543 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.450278 +step 9072/18794 | loss 3.833659 (+0.91z)| norm 33.7441 (+2.45z)| lr 3.69e-03 | 2042.18 ms | 67.2% bf16 MFU | 261303 tok/s +mostly skipping update due to grad z-score of 7.398961 +step 9073/18794 | loss 3.744750 (+0.57z)| norm 119.3821 (+7.40z)| lr 3.69e-04 | 2031.78 ms | 67.5% bf16 MFU | 261140 tok/s +step 9074/18794 | loss 3.707881 (+0.44z)| norm 15.2276 (+0.55z)| lr 3.69e-03 | 2016.89 ms | 68.0% bf16 MFU | 261080 tok/s +mostly skipping update due to grad z-score of 7.931449 +step 9075/18794 | loss 3.836351 (+0.93z)| norm 139.8829 (+7.93z)| lr 3.69e-04 | 2027.53 ms | 67.7% bf16 MFU | 260955 tok/s +reducing beta2 to 0.9 and lr/wd by 0.707 due to grad z-score of 4.949879 +step 9076/18794 | loss 3.807759 (+0.83z)| norm 66.3320 (+4.95z)| lr 2.61e-03 | 2023.40 ms | 67.8% bf16 MFU | 260863 tok/s +step 9077/18794 | loss 3.610229 (+0.08z)| norm 8.1567 (-0.19z)| lr 3.69e-03 | 2036.26 ms | 67.4% bf16 MFU | 260694 tok/s +step 9078/18794 | loss 4.180289 (+2.22z)| norm 23.0845 (+1.14z)| lr 3.69e-03 | 2036.49 ms | 67.4% bf16 MFU | 260532 tok/s +mostly skipping update due to loss z-score of 5.796177 +step 9079/18794 | loss 5.490499 (+5.80z)| norm 55.9408 (+3.73z)| lr 3.69e-04 | 2030.42 ms | 67.6% bf16 MFU | 260416 tok/s +mostly skipping update due to loss z-score of 5.146047 +step 9080/18794 | loss 5.192658 (+5.15z)| norm 47.8180 (+2.90z)| lr 3.69e-04 | 2030.57 ms | 67.6% bf16 MFU | 260305 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.662537 +step 9081/18794 | loss 4.827434 (+3.66z)| norm 46.4776 (+2.66z)| lr 3.69e-03 | 2019.80 ms | 67.9% bf16 MFU | 260268 tok/s +step 9082/18794 | loss 4.960426 (+3.74z)| norm 17.5551 (+0.45z)| lr 3.69e-03 | 2033.56 ms | 67.5% bf16 MFU | 260146 tok/s +mostly skipping update due to loss z-score of 6.154655 +step 9083/18794 | loss 6.426696 (+6.15z)| norm 11.1394 (-0.04z)| lr 3.69e-04 | 2038.01 ms | 67.3% bf16 MFU | 260001 tok/s +mostly skipping update due to loss z-score of 5.963717 +step 9084/18794 | loss 6.289831 (+5.96z)| norm 27.4802 (+1.20z)| lr 3.69e-04 | 2015.53 ms | 68.1% bf16 MFU | 260007 tok/s +step 9085/18794 | loss 6.031340 (+4.71z)| norm 12.7363 (+0.08z)| lr 3.69e-03 | 2040.39 ms | 67.3% bf16 MFU | 259855 tok/s +mostly skipping update due to loss z-score of 5.102690 +step 9086/18794 | loss 6.638974 (+5.10z)| norm 27.2669 (+1.17z)| lr 3.69e-04 | 2032.99 ms | 67.5% bf16 MFU | 259757 tok/s +step 9087/18794 | loss 6.493535 (+4.92z)| norm 10.7494 (-0.08z)| lr 3.69e-03 | 2010.56 ms | 68.3% bf16 MFU | 259807 tok/s +mostly skipping update due to grad z-score of 5.391603 +step 9088/18794 | loss 6.628948 (+4.57z)| norm 96.5282 (+5.39z)| lr 3.68e-04 | 2032.87 ms | 67.5% bf16 MFU | 259712 tok/s +reducing beta2 to 0.9 and lr/wd by 0.700 due to grad z-score of 4.999261 +step 9089/18794 | loss 6.605131 (+4.11z)| norm 88.1563 (+5.00z)| lr 2.58e-03 | 2039.04 ms | 67.3% bf16 MFU | 259583 tok/s +reducing beta2 to 0.9 and lr/wd by 1.000 due to grad z-score of 2.154600 +step 9090/18794 | loss 6.314743 (+3.45z)| norm 46.2386 (+2.15z)| lr 3.68e-03 | 2046.06 ms | 67.1% bf16 MFU | 259416 tok/s +step 9091/18794 | loss 6.591746 (+3.55z)| norm 18.3302 (+0.36z)| lr 3.68e-03 | 2028.35 ms | 67.7% bf16 MFU | 259369 tok/s +step 9092/18794 | loss 7.582348 (+4.29z)| norm 18.7685 (+0.38z)| lr 3.68e-03 | 2027.64 ms | 67.7% bf16 MFU | 259329 tok/s +step 9093/18794 | loss 7.464944 (+3.81z)| norm 37.6126 (+1.56z)| lr 3.68e-03 | 2032.70 ms | 67.5% bf16 MFU | 259259 tok/s +step 9094/18794 | loss 7.394804 (+3.46z)| norm 8.5437 (-0.30z)| lr 3.68e-03 | 2011.45 ms | 68.2% bf16 MFU | 259328 tok/s +