Upload folder using huggingface_hub
Browse files- README.md +2 -2
- config.json +50 -0
- generation_config.json +10 -0
- loss_log.txt +644 -0
- model.safetensors.index.json +298 -0
- output.safetensors +3 -0
- special_tokens_map.json +16 -0
- tokenizer.json +0 -0
- tokenizer_config.json +0 -0
- training_config_phase3.yaml +93 -0
README.md
CHANGED
@@ -1,6 +1,6 @@
|
|
1 |
---
|
2 |
-
library_name:
|
3 |
-
|
4 |
---
|
5 |
|
6 |
# Model Card for Model ID
|
|
|
1 |
---
|
2 |
+
library_name: transformers
|
3 |
+
tags: []
|
4 |
---
|
5 |
|
6 |
# Model Card for Model ID
|
config.json
ADDED
@@ -0,0 +1,50 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_name_or_path": "llama3-s-instruct-v0.3-checkpoint-7000-phase-3/",
|
3 |
+
"architectures": [
|
4 |
+
"LlamaForCausalLM"
|
5 |
+
],
|
6 |
+
"attention_bias": false,
|
7 |
+
"attention_dropout": 0.0,
|
8 |
+
"bos_token_id": 128000,
|
9 |
+
"eos_token_id": [
|
10 |
+
128001,
|
11 |
+
128008,
|
12 |
+
128009
|
13 |
+
],
|
14 |
+
"hidden_act": "silu",
|
15 |
+
"hidden_size": 4096,
|
16 |
+
"initializer_range": 0.02,
|
17 |
+
"intermediate_size": 14336,
|
18 |
+
"max_position_embeddings": 131072,
|
19 |
+
"mlp_bias": false,
|
20 |
+
"model_type": "llama",
|
21 |
+
"num_attention_heads": 32,
|
22 |
+
"num_hidden_layers": 32,
|
23 |
+
"num_key_value_heads": 8,
|
24 |
+
"pretraining_tp": 1,
|
25 |
+
"rms_norm_eps": 1e-05,
|
26 |
+
"rope_scaling": {
|
27 |
+
"factor": 8.0,
|
28 |
+
"high_freq_factor": 4.0,
|
29 |
+
"low_freq_factor": 1.0,
|
30 |
+
"original_max_position_embeddings": 8192,
|
31 |
+
"rope_type": "llama3"
|
32 |
+
},
|
33 |
+
"rope_theta": 500000.0,
|
34 |
+
"tie_word_embeddings": false,
|
35 |
+
"torch_dtype": "bfloat16",
|
36 |
+
"transformers_version": "4.44.2",
|
37 |
+
"use_cache": true,
|
38 |
+
"vocab_size": 128771,
|
39 |
+
"quantization_config": {
|
40 |
+
"quant_method": "exl2",
|
41 |
+
"version": "0.2.2",
|
42 |
+
"bits": 8.2,
|
43 |
+
"head_bits": 6,
|
44 |
+
"calibration": {
|
45 |
+
"rows": 115,
|
46 |
+
"length": 2048,
|
47 |
+
"dataset": "(default)"
|
48 |
+
}
|
49 |
+
}
|
50 |
+
}
|
generation_config.json
ADDED
@@ -0,0 +1,10 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"_from_model_config": true,
|
3 |
+
"bos_token_id": 128000,
|
4 |
+
"eos_token_id": [
|
5 |
+
128001,
|
6 |
+
128008,
|
7 |
+
128009
|
8 |
+
],
|
9 |
+
"transformers_version": "4.44.2"
|
10 |
+
}
|
loss_log.txt
ADDED
@@ -0,0 +1,644 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Step 1 | loss:1.0854802131652832 lr:1.875e-06 tokens_per_second_per_gpu:1168.8574294071448
|
2 |
+
Step 2 | loss:1.1429901123046875 lr:3.75e-06 tokens_per_second_per_gpu:3843.7632889854767
|
3 |
+
Step 3 | loss:1.2053295373916626 lr:5.625e-06 tokens_per_second_per_gpu:3842.159382826307
|
4 |
+
Step 4 | loss:1.079384446144104 lr:7.5e-06 tokens_per_second_per_gpu:3527.3851566722074
|
5 |
+
Step 5 | loss:1.2358113527297974 lr:9.375000000000001e-06 tokens_per_second_per_gpu:3297.665429778813
|
6 |
+
Step 6 | loss:1.0729756355285645 lr:1.125e-05 tokens_per_second_per_gpu:3513.7372696945217
|
7 |
+
Step 7 | loss:0.9907684326171875 lr:1.3125e-05 tokens_per_second_per_gpu:3930.22637976525
|
8 |
+
Step 8 | loss:1.143568515777588 lr:1.5e-05 tokens_per_second_per_gpu:2876.219047245818
|
9 |
+
Step 9 | loss:1.2526814937591553 lr:1.4999908501094575e-05 tokens_per_second_per_gpu:3772.480582082779
|
10 |
+
Step 10 | loss:1.1926498413085938 lr:1.4999634006610844e-05 tokens_per_second_per_gpu:3787.9663127264835
|
11 |
+
Step 11 | loss:1.1863847970962524 lr:1.4999176523246392e-05 tokens_per_second_per_gpu:2879.4265754703606
|
12 |
+
Step 12 | loss:1.0382694005966187 lr:1.4998536062163683e-05 tokens_per_second_per_gpu:3367.342512712612
|
13 |
+
Step 13 | loss:1.0486845970153809 lr:1.4997712638989775e-05 tokens_per_second_per_gpu:3973.1834061223685
|
14 |
+
Step 14 | loss:1.0994930267333984 lr:1.499670627381596e-05 tokens_per_second_per_gpu:3882.9148296055278
|
15 |
+
Step 15 | loss:1.0168027877807617 lr:1.4995516991197246e-05 tokens_per_second_per_gpu:3710.4998349202683
|
16 |
+
Step 16 | loss:1.0419063568115234 lr:1.4994144820151789e-05 tokens_per_second_per_gpu:2869.5305816393893
|
17 |
+
Step 17 | loss:1.065361738204956 lr:1.4992589794160159e-05 tokens_per_second_per_gpu:3181.270854761355
|
18 |
+
Step 18 | loss:1.1389403343200684 lr:1.4990851951164537e-05 tokens_per_second_per_gpu:2980.579149781292
|
19 |
+
Step 19 | loss:1.1638612747192383 lr:1.4988931333567785e-05 tokens_per_second_per_gpu:3509.8801697983517
|
20 |
+
Step 20 | loss:1.0619863271713257 lr:1.4986827988232412e-05 tokens_per_second_per_gpu:3703.804663132431
|
21 |
+
Step 21 | loss:1.1482210159301758 lr:1.4984541966479427e-05 tokens_per_second_per_gpu:3560.136617406801
|
22 |
+
Step 22 | loss:1.0800286531448364 lr:1.4982073324087097e-05 tokens_per_second_per_gpu:3648.6607162752957
|
23 |
+
Step 23 | loss:1.0815565586090088 lr:1.4979422121289576e-05 tokens_per_second_per_gpu:3211.609323443709
|
24 |
+
Step 24 | loss:1.0016083717346191 lr:1.4976588422775437e-05 tokens_per_second_per_gpu:3403.8935665137487
|
25 |
+
Step 25 | loss:1.054980993270874 lr:1.4973572297686097e-05 tokens_per_second_per_gpu:3733.187288468551
|
26 |
+
Step 26 | loss:1.0594943761825562 lr:1.4970373819614128e-05 tokens_per_second_per_gpu:3380.227950545309
|
27 |
+
Step 27 | loss:1.1039057970046997 lr:1.4966993066601459e-05 tokens_per_second_per_gpu:3416.5838997236365
|
28 |
+
Step 28 | loss:1.0442756414413452 lr:1.4963430121137482e-05 tokens_per_second_per_gpu:3232.857948752556
|
29 |
+
Step 29 | loss:1.3250808715820312 lr:1.495968507015702e-05 tokens_per_second_per_gpu:2739.8843861816085
|
30 |
+
Step 30 | loss:1.0642691850662231 lr:1.495575800503823e-05 tokens_per_second_per_gpu:3270.2234151530183
|
31 |
+
Step 31 | loss:1.0556732416152954 lr:1.4951649021600348e-05 tokens_per_second_per_gpu:3280.5378111148802
|
32 |
+
Step 32 | loss:1.2651450634002686 lr:1.4947358220101378e-05 tokens_per_second_per_gpu:3675.5567883584245
|
33 |
+
Step 33 | loss:0.9953560829162598 lr:1.4942885705235616e-05 tokens_per_second_per_gpu:3957.394946057751
|
34 |
+
Step 34 | loss:0.9306876063346863 lr:1.493823158613113e-05 tokens_per_second_per_gpu:2993.4562512796037
|
35 |
+
Step 35 | loss:0.9759434461593628 lr:1.4933395976347056e-05 tokens_per_second_per_gpu:3688.3981271180473
|
36 |
+
Step 36 | loss:1.042667031288147 lr:1.4928378993870868e-05 tokens_per_second_per_gpu:2739.611697714639
|
37 |
+
Step 37 | loss:1.1176235675811768 lr:1.4923180761115471e-05 tokens_per_second_per_gpu:3389.9575685437558
|
38 |
+
Step 38 | loss:1.0304946899414062 lr:1.491780140491623e-05 tokens_per_second_per_gpu:2751.426529078761
|
39 |
+
Step 39 | loss:1.1554368734359741 lr:1.4912241056527865e-05 tokens_per_second_per_gpu:3678.512086747305
|
40 |
+
Step 40 | loss:1.1235543489456177 lr:1.4906499851621251e-05 tokens_per_second_per_gpu:3354.288583814501
|
41 |
+
Step 41 | loss:1.1204215288162231 lr:1.4900577930280117e-05 tokens_per_second_per_gpu:2646.7652669210534
|
42 |
+
Step 42 | loss:1.159371256828308 lr:1.489447543699761e-05 tokens_per_second_per_gpu:3113.893638504558
|
43 |
+
Step 43 | loss:0.9518178701400757 lr:1.488819252067279e-05 tokens_per_second_per_gpu:3624.505087259554
|
44 |
+
Step 44 | loss:1.000114917755127 lr:1.4881729334606977e-05 tokens_per_second_per_gpu:3995.207620964973
|
45 |
+
Step 45 | loss:1.0402061939239502 lr:1.4875086036500025e-05 tokens_per_second_per_gpu:3694.213698196683
|
46 |
+
Step 46 | loss:1.1420111656188965 lr:1.4868262788446472e-05 tokens_per_second_per_gpu:3655.494511670644
|
47 |
+
Step 47 | loss:1.0602643489837646 lr:1.4861259756931577e-05 tokens_per_second_per_gpu:3402.054859100221
|
48 |
+
Step 48 | loss:0.9862946271896362 lr:1.4854077112827263e-05 tokens_per_second_per_gpu:3261.120378766765
|
49 |
+
Step 49 | loss:1.0737448930740356 lr:1.4846715031387952e-05 tokens_per_second_per_gpu:3560.4513581368537
|
50 |
+
Step 50 | loss:1.0223171710968018 lr:1.483917369224628e-05 tokens_per_second_per_gpu:4119.596935775298
|
51 |
+
Step 51 | loss:0.9688004851341248 lr:1.4831453279408724e-05 tokens_per_second_per_gpu:3727.7426885377727
|
52 |
+
Step 52 | loss:1.2084407806396484 lr:1.4823553981251103e-05 tokens_per_second_per_gpu:3475.565097423413
|
53 |
+
Step 53 | loss:1.0782098770141602 lr:1.4815475990513983e-05 tokens_per_second_per_gpu:3116.162844911365
|
54 |
+
Step 54 | loss:1.076345682144165 lr:1.4807219504297984e-05 tokens_per_second_per_gpu:3636.4074969842172
|
55 |
+
Step 55 | loss:0.9842542409896851 lr:1.4798784724058958e-05 tokens_per_second_per_gpu:3890.3173372257284
|
56 |
+
Step 56 | loss:1.0145485401153564 lr:1.4790171855603081e-05 tokens_per_second_per_gpu:3849.2883562880224
|
57 |
+
Step 57 | loss:1.0625418424606323 lr:1.4781381109081831e-05 tokens_per_second_per_gpu:3571.1217369942487
|
58 |
+
Step 58 | loss:1.0252196788787842 lr:1.4772412698986854e-05 tokens_per_second_per_gpu:3456.6789270442778
|
59 |
+
Step 59 | loss:0.9392111897468567 lr:1.4763266844144741e-05 tokens_per_second_per_gpu:3657.3789166667807
|
60 |
+
Step 60 | loss:1.0814956426620483 lr:1.4753943767711678e-05 tokens_per_second_per_gpu:3315.163159791015
|
61 |
+
Step 61 | loss:1.0520405769348145 lr:1.4744443697168013e-05 tokens_per_second_per_gpu:3701.303137175016
|
62 |
+
Step 62 | loss:1.0226781368255615 lr:1.473476686431269e-05 tokens_per_second_per_gpu:3537.4955448330916
|
63 |
+
Step 63 | loss:1.0486294031143188 lr:1.4724913505257609e-05 tokens_per_second_per_gpu:3095.7445591899086
|
64 |
+
Step 64 | loss:1.0411049127578735 lr:1.4714883860421854e-05 tokens_per_second_per_gpu:3149.287379730578
|
65 |
+
Step 65 | loss:1.0797317028045654 lr:1.4704678174525831e-05 tokens_per_second_per_gpu:3445.673531691869
|
66 |
+
Step 66 | loss:0.9485515356063843 lr:1.4694296696585298e-05 tokens_per_second_per_gpu:3864.0149006705747
|
67 |
+
Step 67 | loss:0.9495678544044495 lr:1.4683739679905284e-05 tokens_per_second_per_gpu:3101.8256009234356
|
68 |
+
Step 68 | loss:1.0279844999313354 lr:1.4673007382073919e-05 tokens_per_second_per_gpu:3463.4301964845763
|
69 |
+
Step 69 | loss:1.0704870223999023 lr:1.4662100064956132e-05 tokens_per_second_per_gpu:3121.3618232019967
|
70 |
+
Step 70 | loss:1.0774767398834229 lr:1.4651017994687282e-05 tokens_per_second_per_gpu:2970.1779475300104
|
71 |
+
Step 71 | loss:1.040583848953247 lr:1.4639761441666646e-05 tokens_per_second_per_gpu:3368.9417024963573
|
72 |
+
Step 72 | loss:1.1608614921569824 lr:1.4628330680550833e-05 tokens_per_second_per_gpu:3591.1238850356212
|
73 |
+
Step 73 | loss:1.1760778427124023 lr:1.4616725990247078e-05 tokens_per_second_per_gpu:3191.835895149416
|
74 |
+
Step 74 | loss:1.0865557193756104 lr:1.4604947653906435e-05 tokens_per_second_per_gpu:3856.472486717953
|
75 |
+
Step 75 | loss:0.997776985168457 lr:1.4592995958916877e-05 tokens_per_second_per_gpu:4288.807801484956
|
76 |
+
Step 76 | loss:1.0847547054290771 lr:1.4580871196896266e-05 tokens_per_second_per_gpu:3642.7905562815736
|
77 |
+
Step 77 | loss:1.1006416082382202 lr:1.4568573663685267e-05 tokens_per_second_per_gpu:2950.4712628757084
|
78 |
+
Step 78 | loss:1.1318891048431396 lr:1.4556103659340091e-05 tokens_per_second_per_gpu:3420.308972950097
|
79 |
+
Step 79 | loss:1.073240041732788 lr:1.4543461488125208e-05 tokens_per_second_per_gpu:3292.630999889894
|
80 |
+
Step 80 | loss:0.9821822643280029 lr:1.4530647458505908e-05 tokens_per_second_per_gpu:3845.0406649088623
|
81 |
+
Step 81 | loss:1.031591534614563 lr:1.4517661883140769e-05 tokens_per_second_per_gpu:3597.2292553821944
|
82 |
+
Step 82 | loss:1.0589147806167603 lr:1.4504505078874041e-05 tokens_per_second_per_gpu:3923.875717440058
|
83 |
+
Step 83 | loss:1.0883533954620361 lr:1.449117736672791e-05 tokens_per_second_per_gpu:3328.5666750491487
|
84 |
+
Step 84 | loss:0.9703829288482666 lr:1.4477679071894659e-05 tokens_per_second_per_gpu:4210.9057239346785
|
85 |
+
Step 85 | loss:1.0380264520645142 lr:1.4464010523728745e-05 tokens_per_second_per_gpu:4028.7523308737022
|
86 |
+
Step 86 | loss:1.0424227714538574 lr:1.445017205573875e-05 tokens_per_second_per_gpu:3067.892409170815
|
87 |
+
Step 87 | loss:0.9402433037757874 lr:1.4436164005579258e-05 tokens_per_second_per_gpu:3727.9697325163597
|
88 |
+
Step 88 | loss:0.9366852641105652 lr:1.4421986715042602e-05 tokens_per_second_per_gpu:3359.9819616780637
|
89 |
+
Step 89 | loss:0.9120358824729919 lr:1.4407640530050532e-05 tokens_per_second_per_gpu:3560.9878740376766
|
90 |
+
Step 90 | loss:1.0565634965896606 lr:1.4393125800645775e-05 tokens_per_second_per_gpu:3406.1127910038686
|
91 |
+
Step 91 | loss:1.0152472257614136 lr:1.4378442880983492e-05 tokens_per_second_per_gpu:3213.8003626321415
|
92 |
+
Step 92 | loss:1.1250417232513428 lr:1.4363592129322638e-05 tokens_per_second_per_gpu:3550.8080018136066
|
93 |
+
Step 93 | loss:1.1519802808761597 lr:1.4348573908017218e-05 tokens_per_second_per_gpu:3889.4264438590662
|
94 |
+
Step 94 | loss:1.0634357929229736 lr:1.4333388583507448e-05 tokens_per_second_per_gpu:3799.4362078472236
|
95 |
+
Step 95 | loss:0.9236892461776733 lr:1.4318036526310814e-05 tokens_per_second_per_gpu:3107.0781575328906
|
96 |
+
Step 96 | loss:1.1531763076782227 lr:1.4302518111013029e-05 tokens_per_second_per_gpu:3633.8470364745854
|
97 |
+
Step 97 | loss:1.1617469787597656 lr:1.4286833716258899e-05 tokens_per_second_per_gpu:3394.014913251338
|
98 |
+
Step 98 | loss:1.049908995628357 lr:1.4270983724743077e-05 tokens_per_second_per_gpu:3281.7821637586317
|
99 |
+
Step 99 | loss:1.168889045715332 lr:1.425496852320073e-05 tokens_per_second_per_gpu:3536.842535550907
|
100 |
+
Step 100 | loss:1.029691457748413 lr:1.42387885023981e-05 tokens_per_second_per_gpu:3351.7619303741485
|
101 |
+
Step 101 | loss:0.9969059824943542 lr:1.422244405712297e-05 tokens_per_second_per_gpu:3872.7354730770085
|
102 |
+
Step 102 | loss:1.0703043937683105 lr:1.420593558617504e-05 tokens_per_second_per_gpu:3133.1573043410367
|
103 |
+
Step 103 | loss:0.9668957591056824 lr:1.4189263492356176e-05 tokens_per_second_per_gpu:4133.178776143774
|
104 |
+
Step 104 | loss:1.1153146028518677 lr:1.4172428182460605e-05 tokens_per_second_per_gpu:3279.1325303313006
|
105 |
+
Step 105 | loss:1.0925307273864746 lr:1.4155430067264974e-05 tokens_per_second_per_gpu:3290.367388662595
|
106 |
+
Step 106 | loss:0.9424126148223877 lr:1.4138269561518329e-05 tokens_per_second_per_gpu:3823.0717814425543
|
107 |
+
Step 107 | loss:1.008542537689209 lr:1.4120947083932006e-05 tokens_per_second_per_gpu:3747.7812395683623
|
108 |
+
Step 108 | loss:1.0429927110671997 lr:1.4103463057169398e-05 tokens_per_second_per_gpu:3590.085848047025
|
109 |
+
Step 109 | loss:1.0133060216903687 lr:1.4085817907835657e-05 tokens_per_second_per_gpu:3174.886936324355
|
110 |
+
Step 110 | loss:0.9840688109397888 lr:1.4068012066467276e-05 tokens_per_second_per_gpu:3362.005778821448
|
111 |
+
Step 111 | loss:0.9953323006629944 lr:1.4050045967521587e-05 tokens_per_second_per_gpu:3757.2369577327404
|
112 |
+
Step 112 | loss:1.2065812349319458 lr:1.4031920049366161e-05 tokens_per_second_per_gpu:3325.905289950363
|
113 |
+
Step 113 | loss:0.941290020942688 lr:1.4013634754268107e-05 tokens_per_second_per_gpu:3273.1097453960997
|
114 |
+
Step 114 | loss:1.006861686706543 lr:1.3995190528383292e-05 tokens_per_second_per_gpu:3408.69735024395
|
115 |
+
Step 115 | loss:1.0291639566421509 lr:1.397658782174544e-05 tokens_per_second_per_gpu:3274.3777461502764
|
116 |
+
Step 116 | loss:0.9624848961830139 lr:1.3957827088255166e-05 tokens_per_second_per_gpu:3524.4278903848763
|
117 |
+
Step 117 | loss:1.0975605249404907 lr:1.3938908785668893e-05 tokens_per_second_per_gpu:3103.05657743153
|
118 |
+
Step 118 | loss:1.0946214199066162 lr:1.3919833375587679e-05 tokens_per_second_per_gpu:3842.6176588735884
|
119 |
+
Step 119 | loss:1.0759072303771973 lr:1.3900601323445961e-05 tokens_per_second_per_gpu:3808.512164666663
|
120 |
+
Step 120 | loss:1.01813805103302 lr:1.3881213098500202e-05 tokens_per_second_per_gpu:3999.2491046466607
|
121 |
+
Step 121 | loss:1.0882763862609863 lr:1.3861669173817427e-05 tokens_per_second_per_gpu:3427.5146935052067
|
122 |
+
Step 122 | loss:0.9825055599212646 lr:1.3841970026263695e-05 tokens_per_second_per_gpu:3473.233743555214
|
123 |
+
Step 123 | loss:1.099984884262085 lr:1.382211613649246e-05 tokens_per_second_per_gpu:3717.821725752323
|
124 |
+
Step 124 | loss:1.0696111917495728 lr:1.3802107988932832e-05 tokens_per_second_per_gpu:3082.1936590885393
|
125 |
+
Step 125 | loss:1.1309176683425903 lr:1.3781946071777777e-05 tokens_per_second_per_gpu:3077.2164906951607
|
126 |
+
Step 126 | loss:1.0581340789794922 lr:1.3761630876972183e-05 tokens_per_second_per_gpu:3387.403226559853
|
127 |
+
Step 127 | loss:1.0505608320236206 lr:1.3741162900200874e-05 tokens_per_second_per_gpu:3411.845981322547
|
128 |
+
Step 128 | loss:0.9377124309539795 lr:1.3720542640876514e-05 tokens_per_second_per_gpu:3386.120190777436
|
129 |
+
Step 129 | loss:1.0050909519195557 lr:1.3699770602127406e-05 tokens_per_second_per_gpu:3239.424588950932
|
130 |
+
Step 130 | loss:1.0426461696624756 lr:1.3678847290785237e-05 tokens_per_second_per_gpu:3827.433691106946
|
131 |
+
Step 131 | loss:1.0819664001464844 lr:1.3657773217372694e-05 tokens_per_second_per_gpu:3468.9528124343155
|
132 |
+
Step 132 | loss:0.9391593933105469 lr:1.3636548896091019e-05 tokens_per_second_per_gpu:4109.727828655714
|
133 |
+
Step 133 | loss:1.130295753479004 lr:1.3615174844807451e-05 tokens_per_second_per_gpu:3672.415503816543
|
134 |
+
Step 134 | loss:0.9968859553337097 lr:1.359365158504261e-05 tokens_per_second_per_gpu:4390.05247525661
|
135 |
+
Step 135 | loss:1.1019611358642578 lr:1.3571979641957745e-05 tokens_per_second_per_gpu:3287.8714995416
|
136 |
+
Step 136 | loss:1.047814965248108 lr:1.3550159544341948e-05 tokens_per_second_per_gpu:3286.5090224768937
|
137 |
+
Step 137 | loss:1.041077733039856 lr:1.3528191824599228e-05 tokens_per_second_per_gpu:3717.2716801705315
|
138 |
+
Step 138 | loss:1.0515552759170532 lr:1.3506077018735533e-05 tokens_per_second_per_gpu:3202.3379318776147
|
139 |
+
Step 139 | loss:1.1296720504760742 lr:1.3483815666345674e-05 tokens_per_second_per_gpu:3258.4210318353244
|
140 |
+
Step 140 | loss:1.0627872943878174 lr:1.3461408310600151e-05 tokens_per_second_per_gpu:3194.7948425297013
|
141 |
+
Step 141 | loss:1.0591386556625366 lr:1.3438855498231901e-05 tokens_per_second_per_gpu:3233.8322219652996
|
142 |
+
Step 142 | loss:1.2165062427520752 lr:1.3416157779522969e-05 tokens_per_second_per_gpu:2700.8847148372674
|
143 |
+
Step 143 | loss:1.1000216007232666 lr:1.339331570829106e-05 tokens_per_second_per_gpu:3090.8040651670713
|
144 |
+
Step 144 | loss:1.0945175886154175 lr:1.3370329841876049e-05 tokens_per_second_per_gpu:3016.1521681626327
|
145 |
+
Step 145 | loss:1.0223222970962524 lr:1.3347200741126368e-05 tokens_per_second_per_gpu:3864.9123751265906
|
146 |
+
Step 146 | loss:1.0384180545806885 lr:1.3323928970385318e-05 tokens_per_second_per_gpu:3093.045925373374
|
147 |
+
Step 147 | loss:1.0387237071990967 lr:1.3300515097477319e-05 tokens_per_second_per_gpu:3583.5866748553367
|
148 |
+
Step 148 | loss:1.0544804334640503 lr:1.3276959693694032e-05 tokens_per_second_per_gpu:4013.569528765113
|
149 |
+
Step 149 | loss:1.0835425853729248 lr:1.3253263333780434e-05 tokens_per_second_per_gpu:3747.288931376117
|
150 |
+
Step 150 | loss:1.0735474824905396 lr:1.3229426595920794e-05 tokens_per_second_per_gpu:3016.964949179244
|
151 |
+
Step 151 | loss:1.1732394695281982 lr:1.3205450061724554e-05 tokens_per_second_per_gpu:3552.8302097626038
|
152 |
+
Step 152 | loss:1.0120192766189575 lr:1.3181334316212151e-05 tokens_per_second_per_gpu:3993.2864465929683
|
153 |
+
Step 153 | loss:1.098035216331482 lr:1.3157079947800736e-05 tokens_per_second_per_gpu:3630.096833881631
|
154 |
+
Step 154 | loss:1.0417951345443726 lr:1.313268754828982e-05 tokens_per_second_per_gpu:4128.380187352026
|
155 |
+
Step 155 | loss:1.0801373720169067 lr:1.3108157712846833e-05 tokens_per_second_per_gpu:3170.1141756442244
|
156 |
+
Step 156 | loss:0.9469473361968994 lr:1.3083491039992596e-05 tokens_per_second_per_gpu:3746.1269785447903
|
157 |
+
Step 157 | loss:1.1583002805709839 lr:1.3058688131586727e-05 tokens_per_second_per_gpu:2958.969451708439
|
158 |
+
Step 158 | loss:1.0937081575393677 lr:1.3033749592812955e-05 tokens_per_second_per_gpu:3625.0738232335902
|
159 |
+
Step 159 | loss:1.041576623916626 lr:1.3008676032164346e-05 tokens_per_second_per_gpu:3174.5789362480195
|
160 |
+
Step 160 | loss:1.1431220769882202 lr:1.2983468061428455e-05 tokens_per_second_per_gpu:3186.8973435756893
|
161 |
+
Step 161 | loss:1.060421347618103 lr:1.2958126295672419e-05 tokens_per_second_per_gpu:3658.4590960017413
|
162 |
+
Step 162 | loss:0.9672384262084961 lr:1.293265135322792e-05 tokens_per_second_per_gpu:3234.76727249106
|
163 |
+
Step 163 | loss:1.1202409267425537 lr:1.290704385567612e-05 tokens_per_second_per_gpu:3683.5608176507158
|
164 |
+
Step 164 | loss:1.1316494941711426 lr:1.2881304427832483e-05 tokens_per_second_per_gpu:2893.5620231739244
|
165 |
+
Step 165 | loss:1.0530860424041748 lr:1.2855433697731538e-05 tokens_per_second_per_gpu:3596.0516529135816
|
166 |
+
Step 166 | loss:0.9973039627075195 lr:1.2829432296611547e-05 tokens_per_second_per_gpu:3239.2785653439887
|
167 |
+
Step 167 | loss:1.0723451375961304 lr:1.2803300858899106e-05 tokens_per_second_per_gpu:3442.490376433312
|
168 |
+
Step 168 | loss:1.0164995193481445 lr:1.2777040022193672e-05 tokens_per_second_per_gpu:3164.9946873077406
|
169 |
+
Step 169 | loss:0.9852588176727295 lr:1.275065042725199e-05 tokens_per_second_per_gpu:3323.0232306554753
|
170 |
+
Step 170 | loss:1.0778350830078125 lr:1.2724132717972478e-05 tokens_per_second_per_gpu:2830.1534574906404
|
171 |
+
Step 171 | loss:1.1002942323684692 lr:1.26974875413795e-05 tokens_per_second_per_gpu:3511.467336305257
|
172 |
+
Step 172 | loss:1.0973403453826904 lr:1.267071554760759e-05 tokens_per_second_per_gpu:4210.974619365593
|
173 |
+
Step 173 | loss:1.0474810600280762 lr:1.264381738988558e-05 tokens_per_second_per_gpu:3633.5914296190986
|
174 |
+
Step 174 | loss:1.107753038406372 lr:1.2616793724520665e-05 tokens_per_second_per_gpu:3379.4868101415323
|
175 |
+
Step 175 | loss:1.0912103652954102 lr:1.2589645210882397e-05 tokens_per_second_per_gpu:3268.3413161504523
|
176 |
+
Step 176 | loss:0.9168727397918701 lr:1.256237251138658e-05 tokens_per_second_per_gpu:3584.472835944426
|
177 |
+
Step 177 | loss:1.0452046394348145 lr:1.2534976291479122e-05 tokens_per_second_per_gpu:3418.838663433972
|
178 |
+
Step 178 | loss:1.0404298305511475 lr:1.2507457219619796e-05 tokens_per_second_per_gpu:2913.0762752618957
|
179 |
+
Step 179 | loss:0.9873407483100891 lr:1.247981596726592e-05 tokens_per_second_per_gpu:3476.05126598452
|
180 |
+
Step 180 | loss:0.9315714836120605 lr:1.245205320885598e-05 tokens_per_second_per_gpu:4233.830633953143
|
181 |
+
Step 181 | loss:1.0789451599121094 lr:1.2424169621793182e-05 tokens_per_second_per_gpu:3855.9909233022713
|
182 |
+
Step 182 | loss:1.0372527837753296 lr:1.2396165886428913e-05 tokens_per_second_per_gpu:3188.532121002085
|
183 |
+
Step 183 | loss:0.9855374097824097 lr:1.2368042686046139e-05 tokens_per_second_per_gpu:3360.102286992968
|
184 |
+
Step 184 | loss:0.9893398880958557 lr:1.233980070684274e-05 tokens_per_second_per_gpu:3529.360812306045
|
185 |
+
Step 185 | loss:1.0867488384246826 lr:1.2311440637914766e-05 tokens_per_second_per_gpu:3179.6720985050933
|
186 |
+
Step 186 | loss:1.136749267578125 lr:1.228296317123962e-05 tokens_per_second_per_gpu:3040.5648519621095
|
187 |
+
Step 187 | loss:1.0160714387893677 lr:1.2254369001659178e-05 tokens_per_second_per_gpu:2636.2919865625445
|
188 |
+
Step 188 | loss:0.9412505030632019 lr:1.2225658826862835e-05 tokens_per_second_per_gpu:3556.44720893037
|
189 |
+
Step 189 | loss:1.0352046489715576 lr:1.219683334737047e-05 tokens_per_second_per_gpu:3189.685627683808
|
190 |
+
Step 190 | loss:1.0164215564727783 lr:1.2167893266515368e-05 tokens_per_second_per_gpu:3481.096336571031
|
191 |
+
Step 191 | loss:0.9528836607933044 lr:1.2138839290427062e-05 tokens_per_second_per_gpu:3763.6803007713847
|
192 |
+
Step 192 | loss:0.874556303024292 lr:1.210967212801408e-05 tokens_per_second_per_gpu:3543.1880047946393
|
193 |
+
Step 193 | loss:0.9521118402481079 lr:1.208039249094668e-05 tokens_per_second_per_gpu:3085.3159173416116
|
194 |
+
Step 194 | loss:0.9902434349060059 lr:1.2051001093639451e-05 tokens_per_second_per_gpu:3872.794807892901
|
195 |
+
Step 195 | loss:1.0282405614852905 lr:1.2021498653233912e-05 tokens_per_second_per_gpu:3357.2798676727166
|
196 |
+
Step 196 | loss:1.0458389520645142 lr:1.1991885889581001e-05 tokens_per_second_per_gpu:3481.726431927051
|
197 |
+
Step 197 | loss:1.06132972240448 lr:1.1962163525223505e-05 tokens_per_second_per_gpu:3642.1604094854006
|
198 |
+
Step 198 | loss:1.0513228178024292 lr:1.1932332285378438e-05 tokens_per_second_per_gpu:3643.609473789504
|
199 |
+
Step 199 | loss:1.0699129104614258 lr:1.1902392897919344e-05 tokens_per_second_per_gpu:4025.895506710408
|
200 |
+
Step 200 | loss:1.0457760095596313 lr:1.187234609335854e-05 tokens_per_second_per_gpu:3671.934888630731
|
201 |
+
Step 201 | loss:1.0659315586090088 lr:1.1842192604829286e-05 tokens_per_second_per_gpu:3569.504587981394
|
202 |
+
Step 202 | loss:0.980949878692627 lr:1.1811933168067903e-05 tokens_per_second_per_gpu:3335.3855968235766
|
203 |
+
Step 203 | loss:1.0401633977890015 lr:1.1781568521395815e-05 tokens_per_second_per_gpu:3991.0346971342574
|
204 |
+
Step 204 | loss:1.008753776550293 lr:1.1751099405701535e-05 tokens_per_second_per_gpu:3817.479518764033
|
205 |
+
Step 205 | loss:0.9701691269874573 lr:1.1720526564422593e-05 tokens_per_second_per_gpu:3387.821746371587
|
206 |
+
Step 206 | loss:1.122227430343628 lr:1.1689850743527394e-05 tokens_per_second_per_gpu:3950.7440411904113
|
207 |
+
Step 207 | loss:1.0229696035385132 lr:1.1659072691497014e-05 tokens_per_second_per_gpu:4086.9911967317607
|
208 |
+
Step 208 | loss:1.0376312732696533 lr:1.1628193159306939e-05 tokens_per_second_per_gpu:3460.2927592053616
|
209 |
+
Step 209 | loss:1.0910463333129883 lr:1.1597212900408738e-05 tokens_per_second_per_gpu:3441.665664549203
|
210 |
+
Step 210 | loss:0.951766848564148 lr:1.1566132670711691e-05 tokens_per_second_per_gpu:3765.2848616356405
|
211 |
+
Step 211 | loss:1.1121230125427246 lr:1.1534953228564325e-05 tokens_per_second_per_gpu:3348.8906808140177
|
212 |
+
Step 212 | loss:0.9478145241737366 lr:1.1503675334735933e-05 tokens_per_second_per_gpu:3729.1377760854084
|
213 |
+
Step 213 | loss:1.1243412494659424 lr:1.1472299752397989e-05 tokens_per_second_per_gpu:3466.9044365152718
|
214 |
+
Step 214 | loss:0.9882078170776367 lr:1.1440827247105546e-05 tokens_per_second_per_gpu:3301.1196001394637
|
215 |
+
Step 215 | loss:1.0635075569152832 lr:1.140925858677855e-05 tokens_per_second_per_gpu:3329.520308962072
|
216 |
+
Step 216 | loss:0.9767946004867554 lr:1.1377594541683095e-05 tokens_per_second_per_gpu:3282.602115501923
|
217 |
+
Step 217 | loss:0.9662872552871704 lr:1.134583588441264e-05 tokens_per_second_per_gpu:3304.2082107578244
|
218 |
+
Step 218 | loss:0.9389134049415588 lr:1.1313983389869154e-05 tokens_per_second_per_gpu:3737.964426771077
|
219 |
+
Step 219 | loss:1.0151091814041138 lr:1.1282037835244205e-05 tokens_per_second_per_gpu:3664.737039346888
|
220 |
+
Step 220 | loss:1.0414142608642578 lr:1.125e-05 tokens_per_second_per_gpu:3906.458243931466
|
221 |
+
Step 221 | loss:1.0948292016983032 lr:1.121787066585037e-05 tokens_per_second_per_gpu:3126.7367416868815
|
222 |
+
Step 222 | loss:1.1479017734527588 lr:1.118565061674169e-05 tokens_per_second_per_gpu:3310.893821872108
|
223 |
+
Step 223 | loss:1.1810510158538818 lr:1.1153340638833753e-05 tokens_per_second_per_gpu:3048.5584944239317
|
224 |
+
Step 224 | loss:0.9962694048881531 lr:1.1120941520480588e-05 tokens_per_second_per_gpu:3589.2994193950644
|
225 |
+
Step 225 | loss:1.0925935506820679 lr:1.1088454052211226e-05 tokens_per_second_per_gpu:3637.8874197495197
|
226 |
+
Step 226 | loss:1.0008320808410645 lr:1.1055879026710413e-05 tokens_per_second_per_gpu:3710.3314714150306
|
227 |
+
Step 227 | loss:1.0037486553192139 lr:1.102321723879926e-05 tokens_per_second_per_gpu:3530.8579202116766
|
228 |
+
Step 228 | loss:1.0278898477554321 lr:1.0990469485415859e-05 tokens_per_second_per_gpu:3603.3149217944747
|
229 |
+
Step 229 | loss:0.9470862150192261 lr:1.0957636565595835e-05 tokens_per_second_per_gpu:3314.542239706414
|
230 |
+
Step 230 | loss:0.974503755569458 lr:1.0924719280452849e-05 tokens_per_second_per_gpu:3547.2378512191153
|
231 |
+
Step 231 | loss:0.9583215713500977 lr:1.0891718433159048e-05 tokens_per_second_per_gpu:3523.465479574953
|
232 |
+
Step 232 | loss:1.0647832155227661 lr:1.0858634828925474e-05 tokens_per_second_per_gpu:4079.209517175555
|
233 |
+
Step 233 | loss:1.0859055519104004 lr:1.0825469274982416e-05 tokens_per_second_per_gpu:3748.6882115494077
|
234 |
+
Step 234 | loss:1.0758495330810547 lr:1.0792222580559706e-05 tokens_per_second_per_gpu:3851.188199977994
|
235 |
+
Step 235 | loss:0.9934150576591492 lr:1.0758895556866984e-05 tokens_per_second_per_gpu:3027.8201410044285
|
236 |
+
Step 236 | loss:0.9749125838279724 lr:1.0725489017073905e-05 tokens_per_second_per_gpu:3824.3620745614776
|
237 |
+
Step 237 | loss:1.1250077486038208 lr:1.0692003776290284e-05 tokens_per_second_per_gpu:3812.241805278313
|
238 |
+
Step 238 | loss:0.9836297631263733 lr:1.0658440651546224e-05 tokens_per_second_per_gpu:3754.6913377732058
|
239 |
+
Step 239 | loss:1.0390492677688599 lr:1.0624800461772173e-05 tokens_per_second_per_gpu:3801.826857915905
|
240 |
+
Step 240 | loss:1.04658842086792 lr:1.059108402777894e-05 tokens_per_second_per_gpu:3247.5606533242094
|
241 |
+
Step 241 | loss:1.1221239566802979 lr:1.0557292172237676e-05 tokens_per_second_per_gpu:3813.4732938265543
|
242 |
+
Step 242 | loss:1.150991678237915 lr:1.0523425719659793e-05 tokens_per_second_per_gpu:3428.596138959447
|
243 |
+
Step 243 | loss:1.139402151107788 lr:1.0489485496376844e-05 tokens_per_second_per_gpu:3350.111202236054
|
244 |
+
Step 244 | loss:0.990974485874176 lr:1.0455472330520378e-05 tokens_per_second_per_gpu:3301.0046192724767
|
245 |
+
Step 245 | loss:1.023050308227539 lr:1.0421387052001705e-05 tokens_per_second_per_gpu:2901.330223329429
|
246 |
+
Step 246 | loss:1.0226176977157593 lr:1.0387230492491678e-05 tokens_per_second_per_gpu:3203.0339991749943
|
247 |
+
Step 247 | loss:1.0416032075881958 lr:1.0353003485400378e-05 tokens_per_second_per_gpu:3180.0409616941442
|
248 |
+
Step 248 | loss:0.9852002859115601 lr:1.0318706865856785e-05 tokens_per_second_per_gpu:3998.775527074616
|
249 |
+
Step 249 | loss:1.1657763719558716 lr:1.028434147068841e-05 tokens_per_second_per_gpu:3891.387438322461
|
250 |
+
Step 250 | loss:1.0519105195999146 lr:1.0249908138400862e-05 tokens_per_second_per_gpu:3888.449070526479
|
251 |
+
Step 251 | loss:1.0405876636505127 lr:1.0215407709157396e-05 tokens_per_second_per_gpu:3619.335466976545
|
252 |
+
Step 252 | loss:1.0283851623535156 lr:1.0180841024758419e-05 tokens_per_second_per_gpu:2752.070363393849
|
253 |
+
Step 253 | loss:1.0298212766647339 lr:1.0146208928620938e-05 tokens_per_second_per_gpu:3307.377280772541
|
254 |
+
Step 254 | loss:0.964581310749054 lr:1.0111512265757992e-05 tokens_per_second_per_gpu:3752.103655989273
|
255 |
+
Step 255 | loss:0.9683664441108704 lr:1.0076751882758025e-05 tokens_per_second_per_gpu:4138.001777589129
|
256 |
+
Step 256 | loss:1.0615788698196411 lr:1.0041928627764238e-05 tokens_per_second_per_gpu:3341.61440477967
|
257 |
+
Step 257 | loss:1.0685681104660034 lr:1.0007043350453889e-05 tokens_per_second_per_gpu:3688.378050058521
|
258 |
+
Step 258 | loss:1.0611388683319092 lr:9.972096902017559e-06 tokens_per_second_per_gpu:3797.8084242404275
|
259 |
+
Step 259 | loss:0.961796760559082 lr:9.937090135138392e-06 tokens_per_second_per_gpu:3767.499020527814
|
260 |
+
Step 260 | loss:0.9383830428123474 lr:9.902023903971282e-06 tokens_per_second_per_gpu:4237.725928428407
|
261 |
+
Step 261 | loss:1.0182300806045532 lr:9.866899064122033e-06 tokens_per_second_per_gpu:3204.7581266878838
|
262 |
+
Step 262 | loss:1.0680441856384277 lr:9.831716472626485e-06 tokens_per_second_per_gpu:3623.368447703484
|
263 |
+
Step 263 | loss:1.0305362939834595 lr:9.796476987929601e-06 tokens_per_second_per_gpu:3667.2284929147554
|
264 |
+
Step 264 | loss:1.0003256797790527 lr:9.761181469864523e-06 tokens_per_second_per_gpu:3335.105931326227
|
265 |
+
Step 265 | loss:1.0257227420806885 lr:9.725830779631588e-06 tokens_per_second_per_gpu:4030.3851390190357
|
266 |
+
Step 266 | loss:1.0340684652328491 lr:9.69042577977732e-06 tokens_per_second_per_gpu:3411.0997028354655
|
267 |
+
Step 267 | loss:1.1450355052947998 lr:9.65496733417338e-06 tokens_per_second_per_gpu:3052.7514833967907
|
268 |
+
Step 268 | loss:1.0675625801086426 lr:9.619456307995492e-06 tokens_per_second_per_gpu:3714.1461111976373
|
269 |
+
Step 269 | loss:1.0681331157684326 lr:9.583893567702329e-06 tokens_per_second_per_gpu:3103.583450091087
|
270 |
+
Step 270 | loss:1.0137324333190918 lr:9.548279981014373e-06 tokens_per_second_per_gpu:3423.384868860896
|
271 |
+
Step 271 | loss:1.0491313934326172 lr:9.512616416892749e-06 tokens_per_second_per_gpu:3583.567062020731
|
272 |
+
Step 272 | loss:0.9289360046386719 lr:9.476903745518007e-06 tokens_per_second_per_gpu:3940.2272562878234
|
273 |
+
Step 273 | loss:0.9665488600730896 lr:9.441142838268906e-06 tokens_per_second_per_gpu:2920.460239292336
|
274 |
+
Step 274 | loss:1.0877517461776733 lr:9.405334567701143e-06 tokens_per_second_per_gpu:2961.311684974825
|
275 |
+
Step 275 | loss:1.0493131875991821 lr:9.369479807526072e-06 tokens_per_second_per_gpu:3966.098017081578
|
276 |
+
Step 276 | loss:1.0874686241149902 lr:9.333579432589371e-06 tokens_per_second_per_gpu:3387.0541722877924
|
277 |
+
Step 277 | loss:0.9608097672462463 lr:9.297634318849712e-06 tokens_per_second_per_gpu:4291.32165742434
|
278 |
+
Step 278 | loss:1.095219612121582 lr:9.26164534335738e-06 tokens_per_second_per_gpu:3076.5752421480315
|
279 |
+
Step 279 | loss:1.0561093091964722 lr:9.225613384232867e-06 tokens_per_second_per_gpu:3212.6037289159076
|
280 |
+
Step 280 | loss:1.0817214250564575 lr:9.189539320645461e-06 tokens_per_second_per_gpu:3659.336877166898
|
281 |
+
Step 281 | loss:1.0177308320999146 lr:9.15342403279179e-06 tokens_per_second_per_gpu:3738.1776707794156
|
282 |
+
Step 282 | loss:0.9710911512374878 lr:9.117268401874329e-06 tokens_per_second_per_gpu:3382.0765709925126
|
283 |
+
Step 283 | loss:1.1531765460968018 lr:9.081073310079919e-06 tokens_per_second_per_gpu:3141.8432625191035
|
284 |
+
Step 284 | loss:1.0935015678405762 lr:9.044839640558238e-06 tokens_per_second_per_gpu:3429.2429675320373
|
285 |
+
Step 285 | loss:0.9385885000228882 lr:9.008568277400246e-06 tokens_per_second_per_gpu:3346.592745218261
|
286 |
+
Step 286 | loss:0.9143052101135254 lr:8.972260105616615e-06 tokens_per_second_per_gpu:3597.7170701513446
|
287 |
+
Step 287 | loss:1.1004893779754639 lr:8.935916011116141e-06 tokens_per_second_per_gpu:3088.5027102851095
|
288 |
+
Step 288 | loss:1.0081392526626587 lr:8.899536880684118e-06 tokens_per_second_per_gpu:3565.6117227664477
|
289 |
+
Step 289 | loss:1.0537937879562378 lr:8.863123601960713e-06 tokens_per_second_per_gpu:3896.8268798437653
|
290 |
+
Step 290 | loss:1.0108102560043335 lr:8.826677063419297e-06 tokens_per_second_per_gpu:2986.3032089900526
|
291 |
+
Step 291 | loss:0.9827962517738342 lr:8.790198154344774e-06 tokens_per_second_per_gpu:3411.9397148980493
|
292 |
+
Step 292 | loss:0.9616813659667969 lr:8.753687764811874e-06 tokens_per_second_per_gpu:4128.635119314705
|
293 |
+
Step 293 | loss:1.0986084938049316 lr:8.717146785663451e-06 tokens_per_second_per_gpu:3044.1432990396206
|
294 |
+
Step 294 | loss:1.010633111000061 lr:8.680576108488722e-06 tokens_per_second_per_gpu:4016.5995836897896
|
295 |
+
Step 295 | loss:1.1678359508514404 lr:8.643976625601543e-06 tokens_per_second_per_gpu:3484.066437413288
|
296 |
+
Step 296 | loss:0.9672271609306335 lr:8.60734923001861e-06 tokens_per_second_per_gpu:3722.9078388086114
|
297 |
+
Step 297 | loss:1.0644941329956055 lr:8.570694815437684e-06 tokens_per_second_per_gpu:3674.078084922676
|
298 |
+
Step 298 | loss:1.0218087434768677 lr:8.534014276215784e-06 tokens_per_second_per_gpu:3719.1251320960077
|
299 |
+
Step 299 | loss:0.8616893291473389 lr:8.497308507347358e-06 tokens_per_second_per_gpu:3980.0883290351844
|
300 |
+
Step 300 | loss:1.200434923171997 lr:8.460578404442452e-06 tokens_per_second_per_gpu:3537.9024408752107
|
301 |
+
Step 301 | loss:1.2012302875518799 lr:8.423824863704859e-06 tokens_per_second_per_gpu:3341.0804733107207
|
302 |
+
Step 302 | loss:1.0611824989318848 lr:8.387048781910243e-06 tokens_per_second_per_gpu:3399.3160464558623
|
303 |
+
Step 303 | loss:1.0133317708969116 lr:8.350251056384267e-06 tokens_per_second_per_gpu:3396.784053005488
|
304 |
+
Step 304 | loss:1.0075584650039673 lr:8.313432584980693e-06 tokens_per_second_per_gpu:3170.834006045349
|
305 |
+
Step 305 | loss:1.0057969093322754 lr:8.276594266059473e-06 tokens_per_second_per_gpu:3349.7058914501686
|
306 |
+
Step 306 | loss:0.9875553846359253 lr:8.239736998464839e-06 tokens_per_second_per_gpu:3283.555251517354
|
307 |
+
Step 307 | loss:1.0624175071716309 lr:8.202861681503362e-06 tokens_per_second_per_gpu:3288.3057635972655
|
308 |
+
Step 308 | loss:1.0439351797103882 lr:8.165969214922011e-06 tokens_per_second_per_gpu:3257.713880411403
|
309 |
+
Step 309 | loss:1.021206021308899 lr:8.129060498886204e-06 tokens_per_second_per_gpu:3443.1099672423125
|
310 |
+
Step 310 | loss:1.0511139631271362 lr:8.09213643395784e-06 tokens_per_second_per_gpu:3937.8803984424717
|
311 |
+
Step 311 | loss:0.9559040069580078 lr:8.05519792107332e-06 tokens_per_second_per_gpu:3379.1804990828364
|
312 |
+
Step 312 | loss:1.048850655555725 lr:8.018245861521585e-06 tokens_per_second_per_gpu:3090.2532971700753
|
313 |
+
Step 313 | loss:1.1831482648849487 lr:7.981281156922097e-06 tokens_per_second_per_gpu:3205.4884774853103
|
314 |
+
Step 314 | loss:1.065642237663269 lr:7.944304709202857e-06 tokens_per_second_per_gpu:3615.7381651115775
|
315 |
+
Step 315 | loss:0.9639590978622437 lr:7.9073174205784e-06 tokens_per_second_per_gpu:3895.6258291409545
|
316 |
+
Step 316 | loss:1.0430667400360107 lr:7.870320193527773e-06 tokens_per_second_per_gpu:3228.924479386842
|
317 |
+
Step 317 | loss:1.104460597038269 lr:7.833313930772514e-06 tokens_per_second_per_gpu:3807.3283800568493
|
318 |
+
Step 318 | loss:1.0125938653945923 lr:7.796299535254633e-06 tokens_per_second_per_gpu:3032.071204987172
|
319 |
+
Step 319 | loss:1.064937949180603 lr:7.759277910114582e-06 tokens_per_second_per_gpu:3479.6920455124077
|
320 |
+
Step 320 | loss:1.0094972848892212 lr:7.722249958669199e-06 tokens_per_second_per_gpu:3269.3602673037053
|
321 |
+
Step 321 | loss:1.0703675746917725 lr:7.685216584389697e-06 tokens_per_second_per_gpu:3704.300080154252
|
322 |
+
Step 322 | loss:0.9877763390541077 lr:7.648178690879598e-06 tokens_per_second_per_gpu:3353.1459094211864
|
323 |
+
Step 323 | loss:1.12434983253479 lr:7.611137181852695e-06 tokens_per_second_per_gpu:3122.7128974218263
|
324 |
+
Step 324 | loss:1.0240429639816284 lr:7.574092961110993e-06 tokens_per_second_per_gpu:3098.34410006354
|
325 |
+
Step 325 | loss:0.9952061176300049 lr:7.537046932522668e-06 tokens_per_second_per_gpu:3604.4779903256135
|
326 |
+
Step 326 | loss:1.011582612991333 lr:7.5e-06 tokens_per_second_per_gpu:3283.509048393859
|
327 |
+
Step 327 | loss:0.9831445217132568 lr:7.462953067477332e-06 tokens_per_second_per_gpu:3775.59490266086
|
328 |
+
Step 328 | loss:1.1675689220428467 lr:7.425907038889008e-06 tokens_per_second_per_gpu:3642.4454328719376
|
329 |
+
Step 329 | loss:1.0117154121398926 lr:7.388862818147305e-06 tokens_per_second_per_gpu:3161.511403244572
|
330 |
+
Step 330 | loss:0.9967679381370544 lr:7.351821309120403e-06 tokens_per_second_per_gpu:4076.6176819879342
|
331 |
+
Step 331 | loss:1.0238803625106812 lr:7.314783415610303e-06 tokens_per_second_per_gpu:3682.85016942687
|
332 |
+
Step 332 | loss:1.1032280921936035 lr:7.2777500413308015e-06 tokens_per_second_per_gpu:3018.3267646978616
|
333 |
+
Step 333 | loss:1.0388715267181396 lr:7.240722089885421e-06 tokens_per_second_per_gpu:3664.4967840336526
|
334 |
+
Step 334 | loss:1.0032538175582886 lr:7.203700464745366e-06 tokens_per_second_per_gpu:3878.0272534442006
|
335 |
+
Step 335 | loss:0.9861161112785339 lr:7.166686069227486e-06 tokens_per_second_per_gpu:3704.921578499297
|
336 |
+
Step 336 | loss:1.0723155736923218 lr:7.129679806472228e-06 tokens_per_second_per_gpu:3379.4966782090632
|
337 |
+
Step 337 | loss:1.0907748937606812 lr:7.092682579421598e-06 tokens_per_second_per_gpu:3056.886775193357
|
338 |
+
Step 338 | loss:1.0879602432250977 lr:7.055695290797143e-06 tokens_per_second_per_gpu:3510.6596962005956
|
339 |
+
Step 339 | loss:1.2004570960998535 lr:7.018718843077904e-06 tokens_per_second_per_gpu:2698.696907414987
|
340 |
+
Step 340 | loss:1.1619454622268677 lr:6.981754138478416e-06 tokens_per_second_per_gpu:3246.2572334355586
|
341 |
+
Step 341 | loss:1.1515110731124878 lr:6.944802078926679e-06 tokens_per_second_per_gpu:3833.456255283371
|
342 |
+
Step 342 | loss:0.9102157950401306 lr:6.907863566042161e-06 tokens_per_second_per_gpu:3861.3976159065483
|
343 |
+
Step 343 | loss:1.0276517868041992 lr:6.870939501113796e-06 tokens_per_second_per_gpu:3515.3211579047493
|
344 |
+
Step 344 | loss:1.0000083446502686 lr:6.834030785077989e-06 tokens_per_second_per_gpu:3557.346531908244
|
345 |
+
Step 345 | loss:1.0519853830337524 lr:6.797138318496637e-06 tokens_per_second_per_gpu:3382.344983091482
|
346 |
+
Step 346 | loss:1.0421191453933716 lr:6.7602630015351624e-06 tokens_per_second_per_gpu:3723.6117480414146
|
347 |
+
Step 347 | loss:0.9147570133209229 lr:6.723405733940528e-06 tokens_per_second_per_gpu:4252.460083108924
|
348 |
+
Step 348 | loss:0.9446225166320801 lr:6.68656741501931e-06 tokens_per_second_per_gpu:3618.6484563348768
|
349 |
+
Step 349 | loss:0.9708753228187561 lr:6.649748943615732e-06 tokens_per_second_per_gpu:3283.5603120185374
|
350 |
+
Step 350 | loss:0.989623486995697 lr:6.6129512180897574e-06 tokens_per_second_per_gpu:3714.9557633983477
|
351 |
+
Step 351 | loss:1.0894560813903809 lr:6.57617513629514e-06 tokens_per_second_per_gpu:3633.0728284201673
|
352 |
+
Step 352 | loss:1.115202784538269 lr:6.539421595557549e-06 tokens_per_second_per_gpu:3465.6183875539423
|
353 |
+
Step 353 | loss:0.8921554088592529 lr:6.502691492652643e-06 tokens_per_second_per_gpu:3898.581389798211
|
354 |
+
Step 354 | loss:1.0538413524627686 lr:6.465985723784218e-06 tokens_per_second_per_gpu:3875.7674647016806
|
355 |
+
Step 355 | loss:1.1229345798492432 lr:6.429305184562315e-06 tokens_per_second_per_gpu:3747.439269311531
|
356 |
+
Step 356 | loss:1.0564650297164917 lr:6.392650769981392e-06 tokens_per_second_per_gpu:3671.9056611937544
|
357 |
+
Step 357 | loss:1.0135570764541626 lr:6.356023374398456e-06 tokens_per_second_per_gpu:2899.364909402783
|
358 |
+
Step 358 | loss:1.0615811347961426 lr:6.319423891511278e-06 tokens_per_second_per_gpu:3801.336525876768
|
359 |
+
Step 359 | loss:0.9940143823623657 lr:6.28285321433655e-06 tokens_per_second_per_gpu:3759.510249531142
|
360 |
+
Step 360 | loss:1.051573395729065 lr:6.246312235188126e-06 tokens_per_second_per_gpu:3262.2914203901505
|
361 |
+
Step 361 | loss:1.0494478940963745 lr:6.209801845655227e-06 tokens_per_second_per_gpu:3231.54016416701
|
362 |
+
Step 362 | loss:1.134334921836853 lr:6.173322936580705e-06 tokens_per_second_per_gpu:3571.6658588531845
|
363 |
+
Step 363 | loss:1.132298231124878 lr:6.136876398039287e-06 tokens_per_second_per_gpu:3057.396332653723
|
364 |
+
Step 364 | loss:1.0008654594421387 lr:6.100463119315882e-06 tokens_per_second_per_gpu:3834.101013732286
|
365 |
+
Step 365 | loss:1.1123298406600952 lr:6.0640839888838594e-06 tokens_per_second_per_gpu:3247.3705233572437
|
366 |
+
Step 366 | loss:1.1211216449737549 lr:6.027739894383387e-06 tokens_per_second_per_gpu:2845.3333047002448
|
367 |
+
Step 367 | loss:1.0459622144699097 lr:5.991431722599755e-06 tokens_per_second_per_gpu:3457.97226490014
|
368 |
+
Step 368 | loss:1.0909711122512817 lr:5.955160359441763e-06 tokens_per_second_per_gpu:3221.2013455847036
|
369 |
+
Step 369 | loss:1.1640664339065552 lr:5.918926689920081e-06 tokens_per_second_per_gpu:3451.3559735588115
|
370 |
+
Step 370 | loss:1.2360296249389648 lr:5.882731598125674e-06 tokens_per_second_per_gpu:3289.3973935577333
|
371 |
+
Step 371 | loss:1.0348141193389893 lr:5.846575967208211e-06 tokens_per_second_per_gpu:3435.4094495338736
|
372 |
+
Step 372 | loss:0.9567263722419739 lr:5.810460679354538e-06 tokens_per_second_per_gpu:3576.81900822866
|
373 |
+
Step 373 | loss:0.9700967073440552 lr:5.774386615767134e-06 tokens_per_second_per_gpu:4121.726341513599
|
374 |
+
Step 374 | loss:1.0645740032196045 lr:5.738354656642623e-06 tokens_per_second_per_gpu:3347.2283933756694
|
375 |
+
Step 375 | loss:1.0295037031173706 lr:5.702365681150289e-06 tokens_per_second_per_gpu:3868.2590545517114
|
376 |
+
Step 376 | loss:1.0450406074523926 lr:5.66642056741063e-06 tokens_per_second_per_gpu:3477.8887834676907
|
377 |
+
Step 377 | loss:1.0922298431396484 lr:5.630520192473929e-06 tokens_per_second_per_gpu:3405.0405996273425
|
378 |
+
Step 378 | loss:0.9819583296775818 lr:5.594665432298858e-06 tokens_per_second_per_gpu:3530.057502597269
|
379 |
+
Step 379 | loss:1.0701051950454712 lr:5.558857161731094e-06 tokens_per_second_per_gpu:3565.327286781416
|
380 |
+
Step 380 | loss:0.9559437036514282 lr:5.523096254481995e-06 tokens_per_second_per_gpu:3365.6710550463654
|
381 |
+
Step 381 | loss:1.0547300577163696 lr:5.48738358310725e-06 tokens_per_second_per_gpu:3346.3530578743994
|
382 |
+
Step 382 | loss:0.9696499705314636 lr:5.451720018985626e-06 tokens_per_second_per_gpu:3329.928998826801
|
383 |
+
Step 383 | loss:1.0064926147460938 lr:5.416106432297671e-06 tokens_per_second_per_gpu:4094.6995868513072
|
384 |
+
Step 384 | loss:1.0114684104919434 lr:5.380543692004509e-06 tokens_per_second_per_gpu:3475.432549582163
|
385 |
+
Step 385 | loss:0.9692176580429077 lr:5.345032665826621e-06 tokens_per_second_per_gpu:3244.483451213904
|
386 |
+
Step 386 | loss:1.0306236743927002 lr:5.3095742202226825e-06 tokens_per_second_per_gpu:3269.062813082469
|
387 |
+
Step 387 | loss:0.9608441591262817 lr:5.274169220368412e-06 tokens_per_second_per_gpu:3293.0322861856876
|
388 |
+
Step 388 | loss:1.0451374053955078 lr:5.238818530135479e-06 tokens_per_second_per_gpu:2938.7446792343676
|
389 |
+
Step 389 | loss:1.1184594631195068 lr:5.203523012070398e-06 tokens_per_second_per_gpu:4074.2637958572395
|
390 |
+
Step 390 | loss:1.0143616199493408 lr:5.168283527373516e-06 tokens_per_second_per_gpu:3757.202601042462
|
391 |
+
Step 391 | loss:1.0019251108169556 lr:5.133100935877967e-06 tokens_per_second_per_gpu:3643.925463722737
|
392 |
+
Step 392 | loss:0.9685758352279663 lr:5.097976096028719e-06 tokens_per_second_per_gpu:3100.3578523101673
|
393 |
+
Step 393 | loss:1.0199055671691895 lr:5.0629098648616075e-06 tokens_per_second_per_gpu:3384.275878283242
|
394 |
+
Step 394 | loss:1.178783893585205 lr:5.027903097982441e-06 tokens_per_second_per_gpu:3391.4302131315617
|
395 |
+
Step 395 | loss:1.0473383665084839 lr:4.992956649546113e-06 tokens_per_second_per_gpu:3063.8065410727527
|
396 |
+
Step 396 | loss:1.0996811389923096 lr:4.958071372235763e-06 tokens_per_second_per_gpu:3498.2229002983877
|
397 |
+
Step 397 | loss:1.0462857484817505 lr:4.923248117241975e-06 tokens_per_second_per_gpu:4333.282035445803
|
398 |
+
Step 398 | loss:0.9347184300422668 lr:4.8884877342420105e-06 tokens_per_second_per_gpu:3622.574379253927
|
399 |
+
Step 399 | loss:0.9804700613021851 lr:4.853791071379062e-06 tokens_per_second_per_gpu:3226.8869428343696
|
400 |
+
Step 400 | loss:1.1005946397781372 lr:4.8191589752415834e-06 tokens_per_second_per_gpu:3372.8523923167445
|
401 |
+
Step 401 | loss:0.8959193229675293 lr:4.7845922908426035e-06 tokens_per_second_per_gpu:3320.152228865833
|
402 |
+
Step 402 | loss:1.021907925605774 lr:4.75009186159914e-06 tokens_per_second_per_gpu:3777.003290844435
|
403 |
+
Step 403 | loss:1.0068416595458984 lr:4.71565852931159e-06 tokens_per_second_per_gpu:3986.641799795482
|
404 |
+
Step 404 | loss:1.000541090965271 lr:4.681293134143216e-06 tokens_per_second_per_gpu:3747.1434322920663
|
405 |
+
Step 405 | loss:1.2205549478530884 lr:4.646996514599623e-06 tokens_per_second_per_gpu:3820.4594798567286
|
406 |
+
Step 406 | loss:0.9515297412872314 lr:4.612769507508324e-06 tokens_per_second_per_gpu:3795.377292339278
|
407 |
+
Step 407 | loss:1.0238933563232422 lr:4.5786129479982945e-06 tokens_per_second_per_gpu:3229.7823570944547
|
408 |
+
Step 408 | loss:1.0509495735168457 lr:4.544527669479625e-06 tokens_per_second_per_gpu:3349.333952642161
|
409 |
+
Step 409 | loss:1.0613925457000732 lr:4.510514503623155e-06 tokens_per_second_per_gpu:3156.1717446768876
|
410 |
+
Step 410 | loss:1.033313512802124 lr:4.4765742803402094e-06 tokens_per_second_per_gpu:3336.8989972246827
|
411 |
+
Step 411 | loss:1.0496494770050049 lr:4.442707827762323e-06 tokens_per_second_per_gpu:3590.429787817092
|
412 |
+
Step 412 | loss:1.0385499000549316 lr:4.4089159722210605e-06 tokens_per_second_per_gpu:3170.2776264525723
|
413 |
+
Step 413 | loss:0.9418953061103821 lr:4.375199538227827e-06 tokens_per_second_per_gpu:3534.558502200195
|
414 |
+
Step 414 | loss:1.047280192375183 lr:4.341559348453779e-06 tokens_per_second_per_gpu:2733.5207695529916
|
415 |
+
Step 415 | loss:1.022471308708191 lr:4.307996223709717e-06 tokens_per_second_per_gpu:3890.7734867828663
|
416 |
+
Step 416 | loss:1.0086355209350586 lr:4.274510982926099e-06 tokens_per_second_per_gpu:3987.506565566676
|
417 |
+
Step 417 | loss:1.0402534008026123 lr:4.2411044431330156e-06 tokens_per_second_per_gpu:3236.307481873333
|
418 |
+
Step 418 | loss:0.8711655139923096 lr:4.207777419440298e-06 tokens_per_second_per_gpu:3975.2863975021874
|
419 |
+
Step 419 | loss:1.023214340209961 lr:4.174530725017586e-06 tokens_per_second_per_gpu:3554.0476686396737
|
420 |
+
Step 420 | loss:0.9571332335472107 lr:4.141365171074529e-06 tokens_per_second_per_gpu:3525.9512956270514
|
421 |
+
Step 421 | loss:1.0896693468093872 lr:4.108281566840953e-06 tokens_per_second_per_gpu:3153.098645463316
|
422 |
+
Step 422 | loss:0.9747520685195923 lr:4.075280719547152e-06 tokens_per_second_per_gpu:3966.0455004943665
|
423 |
+
Step 423 | loss:1.1110624074935913 lr:4.042363434404165e-06 tokens_per_second_per_gpu:3779.0025960246794
|
424 |
+
Step 424 | loss:1.0604115724563599 lr:4.009530514584142e-06 tokens_per_second_per_gpu:3830.9188953316843
|
425 |
+
Step 425 | loss:1.0563576221466064 lr:3.976782761200741e-06 tokens_per_second_per_gpu:3582.450000860484
|
426 |
+
Step 426 | loss:1.0671446323394775 lr:3.944120973289589e-06 tokens_per_second_per_gpu:2908.494054782393
|
427 |
+
Step 427 | loss:0.9762901067733765 lr:3.911545947788775e-06 tokens_per_second_per_gpu:3053.4349628908967
|
428 |
+
Step 428 | loss:0.9776210188865662 lr:3.879058479519415e-06 tokens_per_second_per_gpu:3544.2355797599025
|
429 |
+
Step 429 | loss:1.0141849517822266 lr:3.846659361166249e-06 tokens_per_second_per_gpu:3297.786365076082
|
430 |
+
Step 430 | loss:1.1040488481521606 lr:3.8143493832583126e-06 tokens_per_second_per_gpu:3338.843244763157
|
431 |
+
Step 431 | loss:1.0626472234725952 lr:3.7821293341496314e-06 tokens_per_second_per_gpu:4418.329163218037
|
432 |
+
Step 432 | loss:1.0949605703353882 lr:3.750000000000002e-06 tokens_per_second_per_gpu:3047.106222452544
|
433 |
+
Step 433 | loss:1.0159730911254883 lr:3.717962164755795e-06 tokens_per_second_per_gpu:3602.6745614483334
|
434 |
+
Step 434 | loss:1.0500991344451904 lr:3.686016610130848e-06 tokens_per_second_per_gpu:3986.7451576661006
|
435 |
+
Step 435 | loss:1.0814403295516968 lr:3.654164115587359e-06 tokens_per_second_per_gpu:3571.832029398707
|
436 |
+
Step 436 | loss:1.0364539623260498 lr:3.622405458316908e-06 tokens_per_second_per_gpu:3642.0438527830474
|
437 |
+
Step 437 | loss:1.1901767253875732 lr:3.5907414132214504e-06 tokens_per_second_per_gpu:3176.521239313863
|
438 |
+
Step 438 | loss:0.9588651657104492 lr:3.5591727528944566e-06 tokens_per_second_per_gpu:3918.860748590204
|
439 |
+
Step 439 | loss:1.0434871912002563 lr:3.5277002476020124e-06 tokens_per_second_per_gpu:3299.17808549516
|
440 |
+
Step 440 | loss:1.0512974262237549 lr:3.496324665264073e-06 tokens_per_second_per_gpu:3635.173332567157
|
441 |
+
Step 441 | loss:1.231705665588379 lr:3.465046771435676e-06 tokens_per_second_per_gpu:3074.205941042435
|
442 |
+
Step 442 | loss:1.0789237022399902 lr:3.4338673292883108e-06 tokens_per_second_per_gpu:3693.2828587088543
|
443 |
+
Step 443 | loss:1.2193183898925781 lr:3.4027870995912626e-06 tokens_per_second_per_gpu:3431.2840987432733
|
444 |
+
Step 444 | loss:1.0715184211730957 lr:3.3718068406930634e-06 tokens_per_second_per_gpu:3682.480389381471
|
445 |
+
Step 445 | loss:1.0878655910491943 lr:3.3409273085029877e-06 tokens_per_second_per_gpu:3247.4307989923645
|
446 |
+
Step 446 | loss:1.1202605962753296 lr:3.3101492564726074e-06 tokens_per_second_per_gpu:2868.354735858521
|
447 |
+
Step 447 | loss:0.9288511276245117 lr:3.279473435577409e-06 tokens_per_second_per_gpu:3563.062402935358
|
448 |
+
Step 448 | loss:1.0990906953811646 lr:3.2489005942984675e-06 tokens_per_second_per_gpu:2858.893387808962
|
449 |
+
Step 449 | loss:0.9979101419448853 lr:3.218431478604187e-06 tokens_per_second_per_gpu:3678.9613236315463
|
450 |
+
Step 450 | loss:1.1519663333892822 lr:3.188066831932098e-06 tokens_per_second_per_gpu:3727.64506347197
|
451 |
+
Step 451 | loss:1.0830491781234741 lr:3.157807395170714e-06 tokens_per_second_per_gpu:3000.9712833795475
|
452 |
+
Step 452 | loss:0.953801691532135 lr:3.127653906641461e-06 tokens_per_second_per_gpu:3259.1973993559814
|
453 |
+
Step 453 | loss:1.0369517803192139 lr:3.097607102080657e-06 tokens_per_second_per_gpu:3511.980574010068
|
454 |
+
Step 454 | loss:1.092458724975586 lr:3.067667714621564e-06 tokens_per_second_per_gpu:3294.265502138293
|
455 |
+
Step 455 | loss:0.936871349811554 lr:3.037836474776495e-06 tokens_per_second_per_gpu:3481.5948657632803
|
456 |
+
Step 456 | loss:1.1124027967453003 lr:3.008114110419e-06 tokens_per_second_per_gpu:3812.8193336820773
|
457 |
+
Step 457 | loss:1.0468907356262207 lr:2.978501346766086e-06 tokens_per_second_per_gpu:3151.5910084710267
|
458 |
+
Step 458 | loss:1.0531938076019287 lr:2.948998906360552e-06 tokens_per_second_per_gpu:4215.472517871868
|
459 |
+
Step 459 | loss:1.041715145111084 lr:2.9196075090533224e-06 tokens_per_second_per_gpu:3710.2677325476725
|
460 |
+
Step 460 | loss:1.0383410453796387 lr:2.890327871985922e-06 tokens_per_second_per_gpu:3652.23350104369
|
461 |
+
Step 461 | loss:1.0480257272720337 lr:2.8611607095729393e-06 tokens_per_second_per_gpu:3064.9383393059015
|
462 |
+
Step 462 | loss:0.9538127779960632 lr:2.8321067334846317e-06 tokens_per_second_per_gpu:3769.898140676965
|
463 |
+
Step 463 | loss:0.9850819110870361 lr:2.8031666526295325e-06 tokens_per_second_per_gpu:3406.614961871896
|
464 |
+
Step 464 | loss:1.0228904485702515 lr:2.7743411731371677e-06 tokens_per_second_per_gpu:3569.9039916901243
|
465 |
+
Step 465 | loss:0.9739654064178467 lr:2.7456309983408215e-06 tokens_per_second_per_gpu:3855.6404996182887
|
466 |
+
Step 466 | loss:1.0107979774475098 lr:2.7170368287603812e-06 tokens_per_second_per_gpu:2971.2864568059426
|
467 |
+
Step 467 | loss:1.092708945274353 lr:2.6885593620852362e-06 tokens_per_second_per_gpu:2782.0899935873463
|
468 |
+
Step 468 | loss:1.1168324947357178 lr:2.660199293157263e-06 tokens_per_second_per_gpu:3767.413743825975
|
469 |
+
Step 469 | loss:1.0790526866912842 lr:2.6319573139538637e-06 tokens_per_second_per_gpu:3453.5127524794507
|
470 |
+
Step 470 | loss:1.0022902488708496 lr:2.603834113571089e-06 tokens_per_second_per_gpu:3047.035744052629
|
471 |
+
Step 471 | loss:0.9685212969779968 lr:2.575830378206819e-06 tokens_per_second_per_gpu:3106.6813389813046
|
472 |
+
Step 472 | loss:0.9829853773117065 lr:2.547946791144022e-06 tokens_per_second_per_gpu:3517.443534427285
|
473 |
+
Step 473 | loss:1.0006290674209595 lr:2.520184032734084e-06 tokens_per_second_per_gpu:3299.938446528448
|
474 |
+
Step 474 | loss:1.06125009059906 lr:2.492542780380207e-06 tokens_per_second_per_gpu:3319.6900284288845
|
475 |
+
Step 475 | loss:1.1993439197540283 lr:2.4650237085208767e-06 tokens_per_second_per_gpu:3576.5268885175046
|
476 |
+
Step 476 | loss:0.9784356951713562 lr:2.4376274886134225e-06 tokens_per_second_per_gpu:3467.613615187959
|
477 |
+
Step 477 | loss:0.9801285862922668 lr:2.4103547891176042e-06 tokens_per_second_per_gpu:4156.110968467437
|
478 |
+
Step 478 | loss:1.2105839252471924 lr:2.3832062754793375e-06 tokens_per_second_per_gpu:3410.3132050226804
|
479 |
+
Step 479 | loss:1.0551785230636597 lr:2.356182610114421e-06 tokens_per_second_per_gpu:3536.043611031053
|
480 |
+
Step 480 | loss:1.0802967548370361 lr:2.3292844523924132e-06 tokens_per_second_per_gpu:3609.396572236827
|
481 |
+
Step 481 | loss:1.0849721431732178 lr:2.3025124586205e-06 tokens_per_second_per_gpu:3036.075258849583
|
482 |
+
Step 482 | loss:1.0422958135604858 lr:2.2758672820275245e-06 tokens_per_second_per_gpu:3702.107907772152
|
483 |
+
Step 483 | loss:1.0579990148544312 lr:2.2493495727480106e-06 tokens_per_second_per_gpu:3823.945807707913
|
484 |
+
Step 484 | loss:1.0904960632324219 lr:2.22295997780633e-06 tokens_per_second_per_gpu:3312.442269740297
|
485 |
+
Step 485 | loss:1.0701911449432373 lr:2.196699141100894e-06 tokens_per_second_per_gpu:3242.470040399053
|
486 |
+
Step 486 | loss:0.9568579196929932 lr:2.170567703388454e-06 tokens_per_second_per_gpu:3632.613290114698
|
487 |
+
Step 487 | loss:1.0242254734039307 lr:2.1445663022684626e-06 tokens_per_second_per_gpu:3655.3842931955455
|
488 |
+
Step 488 | loss:1.0032799243927002 lr:2.1186955721675145e-06 tokens_per_second_per_gpu:3707.926602365104
|
489 |
+
Step 489 | loss:1.0453135967254639 lr:2.092956144323881e-06 tokens_per_second_per_gpu:2869.137361469829
|
490 |
+
Step 490 | loss:1.2181352376937866 lr:2.067348646772079e-06 tokens_per_second_per_gpu:3293.3080266736984
|
491 |
+
Step 491 | loss:1.000616192817688 lr:2.041873704327583e-06 tokens_per_second_per_gpu:3715.517462012418
|
492 |
+
Step 492 | loss:1.0542888641357422 lr:2.0165319385715436e-06 tokens_per_second_per_gpu:3830.4390289474027
|
493 |
+
Step 493 | loss:1.0258901119232178 lr:1.991323967835658e-06 tokens_per_second_per_gpu:3028.369221973012
|
494 |
+
Step 494 | loss:0.9300064444541931 lr:1.966250407187045e-06 tokens_per_second_per_gpu:3930.731519264302
|
495 |
+
Step 495 | loss:1.0918774604797363 lr:1.9413118684132744e-06 tokens_per_second_per_gpu:3462.5265384350296
|
496 |
+
Step 496 | loss:1.0639344453811646 lr:1.916508960007404e-06 tokens_per_second_per_gpu:3684.844309448488
|
497 |
+
Step 497 | loss:1.0016651153564453 lr:1.8918422871531677e-06 tokens_per_second_per_gpu:3102.599497609656
|
498 |
+
Step 498 | loss:1.0390342473983765 lr:1.8673124517101783e-06 tokens_per_second_per_gpu:3670.164801395993
|
499 |
+
Step 499 | loss:1.210891842842102 lr:1.842920052199263e-06 tokens_per_second_per_gpu:2854.611146514477
|
500 |
+
Step 500 | loss:1.1124374866485596 lr:1.818665683787849e-06 tokens_per_second_per_gpu:3270.6689217420526
|
501 |
+
Step 501 | loss:0.9718752503395081 lr:1.794549938275447e-06 tokens_per_second_per_gpu:4170.2565142940675
|
502 |
+
Step 502 | loss:1.1475387811660767 lr:1.7705734040792066e-06 tokens_per_second_per_gpu:3874.4289722672997
|
503 |
+
Step 503 | loss:0.9939885139465332 lr:1.7467366662195653e-06 tokens_per_second_per_gpu:4018.9157193209103
|
504 |
+
Step 504 | loss:1.060147762298584 lr:1.7230403063059688e-06 tokens_per_second_per_gpu:3884.5121540563578
|
505 |
+
Step 505 | loss:1.153907060623169 lr:1.6994849025226819e-06 tokens_per_second_per_gpu:3476.133514653969
|
506 |
+
Step 506 | loss:0.9286587238311768 lr:1.6760710296146827e-06 tokens_per_second_per_gpu:3320.296836386092
|
507 |
+
Step 507 | loss:1.1152493953704834 lr:1.6527992588736346e-06 tokens_per_second_per_gpu:2898.2952105804293
|
508 |
+
Step 508 | loss:1.1163067817687988 lr:1.6296701581239514e-06 tokens_per_second_per_gpu:3230.075370963527
|
509 |
+
Step 509 | loss:1.0790047645568848 lr:1.6066842917089406e-06 tokens_per_second_per_gpu:3259.4500583104896
|
510 |
+
Step 510 | loss:0.9519661068916321 lr:1.5838422204770304e-06 tokens_per_second_per_gpu:3670.231481091447
|
511 |
+
Step 511 | loss:0.992994487285614 lr:1.5611445017680991e-06 tokens_per_second_per_gpu:3799.1403263844904
|
512 |
+
Step 512 | loss:1.111997127532959 lr:1.5385916893998497e-06 tokens_per_second_per_gpu:3600.044046651026
|
513 |
+
Step 513 | loss:0.9813446998596191 lr:1.5161843336543285e-06 tokens_per_second_per_gpu:3486.542256282655
|
514 |
+
Step 514 | loss:1.058476209640503 lr:1.4939229812644679e-06 tokens_per_second_per_gpu:3923.009117188916
|
515 |
+
Step 515 | loss:0.9895375967025757 lr:1.4718081754007753e-06 tokens_per_second_per_gpu:3754.8223465225924
|
516 |
+
Step 516 | loss:1.079698085784912 lr:1.4498404556580525e-06 tokens_per_second_per_gpu:3386.562332279495
|
517 |
+
Step 517 | loss:1.0072911977767944 lr:1.4280203580422537e-06 tokens_per_second_per_gpu:3733.3749612885863
|
518 |
+
Step 518 | loss:1.0473989248275757 lr:1.4063484149573902e-06 tokens_per_second_per_gpu:3459.389528812242
|
519 |
+
Step 519 | loss:0.9953117370605469 lr:1.3848251551925475e-06 tokens_per_second_per_gpu:3253.9885230172995
|
520 |
+
Step 520 | loss:1.0296379327774048 lr:1.3634511039089819e-06 tokens_per_second_per_gpu:3249.8868818068645
|
521 |
+
Step 521 | loss:1.0001991987228394 lr:1.3422267826273052e-06 tokens_per_second_per_gpu:3295.052521167191
|
522 |
+
Step 522 | loss:1.0914515256881714 lr:1.321152709214763e-06 tokens_per_second_per_gpu:3418.445922794606
|
523 |
+
Step 523 | loss:1.0070934295654297 lr:1.3002293978725935e-06 tokens_per_second_per_gpu:3468.4665497148526
|
524 |
+
Step 524 | loss:0.991290807723999 lr:1.2794573591234868e-06 tokens_per_second_per_gpu:3655.7877168779905
|
525 |
+
Step 525 | loss:0.9848833680152893 lr:1.2588370997991262e-06 tokens_per_second_per_gpu:3221.000627253079
|
526 |
+
Step 526 | loss:1.0280349254608154 lr:1.2383691230278197e-06 tokens_per_second_per_gpu:3589.7060395621884
|
527 |
+
Step 527 | loss:1.0546156167984009 lr:1.2180539282222252e-06 tokens_per_second_per_gpu:3325.8819649807083
|
528 |
+
Step 528 | loss:0.9586650729179382 lr:1.1978920110671688e-06 tokens_per_second_per_gpu:3187.053627556894
|
529 |
+
Step 529 | loss:1.0085161924362183 lr:1.1778838635075415e-06 tokens_per_second_per_gpu:3512.8972208771875
|
530 |
+
Step 530 | loss:0.9839969873428345 lr:1.1580299737363037e-06 tokens_per_second_per_gpu:3345.8963367573137
|
531 |
+
Step 531 | loss:1.0338804721832275 lr:1.1383308261825748e-06 tokens_per_second_per_gpu:3539.808114559959
|
532 |
+
Step 532 | loss:1.1831824779510498 lr:1.1187869014997992e-06 tokens_per_second_per_gpu:3428.417581753952
|
533 |
+
Step 533 | loss:1.0245225429534912 lr:1.0993986765540403e-06 tokens_per_second_per_gpu:3259.282719856141
|
534 |
+
Step 534 | loss:1.022165060043335 lr:1.080166624412322e-06 tokens_per_second_per_gpu:3233.591194250682
|
535 |
+
Step 535 | loss:1.1499828100204468 lr:1.0610912143311096e-06 tokens_per_second_per_gpu:3336.2483024753146
|
536 |
+
Step 536 | loss:1.1073501110076904 lr:1.0421729117448334e-06 tokens_per_second_per_gpu:3675.8114899776433
|
537 |
+
Step 537 | loss:1.002111554145813 lr:1.0234121782545621e-06 tokens_per_second_per_gpu:3456.2249516746747
|
538 |
+
Step 538 | loss:0.988724946975708 lr:1.0048094716167097e-06 tokens_per_second_per_gpu:3571.358149202487
|
539 |
+
Step 539 | loss:1.0480667352676392 lr:9.863652457318939e-07 tokens_per_second_per_gpu:3861.236977577604
|
540 |
+
Step 540 | loss:1.0037941932678223 lr:9.68079950633841e-07 tokens_per_second_per_gpu:3795.781462671837
|
541 |
+
Step 541 | loss:1.046254277229309 lr:9.499540324784137e-07 tokens_per_second_per_gpu:3805.412799974102
|
542 |
+
Step 542 | loss:1.0471158027648926 lr:9.319879335327244e-07 tokens_per_second_per_gpu:3803.6177569269944
|
543 |
+
Step 543 | loss:1.0406914949417114 lr:9.141820921643429e-07 tokens_per_second_per_gpu:3697.1148852757697
|
544 |
+
Step 544 | loss:1.0133854150772095 lr:8.965369428306025e-07 tokens_per_second_per_gpu:2798.1724809223538
|
545 |
+
Step 545 | loss:0.9686719179153442 lr:8.790529160679947e-07 tokens_per_second_per_gpu:3461.1321675655404
|
546 |
+
Step 546 | loss:0.9938296675682068 lr:8.617304384816716e-07 tokens_per_second_per_gpu:3745.8216708473556
|
547 |
+
Step 547 | loss:1.035244345664978 lr:8.445699327350281e-07 tokens_per_second_per_gpu:3470.7464874423918
|
548 |
+
Step 548 | loss:1.0502572059631348 lr:8.275718175393959e-07 tokens_per_second_per_gpu:3148.8280613056
|
549 |
+
Step 549 | loss:0.9388608336448669 lr:8.10736507643825e-07 tokens_per_second_per_gpu:3426.3978598698536
|
550 |
+
Step 550 | loss:1.0639617443084717 lr:7.940644138249606e-07 tokens_per_second_per_gpu:3617.338930211458
|
551 |
+
Step 551 | loss:1.0359141826629639 lr:7.77555942877031e-07 tokens_per_second_per_gpu:3569.7245310019453
|
552 |
+
Step 552 | loss:1.0284898281097412 lr:7.612114976019013e-07 tokens_per_second_per_gpu:4029.1180368589817
|
553 |
+
Step 553 | loss:1.0912672281265259 lr:7.450314767992725e-07 tokens_per_second_per_gpu:3319.4160159719727
|
554 |
+
Step 554 | loss:1.0639758110046387 lr:7.290162752569235e-07 tokens_per_second_per_gpu:3216.1711606947674
|
555 |
+
Step 555 | loss:0.9887262582778931 lr:7.131662837411021e-07 tokens_per_second_per_gpu:3598.567838296343
|
556 |
+
Step 556 | loss:1.119757890701294 lr:6.974818889869706e-07 tokens_per_second_per_gpu:3828.479312234801
|
557 |
+
Step 557 | loss:0.9670307636260986 lr:6.819634736891881e-07 tokens_per_second_per_gpu:3205.8416970935805
|
558 |
+
Step 558 | loss:0.8852719664573669 lr:6.666114164925519e-07 tokens_per_second_per_gpu:3146.806660385405
|
559 |
+
Step 559 | loss:0.9563117623329163 lr:6.51426091982782e-07 tokens_per_second_per_gpu:3621.68107570963
|
560 |
+
Step 560 | loss:1.1427892446517944 lr:6.364078706773616e-07 tokens_per_second_per_gpu:2813.4747436797943
|
561 |
+
Step 561 | loss:1.0382424592971802 lr:6.215571190165073e-07 tokens_per_second_per_gpu:3323.0282890047843
|
562 |
+
Step 562 | loss:1.0987064838409424 lr:6.068741993542251e-07 tokens_per_second_per_gpu:3476.205922040534
|
563 |
+
Step 563 | loss:1.0931929349899292 lr:5.923594699494683e-07 tokens_per_second_per_gpu:3470.4852308972336
|
564 |
+
Step 564 | loss:0.9598541259765625 lr:5.780132849573988e-07 tokens_per_second_per_gpu:3137.686342427438
|
565 |
+
Step 565 | loss:0.9403815865516663 lr:5.638359944207421e-07 tokens_per_second_per_gpu:3236.405721258059
|
566 |
+
Step 566 | loss:0.9363535046577454 lr:5.498279442612497e-07 tokens_per_second_per_gpu:3329.627606082326
|
567 |
+
Step 567 | loss:1.1313518285751343 lr:5.359894762712558e-07 tokens_per_second_per_gpu:3055.2431705401013
|
568 |
+
Step 568 | loss:1.015571117401123 lr:5.223209281053415e-07 tokens_per_second_per_gpu:3085.9648675924795
|
569 |
+
Step 569 | loss:1.0932440757751465 lr:5.088226332720916e-07 tokens_per_second_per_gpu:3736.1741995184375
|
570 |
+
Step 570 | loss:1.132112979888916 lr:4.954949211259599e-07 tokens_per_second_per_gpu:3456.21209578924
|
571 |
+
Step 571 | loss:1.0685855150222778 lr:4.823381168592328e-07 tokens_per_second_per_gpu:3930.7959672235484
|
572 |
+
Step 572 | loss:0.9238698482513428 lr:4.693525414940933e-07 tokens_per_second_per_gpu:3142.089445727256
|
573 |
+
Step 573 | loss:0.9899613857269287 lr:4.565385118747922e-07 tokens_per_second_per_gpu:3651.4091222282746
|
574 |
+
Step 574 | loss:1.131011962890625 lr:4.4389634065990866e-07 tokens_per_second_per_gpu:3602.03410991543
|
575 |
+
Step 575 | loss:1.0533106327056885 lr:4.31426336314735e-07 tokens_per_second_per_gpu:3126.586366047976
|
576 |
+
Step 576 | loss:0.990755558013916 lr:4.191288031037316e-07 tokens_per_second_per_gpu:2938.069229206343
|
577 |
+
Step 577 | loss:1.085005521774292 lr:4.0700404108312493e-07 tokens_per_second_per_gpu:3634.120014496565
|
578 |
+
Step 578 | loss:1.0190842151641846 lr:3.9505234609356455e-07 tokens_per_second_per_gpu:3508.693290240972
|
579 |
+
Step 579 | loss:1.1383253335952759 lr:3.832740097529236e-07 tokens_per_second_per_gpu:3784.3207549193335
|
580 |
+
Step 580 | loss:1.0955651998519897 lr:3.7166931944916713e-07 tokens_per_second_per_gpu:3443.295664825699
|
581 |
+
Step 581 | loss:1.020769715309143 lr:3.602385583333537e-07 tokens_per_second_per_gpu:3670.778095752044
|
582 |
+
Step 582 | loss:0.9577903747558594 lr:3.4898200531271796e-07 tokens_per_second_per_gpu:3453.753723845044
|
583 |
+
Step 583 | loss:1.0107300281524658 lr:3.3789993504386747e-07 tokens_per_second_per_gpu:3322.5018350298633
|
584 |
+
Step 584 | loss:1.1570929288864136 lr:3.2699261792608183e-07 tokens_per_second_per_gpu:3546.864382690297
|
585 |
+
Step 585 | loss:0.9978330135345459 lr:3.162603200947156e-07 tokens_per_second_per_gpu:3258.2812026952033
|
586 |
+
Step 586 | loss:1.0092127323150635 lr:3.057033034147028e-07 tokens_per_second_per_gpu:3262.423307313303
|
587 |
+
Step 587 | loss:0.9077081680297852 lr:2.953218254741699e-07 tokens_per_second_per_gpu:3697.3960382773657
|
588 |
+
Step 588 | loss:1.018717885017395 lr:2.8511613957814655e-07 tokens_per_second_per_gpu:3288.5442807398276
|
589 |
+
Step 589 | loss:1.0892322063446045 lr:2.7508649474239145e-07 tokens_per_second_per_gpu:3393.0489554763767
|
590 |
+
Step 590 | loss:1.0539475679397583 lr:2.6523313568731026e-07 tokens_per_second_per_gpu:3643.0417749516805
|
591 |
+
Step 591 | loss:1.033966064453125 lr:2.555563028319885e-07 tokens_per_second_per_gpu:3534.4036121090435
|
592 |
+
Step 592 | loss:1.0088396072387695 lr:2.46056232288322e-07 tokens_per_second_per_gpu:3681.445022735837
|
593 |
+
Step 593 | loss:1.0584028959274292 lr:2.3673315585526072e-07 tokens_per_second_per_gpu:3240.2921371564175
|
594 |
+
Step 594 | loss:1.0147311687469482 lr:2.2758730101314684e-07 tokens_per_second_per_gpu:3359.430877297497
|
595 |
+
Step 595 | loss:1.1457427740097046 lr:2.1861889091817133e-07 tokens_per_second_per_gpu:3525.4677911202753
|
596 |
+
Step 596 | loss:1.1605221033096313 lr:2.0982814439691939e-07 tokens_per_second_per_gpu:3453.302876809475
|
597 |
+
Step 597 | loss:1.0375539064407349 lr:2.0121527594104295e-07 tokens_per_second_per_gpu:3577.2309087051176
|
598 |
+
Step 598 | loss:1.0658079385757446 lr:1.9278049570201654e-07 tokens_per_second_per_gpu:3381.451046136403
|
599 |
+
Step 599 | loss:1.0220245122909546 lr:1.8452400948601816e-07 tokens_per_second_per_gpu:3823.4144363605037
|
600 |
+
Step 600 | loss:0.9951660633087158 lr:1.7644601874889894e-07 tokens_per_second_per_gpu:3713.609461806826
|
601 |
+
Step 601 | loss:1.119286060333252 lr:1.6854672059127635e-07 tokens_per_second_per_gpu:3864.777481244428
|
602 |
+
Step 602 | loss:0.9337589740753174 lr:1.6082630775371976e-07 tokens_per_second_per_gpu:3744.03231292111
|
603 |
+
Step 603 | loss:1.072335958480835 lr:1.532849686120491e-07 tokens_per_second_per_gpu:3751.680025273069
|
604 |
+
Step 604 | loss:1.048313021659851 lr:1.459228871727386e-07 tokens_per_second_per_gpu:3755.7832090988786
|
605 |
+
Step 605 | loss:1.1349972486495972 lr:1.3874024306842453e-07 tokens_per_second_per_gpu:3677.1697050059897
|
606 |
+
Step 606 | loss:1.0434086322784424 lr:1.3173721155352868e-07 tokens_per_second_per_gpu:3393.0209085636616
|
607 |
+
Step 607 | loss:1.1098034381866455 lr:1.249139634999752e-07 tokens_per_second_per_gpu:3051.599689491925
|
608 |
+
Step 608 | loss:1.0198705196380615 lr:1.1827066539302378e-07 tokens_per_second_per_gpu:3692.11290536148
|
609 |
+
Step 609 | loss:1.0850074291229248 lr:1.1180747932721142e-07 tokens_per_second_per_gpu:3417.4471054705496
|
610 |
+
Step 610 | loss:1.187023639678955 lr:1.055245630023896e-07 tokens_per_second_per_gpu:3640.1749871015163
|
611 |
+
Step 611 | loss:1.0894136428833008 lr:9.942206971988416e-08 tokens_per_second_per_gpu:3595.7152958928796
|
612 |
+
Step 612 | loss:1.0794929265975952 lr:9.350014837874899e-08 tokens_per_second_per_gpu:3323.2475499204224
|
613 |
+
Step 613 | loss:0.9975882768630981 lr:8.775894347213654e-08 tokens_per_second_per_gpu:3418.980031398386
|
614 |
+
Step 614 | loss:1.220999836921692 lr:8.219859508376975e-08 tokens_per_second_per_gpu:3446.3651662681623
|
615 |
+
Step 615 | loss:1.0748283863067627 lr:7.681923888452902e-08 tokens_per_second_per_gpu:3373.2525132787046
|
616 |
+
Step 616 | loss:1.1435097455978394 lr:7.162100612913308e-08 tokens_per_second_per_gpu:3363.609081700875
|
617 |
+
Step 617 | loss:1.0882154703140259 lr:6.660402365294499e-08 tokens_per_second_per_gpu:3023.7098014074804
|
618 |
+
Step 618 | loss:0.9880639314651489 lr:6.176841386887205e-08 tokens_per_second_per_gpu:3988.401290174963
|
619 |
+
Step 619 | loss:1.1591405868530273 lr:5.71142947643824e-08 tokens_per_second_per_gpu:3171.9719503194547
|
620 |
+
Step 620 | loss:1.0383414030075073 lr:5.264177989862312e-08 tokens_per_second_per_gpu:3086.0139727006417
|
621 |
+
Step 621 | loss:1.1537160873413086 lr:4.8350978399650804e-08 tokens_per_second_per_gpu:3369.979075784975
|
622 |
+
Step 622 | loss:1.0627126693725586 lr:4.424199496177117e-08 tokens_per_second_per_gpu:3436.2757606077344
|
623 |
+
Step 623 | loss:1.0991474390029907 lr:4.0314929842979466e-08 tokens_per_second_per_gpu:3929.0458952761765
|
624 |
+
Step 624 | loss:1.0756220817565918 lr:3.6569878862519055e-08 tokens_per_second_per_gpu:3385.603493456278
|
625 |
+
Step 625 | loss:1.0126550197601318 lr:3.300693339854083e-08 tokens_per_second_per_gpu:3796.7319365547914
|
626 |
+
Step 626 | loss:1.0999494791030884 lr:2.9626180385873282e-08 tokens_per_second_per_gpu:3491.292618600503
|
627 |
+
Step 627 | loss:1.0032886266708374 lr:2.64277023139034e-08 tokens_per_second_per_gpu:3434.2746828936133
|
628 |
+
Step 628 | loss:0.9731086492538452 lr:2.3411577224563273e-08 tokens_per_second_per_gpu:3992.9430945768536
|
629 |
+
Step 629 | loss:1.029701590538025 lr:2.0577878710424104e-08 tokens_per_second_per_gpu:3954.800918281961
|
630 |
+
Step 630 | loss:0.9682028889656067 lr:1.7926675912902644e-08 tokens_per_second_per_gpu:3079.0321478215965
|
631 |
+
Step 631 | loss:1.2128889560699463 lr:1.54580335205734e-08 tokens_per_second_per_gpu:2766.3162516442967
|
632 |
+
Step 632 | loss:1.0580332279205322 lr:1.3172011767589865e-08 tokens_per_second_per_gpu:3414.5038307314135
|
633 |
+
Step 633 | loss:0.9746403098106384 lr:1.1068666432215713e-08 tokens_per_second_per_gpu:3261.521524950618
|
634 |
+
Step 634 | loss:0.9512935280799866 lr:9.148048835462552e-09 tokens_per_second_per_gpu:3716.130203693814
|
635 |
+
Step 635 | loss:0.9694274663925171 lr:7.410205839840922e-09 tokens_per_second_per_gpu:3638.283430143105
|
636 |
+
Step 636 | loss:0.9943816065788269 lr:5.8551798482112116e-09 tokens_per_second_per_gpu:3984.6815410831828
|
637 |
+
Step 637 | loss:1.019755244255066 lr:4.483008802754485e-09 tokens_per_second_per_gpu:3426.6431524886534
|
638 |
+
Step 638 | loss:0.9445756673812866 lr:3.2937261840423894e-09 tokens_per_second_per_gpu:3651.1875501336294
|
639 |
+
Step 639 | loss:1.0361696481704712 lr:2.2873610102253083e-09 tokens_per_second_per_gpu:2899.609547551191
|
640 |
+
Step 640 | loss:1.0265421867370605 lr:1.4639378363187629e-09 tokens_per_second_per_gpu:3146.508069944556
|
641 |
+
Step 641 | loss:0.9879869222640991 lr:8.234767536080545e-10 tokens_per_second_per_gpu:3718.1335290345437
|
642 |
+
Step 642 | loss:1.0564134120941162 lr:3.659933891569933e-10 tokens_per_second_per_gpu:3539.4915099356376
|
643 |
+
Step 643 | loss:1.1678091287612915 lr:9.149890542570205e-11 tokens_per_second_per_gpu:2749.888827785128
|
644 |
+
Step 644 | loss:0.9817880392074585 lr:0.0 tokens_per_second_per_gpu:3425.3032445871813
|
model.safetensors.index.json
ADDED
@@ -0,0 +1,298 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"metadata": {
|
3 |
+
"total_size": 16068960256
|
4 |
+
},
|
5 |
+
"weight_map": {
|
6 |
+
"lm_head.weight": "model-00004-of-00004.safetensors",
|
7 |
+
"model.embed_tokens.weight": "model-00001-of-00004.safetensors",
|
8 |
+
"model.layers.0.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
9 |
+
"model.layers.0.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
10 |
+
"model.layers.0.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
11 |
+
"model.layers.0.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
12 |
+
"model.layers.0.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
13 |
+
"model.layers.0.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
14 |
+
"model.layers.0.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
15 |
+
"model.layers.0.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
16 |
+
"model.layers.0.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
17 |
+
"model.layers.1.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
18 |
+
"model.layers.1.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
19 |
+
"model.layers.1.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
20 |
+
"model.layers.1.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
21 |
+
"model.layers.1.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
22 |
+
"model.layers.1.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
23 |
+
"model.layers.1.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
24 |
+
"model.layers.1.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
25 |
+
"model.layers.1.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
26 |
+
"model.layers.10.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
27 |
+
"model.layers.10.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
28 |
+
"model.layers.10.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
29 |
+
"model.layers.10.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
30 |
+
"model.layers.10.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
31 |
+
"model.layers.10.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
32 |
+
"model.layers.10.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
33 |
+
"model.layers.10.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
34 |
+
"model.layers.10.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
35 |
+
"model.layers.11.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
36 |
+
"model.layers.11.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
37 |
+
"model.layers.11.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
38 |
+
"model.layers.11.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
39 |
+
"model.layers.11.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
40 |
+
"model.layers.11.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
41 |
+
"model.layers.11.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
42 |
+
"model.layers.11.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
43 |
+
"model.layers.11.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
44 |
+
"model.layers.12.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
45 |
+
"model.layers.12.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
46 |
+
"model.layers.12.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
47 |
+
"model.layers.12.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
48 |
+
"model.layers.12.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
49 |
+
"model.layers.12.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
50 |
+
"model.layers.12.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
51 |
+
"model.layers.12.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
52 |
+
"model.layers.12.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
53 |
+
"model.layers.13.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
54 |
+
"model.layers.13.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
55 |
+
"model.layers.13.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
56 |
+
"model.layers.13.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
57 |
+
"model.layers.13.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
58 |
+
"model.layers.13.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
59 |
+
"model.layers.13.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
60 |
+
"model.layers.13.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
61 |
+
"model.layers.13.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
62 |
+
"model.layers.14.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
63 |
+
"model.layers.14.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
64 |
+
"model.layers.14.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
65 |
+
"model.layers.14.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
66 |
+
"model.layers.14.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
67 |
+
"model.layers.14.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
68 |
+
"model.layers.14.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
69 |
+
"model.layers.14.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
70 |
+
"model.layers.14.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
71 |
+
"model.layers.15.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
72 |
+
"model.layers.15.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
73 |
+
"model.layers.15.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
74 |
+
"model.layers.15.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
75 |
+
"model.layers.15.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
76 |
+
"model.layers.15.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
77 |
+
"model.layers.15.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
78 |
+
"model.layers.15.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
79 |
+
"model.layers.15.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
80 |
+
"model.layers.16.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
81 |
+
"model.layers.16.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
82 |
+
"model.layers.16.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
83 |
+
"model.layers.16.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
84 |
+
"model.layers.16.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
85 |
+
"model.layers.16.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
86 |
+
"model.layers.16.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
87 |
+
"model.layers.16.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
88 |
+
"model.layers.16.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
89 |
+
"model.layers.17.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
90 |
+
"model.layers.17.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
91 |
+
"model.layers.17.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
92 |
+
"model.layers.17.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
93 |
+
"model.layers.17.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
94 |
+
"model.layers.17.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
95 |
+
"model.layers.17.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
96 |
+
"model.layers.17.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
97 |
+
"model.layers.17.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
98 |
+
"model.layers.18.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
99 |
+
"model.layers.18.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
100 |
+
"model.layers.18.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
101 |
+
"model.layers.18.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
102 |
+
"model.layers.18.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
103 |
+
"model.layers.18.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
104 |
+
"model.layers.18.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
105 |
+
"model.layers.18.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
106 |
+
"model.layers.18.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
107 |
+
"model.layers.19.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
108 |
+
"model.layers.19.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
109 |
+
"model.layers.19.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
110 |
+
"model.layers.19.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
111 |
+
"model.layers.19.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
112 |
+
"model.layers.19.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
113 |
+
"model.layers.19.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
114 |
+
"model.layers.19.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
115 |
+
"model.layers.19.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
116 |
+
"model.layers.2.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
117 |
+
"model.layers.2.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
118 |
+
"model.layers.2.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
119 |
+
"model.layers.2.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
120 |
+
"model.layers.2.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
121 |
+
"model.layers.2.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
122 |
+
"model.layers.2.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
123 |
+
"model.layers.2.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
124 |
+
"model.layers.2.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
125 |
+
"model.layers.20.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
126 |
+
"model.layers.20.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
127 |
+
"model.layers.20.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
128 |
+
"model.layers.20.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
129 |
+
"model.layers.20.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
130 |
+
"model.layers.20.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
131 |
+
"model.layers.20.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
132 |
+
"model.layers.20.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
133 |
+
"model.layers.20.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
134 |
+
"model.layers.21.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
135 |
+
"model.layers.21.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
136 |
+
"model.layers.21.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
137 |
+
"model.layers.21.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
138 |
+
"model.layers.21.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
139 |
+
"model.layers.21.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
140 |
+
"model.layers.21.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
141 |
+
"model.layers.21.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
142 |
+
"model.layers.21.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
143 |
+
"model.layers.22.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
144 |
+
"model.layers.22.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
145 |
+
"model.layers.22.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
146 |
+
"model.layers.22.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
147 |
+
"model.layers.22.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
148 |
+
"model.layers.22.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
149 |
+
"model.layers.22.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
150 |
+
"model.layers.22.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
151 |
+
"model.layers.22.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
152 |
+
"model.layers.23.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
153 |
+
"model.layers.23.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
154 |
+
"model.layers.23.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
155 |
+
"model.layers.23.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
156 |
+
"model.layers.23.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
157 |
+
"model.layers.23.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
158 |
+
"model.layers.23.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
159 |
+
"model.layers.23.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
160 |
+
"model.layers.23.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
161 |
+
"model.layers.24.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
162 |
+
"model.layers.24.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
163 |
+
"model.layers.24.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
164 |
+
"model.layers.24.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
165 |
+
"model.layers.24.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
166 |
+
"model.layers.24.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
167 |
+
"model.layers.24.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
168 |
+
"model.layers.24.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
169 |
+
"model.layers.24.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
170 |
+
"model.layers.25.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
171 |
+
"model.layers.25.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
172 |
+
"model.layers.25.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
173 |
+
"model.layers.25.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
174 |
+
"model.layers.25.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
175 |
+
"model.layers.25.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
176 |
+
"model.layers.25.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
177 |
+
"model.layers.25.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
178 |
+
"model.layers.25.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
179 |
+
"model.layers.26.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
180 |
+
"model.layers.26.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
181 |
+
"model.layers.26.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
182 |
+
"model.layers.26.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
183 |
+
"model.layers.26.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
184 |
+
"model.layers.26.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
185 |
+
"model.layers.26.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
186 |
+
"model.layers.26.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
187 |
+
"model.layers.26.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
188 |
+
"model.layers.27.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
189 |
+
"model.layers.27.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
190 |
+
"model.layers.27.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
191 |
+
"model.layers.27.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
192 |
+
"model.layers.27.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
193 |
+
"model.layers.27.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
194 |
+
"model.layers.27.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
195 |
+
"model.layers.27.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
196 |
+
"model.layers.27.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
197 |
+
"model.layers.28.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
198 |
+
"model.layers.28.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
199 |
+
"model.layers.28.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
200 |
+
"model.layers.28.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
201 |
+
"model.layers.28.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
202 |
+
"model.layers.28.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
203 |
+
"model.layers.28.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
204 |
+
"model.layers.28.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
205 |
+
"model.layers.28.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
206 |
+
"model.layers.29.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
207 |
+
"model.layers.29.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
208 |
+
"model.layers.29.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
209 |
+
"model.layers.29.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
210 |
+
"model.layers.29.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
211 |
+
"model.layers.29.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
212 |
+
"model.layers.29.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
213 |
+
"model.layers.29.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
214 |
+
"model.layers.29.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
215 |
+
"model.layers.3.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
216 |
+
"model.layers.3.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
217 |
+
"model.layers.3.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
218 |
+
"model.layers.3.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
219 |
+
"model.layers.3.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
220 |
+
"model.layers.3.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
221 |
+
"model.layers.3.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
222 |
+
"model.layers.3.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
223 |
+
"model.layers.3.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
224 |
+
"model.layers.30.input_layernorm.weight": "model-00003-of-00004.safetensors",
|
225 |
+
"model.layers.30.mlp.down_proj.weight": "model-00003-of-00004.safetensors",
|
226 |
+
"model.layers.30.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
227 |
+
"model.layers.30.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
228 |
+
"model.layers.30.post_attention_layernorm.weight": "model-00003-of-00004.safetensors",
|
229 |
+
"model.layers.30.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
230 |
+
"model.layers.30.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
231 |
+
"model.layers.30.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
232 |
+
"model.layers.30.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
233 |
+
"model.layers.31.input_layernorm.weight": "model-00004-of-00004.safetensors",
|
234 |
+
"model.layers.31.mlp.down_proj.weight": "model-00004-of-00004.safetensors",
|
235 |
+
"model.layers.31.mlp.gate_proj.weight": "model-00003-of-00004.safetensors",
|
236 |
+
"model.layers.31.mlp.up_proj.weight": "model-00003-of-00004.safetensors",
|
237 |
+
"model.layers.31.post_attention_layernorm.weight": "model-00004-of-00004.safetensors",
|
238 |
+
"model.layers.31.self_attn.k_proj.weight": "model-00003-of-00004.safetensors",
|
239 |
+
"model.layers.31.self_attn.o_proj.weight": "model-00003-of-00004.safetensors",
|
240 |
+
"model.layers.31.self_attn.q_proj.weight": "model-00003-of-00004.safetensors",
|
241 |
+
"model.layers.31.self_attn.v_proj.weight": "model-00003-of-00004.safetensors",
|
242 |
+
"model.layers.4.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
243 |
+
"model.layers.4.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
244 |
+
"model.layers.4.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
245 |
+
"model.layers.4.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
246 |
+
"model.layers.4.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
247 |
+
"model.layers.4.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
248 |
+
"model.layers.4.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
249 |
+
"model.layers.4.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
250 |
+
"model.layers.4.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
251 |
+
"model.layers.5.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
252 |
+
"model.layers.5.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
253 |
+
"model.layers.5.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
254 |
+
"model.layers.5.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
255 |
+
"model.layers.5.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
256 |
+
"model.layers.5.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
257 |
+
"model.layers.5.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
258 |
+
"model.layers.5.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
259 |
+
"model.layers.5.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
260 |
+
"model.layers.6.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
261 |
+
"model.layers.6.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
262 |
+
"model.layers.6.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
263 |
+
"model.layers.6.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
264 |
+
"model.layers.6.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
265 |
+
"model.layers.6.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
266 |
+
"model.layers.6.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
267 |
+
"model.layers.6.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
268 |
+
"model.layers.6.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
269 |
+
"model.layers.7.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
270 |
+
"model.layers.7.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
271 |
+
"model.layers.7.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
272 |
+
"model.layers.7.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
273 |
+
"model.layers.7.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
274 |
+
"model.layers.7.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
275 |
+
"model.layers.7.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
276 |
+
"model.layers.7.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
277 |
+
"model.layers.7.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
278 |
+
"model.layers.8.input_layernorm.weight": "model-00001-of-00004.safetensors",
|
279 |
+
"model.layers.8.mlp.down_proj.weight": "model-00001-of-00004.safetensors",
|
280 |
+
"model.layers.8.mlp.gate_proj.weight": "model-00001-of-00004.safetensors",
|
281 |
+
"model.layers.8.mlp.up_proj.weight": "model-00001-of-00004.safetensors",
|
282 |
+
"model.layers.8.post_attention_layernorm.weight": "model-00001-of-00004.safetensors",
|
283 |
+
"model.layers.8.self_attn.k_proj.weight": "model-00001-of-00004.safetensors",
|
284 |
+
"model.layers.8.self_attn.o_proj.weight": "model-00001-of-00004.safetensors",
|
285 |
+
"model.layers.8.self_attn.q_proj.weight": "model-00001-of-00004.safetensors",
|
286 |
+
"model.layers.8.self_attn.v_proj.weight": "model-00001-of-00004.safetensors",
|
287 |
+
"model.layers.9.input_layernorm.weight": "model-00002-of-00004.safetensors",
|
288 |
+
"model.layers.9.mlp.down_proj.weight": "model-00002-of-00004.safetensors",
|
289 |
+
"model.layers.9.mlp.gate_proj.weight": "model-00002-of-00004.safetensors",
|
290 |
+
"model.layers.9.mlp.up_proj.weight": "model-00002-of-00004.safetensors",
|
291 |
+
"model.layers.9.post_attention_layernorm.weight": "model-00002-of-00004.safetensors",
|
292 |
+
"model.layers.9.self_attn.k_proj.weight": "model-00002-of-00004.safetensors",
|
293 |
+
"model.layers.9.self_attn.o_proj.weight": "model-00002-of-00004.safetensors",
|
294 |
+
"model.layers.9.self_attn.q_proj.weight": "model-00002-of-00004.safetensors",
|
295 |
+
"model.layers.9.self_attn.v_proj.weight": "model-00002-of-00004.safetensors",
|
296 |
+
"model.norm.weight": "model-00004-of-00004.safetensors"
|
297 |
+
}
|
298 |
+
}
|
output.safetensors
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3fd5b29e86ced8a007dbaa7008b11de967a8bf12ce60522973f4fe7e291bde71
|
3 |
+
size 8429513848
|
special_tokens_map.json
ADDED
@@ -0,0 +1,16 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token": {
|
3 |
+
"content": "<|begin_of_text|>",
|
4 |
+
"lstrip": false,
|
5 |
+
"normalized": false,
|
6 |
+
"rstrip": false,
|
7 |
+
"single_word": false
|
8 |
+
},
|
9 |
+
"eos_token": {
|
10 |
+
"content": "<|eot_id|>",
|
11 |
+
"lstrip": false,
|
12 |
+
"normalized": false,
|
13 |
+
"rstrip": false,
|
14 |
+
"single_word": false
|
15 |
+
}
|
16 |
+
}
|
tokenizer.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
tokenizer_config.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
training_config_phase3.yaml
ADDED
@@ -0,0 +1,93 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Config for multi-device full finetuning in full_finetune_distributed.py
|
2 |
+
# using a Llama3 8B Instruct model
|
3 |
+
#
|
4 |
+
# This config assumes that you've run the following command before launching
|
5 |
+
# this run:
|
6 |
+
# tune download meta-llama/Meta-Llama-3-8B-Instruct --output-dir /tmp/Meta-Llama-3-8B-Instruct --hf-token <HF_TOKEN>
|
7 |
+
#
|
8 |
+
# To launch on 4 devices, run the following command from root:
|
9 |
+
# tune run --nproc_per_node 4 full_finetune_distributed --config llama3/8B_full
|
10 |
+
#
|
11 |
+
# You can add specific overrides through the command line. For example
|
12 |
+
# to override the checkpointer directory while launching training
|
13 |
+
# you can run:
|
14 |
+
# tune run --nproc_per_node 4 full_finetune_distributed --config llama3/8B_full checkpointer.checkpoint_dir=<YOUR_CHECKPOINT_DIR>
|
15 |
+
#
|
16 |
+
# This config works best when the model is being fine-tuned on 2+ GPUs.
|
17 |
+
# Single device full finetuning requires more memory optimizations. It's
|
18 |
+
# best to use 8B_full_single_device.yaml for those cases
|
19 |
+
# Tokenizer
|
20 |
+
tokenizer:
|
21 |
+
_component_: torchtune.models.llama3.llama3_s_tokenizer
|
22 |
+
path: ../model_zoo/tokenizer.model
|
23 |
+
max_seq_len: 4096
|
24 |
+
|
25 |
+
# Dataset
|
26 |
+
dataset:
|
27 |
+
_component_: torchtune.datasets.chat_dataset
|
28 |
+
source: jan-hq/mixed-instruction-speech-multiturn-noise-clean
|
29 |
+
conversation_style: openai
|
30 |
+
max_seq_len: 4096
|
31 |
+
split: train
|
32 |
+
train_on_input: True
|
33 |
+
|
34 |
+
seed: 42
|
35 |
+
shuffle: False
|
36 |
+
# Model Arguments
|
37 |
+
model:
|
38 |
+
_component_: torchtune.models.llama3_1.llama3_1_s_8b
|
39 |
+
# path: model_zoo/Llama3.1_s_8b_init
|
40 |
+
checkpointer:
|
41 |
+
_component_: torchtune.utils.FullModelHFCheckpointerSaveSteps
|
42 |
+
checkpoint_dir: ../model_zoo/llama3.1-s-cp-7000
|
43 |
+
checkpoint_files: [
|
44 |
+
model-00001-of-00004.safetensors,
|
45 |
+
model-00002-of-00004.safetensors,
|
46 |
+
model-00003-of-00004.safetensors,
|
47 |
+
model-00004-of-00004.safetensors,
|
48 |
+
]
|
49 |
+
recipe_checkpoint: null
|
50 |
+
output_dir: ../model_zoo/llama3-s-instruct-lr-3e-5
|
51 |
+
model_type: LLAMA3
|
52 |
+
resume_from_checkpoint: False
|
53 |
+
save_every_n_steps: 200
|
54 |
+
max_checkpoints: 3
|
55 |
+
# Fine-tuning arguments
|
56 |
+
batch_size: 4
|
57 |
+
epochs: 1
|
58 |
+
max_steps_per_epoch: null
|
59 |
+
gradient_accumulation_steps: 8
|
60 |
+
compile: False
|
61 |
+
# Optimizer and Scheduler
|
62 |
+
optimizer:
|
63 |
+
_component_: torch.optim.AdamW #change this to use adam_mini: torchtune.modules.optimizer.Adam_mini
|
64 |
+
weight_decay: 0.005
|
65 |
+
lr: 1.5e-5
|
66 |
+
fused: True
|
67 |
+
lr_scheduler:
|
68 |
+
_component_: torchtune.modules.get_cosine_schedule_with_warmup
|
69 |
+
num_warmup_steps: 8
|
70 |
+
|
71 |
+
loss:
|
72 |
+
_component_: torch.nn.CrossEntropyLoss
|
73 |
+
|
74 |
+
fsdp:
|
75 |
+
cpu_offload: False
|
76 |
+
|
77 |
+
# Training env
|
78 |
+
device: cuda
|
79 |
+
dtype: bf16
|
80 |
+
|
81 |
+
# Memory management
|
82 |
+
enable_activation_checkpointing: True
|
83 |
+
memory_efficient_fsdp_wrap: True
|
84 |
+
ac_mode: 'selective'
|
85 |
+
|
86 |
+
|
87 |
+
# Logging
|
88 |
+
metric_logger:
|
89 |
+
_component_: torchtune.utils.metric_logging.DiskLogger
|
90 |
+
log_dir: ${output_dir}
|
91 |
+
output_dir: ../model_zoo/Llama3-instruct-log-lr-3e-5/
|
92 |
+
log_every_n_steps: 1
|
93 |
+
log_peak_memory_stats: False
|