jrahn commited on
Commit
697e098
1 Parent(s): ccf40f5

Upload run_gpt3_125M_edu_pr711.sh with huggingface_hub

Browse files
Files changed (1) hide show
  1. run_gpt3_125M_edu_pr711.sh +46 -0
run_gpt3_125M_edu_pr711.sh ADDED
@@ -0,0 +1,46 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # GPT-3 (125M) repro, but using FineWeb
2
+ # 125M parameter model on 300B tokens
3
+ # note context length: 1024 -> 2048 for GPT-3
4
+ # => 6 * 125e6 * 300e9 = ~= 2.25e20 capability model
5
+ # 572,204 steps of 524,288 tokens/step => 300B
6
+ # on 8X A100 80GB SXM ($14/hr) steps in ~150ms/iter
7
+ # => training time 572,204 * 150ms ~= 24 hours ~= $336
8
+
9
+ make train_gpt2cu USE_CUDNN=1
10
+ out_dir="log_gpt3_125M_edu_v4"
11
+ done_file="$out_dir/DONE_00018794"
12
+
13
+ while true; do
14
+
15
+ # exit condition is that optimization has finished
16
+ if [ -f "$done_file" ]; then
17
+ echo "File $done_file exists. Exiting the loop."
18
+ break
19
+ fi
20
+
21
+ mpirun -np 2 ./train_gpt2cu \
22
+ -i "dev/data/edu_fineweb10B/edu_fineweb_train_*.bin" \
23
+ -j "dev/data/edu_fineweb10B/edu_fineweb_val_*.bin" \
24
+ -o $out_dir \
25
+ -v 250 -s 1000 -g 144 \
26
+ -h 1 \
27
+ -b 16 -t 2048 \
28
+ -d 524288 \
29
+ -r 0 \
30
+ -z 1 \
31
+ -c 0.1 \
32
+ -l 0.006 \
33
+ -q 0.1 \
34
+ -u 1000 \
35
+ -n 500 \
36
+ -nk 5 \
37
+ -nm 2000 \
38
+ -ge 1 \
39
+ -sl 5.0 \
40
+ -sg 5.0 \
41
+ -y 1 \
42
+ -x 18794 \
43
+ -e "gpt3:c768"
44
+
45
+ sleep 1
46
+ done