File size: 5,460 Bytes
002bd9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
env_defaults:


  # NOTE: this kind of string leaded by > will append a new line to the end of the string
  SHARED_CMD_ARGS: >-
    -m src.train
    +model=base_sca_multitask_v2
    training.do_train=True
    training.do_eval=True
    training.do_inference=True
    +data.streaming=False
    training.max_eval_samples=800
    training.max_steps=100000
    training.fp16=True
    training.output_dir=$AMLT_OUTPUT_DIR
    training.output_log_dir=$AMLT_LOGS_DIR
    model.cache_dir=/mnt/blob/weights/.model.cache/
    training.save_strategy=steps
    training.save_steps=5000
    training.save_total_limit=3
    training.optim=adamw_torch
    training.evaluate_before_train=True
    training.per_device_train_batch_size=1
    training.evaluation_strategy=steps
    training.eval_steps=5000
    training.logging_steps=1000
    training.logging_first_step=True
    training.dataloader_num_workers=4
    training.num_masks_per_sample=16
    wandb.project=$AMLT_EXPERIMENT_NAME
    wandb.name=$AMLT_JOB_NAME
    model.num_caption_tokens=8
    model.additional_num_hidden_layers=12
    model.num_task_tokens=6
    training.lr_scheduler_type=cosine
    model.lm_head_model_name_or_path=gpt2-large
    training.learning_rate=1e-5
    training.weight_decay=1e-4
    training.warmup_steps=200
    training.warmup_ratio=0.33333333
    training.compute_metrics=True



environment:

  image: nvidia/pytorch:23.07-py3 # NCCL on PHLRR4076 cannot initialized successfully
  # image: nvidia/pytorch:23.06-py3  # NCCL on PHLRR4076 cannot initialized successfully
  # image: nvidia/pytorch:22.12-py3  # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471 pip install pydantic==1.10.8; not support adamw_torch_fused, as it requires PyTorch 2.0 or higher
  registry: nvcr.io

code:
  local_dir: $CONFIG_DIR/../



jobs:
  - name: gpt2-large
    preemptible: True
    sku: ${NUM_NODES}xG${NUM_GPUS}-V100-IB
    process_count_per_node: 1 # Each node should run 1 process
    command:
      - . amlt_configs/setup.sh
      - source ~/.bashrc
      - . amlt_configs/setup_accelerate_on_azure.sh
      - >-
        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml 
        $SHARED_CMD_ARGS 
        train_data='[vg-densecap-region_descriptions]' 
        eval_data='[vg-densecap-region_descriptions]' 
        model.lm_head_model_name_or_path=gpt2-large
        $EXTRA_ARGS 


    submit_args:
      env:
        SHARED_MEMORY_PERCENT: 0.5
        HYDRA_FULL_ERROR: 1
        # NCCL_IB_DISABLE: 1
        # NCCL_IBEXT_DISABLE: 1
      container_args:
        shm_size: 256g

  - name: open_llama_3b_v2
    preemptible: True
    sku: ${NUM_NODES}xG${NUM_GPUS}-V100-IB
    process_count_per_node: 1 # Each node should run 1 process
    command:
      - . amlt_configs/setup.sh
      - source ~/.bashrc
      - . amlt_configs/setup_accelerate_on_azure.sh
      - >-
        accelerate launch --config_file amlt_configs/accelerate_deepspeed_config.yaml 
        $SHARED_CMD_ARGS
        train_data='[vg-densecap-region_descriptions]'
        eval_data='[vg-densecap-region_descriptions]'
        model.lm_head_model_name_or_path=openlm-research/open_llama_3b_v2
        training.gradient_checkpointing=true
        $EXTRA_ARGS


    submit_args:
      env:
        SHARED_MEMORY_PERCENT: 0.5
        HYDRA_FULL_ERROR: 1
        # NCCL_IB_DISABLE: 1
        # NCCL_IBEXT_DISABLE: 1
      container_args:
        shm_size: 256g


# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000 training.max_steps=100000 training.learning_rate=1e-4" \
# -t msroctovc -w msroctows --sku=G8-V100 --no-pre \
# amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.lsj-gpt2-large-finetune-lr_1e_4 train-sca-ablat-finetune-scale_lr-110423

# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000 training.max_steps=100000 training.learning_rate=1e-5" \
# -t msroctovc -w msroctows --sku=G8-V100 --no-pre \
# amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml :0=`date +"%m%d%y"`.octo-1x8-v100-16g-no_pre.lsj-gpt2-large-finetune-lr_1e_5 train-sca-ablat-finetune-scale_lr-110423

# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-5" \
# -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --no-pre \
# amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml :0=`date +"%m%d%y"`.resrch-4x8-v100-16g-no_pre.lsj-gpt2-large-finetune-lr_1e_5-1xlr train-sca-ablat-finetune-scale_lr-110423

# amlt run -d "" --extra-args "+data_transforms=lsj-0_1-2_0 model.model_name_or_path=/mnt/blob/projects/sca-xiaoke-v3/amlt-results/7301932201.25563-cd1e6021-6ea9-4835-8578-ba26f723a708/checkpoint-100000 training.max_steps=100000 training.learning_rate=4e-4" \
# -t msrresrchvc -w msrresrchws --sku=4xG8-V100 --no-pre \
# amlt_configs/train-sca-ablat-finetune-scale_lr-110423.yaml :0=`date +"%m%d%y"`.resrch-4x8-v100-16g-no_pre.lsj-gpt2-large-finetune-lr_1e_4-1xlr train-sca-ablat-finetune-scale_lr-110423