File size: 1,316 Bytes
002bd9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
env_defaults:

  SHARED_CMD_ARGS: '
    -m src.train
    train_data=[vg-densecap-region_descriptions] eval_data=[vg-densecap-region_descriptions]
    +model=base_sam_captioner
    training.do_train=False
    training.do_eval=False
    training.do_inference=True
    training.num_masks_per_sample=1
    +data.streaming=False
    training.max_eval_samples=10
    training.max_train_samples=1
    training.num_train_epochs=10
    training.fp16=True
    training.output_dir=$AMLT_OUTPUT_DIR
    training.output_log_dir=$AMLT_LOGS_DIR
    model.cache_dir=/mnt/blob/weights/.model.cache/
    training.dataloader_num_workers=4
  '



environment:
  image: nvidia/pytorch:23.07-py3
  registry: nvcr.io

code:
  local_dir: $CONFIG_DIR/../



jobs:
  - name: sam_captioner-infer-debug
    sku: G$NUM_GPUS
    preemptible: False
    process_count_per_node: 1  # Each node should run 1 process
    command:
      - . amlt_configs/setup.sh
      - source ~/.bashrc
      - . amlt_configs/setup_accelerate_on_azure.sh
      - . amlt_configs/post_process.sh
      # - accelerate launch --config_file amlt_configs/accelerate_config.yaml $SHARED_CMD_ARGS || . amlt_configs/post_process.sh

      
    submit_args:
      env:
        AZFUSE_USE_FUSE: "1"
        SHARED_MEMORY_PERCENT: 0.5
      container_args:
        shm_size: 256g