deepspeed / amlt_configs /debug.yaml
xingzhikb's picture
init
002bd9b
env_defaults:
SHARED_CMD_ARGS: '
-m src.train
train_data=[vg-densecap-region_descriptions] eval_data=[vg-densecap-region_descriptions]
+model=base_sam_captioner
training.do_train=False
training.do_eval=False
training.do_inference=True
training.num_masks_per_sample=1
+data.streaming=False
training.max_eval_samples=10
training.max_train_samples=1
training.num_train_epochs=10
training.fp16=True
training.output_dir=$AMLT_OUTPUT_DIR
training.output_log_dir=$AMLT_LOGS_DIR
model.cache_dir=/mnt/blob/weights/.model.cache/
training.dataloader_num_workers=4
'
environment:
image: nvidia/pytorch:23.07-py3
registry: nvcr.io
code:
local_dir: $CONFIG_DIR/../
jobs:
- name: sam_captioner-infer-debug
sku: G$NUM_GPUS
preemptible: False
process_count_per_node: 1 # Each node should run 1 process
command:
- . amlt_configs/setup.sh
- source ~/.bashrc
- . amlt_configs/setup_accelerate_on_azure.sh
- . amlt_configs/post_process.sh
# - accelerate launch --config_file amlt_configs/accelerate_config.yaml $SHARED_CMD_ARGS || . amlt_configs/post_process.sh
submit_args:
env:
AZFUSE_USE_FUSE: "1"
SHARED_MEMORY_PERCENT: 0.5
container_args:
shm_size: 256g