File size: 4,654 Bytes
002bd9b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
env_defaults:

  SHARED_CMD_ARGS: >-
    -m src.train
    +model=base_sca
    training.do_train=False
    training.do_eval=False
    training.do_inference=True
    training.fp16=True
    training.output_log_dir=$AMLT_LOGS_DIR
    model.cache_dir=/mnt/blob/weights/.model.cache/
    wandb.log=False
    training.dataloader_num_workers=4




environment:

  image: nvidia/pytorch:22.12-py3  # Pydantic has bug: https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
  registry: nvcr.io

code:
  local_dir: $CONFIG_DIR/../



jobs:
  - name: infer-eval_suite
    sku: G$NUM_GPUS
    preemptible: False
    command:
      - . amlt_configs/setup.sh
      - source ~/.bashrc
      - pip install pydantic==1.10.8  # https://github.com/pydantic/pydantic/issues/545#issuecomment-1573776471
      - . amlt_configs/setup_eval_suite.sh
      - . amlt_configs/setup_accelerate_on_azure.sh

      # get best (or max step) model
      - BEST_CKPT_STEP=$$(python scripts/tools/get_model_name_from_trainer_state.py $$AMLT_MAP_INPUT_DIR "last")
      - BEST_CKPT_PATH=$$(find $$AMLT_MAP_INPUT_DIR -name '*checkpoint*' | grep $$BEST_CKPT_STEP | tail -n1)

      # caption
      - DATASET=vg-densecap-region_descriptions
      - >-
        accelerate launch $SHARED_CMD_ARGS
        train_data=[$$DATASET]
        eval_data=[$$DATASET]
        training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET
        model.model_name_or_path=$$BEST_CKPT_PATH
        model.lm_head_model_name_or_path=$$(python scripts/tools/get_sub_model_name_from_ckpt.py $$BEST_CKPT_PATH "lm")
        model.sam_model_name_or_path=facebook/sam-vit-huge
        $EXTRA_ARGS

      - bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-vg-densecap-region_descriptions/region_img_annot_caption/visual_genome.py-region_descriptions_v1.2.0-test.region_img.tsv

      # - DATASET=refcocog-google
      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
      # - bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference /mnt/blob/data/sca-eval_suite-data/extract_region_img_annot_caption_to_tsv-refcocog-google/region_img_annot_caption/refcoco.py-refcocog-google-validation.region_img.tsv

      # - DATASET=refcoco-unc-split_testA
      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS 
      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference

      # - DATASET=refcoco-unc-split_testB
      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference

      # - DATASET=refcoco+-unc-split_testA
      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS 
      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference

      # - DATASET=refcoco+-unc-split_testB
      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS 
      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference

      # concept
      # - DATASET=coco-instance
      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference

      # OOM and every slow
      # - DATASET=objects365-local
      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference

      # OOM and every slow
      # - DATASET=v3det-local
      # - accelerate launch $SHARED_CMD_ARGS train_data=[$$DATASET] eval_data=[$$DATASET] training.output_dir=$$AMLT_OUTPUT_DIR/$$DATASET $EXTRA_ARGS
      # - SKIP_CLIP_RECALL=1 bash scripts/tools/eval_suite.sh $$AMLT_OUTPUT_DIR/$$DATASET infer.json inference

    submit_args:
      env:
        SHARED_MEMORY_PERCENT: 0.5
      container_args:
        shm_size: 256g