ProteinGPT-Llama3 / protein_gpt.py
EdwardoSunny's picture
finished
85ab89d
"""
Copyright (c) 2022, salesforce.com, inc.
All rights reserved.
SPDX-License-Identifier: BSD-3-Clause
For full license text, see the LICENSE_Lavis file in the repo root or https://opensource.org/licenses/BSD-3-Clause
"""
import argparse
import os
import random
import numpy as np
import torch
import torch.backends.cudnn as cudnn
import yaml
# import esm
import minigpt4.tasks as tasks
from minigpt4.esm.esm_config import Config
from minigpt4.common.dist_utils import get_rank, init_distributed_mode
from minigpt4.common.logger import setup_logger
from minigpt4.common.optims import (
LinearWarmupCosineLRScheduler,
LinearWarmupStepLRScheduler,
)
from minigpt4.common.registry import registry
from minigpt4.common.utils import now
# imports modules for registration
from minigpt4.datasets.builders import *
from minigpt4.datasets.pdb_dataset import ESMDataset
from minigpt4.datasets.qa_dataset import QADataset
from minigpt4.models import *
from minigpt4.processors import *
from minigpt4.runners import *
from minigpt4.tasks import *
def parse_args():
parser = argparse.ArgumentParser(description="Training")
parser.add_argument("--cfg-path", required=False, help="path to configuration file.",
default='configs/train_modality_alignment.yaml')
parser.add_argument(
"--options",
nargs="+",
help="override some settings in the used config, the key-value pair "
"in xxx=yyy format will be merged into config file (deprecate), "
"change to --cfg-options instead.",
)
args = parser.parse_args()
# if 'LOCAL_RANK' not in os.environ:
# os.environ['LOCAL_RANK'] = str(args.local_rank)
return args
def setup_seeds(config):
seed = config.run_cfg.seed + get_rank()
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
cudnn.benchmark = False
cudnn.deterministic = True
def get_runner_class(cfg):
"""
Get runner class from config. Default to epoch-based runner.
"""
runner_cls = registry.get_runner_class(cfg.run_cfg.get("runner", "runner_base"))
return runner_cls
def is_stage_1_training(cfg):
return cfg.to_dict()["run"]["stage"] == 1
def main():
# allow auto-dl completes on main process without timeout when using NCCL backend.
# os.environ["NCCL_BLOCKING_WAIT"] = "1"
# set before init_distributed_mode() to ensure the same job_id shared across all ranks.
job_id = now()
cfg = Config(parse_args())
init_distributed_mode(cfg.run_cfg)
setup_seeds(cfg)
# set after init_distributed_mode() to only log on master.
setup_logger()
cfg.pretty_print()
task = tasks.setup_task(cfg)
datasets_raw = []
if (is_stage_1_training(cfg)):
datasets_raw = ESMDataset(pdb_root="/home/ubuntu/pt/",
seq_root="/home/ubuntu/seq/",
ann_paths="/home/ubuntu/proteinchat/data/esm_subset/abstract.json",
dataset_description="/home/ubuntu/dataset.json",
chain="A")
else:
datasets_raw = QADataset(pdb_root="/home/ubuntu/pt/",
seq_root="/home/ubuntu/seq/",
# ann_paths="/home/ubuntu/proteinchat/data/esm_subset/qa_all.json",
ann_paths="/home/ubuntu/proteinchat/data/esm_subset/GPT_merged_summary.json",
# dataset_description="/home/ubuntu/dataset.json",
chain="A")
datasets = {'esm': {'train': datasets_raw}}
model = task.build_model(cfg)
runner = get_runner_class(cfg)(
cfg=cfg, job_id=job_id, task=task, model=model, datasets=datasets
)
runner.train()
if __name__ == "__main__":
main()