MelodyFlow / audiocraft /utils /checkpoint.py
Gael Le Lan
Initial commit
9d0d223
raw
history blame
6.13 kB
# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.
#
# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.
from enum import Enum
import logging
from pathlib import Path
import re
import typing as tp
import flashy
import torch
from ..environment import AudioCraftEnvironment
logger = logging.getLogger(__name__)
class CheckpointSource(Enum):
CURRENT_XP = "current_xp"
PRETRAINED = "pretrained"
OTHER = "other"
def checkpoint_name(name: tp.Optional[str] = None, rank: tp.Optional[int] = None, use_fsdp: bool = False) -> str:
"""Checkpoint name formatted for all use in AudioCraft codebase and has the following format:
`checkpoint_<name>.th(.<rank>)`. By convention, name is expected to be empty for last checkpoint,
'best' for the best checkpoint or the epoch number.
Args:
name (str, optional): Name suffix for the checkpoint file stem.
rank (optional, int): Rank for distributed processing, retrieved with flashy if not provided.
use_fsdp (bool): Whether the calling solver relies on FSDP.
Returns:
str: The checkpoint name.
"""
suffix = ''
if rank is None:
rank = flashy.distrib.rank()
if rank > 0 and use_fsdp:
suffix = '.' + str(rank)
name_part = ''
if name is not None:
name_part = f'_{name}'
return f'checkpoint{name_part}.th{suffix}'
def is_sharded_checkpoint(path: Path) -> bool:
"""Whether the checkpoint at the given path corresponds to a sharded checkpoint across rank."""
return re.search(r'\.th\.\d+$', path.name) is not None
def resolve_checkpoint_path(sig_or_path: tp.Union[Path, str], name: tp.Optional[str] = None,
use_fsdp: bool = False) -> tp.Optional[Path]:
"""Resolve a given checkpoint path for a provided dora sig or path.
Args:
sig_or_path (Path or str): Checkpoint path or dora signature.
name (str, optional): Name suffix for the checkpoint file stem.
rank (optional, int): Rank for distributed processing, retrieved with flashy if not provided.
use_fsdp (bool): Whether the calling solver relies on FSDP.
Returns:
Path, optional: Resolved checkpoint path, if it exists.
"""
from audiocraft import train
xps_root = train.main.dora.dir / 'xps'
sig_or_path = str(sig_or_path)
if sig_or_path.startswith('//sig/'):
sig = sig_or_path[len('//sig/'):]
path = xps_root / sig
else:
path = Path(sig_or_path)
path = AudioCraftEnvironment.resolve_reference_path(path)
if path.is_dir():
path = path / checkpoint_name(name, use_fsdp=use_fsdp)
if path.exists():
return path
else:
return None
def load_checkpoint(checkpoint_path: Path, is_sharded: bool = False) -> tp.Any:
"""Load state from checkpoints at the specified checkpoint path."""
if is_sharded:
rank0_checkpoint_path = checkpoint_path.parent / checkpoint_name(use_fsdp=False)
if rank0_checkpoint_path.exists():
check_sharded_checkpoint(checkpoint_path, rank0_checkpoint_path)
state = torch.load(checkpoint_path, 'cpu')
logger.info("Checkpoint loaded from %s", checkpoint_path)
return state
def save_checkpoint(state: tp.Any, checkpoint_path: Path, is_sharded: bool = False) -> None:
"""Save state to disk to the specified checkpoint_path."""
_safe_save_checkpoint(state, checkpoint_path, is_sharded)
logger.info("Checkpoint saved to %s", checkpoint_path)
def flush_stale_checkpoints(checkpoint_path: Path, keep_last: tp.Optional[int] = None) -> None:
"""Flush checkpoints to only keep last N checkpoints."""
if keep_last is None or keep_last <= 0:
return
checkpoint_dir = checkpoint_path.parent
suffix = ''
if flashy.distrib.rank() > 0:
suffix = f'.{flashy.distrib.rank()}'
checkpoint_files_with_epoch = []
for path in Path(checkpoint_dir).glob(f'checkpoint_*.th{suffix}'):
epoch_part = path.name.split('.', 1)[0].split('_', 1)[1]
if epoch_part.isdigit():
checkpoint_files_with_epoch.append((path, int(epoch_part)))
checkpoint_files = [path for path, _ in list(sorted(checkpoint_files_with_epoch, key=lambda t: t[1]))]
total_to_flush = max(0, len(checkpoint_files) - keep_last)
files_to_flush = checkpoint_files[:total_to_flush]
for path in files_to_flush:
logger.debug("Removing checkpoint: %s", str(path))
path.unlink(missing_ok=True)
def check_sharded_checkpoint(checkpoint_path: Path, rank0_checkpoint_path: Path) -> None:
"""Check sharded checkpoint state, ensuring the checkpoints are not corrupted."""
# Finish the work of a previous run that got interrupted while dumping.
old_path = Path(str(checkpoint_path) + '.old')
if old_path.exists():
raise RuntimeError(
f"Old checkpoint {old_path} from previous version of this code exist, cannot safely proceed.")
token = Path(str(rank0_checkpoint_path) + '.tmp.done')
tmp_path = Path(str(checkpoint_path) + '.tmp')
if token.exists():
if tmp_path.exists():
tmp_path.rename(checkpoint_path)
flashy.distrib.barrier()
if flashy.distrib.is_rank_zero() and token.exists():
token.unlink()
def _safe_save_checkpoint(state: tp.Any, checkpoint_path: Path, is_sharded: bool = False) -> None:
"""Save checkpoints in a safe manner even with when sharded checkpoints across nodes."""
def _barrier_if_sharded():
if is_sharded:
flashy.distrib.barrier()
if flashy.distrib.is_rank_zero():
token = Path(str(checkpoint_path) + '.tmp.done')
if token.exists():
token.unlink()
_barrier_if_sharded()
with flashy.utils.write_and_rename(checkpoint_path) as f:
torch.save(state, f)
_barrier_if_sharded()
if flashy.distrib.is_rank_zero():
token.touch()
_barrier_if_sharded()
_barrier_if_sharded()
if flashy.distrib.rank() == 0:
token.unlink()