Spaces:

yuwd
/

Polos-Demo

Sleeping

File size: 6,903 Bytes

03f6091

# -*- coding: utf-8 -*-
r"""
Lightning Trainer Setup
==============
   Setup logic for the lightning trainer.
"""
import os
from argparse import Namespace
from datetime import datetime
from typing import Union

import click
import pandas as pd

import pytorch_lightning as pl
from polos.models.utils import apply_to_sample
from pytorch_lightning.callbacks import (
    Callback,
    EarlyStopping,
    ModelCheckpoint,
)
from pytorch_lightning.loggers import LightningLoggerBase, WandbLogger, TensorBoardLogger
from pytorch_lightning.utilities import rank_zero_only


class TrainerConfig:
    """
    The TrainerConfig class is used to define default hyper-parameters that
    are used to initialize our Lightning Trainer. These parameters are then overwritted
    with the values defined in the YAML file.

    -------------------- General Parameters -------------------------

    :param seed: Training seed.

    :param deterministic: If true enables cudnn.deterministic. Might make your system
        slower, but ensures reproducibility.

    :param model: Model class we want to train.

    :param verbode: verbosity mode.

    :param overfit_batches: Uses this much data of the training set. If nonzero, will use
        the same training set for validation and testing. If the training dataloaders
        have shuffle=True, Lightning will automatically disable it.

    :param lr_finder: Runs a small portion of the training where the learning rate is increased
        after each processed batch and the corresponding loss is logged. The result of this is
        a lr vs. loss plot that can be used as guidance for choosing a optimal initial lr.

    -------------------- Model Checkpoint & Early Stopping -------------------------

    :param early_stopping: If true enables EarlyStopping.

    :param save_top_k: If save_top_k == k, the best k models according to the metric
        monitored will be saved.

    :param monitor: Metric to be monitored.

    :param save_weights_only: Saves only the weights of the model.

    :param period: Interval (number of epochs) between checkpoints.

    :param metric_mode: One of {min, max}. In min mode, training will stop when the
        metric monitored has stopped decreasing; in max mode it will stop when the
        metric monitored has stopped increasing.

    :param min_delta: Minimum change in the monitored metric to qualify as an improvement.

    :param patience: Number of epochs with no improvement after which training will be stopped.
    """

    seed: int = 3
    deterministic: bool = True
    model: str = None
    verbose: bool = False
    overfit_batches: Union[int, float] = 0.0

    # Model Checkpoint & Early Stopping
    early_stopping: bool = True
    save_top_k: int = 1
    monitor: str = "kendall"
    save_weights_only: bool = False
    metric_mode: str = "max"
    min_delta: float = 0.0
    patience: int = 1
    accumulate_grad_batches: int = 1
    lr_finder: bool = False

    def __init__(self, initial_data: dict) -> None:
        trainer_attr = pl.Trainer.default_attributes()
        for key in trainer_attr:
            setattr(self, key, trainer_attr[key])

        for key in initial_data:
            if hasattr(self, key):
                setattr(self, key, initial_data[key])

    def namespace(self) -> Namespace:
        return Namespace(
            **{
                name: getattr(self, name)
                for name in dir(self)
                if not callable(getattr(self, name)) and not name.startswith("__")
            }
        )


class TrainReport(Callback):
    """ Logger Callback that echos results during training. """

    _stack: list = []  # stack to keep metrics from all epochs

    @rank_zero_only
    def on_validation_end(
        self, trainer: pl.Trainer, pl_module: pl.LightningModule
    ) -> None:
        metrics = trainer.callback_metrics
        metrics = LightningLoggerBase._flatten_dict(metrics, "_")
        metrics = apply_to_sample(lambda x: x.item(), metrics)
        self._stack.append(metrics)
        # pl_module.print() # Print newline

    @rank_zero_only
    def on_fit_end(self, trainer: pl.Trainer, pl_module: pl.LightningModule) -> None:
        click.secho("\nTraining Report Experiment:", fg="yellow")
        index_column = ["Epoch " + str(i) for i in range(len(self._stack) - 1)]
        df = pd.DataFrame(self._stack[1:], index=index_column)
        # Clean dataframe columns
        del df["train_loss_step"]
        del df["gpu_id: 0/memory.used (MB)"]
        del df["train_loss_epoch"]
        del df["train_avg_loss"]
        click.secho("{}".format(df), fg="yellow")


def build_trainer(hparams: Namespace, resume_from_checkpoint) -> pl.Trainer:
    """
    :param hparams: Namespace

    :returns: Lightning Trainer (obj)
    """
    # Early Stopping Callback
    early_stop_callback = EarlyStopping(
        monitor=hparams.monitor,
        min_delta=hparams.min_delta,
        patience=hparams.patience,
        verbose=hparams.verbose,
        mode=hparams.metric_mode,
    )

    # TestTube Logger Callback
    wandb_logger = WandbLogger(name="polos",
                               project="polos_cvpr",
                               save_dir="experiments/",
                               version="version_" + datetime.now().strftime("%d-%m-%Y--%H-%M-%S"))

    tb_logger = TensorBoardLogger(
        save_dir="experiments/",
        version="version_" + datetime.now().strftime("%d-%m-%Y--%H-%M-%S"),
        name="lightning",
    )

    # Model Checkpoint Callback
    ckpt_path = os.path.join("experiments/lightning/", wandb_logger.version)

    checkpoint_callback = ModelCheckpoint(
        dirpath=ckpt_path,
        save_top_k=hparams.save_top_k,
        verbose=hparams.verbose,
        monitor=hparams.monitor,
        save_weights_only=hparams.save_weights_only,
        period=1,
        mode=hparams.metric_mode,
    )
    other_callbacks = [early_stop_callback, checkpoint_callback, TrainReport()]

    trainer = pl.Trainer(
        logger=[wandb_logger,tb_logger],
        callbacks=other_callbacks,
        gradient_clip_val=hparams.gradient_clip_val,
        gpus=hparams.gpus,
        log_gpu_memory="all",
        deterministic=hparams.deterministic,
        overfit_batches=hparams.overfit_batches,
        check_val_every_n_epoch=1,
        fast_dev_run=False,
        accumulate_grad_batches=hparams.accumulate_grad_batches,
        max_epochs=hparams.max_epochs,
        min_epochs=hparams.min_epochs,
        limit_train_batches=hparams.limit_train_batches,
        limit_val_batches=hparams.limit_val_batches,
        val_check_interval=hparams.val_check_interval,
        distributed_backend=hparams.distributed_backend,
        precision=hparams.precision,
        weights_summary="top",
        profiler=hparams.profiler,
        resume_from_checkpoint=resume_from_checkpoint,
    )
    return trainer