Spaces:
Running
on
Zero
Running
on
Zero
# Copyright (c) Meta Platforms, Inc. and affiliates. | |
# All rights reserved. | |
# | |
# This source code is licensed under the license found in the | |
# LICENSE file in the root directory of this source tree. | |
from typing import Optional | |
import torch.nn as nn | |
import torch.nn.functional as F | |
from torch import Tensor | |
from mmdet.registry import MODELS | |
from .utils import weighted_loss | |
def knowledge_distillation_kl_div_loss(pred: Tensor, | |
soft_label: Tensor, | |
T: int, | |
detach_target: bool = True) -> Tensor: | |
r"""Loss function for knowledge distilling using KL divergence. | |
Args: | |
pred (Tensor): Predicted logits with shape (N, n + 1). | |
soft_label (Tensor): Target logits with shape (N, N + 1). | |
T (int): Temperature for distillation. | |
detach_target (bool): Remove soft_label from automatic differentiation | |
Returns: | |
Tensor: Loss tensor with shape (N,). | |
""" | |
assert pred.size() == soft_label.size() | |
target = F.softmax(soft_label / T, dim=1) | |
if detach_target: | |
target = target.detach() | |
kd_loss = F.kl_div( | |
F.log_softmax(pred / T, dim=1), target, reduction='none').mean(1) * ( | |
T * T) | |
return kd_loss | |
class KnowledgeDistillationKLDivLoss(nn.Module): | |
"""Loss function for knowledge distilling using KL divergence. | |
Args: | |
reduction (str): Options are `'none'`, `'mean'` and `'sum'`. | |
loss_weight (float): Loss weight of current loss. | |
T (int): Temperature for distillation. | |
""" | |
def __init__(self, | |
reduction: str = 'mean', | |
loss_weight: float = 1.0, | |
T: int = 10) -> None: | |
super().__init__() | |
assert T >= 1 | |
self.reduction = reduction | |
self.loss_weight = loss_weight | |
self.T = T | |
def forward(self, | |
pred: Tensor, | |
soft_label: Tensor, | |
weight: Optional[Tensor] = None, | |
avg_factor: Optional[int] = None, | |
reduction_override: Optional[str] = None) -> Tensor: | |
"""Forward function. | |
Args: | |
pred (Tensor): Predicted logits with shape (N, n + 1). | |
soft_label (Tensor): Target logits with shape (N, N + 1). | |
weight (Tensor, optional): The weight of loss for each | |
prediction. Defaults to None. | |
avg_factor (int, optional): Average factor that is used to average | |
the loss. Defaults to None. | |
reduction_override (str, optional): The reduction method used to | |
override the original reduction method of the loss. | |
Defaults to None. | |
Returns: | |
Tensor: Loss tensor. | |
""" | |
assert reduction_override in (None, 'none', 'mean', 'sum') | |
reduction = ( | |
reduction_override if reduction_override else self.reduction) | |
loss_kd = self.loss_weight * knowledge_distillation_kl_div_loss( | |
pred, | |
soft_label, | |
weight, | |
reduction=reduction, | |
avg_factor=avg_factor, | |
T=self.T) | |
return loss_kd | |