RGMC / audiocaptioner.py
NikitaSrivatsan
First pass at captioning functionality through web app
48ac659
raw
history blame
3.05 kB
from lib import *
import contextlib
import io
import laion_clap
import torch
class AudioCaptioner(torch.nn.Module):
def get_dummy_token(self, batch_size: int) -> torch.Tensor:
return torch.zeros(batch_size, self.prefix_length, dtype=torch.int64)
def embed_waveform(self, waveform):
# compute the prefix
input_dict = {
'waveform': waveform # you can add more key-values
}
audio_embeds = self.clap_model.model.encode_audio(
input_dict,
device=waveform.device
)
# get BxD-dim embedding (last layer) D = 1024 -> 512 after audio projection
audio_embedding = torch.nn.functional.normalize(self.clap_model.model.audio_projection(audio_embeds['embedding']), dim=-1)
return audio_embedding
def create_prefix(self, waveform, batch_size):
if waveform is not None:
audio_embedding = self.embed_waveform(waveform)
else:
audio_embedding = torch.zeros(batch_size, self.prefix_size).cuda()
# project the prefix through map net and append it
prefix_projections = self.clip_project(audio_embedding).view(-1, self.prefix_length, self.gpt_embedding_size)
return prefix_projections
def forward(self, tokens: torch.Tensor, waveform: torch.Tensor, mask: Optional[torch.Tensor] = None,
labels: Optional[torch.Tensor] = None, freeze_gpt = False):
# embed the text
embedding_text = self.gpt.transformer.wte(tokens)
prefix_projections = self.create_prefix(waveform, tokens.shape[0])
embedding_text = torch.cat((prefix_projections, embedding_text), dim=1)
# offset labels
if labels is not None:
dummy_token = self.get_dummy_token(tokens.shape[0], tokens.device)
labels = torch.cat((dummy_token, tokens), dim=1)
# push through GPT
if freeze_gpt:
with torch.no_grad():
out = self.gpt(inputs_embeds=embedding_text, labels=labels, attention_mask=mask)
else:
out = self.gpt(inputs_embeds=embedding_text, labels=labels, attention_mask=mask)
return out
def __init__(self, prefix_length: int, clip_length: Optional[int] = None, prefix_size: int = 512,
num_layers: int = 8):
super(AudioCaptioner, self).__init__()
self.prefix_size = prefix_size
self.prefix_length = prefix_length
self.gpt = GPT2LMHeadModel.from_pretrained('gpt2')
self.gpt_embedding_size = self.gpt.transformer.wte.weight.shape[1]
self.clip_project = MLP((prefix_size, (self.gpt_embedding_size * prefix_length) // 2,
self.gpt_embedding_size * prefix_length))
self.clap_model = laion_clap.CLAP_Module(
enable_fusion=False,
amodel = 'HTSAT-base'
)
with contextlib.redirect_stdout(io.StringIO()):
self.clap_model.load_ckpt(ckpt = '/graft1/datasets/kechen/clap_ckpt/music_audioset_epoch_15_esc_90.14.pt')