File size: 2,340 Bytes
69a552b da07249 69a552b e86f873 da07249 69a552b da07249 69a552b da07249 66a79e5 4de86d4 da07249 69a552b cc4b1f1 da07249 69a552b da07249 69a552b 699ea16 843f9c7 da07249 e8cfffd 69a552b da07249 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 |
from typing import Optional, Tuple, Union
import torch
from torch import nn
from transformers.modeling_outputs import BaseModelOutput
from transformers import Wav2Vec2BertModel, Wav2Vec2BertConfig, Wav2Vec2BertPreTrainedModel
from transformers.models.mllama.configuration_mllama import MllamaTextConfig
class Llama3Embedding(Wav2Vec2BertPreTrainedModel):
base_model_prefix = "audio_model"
def __init__(self, config: Wav2Vec2BertConfig, text_config: MllamaTextConfig):
super().__init__(config)
assert config.add_adapter is True, f'{type(self).__name__} requires add adapter to be true.'
assert config.output_hidden_size == text_config.hidden_size
self.text_embeddings = nn.Embedding(text_config.vocab_size, text_config.hidden_size, text_config.pad_token_id)
self.audio_embedding = Wav2Vec2BertModel(config)
assert self.text_embeddings.weight.shape[-1] == text_config.hidden_size
self.start_of_audio = nn.Parameter(data=torch.zeros((1, config.output_hidden_size)), requires_grad=True)
self.end_of_audio = nn.Parameter(data=torch.zeros((1, config.output_hidden_size)), requires_grad=True)
def forward(
self,
input_ids: torch.LongTensor = None,
audio_features: Optional[torch.Tensor] = None,
) -> Union[BaseModelOutput, Tuple[torch.Tensor, ...]]:
input_embeddings = self.text_embeddings(torch.clamp(input_ids, min=0))
if audio_features is None:
return input_embeddings
bs, max_num_img, l, d = audio_features.shape
audio_embeddings = self.audio_embedding(input_features=audio_features.view((bs*max_num_img, l, d)))['last_hidden_state']
audio_embeddings = audio_embeddings.view((bs, max_num_img, -1, self.start_of_audio.shape[-1]))
for i in range(bs):
for j in range(max_num_img):
audio_id = -1 - j
if torch.any(input_ids[i] == audio_id):
positions = torch.nonzero(input_ids[i] == audio_id, as_tuple=True)
seq_len = input_embeddings[i][positions].shape[0] - 2
input_embeddings[i] = input_embeddings[i].index_put(positions, torch.concat([self.start_of_audio, audio_embeddings[i, j, :seq_len, :], self.end_of_audio]), accumulate=False)
return input_embeddings
|