Spaces:
Build error
Build error
import peft | |
import torch | |
import whisperx | |
import torch.nn as nn | |
from transformers import AutoProcessor, AutoTokenizer | |
from transformers import CLIPVisionModel, AutoModelForCausalLM | |
class Projections(nn.Module): | |
def __init__( | |
self, | |
clip_embed, | |
phi_embed, | |
num_projection_layers=6, | |
): | |
super().__init__() | |
self.norm = nn.LayerNorm(phi_embed) | |
self.output = nn.Linear(clip_embed, phi_embed) | |
self.projection_layers = nn.ModuleList( | |
[ | |
nn.Sequential( | |
nn.Linear(phi_embed, phi_embed), | |
nn.GELU(), | |
nn.Linear(phi_embed, phi_embed), | |
) | |
for _ in range(num_projection_layers) | |
] | |
) | |
def forward(self, x): | |
x = self.output(x) | |
self.norm(x) | |
for layer in self.projection_layers: | |
residual = x | |
x = layer(x) + residual | |
return x | |
def load_projection_model(path, clip_embed, phi_embed): | |
"""Loads a Projections model instance from a checkpoint and returns it with weights loaded. | |
Args: | |
path (str): Path to the checkpoint file. | |
Returns: | |
torch.nn.Module: The loaded Projections model instance. | |
""" | |
state_dict = torch.load(path)['state_dict'] | |
new_state_dict = {k.replace('projection.', ''): v for k, v in state_dict.items()} | |
model = Projections(clip_embed, phi_embed) | |
model.load_state_dict(new_state_dict) | |
return model | |
class Config: | |
EOS_TOKEN_ID = 50256 | |
QUESTION_ANSWER_SEPARATOR_ID = 50295 # Special token ID for question-answer separation | |
IMAGE_SEPARATOR_TOKENS = [685, 36259, 14041, 60, 220] | |
phi_model_name = "microsoft/phi-2" | |
model_name = "openai/clip-vit-base-patch32" | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
processor = AutoProcessor.from_pretrained(model_name) | |
tokenizer = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True) | |
projection = load_projection_model("models/MModalGPT-FINETUNE-step=29000-loss=3.45.ckpt", 768, 2560) | |
clip_model = CLIPVisionModel.from_pretrained(model_name) | |
audio_model = whisperx.load_model("small", device.type, compute_type="float16") | |
text_model = AutoModelForCausalLM.from_pretrained(phi_model_name, | |
torch_dtype=torch.float16, | |
#device_map="cuda", | |
low_cpu_mem_usage=True, | |
return_dict=True, | |
trust_remote_code=True) | |
peft_model = peft.PeftModel.from_pretrained(text_model, 'models/29000') |