Spaces:

Sijuade
/

MLM-CLIP-PHI-2-LLAVA-chatbot

Build error

App Files Files Community

Sijuade commited on Jan 28

Commit

00b53ff

•

1 Parent(s): 137a199

Update config.py

Browse files

Files changed (1) hide show

config.py +69 -1

config.py CHANGED Viewed

@@ -1,5 +1,59 @@
 import torch
 from transformers import AutoProcessor, AutoTokenizer
 class Config:
@@ -12,4 +66,18 @@ class Config:
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     processor = AutoProcessor.from_pretrained(model_name)
-    tokenizer = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)

+import peft
 import torch
+import whisperx
+import torch.nn as nn
 from transformers import AutoProcessor, AutoTokenizer
+from transformers import CLIPVisionModel, AutoModelForCausalLM
+class Projections(nn.Module):
+    def __init__(
+        self,
+        clip_embed,
+        phi_embed,
+        num_projection_layers=6,
+    ):
+        super().__init__()
+        self.norm = nn.LayerNorm(phi_embed)
+        self.output = nn.Linear(clip_embed, phi_embed)
+        self.projection_layers = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.Linear(phi_embed, phi_embed),
+                    nn.GELU(),
+                    nn.Linear(phi_embed, phi_embed),
+                )
+                for _ in range(num_projection_layers)
+            ]
+        )
+    def forward(self, x):
+        x = self.output(x)
+        self.norm(x)
+        for layer in self.projection_layers:
+            residual = x
+            x = layer(x) + residual
+        return x
+def load_projection_model(path, clip_embed, phi_embed):
+    """Loads a Projections model instance from a checkpoint and returns it with weights loaded.
+    Args:
+        path (str): Path to the checkpoint file.
+    Returns:
+        torch.nn.Module: The loaded Projections model instance.
+    """
+    state_dict = torch.load(path)['state_dict']
+    new_state_dict = {k.replace('projection.', ''): v for k, v in state_dict.items()}
+    model = Projections(clip_embed, phi_embed)
+    model.load_state_dict(new_state_dict)
+    return model
 class Config:
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     processor = AutoProcessor.from_pretrained(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)
+    projection = load_projection_model("models/MModalGPT-FINETUNE-step=29000-loss=3.45.ckpt", 768, 2560)
+    clip_model = CLIPVisionModel.from_pretrained(model_name)
+    audio_model = whisperx.load_model("small", device.type, compute_type="float16")
+    text_model = AutoModelForCausalLM.from_pretrained(phi_model_name,
+                                                  torch_dtype=torch.float16,
+                                                  #device_map="cuda",
+                                                  low_cpu_mem_usage=True,
+                                                  return_dict=True,
+                                                  trust_remote_code=True)
+    peft_model = peft.PeftModel.from_pretrained(text_model, 'models/29000')