Spaces:

Sijuade
/

MLM-CLIP-PHI-2-LLAVA-chatbot

Build error

App Files Files Community

Sijuade commited on Jan 28

Commit

b7be07b

•

1 Parent(s): 9cbbd7c

Upload 6 files

Browse files

Files changed (6) hide show

MLMbanner.py +144 -0
app.py +26 -0
config.py +15 -0
networks.py +79 -0
requirements.txt +6 -0
utils.py +90 -0

MLMbanner.py ADDED Viewed

	@@ -0,0 +1,144 @@

+def get_html():
+  html_string = """
+    <div id="banner">
+        <div id="particles-js"></div>
+        <h1>Multimodal Chatbot</h1>
+        <p>A chatbot that accepts text, audio, and images.</p>
+        <div class="icons">
+            <div class="icon" id="text-icon">&#128172;</div> <!-- Text Bubble Emoji -->
+            <div class="icon" id="audio-icon">&#127911;</div> <!-- Headphone Emoji -->
+            <div class="icon" id="image-icon">&#128247;</div> <!-- Camera Emoji -->
+        </div>
+    </div>
+    <style>
+        #banner {
+            background: linear-gradient(270deg, #6c5ce7, #a29bfe, #fd79a8);
+            background-size: 600% 600%;
+            color: white;
+            text-align: center;
+            padding: 30px;
+            border-radius: 15px;
+            box-shadow: 0 8px 16px rgba(0, 0, 0, 0.2);
+            position: relative;
+            overflow: hidden;
+            animation: AnimatedGradient 15s ease infinite;
+        }
+        #particles-js {
+            position: absolute;
+            width: 100%;
+            height: 100%;
+            top: 0;
+            left: 0;
+            z-index: 1;
+        }
+        #banner > * {
+            position: relative;
+            z-index: 2;
+        }
+        #banner h1 {
+            font-size: 2.8em;
+            margin-bottom: 10px;
+            animation: fadeInDown 1.5s ease-in-out;
+        }
+        #banner p {
+            font-size: 1.3em;
+            animation: fadeInUp 1.5s ease-in-out;
+        }
+        .icons {
+            display: flex;
+            justify-content: center;
+            margin-top: 20px;
+        }
+        .icon {
+            font-size: 2em;
+            margin: 0 10px;
+            animation: bounce 2s infinite;
+            transition: transform 0.2s;
+        }
+        .icon:hover {
+            transform: scale(1.1);
+        }
+        @keyframes fadeInDown {
+            from { opacity: 0; transform: translateY(-20px); }
+            to { opacity: 1; transform: translateY(0); }
+        }
+        @keyframes fadeInUp {
+            from { opacity: 0; transform: translateY(20px); }
+            to { opacity: 1; transform: translateY(0); }
+        }
+        @keyframes bounce {
+            0%, 100% { transform: translateY(0); }
+            50% { transform: translateY(-10px); }
+        }
+        @keyframes AnimatedGradient {
+            0%{background-position:0% 50%}
+            50%{background-position:100% 50%}
+            100%{background-position:0% 50%}
+        }
+    </style>
+    <script src="https://cdn.jsdelivr.net/particles.js/2.0.0/particles.min.js"></script>
+    <script>
+        document.addEventListener("DOMContentLoaded", function() {
+            particlesJS("particles-js", {
+                "particles": {
+                    "number": {
+                        "value": 80,
+                        "density": {
+                            "enable": true,
+                            "value_area": 800
+                        }
+                    },
+                    "color": {
+                        "value": "#ffffff"
+                    },
+                    "shape": {
+                        "type": "circle",
+                        "stroke": {
+                            "width": 0,
+                            "color": "#000000"
+                        },
+                        "polygon": {
+                            "nb_sides": 5
+                        }
+                    },
+                    "opacity": {
+                        "value": 0.5,
+                        "random": false,
+                        "anim": {
+                            "enable": false,
+                            "speed": 1,
+                            "opacity_min": 0.1,
+                            "sync": false
+                        }
+                    },
+                    "size": {
+                        "value": 3,
+                        "random": true,
+                        "anim": {
+                            "enable": false,
+                            "speed": 40,
+                            "size_min": 0.1,
+                            "sync": false
+                        }
+                    },
+                    "line_linked": {
+                        "enable
+  """
+  return(html_string)

app.py ADDED Viewed

	@@ -0,0 +1,26 @@

+import gradio as gr
+from MLMbanner import get_html
+from utils import chatbot_response
+with gr.Blocks() as demo:
+    gr.HTML(value=get_html, show_label=True)
+    with gr.Row():
+        text_input = gr.Textbox(label="Enter text", lines=10)
+        image_input = gr.Image(label="Upload image", type="pil")
+        audio_input = gr.Audio(label="Record or upload audio",
+                               type="filepath",
+                               sources=['microphone', 'upload'])
+        submit_button = gr.Button("Submit")
+    output = gr.Textbox(label="Chatbot Response", lines=10)
+    submit_button.click(
+        fn=chatbot_response,
+        inputs=[text_input, image_input, audio_input],
+        outputs=output
+    )
+demo.launch()

config.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import torch
+from transformers import AutoProcessor, AutoTokenizer
+class Config:
+    EOS_TOKEN_ID = 50256
+    QUESTION_ANSWER_SEPARATOR_ID = 50295  # Special token ID for question-answer separation
+    IMAGE_SEPARATOR_TOKENS = [685, 36259, 14041, 60, 220]
+    phi_model_name = "microsoft/phi-2"
+    model_name = "openai/clip-vit-base-patch32"
+    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+    processor = AutoProcessor.from_pretrained(model_name)
+    tokenizer = AutoTokenizer.from_pretrained(phi_model_name, trust_remote_code=True)

networks.py ADDED Viewed

	@@ -0,0 +1,79 @@

+import peft
+import torch
+import whisperx
+import torch.nn as nn
+from config import Config
+from transformers import CLIPVisionModel, AutoModelForCausalLM
+phi_model_name, model_name, device = Config.phi_model_name, Config.model_name, Config.device
+text_model = AutoModelForCausalLM.from_pretrained(phi_model_name,
+                                                  torch_dtype=torch.float16,
+                                                  #device_map="cuda",
+                                                  low_cpu_mem_usage=True,
+                                                  return_dict=True,
+                                                  trust_remote_code=True)
+peft_model = peft.PeftModel.from_pretrained(text_model, 'models/29000')
+projection = load_projection_model("models/MModalGPT-FINETUNE-step=29000-loss=3.45.ckpt", 768, 2560)
+clip_model = CLIPVisionModel.from_pretrained(model_name)
+audio_model = whisperx.load_model("small", device.type, compute_type="float16")
+projection = projection.to(device)
+peft_model = peft_model.to(device)
+clip_model = clip_model.to(device)
+def load_projection_model(path, clip_embed, phi_embed):
+    """Loads a Projections model instance from a checkpoint and returns it with weights loaded.
+    Args:
+        path (str): Path to the checkpoint file.
+    Returns:
+        torch.nn.Module: The loaded Projections model instance.
+    """
+    state_dict = torch.load(path)['state_dict']
+    new_state_dict = {k.replace('projection.', ''): v for k, v in state_dict.items()}
+    model = Projections(clip_embed, phi_embed)
+    model.load_state_dict(new_state_dict)
+    return model
+class Projections(nn.Module):
+    def __init__(
+        self,
+        clip_embed,
+        phi_embed,
+        num_projection_layers=6,
+    ):
+        super().__init__()
+        self.norm = nn.LayerNorm(phi_embed)
+        self.output = nn.Linear(clip_embed, phi_embed)
+        self.projection_layers = nn.ModuleList(
+            [
+                nn.Sequential(
+                    nn.Linear(phi_embed, phi_embed),
+                    nn.GELU(),
+                    nn.Linear(phi_embed, phi_embed),
+                )
+                for _ in range(num_projection_layers)
+            ]
+        )
+    def forward(self, x):
+        x = self.output(x)
+        self.norm(x)
+        for layer in self.projection_layers:
+            residual = x
+            x = layer(x) + residual
+        return x

requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+torch
+pandas
+pillow
+git+https://github.com/huggingface/transformers
+git+https://github.com/m-bain/whisperx.git
+git+https://github.com/huggingface/peft.git

utils.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import torch
+from config import Config
+from networks import peft_model
+tokenizer = Config.tokenizer
+tokenizer.pad_token = tokenizer.eos_token
+tokenizer.add_tokens('<question-answer>')
+def prepare_inputs(peft_model, audio_model, clip_model, projection, text_input=None, image_input=None, audio_input=None):
+  text_audio, text_embed, image_embed = None, None, None
+  if audio_input:
+    audio_transcribed = audio_model.transcribe(audio_input)
+    processed_audio = ''
+    for audio_segment in audio_transcribed['segments']:
+        processed_audio += audio_segment['text']
+    processed_audio = processed_audio.strip()
+  if image_input != None:
+    image_processed = Config.processor(images=image_input, return_tensors="pt")
+    with torch.no_grad():
+      outputs = clip_model(**image_processed)
+      last_hidden_state = outputs.last_hidden_state[:, 1:, :]
+      image_embed = projection(last_hidden_state.to(Config.device)).to(torch.float16)
+  if audio_input != None and text_input != None:
+    text_audio = f"{text_input} {processed_audio}"
+  elif audio_input and text_input == None:
+    text_audio = processed_audio
+  elif audio_input == None and text_input:
+    text_audio = text_input
+  if text_audio:
+    tokenized_text_audio = tokenizer.encode(text_audio)
+    tokenized_text_audio = Config.IMAGE_SEPARATOR_TOKENS + tokenized_text_audio + [Config.QUESTION_ANSWER_SEPARATOR_ID]
+    with torch.no_grad():
+      tokenized_text_audio = torch.tensor(tokenized_text_audio)
+      text_embed = peft_model.model.model.embed_tokens(tokenized_text_audio.to(Config.device)).unsqueeze(0)
+  if text_audio != None and image_input != None:
+    combined_embed = torch.cat([image_embed, text_embed], dim=1)
+  elif text_audio and image_input == None:
+    combined_embed = text_embed
+  elif text_audio == None and image_input:
+    combined_embed = image_embed
+  return(combined_embed)
+def chatbot_response(text_input, image_input, audio_input):
+    if text_input == '':
+      text_input = None
+    if text_input == None and image_input == None and audio_input == None:
+        return "Please enter text, upload an image, or record audio."
+    combined_embeds = prepare_inputs(text_input, image_input, audio_input)
+    generated_tokens = generate_tokens(combined_embeds, max_tokens=60)
+    return(tokenizer.decode(generated_tokens))
+def generate_tokens(combined_embeds, max_tokens=100):
+  pred_tokens = []
+  combined_embed = combined_embeds
+  for _ in range(max_tokens):
+    logits = peft_model(inputs_embeds=combined_embed).logits[:, -1, :]
+    next_token_id = logits.argmax(dim=-1)
+    if next_token_id.item() == 50256:
+        break
+    pred_tokens.append(next_token_id.item())
+    next_token_embed = peft_model.model.model.embed_tokens(next_token_id.unsqueeze(0))
+    with torch.no_grad():
+      combined_embed = torch.cat((combined_embed, next_token_embed), dim=1)
+  return(pred_tokens)