Spaces:

Navyabhat
/

Capstone_Project

Sleeping

App Files Files Community

Navyabhat commited on Jan 27

Commit

d43c6a1

•

1 Parent(s): 905e6c6

Upload 13 files

Browse files

Files changed (12) hide show

.gitignore +160 -0
Experiments/clip_expt.ipynb +840 -0
Experiments/eval.ipynb +782 -0
Experiments/instruct_150k_data.ipynb +0 -0
Experiments/instruct_data.py +39 -0
Experiments/llava_exp.ipynb +145 -0
Experiments/multimodal_exp.ipynb +362 -0
Experiments/pretrain_data_check.ipynb +304 -0
Experiments/whispher_exp.ipynb +500 -0
README.md +36 -5
app.py +128 -0
requirements.txt +21 -0

.gitignore ADDED Viewed

	@@ -0,0 +1,160 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+#.idea/

Experiments/clip_expt.ipynb ADDED Viewed

	@@ -0,0 +1,840 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "9fe51ce7-4c87-4186-9fd3-0fb18ac43e56",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from PIL import Image\n",
+    "import requests\n",
+    "from transformers import AutoProcessor, CLIPVisionModel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "0f4c21dd-4258-461d-8511-5be089d068a8",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = CLIPVisionModel.from_pretrained(\"openai/clip-vit-base-patch32\", device_map=\"cuda:0\")\n",
+    "processor = AutoProcessor.from_pretrained(\"openai/clip-vit-base-patch32\", device_map=\"cuda:0\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "98b9f906-ffaa-4be4-8671-4ecf65f12c49",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n",
+    "# image = Image.open(requests.get(url, stream=True).raw)\n",
+    "image = Image.open(\"002579.jpg\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "54b2e4ce-b77b-4314-87f6-ca2a1970fc79",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 18,
+   "id": "cdd65c58-007f-450b-8deb-f8b4f372a823",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# image = None"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "e9066c2e-c78b-49d1-979b-10d0f4f09441",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "inputs = processor(images=image, return_tensors=\"pt\", device_map=\"cuda:0\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "e98b211d-29d9-4662-be0b-e011e89b0101",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# inputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b030bd3d-4282-4074-98fe-97e658bd0f50",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 3, 224, 224])"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "inputs[\"pixel_values\"].shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "0ce68f11-1c88-4dd7-8b17-0d1de5811fe6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "outputs = model(inputs[\"pixel_values\"].to(\"cuda:0\"))\n",
+    "last_hidden_state = outputs.last_hidden_state\n",
+    "pooled_output = outputs.pooler_output  # pooled CLS states"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "30cb0918-a30e-4246-b540-6b8e0d876807",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 768])"
+      ]
+     },
+     "execution_count": 23,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pooled_output.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "6399543a-f23f-426d-8289-3bb52d293ece",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 50, 768])"
+      ]
+     },
+     "execution_count": 24,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "last_hidden_state.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "19a70443-5942-4937-b3ea-6a52d76e2b08",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 768])"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "outputs[1].shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "fa13903f-a94a-4839-ae5a-8df4f55c68b6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from torch import nn\n",
+    "from transformers import CLIPVisionConfig,CLIPPreTrainedModel"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "b2bd9198-42f0-40c3-80e1-d167c0b038fb",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'Optional' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[9], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[38;5;28;01mclass\u001b[39;00m \u001b[38;5;21;01mCLIPVisionModelWithProjection\u001b[39;00m(CLIPPreTrainedModel):\n\u001b[1;32m      2\u001b[0m     config_class \u001b[38;5;241m=\u001b[39m CLIPVisionConfig\n\u001b[1;32m      3\u001b[0m     main_input_name \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpixel_values\u001b[39m\u001b[38;5;124m\"\u001b[39m\n",
+      "Cell \u001b[0;32mIn[9], line 20\u001b[0m, in \u001b[0;36mCLIPVisionModelWithProjection\u001b[0;34m()\u001b[0m\n\u001b[1;32m     15\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mget_input_embeddings\u001b[39m(\u001b[38;5;28mself\u001b[39m) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m nn\u001b[38;5;241m.\u001b[39mModule:\n\u001b[1;32m     16\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvision_model\u001b[38;5;241m.\u001b[39membeddings\u001b[38;5;241m.\u001b[39mpatch_embedding\n\u001b[1;32m     18\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\n\u001b[1;32m     19\u001b[0m     \u001b[38;5;28mself\u001b[39m,\n\u001b[0;32m---> 20\u001b[0m     pixel_values: \u001b[43mOptional\u001b[49m[torch\u001b[38;5;241m.\u001b[39mFloatTensor] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m     21\u001b[0m     output_attentions: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m     22\u001b[0m     output_hidden_states: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m     23\u001b[0m     return_dict: Optional[\u001b[38;5;28mbool\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m     24\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Union[Tuple, CLIPVisionModelOutput]:\n\u001b[1;32m     25\u001b[0m     return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[1;32m     27\u001b[0m     vision_outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mvision_model(\n\u001b[1;32m     28\u001b[0m         pixel_values\u001b[38;5;241m=\u001b[39mpixel_values,\n\u001b[1;32m     29\u001b[0m         output_attentions\u001b[38;5;241m=\u001b[39moutput_attentions,\n\u001b[1;32m     30\u001b[0m         output_hidden_states\u001b[38;5;241m=\u001b[39moutput_hidden_states,\n\u001b[1;32m     31\u001b[0m         return_dict\u001b[38;5;241m=\u001b[39mreturn_dict,\n\u001b[1;32m     32\u001b[0m     )\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'Optional' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "class CLIPVisionModelWithProjection(CLIPPreTrainedModel):\n",
+    "    config_class = CLIPVisionConfig\n",
+    "    main_input_name = \"pixel_values\"\n",
+    "\n",
+    "    def __init__(self, config: CLIPVisionConfig):\n",
+    "        super().__init__(config)\n",
+    "\n",
+    "        self.vision_model = CLIPVisionTransformer(config)\n",
+    "\n",
+    "        self.visual_projection = nn.Linear(config.hidden_size, config.projection_dim, bias=False)\n",
+    "\n",
+    "        # Initialize weights and apply final processing\n",
+    "        self.post_init()\n",
+    "\n",
+    "    def get_input_embeddings(self) -> nn.Module:\n",
+    "        return self.vision_model.embeddings.patch_embedding\n",
+    "\n",
+    "    def forward(\n",
+    "        self,\n",
+    "        pixel_values: Optional[torch.FloatTensor] = None,\n",
+    "        output_attentions: Optional[bool] = None,\n",
+    "        output_hidden_states: Optional[bool] = None,\n",
+    "        return_dict: Optional[bool] = None,\n",
+    "    ) -> Union[Tuple, CLIPVisionModelOutput]:\n",
+    "        return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n",
+    "\n",
+    "        vision_outputs = self.vision_model(\n",
+    "            pixel_values=pixel_values,\n",
+    "            output_attentions=output_attentions,\n",
+    "            output_hidden_states=output_hidden_states,\n",
+    "            return_dict=return_dict,\n",
+    "        )\n",
+    "\n",
+    "        pooled_output = vision_outputs[1]  # pooled_output\n",
+    "\n",
+    "        image_embeds = self.visual_projection(pooled_output)\n",
+    "\n",
+    "        if not return_dict:\n",
+    "            outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]\n",
+    "            return tuple(output for output in outputs if output is not None)\n",
+    "\n",
+    "        return CLIPVisionModelOutput(\n",
+    "            image_embeds=image_embeds,\n",
+    "            last_hidden_state=vision_outputs.last_hidden_state,\n",
+    "            hidden_states=vision_outputs.hidden_states,\n",
+    "            attentions=vision_outputs.attentions,\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "68a9ee4a-d977-4725-842d-e64e0dd2f61d",
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "loading configuration file config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n",
+      "`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.\n",
+      "`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.\n",
+      "Model config CLIPConfig {\n",
+      "  \"_name_or_path\": \"openai/clip-vit-base-patch32\",\n",
+      "  \"architectures\": [\n",
+      "    \"CLIPModel\"\n",
+      "  ],\n",
+      "  \"initializer_factor\": 1.0,\n",
+      "  \"logit_scale_init_value\": 2.6592,\n",
+      "  \"model_type\": \"clip\",\n",
+      "  \"projection_dim\": 512,\n",
+      "  \"text_config\": {\n",
+      "    \"bos_token_id\": 0,\n",
+      "    \"dropout\": 0.0,\n",
+      "    \"eos_token_id\": 2,\n",
+      "    \"model_type\": \"clip_text_model\"\n",
+      "  },\n",
+      "  \"transformers_version\": \"4.36.2\",\n",
+      "  \"vision_config\": {\n",
+      "    \"dropout\": 0.0,\n",
+      "    \"model_type\": \"clip_vision_model\"\n",
+      "  }\n",
+      "}\n",
+      "\n",
+      "loading weights file pytorch_model.bin from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/pytorch_model.bin\n",
+      "All model checkpoint weights were used when initializing CLIPModel.\n",
+      "\n",
+      "All the weights of CLIPModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.\n",
+      "If your task is similar to the task the model of the checkpoint was trained on, you can already use CLIPModel for predictions without further training.\n",
+      "loading configuration file preprocessor_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n",
+      "loading configuration file preprocessor_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n",
+      "loading configuration file config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n",
+      "`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.\n",
+      "`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.\n",
+      "Model config CLIPConfig {\n",
+      "  \"_name_or_path\": \"openai/clip-vit-base-patch32\",\n",
+      "  \"architectures\": [\n",
+      "    \"CLIPModel\"\n",
+      "  ],\n",
+      "  \"initializer_factor\": 1.0,\n",
+      "  \"logit_scale_init_value\": 2.6592,\n",
+      "  \"model_type\": \"clip\",\n",
+      "  \"projection_dim\": 512,\n",
+      "  \"text_config\": {\n",
+      "    \"bos_token_id\": 0,\n",
+      "    \"dropout\": 0.0,\n",
+      "    \"eos_token_id\": 2,\n",
+      "    \"model_type\": \"clip_text_model\"\n",
+      "  },\n",
+      "  \"transformers_version\": \"4.36.2\",\n",
+      "  \"vision_config\": {\n",
+      "    \"dropout\": 0.0,\n",
+      "    \"model_type\": \"clip_vision_model\"\n",
+      "  }\n",
+      "}\n",
+      "\n",
+      "loading configuration file preprocessor_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n",
+      "size should be a dictionary on of the following set of keys: ({'width', 'height'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 224. Converted to {'shortest_edge': 224}.\n",
+      "crop_size should be a dictionary on of the following set of keys: ({'width', 'height'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 224. Converted to {'height': 224, 'width': 224}.\n",
+      "Image processor CLIPImageProcessor {\n",
+      "  \"crop_size\": {\n",
+      "    \"height\": 224,\n",
+      "    \"width\": 224\n",
+      "  },\n",
+      "  \"do_center_crop\": true,\n",
+      "  \"do_convert_rgb\": true,\n",
+      "  \"do_normalize\": true,\n",
+      "  \"do_rescale\": true,\n",
+      "  \"do_resize\": true,\n",
+      "  \"feature_extractor_type\": \"CLIPFeatureExtractor\",\n",
+      "  \"image_mean\": [\n",
+      "    0.48145466,\n",
+      "    0.4578275,\n",
+      "    0.40821073\n",
+      "  ],\n",
+      "  \"image_processor_type\": \"CLIPImageProcessor\",\n",
+      "  \"image_std\": [\n",
+      "    0.26862954,\n",
+      "    0.26130258,\n",
+      "    0.27577711\n",
+      "  ],\n",
+      "  \"resample\": 3,\n",
+      "  \"rescale_factor\": 0.00392156862745098,\n",
+      "  \"size\": {\n",
+      "    \"shortest_edge\": 224\n",
+      "  }\n",
+      "}\n",
+      "\n",
+      "loading file vocab.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/vocab.json\n",
+      "loading file merges.txt from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/merges.txt\n",
+      "loading file tokenizer.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/tokenizer.json\n",
+      "loading file added_tokens.json from cache at None\n",
+      "loading file special_tokens_map.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/special_tokens_map.json\n",
+      "loading file tokenizer_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/tokenizer_config.json\n",
+      "loading configuration file config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n",
+      "`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.\n",
+      "`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.\n",
+      "Model config CLIPConfig {\n",
+      "  \"_name_or_path\": \"openai/clip-vit-base-patch32\",\n",
+      "  \"architectures\": [\n",
+      "    \"CLIPModel\"\n",
+      "  ],\n",
+      "  \"initializer_factor\": 1.0,\n",
+      "  \"logit_scale_init_value\": 2.6592,\n",
+      "  \"model_type\": \"clip\",\n",
+      "  \"projection_dim\": 512,\n",
+      "  \"text_config\": {\n",
+      "    \"bos_token_id\": 0,\n",
+      "    \"dropout\": 0.0,\n",
+      "    \"eos_token_id\": 2,\n",
+      "    \"model_type\": \"clip_text_model\"\n",
+      "  },\n",
+      "  \"transformers_version\": \"4.36.2\",\n",
+      "  \"vision_config\": {\n",
+      "    \"dropout\": 0.0,\n",
+      "    \"model_type\": \"clip_vision_model\"\n",
+      "  }\n",
+      "}\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from PIL import Image\n",
+    "import requests\n",
+    "from transformers import AutoProcessor, CLIPModel\n",
+    "\n",
+    "model = CLIPModel.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
+    "processor = AutoProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
+    "\n",
+    "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n",
+    "image = Image.open(requests.get(url, stream=True).raw)\n",
+    "\n",
+    "inputs = processor(images=image, return_tensors=\"pt\")\n",
+    "\n",
+    "image_features = model.get_image_features(**inputs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "9ff63766-b706-452b-b735-bf9000fb9c20",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 512])"
+      ]
+     },
+     "execution_count": 29,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "image_features.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "82566e7b-3c91-421a-94c5-f1e2b3e91c8c",
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "loading configuration file config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n",
+      "Model config CLIPVisionConfig {\n",
+      "  \"attention_dropout\": 0.0,\n",
+      "  \"dropout\": 0.0,\n",
+      "  \"hidden_act\": \"quick_gelu\",\n",
+      "  \"hidden_size\": 768,\n",
+      "  \"image_size\": 224,\n",
+      "  \"initializer_factor\": 1.0,\n",
+      "  \"initializer_range\": 0.02,\n",
+      "  \"intermediate_size\": 3072,\n",
+      "  \"layer_norm_eps\": 1e-05,\n",
+      "  \"model_type\": \"clip_vision_model\",\n",
+      "  \"num_attention_heads\": 12,\n",
+      "  \"num_channels\": 3,\n",
+      "  \"num_hidden_layers\": 12,\n",
+      "  \"patch_size\": 32,\n",
+      "  \"projection_dim\": 512,\n",
+      "  \"transformers_version\": \"4.36.2\"\n",
+      "}\n",
+      "\n",
+      "loading weights file pytorch_model.bin from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/pytorch_model.bin\n",
+      "Some weights of the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing CLIPVisionModel: ['text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'logit_scale', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'visual_projection.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.final_layer_norm.weight', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.embeddings.position_ids', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.final_layer_norm.bias', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.embeddings.position_embedding.weight', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_projection.weight', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.4.self_attn.q_proj.weight']\n",
+      "- This IS expected if you are initializing CLIPVisionModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing CLIPVisionModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "All the weights of CLIPVisionModel were initialized from the model checkpoint at openai/clip-vit-base-patch32.\n",
+      "If your task is similar to the task the model of the checkpoint was trained on, you can already use CLIPVisionModel for predictions without further training.\n",
+      "loading configuration file preprocessor_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n",
+      "loading configuration file preprocessor_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n",
+      "loading configuration file config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n",
+      "`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.\n",
+      "`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.\n",
+      "Model config CLIPConfig {\n",
+      "  \"_name_or_path\": \"openai/clip-vit-base-patch32\",\n",
+      "  \"architectures\": [\n",
+      "    \"CLIPModel\"\n",
+      "  ],\n",
+      "  \"initializer_factor\": 1.0,\n",
+      "  \"logit_scale_init_value\": 2.6592,\n",
+      "  \"model_type\": \"clip\",\n",
+      "  \"projection_dim\": 512,\n",
+      "  \"text_config\": {\n",
+      "    \"bos_token_id\": 0,\n",
+      "    \"dropout\": 0.0,\n",
+      "    \"eos_token_id\": 2,\n",
+      "    \"model_type\": \"clip_text_model\"\n",
+      "  },\n",
+      "  \"transformers_version\": \"4.36.2\",\n",
+      "  \"vision_config\": {\n",
+      "    \"dropout\": 0.0,\n",
+      "    \"model_type\": \"clip_vision_model\"\n",
+      "  }\n",
+      "}\n",
+      "\n",
+      "loading configuration file preprocessor_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/preprocessor_config.json\n",
+      "size should be a dictionary on of the following set of keys: ({'width', 'height'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 224. Converted to {'shortest_edge': 224}.\n",
+      "crop_size should be a dictionary on of the following set of keys: ({'width', 'height'}, {'shortest_edge'}, {'longest_edge', 'shortest_edge'}, {'longest_edge'}), got 224. Converted to {'height': 224, 'width': 224}.\n",
+      "Image processor CLIPImageProcessor {\n",
+      "  \"crop_size\": {\n",
+      "    \"height\": 224,\n",
+      "    \"width\": 224\n",
+      "  },\n",
+      "  \"do_center_crop\": true,\n",
+      "  \"do_convert_rgb\": true,\n",
+      "  \"do_normalize\": true,\n",
+      "  \"do_rescale\": true,\n",
+      "  \"do_resize\": true,\n",
+      "  \"feature_extractor_type\": \"CLIPFeatureExtractor\",\n",
+      "  \"image_mean\": [\n",
+      "    0.48145466,\n",
+      "    0.4578275,\n",
+      "    0.40821073\n",
+      "  ],\n",
+      "  \"image_processor_type\": \"CLIPImageProcessor\",\n",
+      "  \"image_std\": [\n",
+      "    0.26862954,\n",
+      "    0.26130258,\n",
+      "    0.27577711\n",
+      "  ],\n",
+      "  \"resample\": 3,\n",
+      "  \"rescale_factor\": 0.00392156862745098,\n",
+      "  \"size\": {\n",
+      "    \"shortest_edge\": 224\n",
+      "  }\n",
+      "}\n",
+      "\n",
+      "loading file vocab.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/vocab.json\n",
+      "loading file merges.txt from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/merges.txt\n",
+      "loading file tokenizer.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/tokenizer.json\n",
+      "loading file added_tokens.json from cache at None\n",
+      "loading file special_tokens_map.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/special_tokens_map.json\n",
+      "loading file tokenizer_config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/tokenizer_config.json\n",
+      "loading configuration file config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n",
+      "`text_config` is `None`. Initializing the `CLIPTextConfig` with default values.\n",
+      "`vision_config` is `None`. initializing the `CLIPVisionConfig` with default values.\n",
+      "Model config CLIPConfig {\n",
+      "  \"_name_or_path\": \"openai/clip-vit-base-patch32\",\n",
+      "  \"architectures\": [\n",
+      "    \"CLIPModel\"\n",
+      "  ],\n",
+      "  \"initializer_factor\": 1.0,\n",
+      "  \"logit_scale_init_value\": 2.6592,\n",
+      "  \"model_type\": \"clip\",\n",
+      "  \"projection_dim\": 512,\n",
+      "  \"text_config\": {\n",
+      "    \"bos_token_id\": 0,\n",
+      "    \"dropout\": 0.0,\n",
+      "    \"eos_token_id\": 2,\n",
+      "    \"model_type\": \"clip_text_model\"\n",
+      "  },\n",
+      "  \"transformers_version\": \"4.36.2\",\n",
+      "  \"vision_config\": {\n",
+      "    \"dropout\": 0.0,\n",
+      "    \"model_type\": \"clip_vision_model\"\n",
+      "  }\n",
+      "}\n",
+      "\n"
+     ]
+    }
+   ],
+   "source": [
+    "from PIL import Image\n",
+    "import requests\n",
+    "from transformers import AutoProcessor, CLIPVisionModel\n",
+    "\n",
+    "model = CLIPVisionModel.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
+    "processor = AutoProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
+    "\n",
+    "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n",
+    "image = Image.open(requests.get(url, stream=True).raw)\n",
+    "\n",
+    "inputs = processor(images=image, return_tensors=\"pt\")\n",
+    "\n",
+    "outputs = model(**inputs)\n",
+    "last_hidden_state = outputs.last_hidden_state\n",
+    "pooled_output = outputs.pooler_output  # pooled CLS states"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "bcf0a7b3-6cbb-492e-bc2c-42e3edbe6a0c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 768])"
+      ]
+     },
+     "execution_count": 31,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "pooled_output.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "67240294-c7a0-4e94-a8c1-86bfe1b21977",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import CLIPPreTrainedModel\n",
+    "from transformers.models.clip.modeling_clip import CLIPVisionModelOutput, CLIPVisionTransformer\n",
+    "from typing import Optional, Union, Tuple"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "id": "cc9b20db-7f84-44c3-9c78-e84164ccc192",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class VisionLanguageConnector(nn.Module):\n",
+    "    def __init__(self, hidden_size, projection_dim):\n",
+    "        super().__init__()\n",
+    "        self.mlp = nn.Sequential(\n",
+    "            nn.Linear(hidden_size, hidden_size, bias=False),\n",
+    "            nn.GELU(),\n",
+    "            nn.Linear(hidden_size, projection_dim, bias=False)\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return self.mlp(x)\n",
+    "        \n",
+    "class ClipWithProjection(CLIPPreTrainedModel):\n",
+    "    config_class = CLIPVisionConfig\n",
+    "    main_input_name = \"pixel_values\"\n",
+    "\n",
+    "    def __init__(self, config: CLIPVisionConfig):\n",
+    "        super().__init__(config)\n",
+    "\n",
+    "        self.vision_model = CLIPVisionTransformer(config)\n",
+    "        self.vision_model.\n",
+    "        self.vision_language_connector = VisionLanguageConnector(config.hidden_size, config.projection_dim)\n",
+    "\n",
+    "        # Initialize weights and apply final processing\n",
+    "        self.post_init()\n",
+    "\n",
+    "    def forward(\n",
+    "        self,\n",
+    "        pixel_values: Optional[torch.FloatTensor] = None,\n",
+    "        output_attentions: Optional[bool] = None,\n",
+    "        output_hidden_states: Optional[bool] = None,\n",
+    "        return_dict: Optional[bool] = None,\n",
+    "    ) -> Union[Tuple, CLIPVisionModelOutput]:\n",
+    "        return_dict = return_dict if return_dict is not None else self.config.use_return_dict\n",
+    "\n",
+    "        vision_outputs = self.vision_model(\n",
+    "            pixel_values=pixel_values,\n",
+    "            output_attentions=output_attentions,\n",
+    "            output_hidden_states=output_hidden_states,\n",
+    "            return_dict=return_dict,\n",
+    "        )\n",
+    "\n",
+    "        pooled_output = vision_outputs[1]  # pooled_output\n",
+    "\n",
+    "        image_embeds = self.vision_language_connector(pooled_output)\n",
+    "\n",
+    "        if not return_dict:\n",
+    "            outputs = (image_embeds, vision_outputs[0]) + vision_outputs[2:]\n",
+    "            return tuple(output for output in outputs if output is not None)\n",
+    "\n",
+    "        return CLIPVisionModelOutput(\n",
+    "            image_embeds=image_embeds,\n",
+    "            last_hidden_state=vision_outputs.last_hidden_state,\n",
+    "            hidden_states=vision_outputs.hidden_states,\n",
+    "            attentions=vision_outputs.attentions,\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 55,
+   "id": "a4892ab8-39d2-41c9-ad2a-04711c22b95f",
+   "metadata": {
+    "collapsed": true,
+    "jupyter": {
+     "outputs_hidden": true
+    }
+   },
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "loading configuration file config.json from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/config.json\n",
+      "Model config CLIPVisionConfig {\n",
+      "  \"attention_dropout\": 0.0,\n",
+      "  \"dropout\": 0.0,\n",
+      "  \"hidden_act\": \"quick_gelu\",\n",
+      "  \"hidden_size\": 768,\n",
+      "  \"image_size\": 224,\n",
+      "  \"initializer_factor\": 1.0,\n",
+      "  \"initializer_range\": 0.02,\n",
+      "  \"intermediate_size\": 3072,\n",
+      "  \"layer_norm_eps\": 1e-05,\n",
+      "  \"model_type\": \"clip_vision_model\",\n",
+      "  \"num_attention_heads\": 12,\n",
+      "  \"num_channels\": 3,\n",
+      "  \"num_hidden_layers\": 12,\n",
+      "  \"patch_size\": 32,\n",
+      "  \"projection_dim\": 512,\n",
+      "  \"transformers_version\": \"4.36.2\"\n",
+      "}\n",
+      "\n",
+      "loading weights file pytorch_model.bin from cache at /home/ravi.naik/.cache/huggingface/hub/models--openai--clip-vit-base-patch32/snapshots/e6a30b603a447e251fdaca1c3056b2a16cdfebeb/pytorch_model.bin\n",
+      "Some weights of the model checkpoint at openai/clip-vit-base-patch32 were not used when initializing ClipWithProjection: ['text_model.encoder.layers.8.mlp.fc2.weight', 'text_model.encoder.layers.4.self_attn.v_proj.weight', 'text_model.encoder.layers.2.mlp.fc2.bias', 'text_model.encoder.layers.2.self_attn.q_proj.bias', 'text_model.encoder.layers.4.self_attn.v_proj.bias', 'text_model.encoder.layers.6.self_attn.k_proj.bias', 'text_model.encoder.layers.9.self_attn.k_proj.weight', 'text_model.encoder.layers.11.self_attn.q_proj.bias', 'text_model.encoder.layers.3.self_attn.out_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.bias', 'text_model.encoder.layers.1.self_attn.q_proj.weight', 'text_model.encoder.layers.11.layer_norm1.bias', 'text_model.encoder.layers.11.mlp.fc2.bias', 'text_model.encoder.layers.10.layer_norm1.bias', 'text_model.encoder.layers.7.self_attn.q_proj.bias', 'text_model.encoder.layers.11.self_attn.k_proj.weight', 'text_model.encoder.layers.5.self_attn.q_proj.bias', 'text_model.encoder.layers.0.self_attn.v_proj.bias', 'logit_scale', 'text_model.encoder.layers.8.mlp.fc1.bias', 'text_model.encoder.layers.6.layer_norm1.bias', 'text_model.encoder.layers.5.self_attn.out_proj.weight', 'text_model.encoder.layers.7.self_attn.out_proj.bias', 'text_model.embeddings.token_embedding.weight', 'text_model.encoder.layers.8.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.weight', 'text_model.encoder.layers.1.mlp.fc1.weight', 'text_model.encoder.layers.0.layer_norm1.weight', 'text_model.encoder.layers.6.self_attn.k_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.weight', 'text_model.encoder.layers.2.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.v_proj.weight', 'text_model.encoder.layers.7.self_attn.q_proj.weight', 'text_model.encoder.layers.1.layer_norm2.weight', 'text_model.encoder.layers.2.self_attn.out_proj.weight', 'text_model.encoder.layers.3.self_attn.k_proj.weight', 'text_model.encoder.layers.7.mlp.fc2.bias', 'text_model.encoder.layers.10.self_attn.out_proj.weight', 'text_model.encoder.layers.2.self_attn.q_proj.weight', 'text_model.encoder.layers.1.self_attn.k_proj.weight', 'text_model.encoder.layers.4.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.q_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.bias', 'text_model.encoder.layers.4.self_attn.out_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.weight', 'visual_projection.weight', 'text_model.encoder.layers.6.layer_norm2.bias', 'text_model.encoder.layers.6.layer_norm1.weight', 'text_model.encoder.layers.4.self_attn.out_proj.weight', 'text_model.encoder.layers.10.mlp.fc2.bias', 'text_model.encoder.layers.10.mlp.fc1.weight', 'text_model.encoder.layers.6.self_attn.out_proj.weight', 'text_model.encoder.layers.9.layer_norm1.weight', 'text_model.encoder.layers.11.layer_norm2.weight', 'text_model.encoder.layers.6.self_attn.q_proj.bias', 'text_model.encoder.layers.5.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc1.weight', 'text_model.encoder.layers.11.self_attn.out_proj.weight', 'text_model.encoder.layers.0.self_attn.out_proj.weight', 'text_model.encoder.layers.11.mlp.fc2.weight', 'text_model.encoder.layers.7.layer_norm2.weight', 'text_model.encoder.layers.10.self_attn.v_proj.bias', 'text_model.encoder.layers.9.mlp.fc1.bias', 'text_model.encoder.layers.8.self_attn.v_proj.weight', 'text_model.encoder.layers.3.layer_norm1.bias', 'text_model.encoder.layers.6.self_attn.v_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.bias', 'text_model.encoder.layers.9.self_attn.q_proj.weight', 'text_model.encoder.layers.4.self_attn.k_proj.weight', 'text_model.encoder.layers.7.layer_norm1.weight', 'text_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layers.7.self_attn.v_proj.bias', 'text_model.encoder.layers.7.mlp.fc1.bias', 'text_model.encoder.layers.11.mlp.fc1.weight', 'text_model.encoder.layers.2.mlp.fc1.bias', 'text_model.encoder.layers.3.mlp.fc2.bias', 'text_model.encoder.layers.8.self_attn.q_proj.weight', 'text_model.encoder.layers.0.mlp.fc1.weight', 'text_model.encoder.layers.11.self_attn.out_proj.bias', 'text_model.encoder.layers.1.self_attn.v_proj.weight', 'text_model.encoder.layers.0.self_attn.k_proj.weight', 'text_model.encoder.layers.9.layer_norm1.bias', 'text_model.final_layer_norm.weight', 'text_model.encoder.layers.3.layer_norm1.weight', 'text_model.encoder.layers.4.mlp.fc1.bias', 'text_model.encoder.layers.1.layer_norm1.weight', 'text_model.encoder.layers.10.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.k_proj.bias', 'text_model.encoder.layers.8.mlp.fc2.bias', 'text_model.encoder.layers.5.mlp.fc2.bias', 'text_model.encoder.layers.6.self_attn.q_proj.weight', 'text_model.encoder.layers.5.self_attn.out_proj.bias', 'text_model.encoder.layers.9.mlp.fc2.bias', 'text_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layers.2.mlp.fc2.weight', 'text_model.encoder.layers.3.self_attn.out_proj.weight', 'text_model.encoder.layers.6.mlp.fc2.weight', 'text_model.encoder.layers.1.self_attn.out_proj.weight', 'text_model.encoder.layers.1.mlp.fc2.bias', 'text_model.encoder.layers.7.mlp.fc2.weight', 'text_model.encoder.layers.10.self_attn.v_proj.weight', 'text_model.encoder.layers.11.self_attn.v_proj.bias', 'text_model.encoder.layers.4.layer_norm1.bias', 'text_model.encoder.layers.4.layer_norm2.bias', 'text_model.encoder.layers.8.self_attn.q_proj.bias', 'text_model.embeddings.position_ids', 'text_model.encoder.layers.10.layer_norm2.weight', 'text_model.encoder.layers.1.self_attn.out_proj.bias', 'text_model.encoder.layers.2.layer_norm2.weight', 'text_model.encoder.layers.10.self_attn.q_proj.weight', 'text_model.encoder.layers.4.mlp.fc1.weight', 'text_model.encoder.layers.8.layer_norm1.bias', 'text_model.encoder.layers.2.self_attn.k_proj.weight', 'text_model.encoder.layers.5.mlp.fc1.bias', 'text_model.encoder.layers.9.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.v_proj.weight', 'text_model.encoder.layers.2.self_attn.k_proj.bias', 'text_model.encoder.layers.5.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.out_proj.bias', 'text_model.encoder.layers.7.self_attn.k_proj.weight', 'text_model.encoder.layers.6.mlp.fc1.weight', 'text_model.encoder.layers.6.mlp.fc1.bias', 'text_model.encoder.layers.3.self_attn.v_proj.weight', 'text_model.encoder.layers.3.self_attn.q_proj.bias', 'text_model.encoder.layers.9.self_attn.out_proj.weight', 'text_model.encoder.layers.3.mlp.fc1.bias', 'text_model.encoder.layers.0.self_attn.q_proj.bias', 'text_model.encoder.layers.1.layer_norm2.bias', 'text_model.encoder.layers.8.layer_norm2.weight', 'text_model.encoder.layers.5.self_attn.q_proj.weight', 'text_model.encoder.layers.4.layer_norm2.weight', 'text_model.encoder.layers.4.mlp.fc2.bias', 'text_model.encoder.layers.9.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.k_proj.weight', 'text_model.encoder.layers.10.layer_norm1.weight', 'text_model.encoder.layers.0.self_attn.k_proj.bias', 'text_model.encoder.layers.8.self_attn.k_proj.bias', 'text_model.encoder.layers.9.layer_norm2.weight', 'text_model.encoder.layers.4.self_attn.k_proj.bias', 'text_model.encoder.layers.6.layer_norm2.weight', 'text_model.encoder.layers.0.layer_norm2.weight', 'text_model.encoder.layers.5.self_attn.v_proj.bias', 'text_model.encoder.layers.3.layer_norm2.bias', 'text_model.encoder.layers.8.mlp.fc1.weight', 'text_model.encoder.layers.4.self_attn.q_proj.bias', 'text_model.encoder.layers.8.layer_norm1.weight', 'text_model.encoder.layers.2.self_attn.v_proj.weight', 'text_model.encoder.layers.3.self_attn.v_proj.bias', 'text_model.encoder.layers.11.mlp.fc1.bias', 'text_model.encoder.layers.6.mlp.fc2.bias', 'text_model.encoder.layers.1.mlp.fc1.bias', 'text_model.encoder.layers.2.self_attn.v_proj.bias', 'text_model.encoder.layers.5.mlp.fc2.weight', 'text_model.encoder.layers.8.self_attn.v_proj.bias', 'text_model.encoder.layers.10.self_attn.out_proj.bias', 'text_model.encoder.layers.5.layer_norm1.bias', 'text_model.encoder.layers.5.self_attn.v_proj.weight', 'text_model.encoder.layers.10.self_attn.q_proj.bias', 'text_model.encoder.layers.2.layer_norm2.bias', 'text_model.encoder.layers.7.layer_norm1.bias', 'text_model.encoder.layers.4.mlp.fc2.weight', 'text_model.encoder.layers.10.mlp.fc2.weight', 'text_model.encoder.layers.3.mlp.fc1.weight', 'text_model.encoder.layers.5.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.q_proj.bias', 'text_model.encoder.layers.1.self_attn.k_proj.bias', 'text_model.encoder.layers.7.self_attn.out_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.v_proj.weight', 'text_model.encoder.layers.1.layer_norm1.bias', 'text_model.encoder.layers.1.mlp.fc2.weight', 'text_model.encoder.layers.9.layer_norm2.bias', 'text_model.encoder.layers.9.self_attn.k_proj.bias', 'text_model.encoder.layers.11.layer_norm1.weight', 'text_model.encoder.layers.8.self_attn.out_proj.weight', 'text_model.encoder.layers.0.layer_norm1.bias', 'text_model.encoder.layers.7.mlp.fc1.weight', 'text_model.encoder.layers.0.mlp.fc1.bias', 'text_model.encoder.layers.0.layer_norm2.bias', 'text_model.encoder.layers.3.self_attn.k_proj.bias', 'text_model.encoder.layers.5.layer_norm1.weight', 'text_model.encoder.layers.3.layer_norm2.weight', 'text_model.encoder.layers.1.self_attn.q_proj.bias', 'text_model.encoder.layers.2.self_attn.out_proj.bias', 'text_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layers.11.self_attn.q_proj.weight', 'text_model.final_layer_norm.bias', 'text_model.encoder.layers.6.self_attn.v_proj.weight', 'text_model.encoder.layers.0.mlp.fc2.bias', 'text_model.encoder.layers.7.layer_norm2.bias', 'text_model.encoder.layers.10.mlp.fc1.bias', 'text_model.embeddings.position_embedding.weight', 'text_model.encoder.layers.6.self_attn.out_proj.bias', 'text_model.encoder.layers.2.layer_norm1.bias', 'text_model.encoder.layers.9.mlp.fc1.weight', 'text_projection.weight', 'text_model.encoder.layers.11.layer_norm2.bias', 'text_model.encoder.layers.4.self_attn.q_proj.weight']\n",
+      "- This IS expected if you are initializing ClipWithProjection from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n",
+      "- This IS NOT expected if you are initializing ClipWithProjection from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n",
+      "Some weights of ClipWithProjection were not initialized from the model checkpoint at openai/clip-vit-base-patch32 and are newly initialized: ['vision_language_connector.mlp.2.weight', 'vision_language_connector.mlp.0.weight']\n",
+      "You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model = ClipWithProjection.from_pretrained(\"openai/clip-vit-base-patch32\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 56,
+   "id": "588ef914-5be9-49e1-b68d-b899e0e74edd",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "768"
+      ]
+     },
+     "execution_count": 56,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.config.hidden_size"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "05d95b9e-9831-4415-860e-94793e29d210",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "outputs = model(**inputs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "id": "185b1bff-6ffe-4cce-9255-ee7629feba54",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 512])"
+      ]
+     },
+     "execution_count": 61,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "outputs[0].shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "04414a35-c7b3-4986-a79e-1d363916caa4",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "485dbbcb-06df-4926-b257-dfd1a4081d44",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'outputs' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[1], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43moutputs\u001b[49m[\u001b[38;5;241m0\u001b[39m]\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'outputs' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "outputs[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f983313c-8e0f-4805-af14-25bb69afd04c",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Experiments/eval.ipynb ADDED Viewed

	@@ -0,0 +1,782 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "215cfd2f-62b0-4a86-a407-777a1d32597f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "[2024-01-24 15:18:49,948] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)\n"
+     ]
+    }
+   ],
+   "source": [
+    "from PIL import Image\n",
+    "import requests\n",
+    "\n",
+    "import torch\n",
+    "from torch import nn\n",
+    "from transformers import AutoProcessor, CLIPVisionModel, CLIPVisionConfig, CLIPPreTrainedModel\n",
+    "from transformers.models.clip.modeling_clip import CLIPVisionModelOutput, CLIPVisionTransformer\n",
+    "from transformers import WhisperProcessor, WhisperForConditionalGeneration\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "2244e8f3-fcc7-4309-9d4d-fea557f89f79",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llava_phi import LlavaPhiForCausalLM"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "587883e1-3419-4b14-b16b-38fabbc8bfaa",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# model = LlavaPhiForCausalLM.from_pretrained(\"./llava-phi/checkpoints/llavaPhi-v0-3b-finetune/checkpoint-4000\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "0e27a7db-e2ab-4d65-b21d-497222e318ad",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# processor = AutoProcessor.from_pretrained(\"./llava-phi/checkpoints/llavaPhi-v0-3b-finetune/checkpoint-4000\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "663efdd8-ea21-4231-a2ae-bcc0fb47b46a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# prompt = \"<image>\\nUSER: What's the content of the image?\\nASSISTANT:\"\n",
+    "# url = \"https://www.ilankelman.org/stopsigns/australia.jpg\"\n",
+    "# image = Image.open(requests.get(url, stream=True).raw)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "f622609f-f6a7-4ec1-ac35-c1d33d9436ca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# # Generate\n",
+    "# generate_ids = model.generate(**inputs, max_length=30)\n",
+    "# processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "45f5ba72-2e41-4ccc-84c1-97d542ebee63",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llava_phi.model.builder import load_pretrained_model\n",
+    "from llava_phi.mm_utils import tokenizer_image_token, get_model_name_from_path\n",
+    "from llava_phi.utils import disable_torch_init\n",
+    "from llava_phi.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN\n",
+    "from llava_phi.conversation import conv_templates, SeparatorStyle"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 11,
+   "id": "b98ac5d3-5503-4430-81d1-19a4f8d6bd75",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model_path = \"checkpoints/llavaPhi-v0-3b-finetune/checkpoint-4000\"\n",
+    "model_name = get_model_name_from_path(model_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "42fd5721-75a7-475b-bd30-5ee23aeaac64",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'llavaPhi-v0-3b-finetune_checkpoint-4000'"
+      ]
+     },
+     "execution_count": 12,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model_name"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 13,
+   "id": "8c2076b5-3bfc-48fd-917b-5dfd06fc532f",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "load llaVA-Phi MLLM!!!\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "20b86f2c01744081b537620c8780f12e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'device_map': 'cuda'}\n"
+     ]
+    }
+   ],
+   "source": [
+    "tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "4e46221e-0907-453e-8126-76199828493e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "qs = \"What's the content of the image?\"\n",
+    "qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\\n' + qs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "07355444-0eb8-4d4d-ad50-48b91c969664",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "conv = conv_templates[\"default\"].copy()\n",
+    "conv.append_message(conv.roles[0], qs)\n",
+    "conv.append_message(conv.roles[1], None)\n",
+    "prompt = conv.get_prompt()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "ccb5674f-aff8-456e-b61b-1d167864f1a6",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "\"A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: <im_start><image><im_end>\\nWhat's the content of the image? ASSISTANT:\""
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "prompt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "a89cc181-2214-4844-b966-164a41744e54",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = \"https://www.ilankelman.org/stopsigns/australia.jpg\"\n",
+    "image = Image.open(requests.get(url, stream=True).raw)\n",
+    "image_tensor = image_processor.preprocess(image, return_tensors='pt')['pixel_values'].cuda()\n",
+    "\n",
+    "input_ids = tokenizer_image_token(prompt, tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()\n",
+    "\n",
+    "stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "0d519851-64d4-4cf5-b2eb-19474f9aa260",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 55])"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "input_ids.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "1694ff36-f214-4ed3-b2f3-d3dbd0a1a25b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
+     ]
+    }
+   ],
+   "source": [
+    "from datasets import load_dataset\n",
+    "audio_ds = load_dataset(\"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\")\n",
+    "audio = audio_ds[0][\"audio\"]\n",
+    "\n",
+    "whisper_w_proj = WhisperWithProjection(projection_dim=512)\n",
+    "audio_embed = whisper_w_proj(audio)[\"input_ids\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "9c4a9fae-d6ed-4fc2-ba02-97df64cddd93",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "(torch.Size([1, 33]), device(type='cpu'))"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "audio_embed.shape, audio_embed.device"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 29,
+   "id": "c3fffe29-98fb-4f4b-ac51-4bdda9e46752",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_ids = torch.concat([input_ids, audio_embed.to(\"cuda:0\")], dim=1)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 30,
+   "id": "5dee1ec8-2db2-4f65-99e8-d34bd2735c9c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "torch.Size([1, 88])"
+      ]
+     },
+     "execution_count": 30,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "input_ids.shape"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 31,
+   "id": "96033b43-4f57-4f0c-bcf7-37b57ca02e47",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with torch.inference_mode():\n",
+    "        output_ids = model.generate(\n",
+    "            input_ids,\n",
+    "            images=image_tensor,\n",
+    "            do_sample=True,\n",
+    "            temperature=0.2,\n",
+    "            max_new_tokens=1024,\n",
+    "            eos_token_id=tokenizer.eos_token_id,  # End of sequence token\n",
+    "            pad_token_id=tokenizer.eos_token_id,  # Pad token\n",
+    "            use_cache=True,\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 32,
+   "id": "741e8da5-0d18-4c11-b559-76054ce4ca3a",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "is a Japanese character from the story of Jesus, who is a Chinese monk who is also known for his teachings. The story is based on the story of the story of Jesus Christ, and it is a representation of the story of Jesus and the story of Jesus Christ.\n"
+     ]
+    }
+   ],
+   "source": [
+    "input_token_len = input_ids.shape[1]\n",
+    "n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()\n",
+    "if n_diff_input_output > 0:\n",
+    "    print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')\n",
+    "outputs = tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]\n",
+    "outputs = outputs.strip()\n",
+    "if outputs.endswith(stop_str):\n",
+    "    outputs = outputs[:-len(stop_str)]\n",
+    "outputs = outputs.strip()\n",
+    "print(outputs)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 20,
+   "id": "69d494d4-d768-4645-b4d6-5c455791b50d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8a340856-a13f-4b18-9911-126a4ba37816",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "3c56fdea-c7a1-4e67-9832-e2ed077d8704",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "id": "89e84d39-8ed8-45db-ae82-27c156ee6dd1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class AudioLanguageConnector:\n",
+    "    def __init__(self, projection_dim):\n",
+    "        model_name = \"microsoft/phi-2\"\n",
+    "        self.phi2_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
+    "        self.phi2_tokenizer.pad_token = self.phi2_tokenizer.eos_token\n",
+    "        self.phi2_tokenizer.max_length = projection_dim\n",
+    "\n",
+    "    def __call__(self, text):\n",
+    "        text = f\"<audio_start> {text} <audio_end>\"\n",
+    "        tokens = self.phi2_tokenizer(text, return_tensors=\"pt\", return_attention_mask=False)\n",
+    "        return tokens\n",
+    "        \n",
+    "\n",
+    "class WhisperWithProjection:\n",
+    "    def __init__(self, projection_dim, device):\n",
+    "        self.device = device\n",
+    "        self.processor = WhisperProcessor.from_pretrained(\"openai/whisper-tiny\", device_map=device)\n",
+    "        self.model = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-tiny\", device_map=device)\n",
+    "        self.model.config.forced_decoder_ids = None\n",
+    "        self.audio_language_connector = AudioLanguageConnector(projection_dim)\n",
+    "        \n",
+    "    def __call__(self, audio):\n",
+    "        input_features = self.processor(audio[\"array\"],\n",
+    "                                   sampling_rate=audio[\"sampling_rate\"],\n",
+    "                                   return_tensors=\"pt\").input_features\n",
+    "        # generate token ids\n",
+    "        predicted_ids = self.model.generate(input_features.to(self.device))\n",
+    "        # decode token ids to text        \n",
+    "        transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)\n",
+    "\n",
+    "        audio_embeddings = self.audio_language_connector(transcription)\n",
+    "        return audio_embeddings.to(self.device)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 53,
+   "id": "75e24be0-b236-4047-83ef-5c344e262476",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class MultiModalPhi2:\n",
+    "    def __init__(self, model_path=\"checkpoints/llavaPhi-v0-3b-finetune/checkpoint-4000\",\n",
+    "                temperature=0.2,\n",
+    "                max_new_tokens=1024,\n",
+    "                device=\"cuda\"):\n",
+    "        self.temperature = temperature\n",
+    "        self.max_new_tokens = max_new_tokens\n",
+    "        self.device = device\n",
+    "        model_name = get_model_name_from_path(model_path)\n",
+    "        self.tokenizer, self.model, self.image_processor, self.context_len = load_pretrained_model(model_path, None, model_name, device_map=device)\n",
+    "        self.whisper_w_proj = WhisperWithProjection(projection_dim=512, device=device)\n",
+    "        \n",
+    "        \n",
+    "    def __call__(self, text, audio, image):\n",
+    "        qs = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\\n' + text\n",
+    "        conv = conv_templates[\"default\"].copy()\n",
+    "        conv.append_message(conv.roles[0], qs)\n",
+    "        conv.append_message(conv.roles[1], None)\n",
+    "        prompt = conv.get_prompt()\n",
+    "\n",
+    "        image_tensor = self.image_processor.preprocess(image, return_tensors='pt')['pixel_values'].cuda()\n",
+    "        \n",
+    "        input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()\n",
+    "\n",
+    "        audio_embed = self.whisper_w_proj(audio)[\"input_ids\"]\n",
+    "        \n",
+    "        stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2\n",
+    "\n",
+    "        input_ids = torch.concat([input_ids, audio_embed], dim=1)\n",
+    "\n",
+    "        with torch.inference_mode():\n",
+    "            output_ids = self.model.generate(\n",
+    "                input_ids,\n",
+    "                images=image_tensor,\n",
+    "                do_sample=True,\n",
+    "                temperature=self.temperature,\n",
+    "                max_new_tokens=self.max_new_tokens,\n",
+    "                eos_token_id=tokenizer.eos_token_id,  # End of sequence token\n",
+    "                pad_token_id=tokenizer.eos_token_id,  # Pad token\n",
+    "                use_cache=True,\n",
+    "            )\n",
+    "\n",
+    "        input_token_len = input_ids.shape[1]\n",
+    "        n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item()\n",
+    "        if n_diff_input_output > 0:\n",
+    "            print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids')\n",
+    "        outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0]\n",
+    "        outputs = outputs.strip()\n",
+    "        if outputs.endswith(stop_str):\n",
+    "            outputs = outputs[:-len(stop_str)]\n",
+    "        outputs = outputs.strip()\n",
+    "        return outputs"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "id": "4efdbad4-d88a-4477-a3a0-f5591cd0b172",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.\n",
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "load llaVA-Phi MLLM!!!\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "492c17cf54f34d4d9e4f288fc9e72e79",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "{'device_map': 'cuda'}\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
+     ]
+    }
+   ],
+   "source": [
+    "multimodal_phi2 = MultiModalPhi2()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "id": "9a6de0b0-a231-4d50-88e8-e40c6f7216c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"tell me about the audio\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "id": "b4919948-6a75-4d19-ba95-9ba233a7d3d9",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "'is a popular Japanese drama series featuring a man in a red and white costume, who is dressed as Santa Claus, is walking down the street. The scene takes place in a busy city environment, with people walking and standing on the sidewalk, likely enjoying the festive atmosphere and the festive atmosphere.'"
+      ]
+     },
+     "execution_count": 58,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "multimodal_phi2(text, audio, image)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "590f2d64-62ed-4e6f-b7c8-b0cf68aecaab",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 64,
+   "id": "c921eb63-feb5-4fa9-993b-2faeb6dfe1db",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoTokenizer, AutoModelForCausalLM, AutoConfig, BitsAndBytesConfig, CLIPImageProcessor"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 65,
+   "id": "b470a2c4-806a-435d-9fc2-f17448dbe5fc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from llava_phi.model import LlavaPhiConfig"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 66,
+   "id": "4f7bc91a-0a41-45e5-92a4-daa1e3eea0da",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.\n",
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "993bc3a38cb84de4a2e3a79a3448c4d6",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "device_map = \"cuda:0\"\n",
+    "load_8bit = False\n",
+    "load_4bit = False\n",
+    "kwargs = {\"device_map\": device_map}\n",
+    "if load_8bit:\n",
+    "    kwargs['load_in_8bit'] = True\n",
+    "elif load_4bit:\n",
+    "    kwargs['load_in_4bit'] = True\n",
+    "    kwargs['quantization_config'] = BitsAndBytesConfig(\n",
+    "        load_in_4bit=True,\n",
+    "        bnb_4bit_compute_dtype=torch.float16,\n",
+    "        bnb_4bit_use_double_quant=True,\n",
+    "        bnb_4bit_quant_type='nf4'\n",
+    "    )\n",
+    "config = LlavaPhiConfig.from_pretrained(model_path, trust_remote_code=True)\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)\n",
+    "model = LlavaPhiForCausalLM.from_pretrained(\n",
+    "    model_path, \n",
+    "    config=config, \n",
+    "    use_safetensors=True, \n",
+    "    **kwargs).to(\"cuda\")\n",
+    "image_processor = CLIPImageProcessor.from_pretrained(model_path)\n",
+    "mm_use_im_start_end = getattr(model.config, \"mm_use_im_start_end\", False)\n",
+    "mm_use_im_patch_token = getattr(model.config, \"mm_use_im_patch_token\", True)\n",
+    "\n",
+    "# TODO: the tokenizer length of phi-2 is 50295, but the output class of lm_head is 51200\n",
+    "if mm_use_im_patch_token:\n",
+    "    tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True)\n",
+    "if mm_use_im_start_end:\n",
+    "    tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True)\n",
+    "    \n",
+    "if hasattr(model.config, \"max_sequence_length\"):\n",
+    "        context_len = model.config.max_sequence_length\n",
+    "else:\n",
+    "    context_len = 2048"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 70,
+   "id": "99355837-a297-4a25-aeb3-1670af7e9251",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "KeyboardInterrupt",
+     "evalue": "",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m                         Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[70], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msave_pretrained\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mLlava-Phi-Checkpoint\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/transformers/modeling_utils.py:2376\u001b[0m, in \u001b[0;36mPreTrainedModel.save_pretrained\u001b[0;34m(self, save_directory, is_main_process, state_dict, save_function, push_to_hub, max_shard_size, safe_serialization, variant, token, save_peft_format, **kwargs)\u001b[0m\n\u001b[1;32m   2372\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m shard_file, shard \u001b[38;5;129;01min\u001b[39;00m shards\u001b[38;5;241m.\u001b[39mitems():\n\u001b[1;32m   2373\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m safe_serialization:\n\u001b[1;32m   2374\u001b[0m         \u001b[38;5;66;03m# At some point we will need to deal better with save_function (used for TPU and other distributed\u001b[39;00m\n\u001b[1;32m   2375\u001b[0m         \u001b[38;5;66;03m# joyfulness), but for now this enough.\u001b[39;00m\n\u001b[0;32m-> 2376\u001b[0m         \u001b[43msafe_save_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43mshard\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mos\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpath\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mjoin\u001b[49m\u001b[43m(\u001b[49m\u001b[43msave_directory\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mshard_file\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m{\u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mformat\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m:\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43mpt\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m}\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2377\u001b[0m     \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m   2378\u001b[0m         save_function(shard, os\u001b[38;5;241m.\u001b[39mpath\u001b[38;5;241m.\u001b[39mjoin(save_directory, shard_file))\n",
+      "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/safetensors/torch.py:281\u001b[0m, in \u001b[0;36msave_file\u001b[0;34m(tensors, filename, metadata)\u001b[0m\n\u001b[1;32m    250\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21msave_file\u001b[39m(\n\u001b[1;32m    251\u001b[0m     tensors: Dict[\u001b[38;5;28mstr\u001b[39m, torch\u001b[38;5;241m.\u001b[39mTensor],\n\u001b[1;32m    252\u001b[0m     filename: Union[\u001b[38;5;28mstr\u001b[39m, os\u001b[38;5;241m.\u001b[39mPathLike],\n\u001b[1;32m    253\u001b[0m     metadata: Optional[Dict[\u001b[38;5;28mstr\u001b[39m, \u001b[38;5;28mstr\u001b[39m]] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m,\n\u001b[1;32m    254\u001b[0m ):\n\u001b[1;32m    255\u001b[0m \u001b[38;5;250m    \u001b[39m\u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m    256\u001b[0m \u001b[38;5;124;03m    Saves a dictionary of tensors into raw bytes in safetensors format.\u001b[39;00m\n\u001b[1;32m    257\u001b[0m \n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    279\u001b[0m \u001b[38;5;124;03m    ```\u001b[39;00m\n\u001b[1;32m    280\u001b[0m \u001b[38;5;124;03m    \"\"\"\u001b[39;00m\n\u001b[0;32m--> 281\u001b[0m     \u001b[43mserialize_file\u001b[49m\u001b[43m(\u001b[49m\u001b[43m_flatten\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtensors\u001b[49m\u001b[43m)\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mfilename\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mmetadata\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mmetadata\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mKeyboardInterrupt\u001b[0m: "
+     ]
+    }
+   ],
+   "source": [
+    "model.save_pretrained(\"Llava-Phi-Checkpoint\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "fa0bec34-a148-4340-a30c-6f09dd5e71ca",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.push_to_hub(\"RaviNaik/Llava-Phi2\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 73,
+   "id": "382f74b0-2967-408a-badc-a90918810d74",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "CommitInfo(commit_url='https://huggingface.co/RaviNaik/Llava-Phi2/commit/fa8f7240058241243f6bdc3d6ab44bb691f76e39', commit_message='Upload tokenizer', commit_description='', oid='fa8f7240058241243f6bdc3d6ab44bb691f76e39', pr_url=None, pr_revision=None, pr_num=None)"
+      ]
+     },
+     "execution_count": 73,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokenizer.push_to_hub(\"RaviNaik/Llava-Phi2\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b851459b-d3ac-4fb8-99b6-17a648adc41f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Experiments/instruct_150k_data.ipynb ADDED Viewed

The diff for this file is too large to render. See raw diff

Experiments/instruct_data.py ADDED Viewed

	@@ -0,0 +1,39 @@

+from datasets import Dataset, IterableDataset
+from PIL import Image
+# ChatML format
+templates = {
+    "assistant": "<|im_start|>assistant\n{msg}<|im_end|>",      # message by assistant
+    "user": "<|im_start|>user\n{msg}<|im_end|>"       # message by user
+}
+ds = Dataset.from_json("llava_instruct_150k.json", split="train")
+ds_stream = ds.to_iterable_dataset()
+def get_image(image_path):
+    image_path = f"train2014/COCO_train2014_{image_path}"
+    img = Image.open(image_path)
+    return img
+def get_chatml_text(conversations):
+    chatml_text = ""
+    for conversation in conversations:
+        role = conversation["from"]
+        role = "user" if role == "human" else "assistant"
+        content = conversation["value"]
+        formatted_text = templates[role].format(msg=content)
+        chatml_text += formatted_text + "\n"
+    return chatml_text
+def instruct_data_generator():
+    for sample in ds_stream:
+        image_path = sample["image"]
+        conversations = sample["conversations"]
+        image = get_image(image_path)
+        text = get_chatml_text(conversations)
+        yield {"text": text, "image": image}
+instruct_ds = IterableDataset.from_generator(generator=instruct_data_generator)

Experiments/llava_exp.ipynb ADDED Viewed

	@@ -0,0 +1,145 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "99576983-f881-47c8-8b5e-c6f561a93e71",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import transformers"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "58ba19f2-4b91-4f90-a33d-4c1ed17e202a",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import LlavaForConditionalGeneration, LlavaConfig, CLIPVisionConfig, PhiConfig\n",
+    "\n",
+    "# Initializing a CLIP-vision config\n",
+    "vision_config = CLIPVisionConfig()\n",
+    "\n",
+    "# Initializing a Llama config\n",
+    "text_config = PhiConfig()\n",
+    "\n",
+    "# Initializing a Llava llava-1.5-7b style configuration\n",
+    "configuration = LlavaConfig(vision_config, text_config)\n",
+    "\n",
+    "# Initializing a model from the llava-1.5-7b style configuration\n",
+    "model = LlavaForConditionalGeneration(configuration)\n",
+    "\n",
+    "# Accessing the model configuration\n",
+    "configuration = model.config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "a806a07a-fe72-45a3-8ceb-8e942c6c845d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "LlavaConfig {\n",
+       "  \"ignore_index\": -100,\n",
+       "  \"image_token_index\": 32000,\n",
+       "  \"model_type\": \"llava\",\n",
+       "  \"projector_hidden_act\": \"gelu\",\n",
+       "  \"text_config\": {\n",
+       "    \"embd_pdrop\": 0.0,\n",
+       "    \"hidden_act\": \"gelu_new\",\n",
+       "    \"hidden_size\": 2048,\n",
+       "    \"intermediate_size\": 8192,\n",
+       "    \"layer_norm_eps\": 1e-05,\n",
+       "    \"model_type\": \"phi\",\n",
+       "    \"num_hidden_layers\": 24,\n",
+       "    \"partial_rotary_factor\": 0.5,\n",
+       "    \"qk_layernorm\": false,\n",
+       "    \"resid_pdrop\": 0.0,\n",
+       "    \"vocab_size\": 51200\n",
+       "  },\n",
+       "  \"transformers_version\": \"4.36.2\",\n",
+       "  \"vision_config\": {\n",
+       "    \"hidden_size\": 768,\n",
+       "    \"image_size\": 224,\n",
+       "    \"intermediate_size\": 3072,\n",
+       "    \"model_type\": \"clip_vision_model\",\n",
+       "    \"num_attention_heads\": 12,\n",
+       "    \"num_hidden_layers\": 12,\n",
+       "    \"patch_size\": 32,\n",
+       "    \"projection_dim\": 512\n",
+       "  },\n",
+       "  \"vision_feature_layer\": -2,\n",
+       "  \"vision_feature_select_strategy\": \"default\",\n",
+       "  \"vocab_size\": 32000\n",
+       "}"
+      ]
+     },
+     "execution_count": 5,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "model.config"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "79efbc6b-f005-4a5c-82a1-112fa37f1904",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Cloning into 'llava-phi'...\n",
+      "remote: Enumerating objects: 151, done.\u001b[K\n",
+      "remote: Counting objects: 100% (151/151), done.\u001b[K\n",
+      "remote: Compressing objects: 100% (116/116), done.\u001b[K\n",
+      "remote: Total 151 (delta 36), reused 133 (delta 25), pack-reused 0\u001b[K\n",
+      "Receiving objects: 100% (151/151), 333.89 KiB | 112.00 KiB/s, done.\n",
+      "Resolving deltas: 100% (36/36), done.\n"
+     ]
+    }
+   ],
+   "source": [
+    "!git clone https://github.com/zhuyiche/llava-phi.git"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cf827184-f334-4d86-ace1-fe9c92f84d66",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Experiments/multimodal_exp.ipynb ADDED Viewed

	@@ -0,0 +1,362 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 23,
+   "id": "d4bed9ef-4bff-4d61-a4f9-a585f377f136",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from PIL import Image\n",
+    "import requests\n",
+    "\n",
+    "import torch\n",
+    "from torch import nn\n",
+    "from transformers import AutoProcessor, CLIPVisionModel, CLIPVisionConfig, CLIPPreTrainedModel\n",
+    "from transformers.models.clip.modeling_clip import CLIPVisionModelOutput, CLIPVisionTransformer\n",
+    "from transformers import WhisperProcessor, WhisperForConditionalGeneration\n",
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer\n",
+    "from typing import Optional, Union, Tuple"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 43,
+   "id": "952314f0-ee9d-45e7-85b8-1e3e44c1a2fd",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class VisionLanguageConnector(nn.Module):\n",
+    "    def __init__(self, hidden_size, projection_dim):\n",
+    "        super().__init__()\n",
+    "        self.mlp = nn.Sequential(\n",
+    "            nn.Linear(hidden_size, hidden_size, bias=False),\n",
+    "            nn.GELU(),\n",
+    "            nn.Linear(hidden_size, projection_dim, bias=False)\n",
+    "        )\n",
+    "\n",
+    "    def forward(self, x):\n",
+    "        return self.mlp(x)\n",
+    "        \n",
+    "class ClipWithProjection():\n",
+    "    config_class = CLIPVisionConfig\n",
+    "    main_input_name = \"pixel_values\"\n",
+    "\n",
+    "    def __init__(self, hidden_size, projection_dim):\n",
+    "        super().__init__()\n",
+    "        \n",
+    "        self.processor = AutoProcessor.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
+    "        self.vision_model = CLIPVisionModel.from_pretrained(\"openai/clip-vit-base-patch32\")\n",
+    "        self.vision_language_connector = VisionLanguageConnector(hidden_size, projection_dim)\n",
+    "\n",
+    "    def forward(\n",
+    "        self,\n",
+    "        image = None,\n",
+    "        output_attentions: Optional[bool] = None,\n",
+    "        output_hidden_states: Optional[bool] = None,\n",
+    "        return_dict: Optional[bool] = None,\n",
+    "    ) -> Union[Tuple, CLIPVisionModelOutput]:\n",
+    "        \n",
+    "        pixel_values = self.processor(images=image, return_tensors=\"pt\")[\"pixel_values\"]\n",
+    "        vision_outputs = self.vision_model(\n",
+    "            pixel_values=pixel_values,\n",
+    "            output_attentions=output_attentions,\n",
+    "            output_hidden_states=output_hidden_states,\n",
+    "            return_dict=return_dict,\n",
+    "        )\n",
+    "\n",
+    "        pooled_output = vision_outputs[1]  # pooled_output\n",
+    "\n",
+    "        image_embeds = self.vision_language_connector(pooled_output)\n",
+    "\n",
+    "        return CLIPVisionModelOutput(\n",
+    "            image_embeds=image_embeds,\n",
+    "            last_hidden_state=vision_outputs.last_hidden_state,\n",
+    "            hidden_states=vision_outputs.hidden_states,\n",
+    "            attentions=vision_outputs.attentions,\n",
+    "        )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 44,
+   "id": "bd2889fe-be85-44a3-afe8-65b47f7a93c3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "url = \"http://images.cocodataset.org/val2017/000000039769.jpg\"\n",
+    "image = Image.open(requests.get(url, stream=True).raw)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 46,
+   "id": "17c72699-fe98-4b96-b63c-5c8ab7c1a65f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# model = ClipWithProjection(768, 512)\n",
+    "# model.forward(image)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 47,
+   "id": "70806156-38a9-45a2-bf9f-e72047a0173f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class AudioLanguageConnector:\n",
+    "    def __init__(self, projection_dim):\n",
+    "        model_name = \"microsoft/phi-2\"\n",
+    "        self.phi2_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
+    "        self.phi2_tokenizer.pad_token = self.phi2_tokenizer.eos_token\n",
+    "        self.phi2_tokenizer.max_length = projection_dim\n",
+    "\n",
+    "    def __call__(self, text):\n",
+    "        text = f\"<audio_start> {text} <audio_end>\"\n",
+    "        tokens = self.phi2_tokenizer(text, return_tensors=\"pt\", return_attention_mask=False)\n",
+    "        return tokens\n",
+    "        \n",
+    "\n",
+    "class WhisperWithProjection:\n",
+    "    def __init__(self, projection_dim):\n",
+    "        self.processor = WhisperProcessor.from_pretrained(\"openai/whisper-tiny\")\n",
+    "        self.model = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-tiny\")\n",
+    "        self.model.config.forced_decoder_ids = None\n",
+    "        self.audio_language_connector = AudioLanguageConnector(projection_dim)\n",
+    "        \n",
+    "    def forward(self, audio):\n",
+    "        input_features = self.processor(audio[\"array\"],\n",
+    "                                   sampling_rate=audio[\"sampling_rate\"],\n",
+    "                                   return_tensors=\"pt\").input_features\n",
+    "        # generate token ids\n",
+    "        predicted_ids = self.model.generate(input_features)\n",
+    "        # decode token ids to text        \n",
+    "        transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)\n",
+    "\n",
+    "        audio_embeddings = self.audio_language_connector(transcription)\n",
+    "        return audio_embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 48,
+   "id": "79cc4d98-498b-4042-bd71-143b2477733d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class TextModality:\n",
+    "    def __init__(self, projection_dim):\n",
+    "        model_name = \"microsoft/phi-2\"\n",
+    "        self.phi2_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
+    "        self.phi2_tokenizer.pad_token = self.phi2_tokenizer.eos_token\n",
+    "        self.phi2_tokenizer.max_length = projection_dim\n",
+    "\n",
+    "\n",
+    "    def __call__(self, text):\n",
+    "        tokens = self.phi2_tokenizer(text, return_tensors=\"pt\", return_attention_mask=False)\n",
+    "        return tokens"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 77,
+   "id": "ba4c4772-923f-48e8-a4af-b7d9c192dd4b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class MultiModalPhi2:\n",
+    "    def __init__(self):\n",
+    "        self.text_modality = TextModality(projection_dim=768)\n",
+    "        self.whisper_w_proj = WhisperWithProjection(projection_dim=512)\n",
+    "        self.clip_w_proj = ClipWithProjection(hidden_size=768, projection_dim=768)\n",
+    "        self.llm = self.load_llm()\n",
+    "\n",
+    "    def load_llm(self):\n",
+    "        model_name = \"microsoft/phi-2\"\n",
+    "        \n",
+    "        bnb_config = BitsAndBytesConfig(\n",
+    "        load_in_4bit=True,\n",
+    "        bnb_4bit_quant_type=\"nf4\",\n",
+    "        bnb_4bit_compute_dtype=torch.float16)\n",
+    "    \n",
+    "        model = AutoModelForCausalLM.from_pretrained(\n",
+    "            model_name,\n",
+    "            quantization_config=bnb_config,\n",
+    "            trust_remote_code=True,\n",
+    "            device_map=\"cuda:0\"\n",
+    "        )\n",
+    "        model.config.use_cache = False\n",
+    "        return model\n",
+    "\n",
+    "    def forward(self, audio, image, text):\n",
+    "        if text is not None:\n",
+    "            text_embed = self.text_modality(text)[\"input_ids\"]\n",
+    "        if audio is not None:\n",
+    "            audio_embed = self.whisper_w_proj.forward(audio)[\"input_ids\"]\n",
+    "        if image is not None:\n",
+    "            image_embed = self.clip_w_proj.forward(image)[0]\n",
+    "        print(text_embed.shape, text_embed.dtype)\n",
+    "        print(audio_embed.shape, audio_embed.dtype)\n",
+    "        print(image_embed.shape, image_embed.dtype)\n",
+    "        \n",
+    "        inputs = torch.concat([text_embed, audio_embed, image_embed], dim=1)\n",
+    "        print(inputs.shape, inputs.dtype)\n",
+    "        outputs = self.llm(inputs)\n",
+    "\n",
+    "        return outputs \n",
+    "        \n",
+    "\n",
+    "    def generate(self, audio, text):\n",
+    "        text_embeddings = self.text_modality(text)\n",
+    "        audio_embeddings = self.whisper_w_proj.forward(audio)\n",
+    "        inputs = torch.concat([text_embed[\"input_ids\"], audio_embed[\"input_ids\"]], dim=1)\n",
+    "        \n",
+    "        outputs = self.llm.generate(inputs, max_length=200)\n",
+    "        text = self.text_modality.phi2_tokenizer.batch_decode(outputs)[0]\n",
+    "        print(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 74,
+   "id": "7ca694eb-8009-4eb9-9a4c-eac406ab9584",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from datasets import load_dataset\n",
+    "audio_ds = load_dataset(\"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\")\n",
+    "audio = audio_ds[0][\"audio\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 58,
+   "id": "37be28c5-4cc3-4471-b394-032c7602accc",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "text = \"explain about the audio\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "id": "c0705114-1670-4937-bc3e-3660e5a5d2c5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 78,
+   "id": "0d7e5b49-b4bd-477c-87b8-91ef70857677",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "733dc7b2208b4853a89aea49bff9a55c",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "model = MultiModalPhi2()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 79,
+   "id": "0b6471c4-4553-47f3-b38f-46057dcf80f2",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "torch.Size([1, 5]) torch.int64\n",
+      "torch.Size([1, 33]) torch.int64\n",
+      "torch.Size([1, 768]) torch.float32\n",
+      "torch.Size([1, 806]) torch.float32\n"
+     ]
+    },
+    {
+     "ename": "RuntimeError",
+     "evalue": "Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mRuntimeError\u001b[0m                              Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[79], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mforward\u001b[49m\u001b[43m(\u001b[49m\u001b[43maudio\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mimage\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mtext\u001b[49m\u001b[43m)\u001b[49m\n",
+      "Cell \u001b[0;32mIn[77], line 38\u001b[0m, in \u001b[0;36mMultiModalPhi2.forward\u001b[0;34m(self, audio, image, text)\u001b[0m\n\u001b[1;32m     36\u001b[0m inputs \u001b[38;5;241m=\u001b[39m torch\u001b[38;5;241m.\u001b[39mconcat([text_embed, audio_embed, image_embed], dim\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m1\u001b[39m)\n\u001b[1;32m     37\u001b[0m \u001b[38;5;28mprint\u001b[39m(inputs\u001b[38;5;241m.\u001b[39mshape, inputs\u001b[38;5;241m.\u001b[39mdtype)\n\u001b[0;32m---> 38\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mllm\u001b[49m\u001b[43m(\u001b[49m\u001b[43minputs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m     40\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m outputs\n",
+      "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1516\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1522\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1523\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1525\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1526\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1529\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1530\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/accelerate/hooks.py:165\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    163\u001b[0m         output \u001b[38;5;241m=\u001b[39m old_forward(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m    164\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 165\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[43mold_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    166\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m module\u001b[38;5;241m.\u001b[39m_hf_hook\u001b[38;5;241m.\u001b[39mpost_forward(module, output)\n",
+      "File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/microsoft/phi-2/85d00b03fee509307549d823fdd095473ba5197c/modeling_phi.py:1049\u001b[0m, in \u001b[0;36mPhiForCausalLM.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, labels, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m   1046\u001b[0m return_dict \u001b[38;5;241m=\u001b[39m return_dict \u001b[38;5;28;01mif\u001b[39;00m return_dict \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39muse_return_dict\n\u001b[1;32m   1048\u001b[0m \u001b[38;5;66;03m# decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)\u001b[39;00m\n\u001b[0;32m-> 1049\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmodel\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   1050\u001b[0m \u001b[43m    \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1051\u001b[0m \u001b[43m    \u001b[49m\u001b[43mattention_mask\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mattention_mask\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1052\u001b[0m \u001b[43m    \u001b[49m\u001b[43mposition_ids\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mposition_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1053\u001b[0m \u001b[43m    \u001b[49m\u001b[43mpast_key_values\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mpast_key_values\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1054\u001b[0m \u001b[43m    \u001b[49m\u001b[43minputs_embeds\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43minputs_embeds\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1055\u001b[0m \u001b[43m    \u001b[49m\u001b[43muse_cache\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43muse_cache\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1056\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_attentions\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_attentions\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1057\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutput_hidden_states\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43moutput_hidden_states\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1058\u001b[0m \u001b[43m    \u001b[49m\u001b[43mreturn_dict\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mreturn_dict\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   1059\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1061\u001b[0m hidden_states \u001b[38;5;241m=\u001b[39m outputs[\u001b[38;5;241m0\u001b[39m]\n\u001b[1;32m   1062\u001b[0m logits \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mlm_head(hidden_states)\n",
+      "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1516\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1522\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1523\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1525\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1526\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1529\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1530\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/accelerate/hooks.py:165\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    163\u001b[0m         output \u001b[38;5;241m=\u001b[39m old_forward(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m    164\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 165\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[43mold_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    166\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m module\u001b[38;5;241m.\u001b[39m_hf_hook\u001b[38;5;241m.\u001b[39mpost_forward(module, output)\n",
+      "File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/microsoft/phi-2/85d00b03fee509307549d823fdd095473ba5197c/modeling_phi.py:893\u001b[0m, in \u001b[0;36mPhiModel.forward\u001b[0;34m(self, input_ids, attention_mask, position_ids, past_key_values, inputs_embeds, use_cache, output_attentions, output_hidden_states, return_dict)\u001b[0m\n\u001b[1;32m    890\u001b[0m     position_ids \u001b[38;5;241m=\u001b[39m position_ids\u001b[38;5;241m.\u001b[39munsqueeze(\u001b[38;5;241m0\u001b[39m)\n\u001b[1;32m    892\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m inputs_embeds \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[0;32m--> 893\u001b[0m     inputs_embeds \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membed_tokens\u001b[49m\u001b[43m(\u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    895\u001b[0m inputs_embeds \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39membed_dropout(inputs_embeds)\n\u001b[1;32m    897\u001b[0m \u001b[38;5;66;03m# Attention mask.\u001b[39;00m\n",
+      "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1518\u001b[0m, in \u001b[0;36mModule._wrapped_call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1516\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_compiled_call_impl(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)  \u001b[38;5;66;03m# type: ignore[misc]\u001b[39;00m\n\u001b[1;32m   1517\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m-> 1518\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_call_impl\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/torch/nn/modules/module.py:1527\u001b[0m, in \u001b[0;36mModule._call_impl\u001b[0;34m(self, *args, **kwargs)\u001b[0m\n\u001b[1;32m   1522\u001b[0m \u001b[38;5;66;03m# If we don't have any hooks, we want to skip the rest of the logic in\u001b[39;00m\n\u001b[1;32m   1523\u001b[0m \u001b[38;5;66;03m# this function, and just call forward.\u001b[39;00m\n\u001b[1;32m   1524\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m (\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_forward_pre_hooks\n\u001b[1;32m   1525\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_backward_pre_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_backward_hooks\n\u001b[1;32m   1526\u001b[0m         \u001b[38;5;129;01mor\u001b[39;00m _global_forward_hooks \u001b[38;5;129;01mor\u001b[39;00m _global_forward_pre_hooks):\n\u001b[0;32m-> 1527\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mforward_call\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1529\u001b[0m \u001b[38;5;28;01mtry\u001b[39;00m:\n\u001b[1;32m   1530\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;01mNone\u001b[39;00m\n",
+      "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/accelerate/hooks.py:165\u001b[0m, in \u001b[0;36madd_hook_to_module.<locals>.new_forward\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    163\u001b[0m         output \u001b[38;5;241m=\u001b[39m old_forward(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs)\n\u001b[1;32m    164\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[0;32m--> 165\u001b[0m     output \u001b[38;5;241m=\u001b[39m \u001b[43mold_forward\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    166\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m module\u001b[38;5;241m.\u001b[39m_hf_hook\u001b[38;5;241m.\u001b[39mpost_forward(module, output)\n",
+      "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/torch/nn/modules/sparse.py:162\u001b[0m, in \u001b[0;36mEmbedding.forward\u001b[0;34m(self, input)\u001b[0m\n\u001b[1;32m    161\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mforward\u001b[39m(\u001b[38;5;28mself\u001b[39m, \u001b[38;5;28minput\u001b[39m: Tensor) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Tensor:\n\u001b[0;32m--> 162\u001b[0m     \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mF\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    163\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mpadding_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mmax_norm\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m    164\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mnorm_type\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mscale_grad_by_freq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43msparse\u001b[49m\u001b[43m)\u001b[49m\n",
+      "File \u001b[0;32m~/miniconda3/envs/torchenv/lib/python3.10/site-packages/torch/nn/functional.py:2233\u001b[0m, in \u001b[0;36membedding\u001b[0;34m(input, weight, padding_idx, max_norm, norm_type, scale_grad_by_freq, sparse)\u001b[0m\n\u001b[1;32m   2227\u001b[0m     \u001b[38;5;66;03m# Note [embedding_renorm set_grad_enabled]\u001b[39;00m\n\u001b[1;32m   2228\u001b[0m     \u001b[38;5;66;03m# XXX: equivalent to\u001b[39;00m\n\u001b[1;32m   2229\u001b[0m     \u001b[38;5;66;03m# with torch.no_grad():\u001b[39;00m\n\u001b[1;32m   2230\u001b[0m     \u001b[38;5;66;03m#   torch.embedding_renorm_\u001b[39;00m\n\u001b[1;32m   2231\u001b[0m     \u001b[38;5;66;03m# remove once script supports set_grad_enabled\u001b[39;00m\n\u001b[1;32m   2232\u001b[0m     _no_grad_embedding_renorm_(weight, \u001b[38;5;28minput\u001b[39m, max_norm, norm_type)\n\u001b[0;32m-> 2233\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mtorch\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43membedding\u001b[49m\u001b[43m(\u001b[49m\u001b[43mweight\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;28;43minput\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mpadding_idx\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mscale_grad_by_freq\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43msparse\u001b[49m\u001b[43m)\u001b[49m\n",
+      "\u001b[0;31mRuntimeError\u001b[0m: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.cuda.FloatTensor instead (while checking arguments for embedding)"
+     ]
+    }
+   ],
+   "source": [
+    "model.forward(audio, image, text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "4ca96caf-82e2-4f07-87b3-8654dfdc89aa",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Experiments/pretrain_data_check.ipynb ADDED Viewed

	@@ -0,0 +1,304 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "61c272f2-edbe-4b7d-8fec-3ab431400cd3",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import json"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "e9dfd7d7-1685-4fc7-bbb9-3905c32d8ba1",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"metadata.json\", \"rb\") as f:\n",
+    "    metadata = json.load(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "70bdba48-db01-42ac-8d89-edc69d7d7672",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "595375"
+      ]
+     },
+     "execution_count": 4,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(metadata)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 14,
+   "id": "59e193cc-0dd8-4f7e-959a-fbad0133d76c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"blip_laion_cc_sbu_558k.jsonblip_laion_cc_sbu_558k.json\", \"rb\") as f:\n",
+    "    data = json.load(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "f3157f41-269b-4f7a-b3ba-9be711babe02",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'id': '004539375',\n",
+       " 'image': '00453/004539375.jpg',\n",
+       " 'conversations': [{'from': 'human',\n",
+       "   'value': 'Render a clear and concise summary of the photo.\\n<image>'},\n",
+       "  {'from': 'gpt',\n",
+       "   'value': 'select luxury furniture 3 - inch gel memory foam mattress topper'}]}"
+      ]
+     },
+     "execution_count": 7,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[0]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "50d8a051-1526-47dd-ad71-d3c66f7bd34e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'id': '004374662',\n",
+       " 'image': '00437/004374662.jpg',\n",
+       " 'conversations': [{'from': 'human',\n",
+       "   'value': 'Give a brief description of the image.\\n<image>'},\n",
+       "  {'from': 'gpt', 'value': 'the north face duffel bag camo large'}]}"
+      ]
+     },
+     "execution_count": 8,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[234]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "2e6d5664-4583-49a6-93cc-079ee2d1ff6c",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "558128"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 10,
+   "id": "11ed106d-6bef-482c-a456-5eaaf2025534",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'id': 'GCC_train_001749371',\n",
+       " 'image': 'GCC_train_001749371.jpg',\n",
+       " 'caption': 'if you are dreaming of simpler or off - the - grid living , a yurt is a fantastic option',\n",
+       " 'blip_caption': 'a white and tan yurt sitting on a dirt road',\n",
+       " 'url': 'https://i.pinimg.com/736x/14/7b/64/147b64467ee966d9a578097bb70475ad--yurt-kits-small-space-living.jpg'}"
+      ]
+     },
+     "execution_count": 10,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "metadata[67]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "ce8adcec-2499-4be3-be1d-7313fe54e96a",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'id': '000466761',\n",
+       " 'image': '00046/000466761.jpg',\n",
+       " 'conversations': [{'from': 'human',\n",
+       "   'value': '<image>\\nProvide a brief description of the given image.'},\n",
+       "  {'from': 'gpt',\n",
+       "   'value': 'a clipboard and a pen with the words public health emergency next to it on a white table'}]}"
+      ]
+     },
+     "execution_count": 15,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "data[67]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "068313b6-6379-4ca2-892c-682634d3581e",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "list"
+      ]
+     },
+     "execution_count": 16,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "type(data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 24,
+   "id": "9ec33b51-4a0b-4a1e-81f7-2fda7cddb25f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "sample_data = data[:200000]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 25,
+   "id": "095685e5-40f1-4d84-8280-ef74fa56c5a2",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "200000"
+      ]
+     },
+     "execution_count": 25,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(sample_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 26,
+   "id": "ffbad552-23fd-475f-8e9a-7118bcc4f51e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"llava-phi/pretrain_data/blip_sample.json\", \"w\") as f:\n",
+    "    json.dump(sample_data, f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 27,
+   "id": "69a05d25-6f3b-40c0-a3b5-e185ff526471",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open(\"llava-phi/pretrain_data/blip_sample.json\", \"rb\") as f:\n",
+    "    sample = json.load(f)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 28,
+   "id": "200eea06-dfd6-4b3a-bb91-82af7d363951",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "200000"
+      ]
+     },
+     "execution_count": 28,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "len(sample)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f86caa1e-edea-4a9c-934f-5420ede80d0d",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

Experiments/whispher_exp.ipynb ADDED Viewed

	@@ -0,0 +1,500 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "bb4dd66b-0c17-48d4-9d34-f48cece2feb5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# !pip install soundfile\n",
+    "# !pip install librosa"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "id": "6e9386ea-4862-4f5b-a02f-d656e1a5ab9e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import WhisperProcessor, WhisperForConditionalGeneration\n",
+    "from datasets import load_dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "914ab2b4-389d-4c48-8d1d-1250356646ac",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# load model and processor\n",
+    "processor = WhisperProcessor.from_pretrained(\"openai/whisper-tiny\")\n",
+    "model = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-tiny\")\n",
+    "model.config.forced_decoder_ids = None\n",
+    "\n",
+    "# load dummy dataset and read audio files\n",
+    "ds = load_dataset(\"hf-internal-testing/librispeech_asr_dummy\", \"clean\", split=\"validation\")\n",
+    "sample = ds[0][\"audio\"]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "2b299bab-1228-48d9-a8a5-3d5b6c52162d",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'path': '/home/ravi.naik/.cache/huggingface/datasets/downloads/extracted/431c2c946d216530b2666a0e7ffa5ac3f5b3da89dd28858a9de6c78fae7caa4a/dev_clean/1272/128104/1272-128104-0000.flac',\n",
+       " 'array': array([0.00238037, 0.0020752 , 0.00198364, ..., 0.00042725, 0.00057983,\n",
+       "        0.0010376 ]),\n",
+       " 'sampling_rate': 16000}"
+      ]
+     },
+     "execution_count": 3,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "sample"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "b7e570a1-cf5c-450c-a7b6-49b45a10d2df",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_features = processor(sample[\"array\"], sampling_rate=sample[\"sampling_rate\"], return_tensors=\"pt\").input_features "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "584e920b-a7fd-402d-95dd-3b9128cd34bb",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# generate token ids\n",
+    "predicted_ids = model.generate(input_features)\n",
+    "# decode token ids to text\n",
+    "transcription = processor.batch_decode(predicted_ids, skip_special_tokens=False)\n",
+    "\n",
+    "transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "b27ab660-861b-49d1-81f9-f51cb7f9d8d8",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.']"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "transcription"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "id": "eca553b8-68f6-493d-b567-3d526b49ae1b",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "from torch import nn"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 4,
+   "id": "c619a4cf-9068-4e4d-8139-e16d15345f4f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, AutoTokenizer"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "id": "47d5b1ff-ab0f-4d11-af64-d2fa2be39286",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
+     ]
+    }
+   ],
+   "source": [
+    "model_name = \"microsoft/phi-2\"\n",
+    "phi2_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
+    "phi2_tokenizer.pad_token = phi2_tokenizer.eos_token"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "0b36b3f0-db5b-4029-9072-0a53bcab315a",
+   "metadata": {},
+   "outputs": [
+    {
+     "ename": "NameError",
+     "evalue": "name 'transcription' is not defined",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
+      "Cell \u001b[0;32mIn[6], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m tokens \u001b[38;5;241m=\u001b[39m phi2_tokenizer(\u001b[38;5;241m*\u001b[39m\u001b[43mtranscription\u001b[49m, return_tensors\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpt\u001b[39m\u001b[38;5;124m\"\u001b[39m, return_attention_mask\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mFalse\u001b[39;00m)\n",
+      "\u001b[0;31mNameError\u001b[0m: name 'transcription' is not defined"
+     ]
+    }
+   ],
+   "source": [
+    "tokens = phi2_tokenizer(*transcription, return_tensors=\"pt\", return_attention_mask=False)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 22,
+   "id": "91f6d3d3-bb00-434f-a91e-6952375890d0",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "{'input_ids': tensor([[ 1770,    13,  2264,   346,   353,   318,   262, 46329,   286,   262,\n",
+       "          3504,  6097,   290,   356,   389,  9675,   284,  7062,   465, 21443,\n",
+       "            13]])}"
+      ]
+     },
+     "execution_count": 22,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "tokens"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 12,
+   "id": "533191d9-4b3b-417a-918d-6fe854f24b50",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:\n",
+      "- configuration_phi.py\n",
+      ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2a65a119388b4cb4b123b532176e786e",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "modeling_phi.py:   0%|          | 0.00/62.7k [00:00<?, ?B/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "A new version of the following files was downloaded from https://huggingface.co/microsoft/phi-2:\n",
+      "- modeling_phi.py\n",
+      ". Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "7183811844304c16b72d53fe11098a74",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "3e78fe144e8f42139a4d7a1830dbf192",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "bnb_config = BitsAndBytesConfig(\n",
+    "    load_in_4bit=True,\n",
+    "    bnb_4bit_quant_type=\"nf4\",\n",
+    "    bnb_4bit_compute_dtype=torch.float16,\n",
+    ")\n",
+    "\n",
+    "model = AutoModelForCausalLM.from_pretrained(\n",
+    "    model_name,\n",
+    "    quantization_config=bnb_config,\n",
+    "    trust_remote_code=True,\n",
+    "    device_map=\"cuda:0\"\n",
+    ")\n",
+    "model.config.use_cache = False"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 19,
+   "id": "155c054a-a00f-4ed5-bfff-1ad64889e7f1",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "[' Mr. Quilter is the apostle of the middle classes and we are glad to welcome his gospel.\\n']"
+      ]
+     },
+     "execution_count": 19,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "phi2_tokenizer.batch_decode(model.generate(**tokens))"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "04f940c9-586d-4937-ae31-cc0f96d33e92",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class AudioLanguageConnector:\n",
+    "    def __init__(self):\n",
+    "        model_name = \"microsoft/phi-2\"\n",
+    "        self.phi2_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
+    "        self.phi2_tokenizer.pad_token = self.phi2_tokenizer.eos_token\n",
+    "\n",
+    "    def __call__(self, text):\n",
+    "        text = f\"<audio_start> {text} <audio_end>\"\n",
+    "        tokens = self.phi2_tokenizer(text, return_tensors=\"pt\", return_attention_mask=False)\n",
+    "        return tokens\n",
+    "        \n",
+    "\n",
+    "class WhisperWithProjection:\n",
+    "    def __init__(self):\n",
+    "        self.processor = WhisperProcessor.from_pretrained(\"openai/whisper-tiny\")\n",
+    "        self.model = WhisperForConditionalGeneration.from_pretrained(\"openai/whisper-tiny\")\n",
+    "        self.model.config.forced_decoder_ids = None\n",
+    "        self.audio_language_connector = AudioLanguageConnector()\n",
+    "        \n",
+    "    def forward(self, audio):\n",
+    "        input_features = self.processor(audio[\"array\"],\n",
+    "                                   sampling_rate=audio[\"sampling_rate\"],\n",
+    "                                   return_tensors=\"pt\").input_features\n",
+    "        # generate token ids\n",
+    "        predicted_ids = self.model.generate(input_features)\n",
+    "        # decode token ids to text        \n",
+    "        transcription = self.processor.batch_decode(predicted_ids, skip_special_tokens=True)\n",
+    "\n",
+    "        audio_embeddings = self.audio_language_connector(transcription)\n",
+    "        return audio_embeddings"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "2b1f8f44-bfe6-413c-9e32-c38fa5517981",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class TextModality:\n",
+    "    def __init__(self):\n",
+    "        model_name = \"microsoft/phi-2\"\n",
+    "        self.phi2_tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)\n",
+    "        self.phi2_tokenizer.pad_token = self.phi2_tokenizer.eos_token\n",
+    "\n",
+    "    def __call__(self, text):\n",
+    "        tokens = self.phi2_tokenizer(text, return_tensors=\"pt\", return_attention_mask=False)\n",
+    "        return tokens"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 15,
+   "id": "21c51648-abb6-4bbd-b4c1-509967a69337",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class MultiModalPhi2:\n",
+    "    def __init__(self):\n",
+    "        self.text_modality = TextModality()\n",
+    "        self.whisper_w_proj = WhisperWithProjection()\n",
+    "        self.llm = self.load_llm()\n",
+    "\n",
+    "    def load_llm(self):\n",
+    "        bnb_config = BitsAndBytesConfig(\n",
+    "        load_in_4bit=True,\n",
+    "        bnb_4bit_quant_type=\"nf4\",\n",
+    "        bnb_4bit_compute_dtype=torch.float16)\n",
+    "    \n",
+    "        model = AutoModelForCausalLM.from_pretrained(\n",
+    "            model_name,\n",
+    "            quantization_config=bnb_config,\n",
+    "            trust_remote_code=True,\n",
+    "            device_map=\"cuda:0\"\n",
+    "        )\n",
+    "        model.config.use_cache = False\n",
+    "        return model\n",
+    "\n",
+    "    def generate(self, audio, text):\n",
+    "        text_embeddings = self.text_modality(text)\n",
+    "        audio_embeddings = self.whisper_w_proj.forward(audio)\n",
+    "        inputs = torch.concat([text_embeddings[\"input_ids\"], audio_embeddings[\"input_ids\"]], dim=1)\n",
+    "        \n",
+    "        # outputs = self.llm.generate(inputs, max_length=200)\n",
+    "        outputs = self.llm(inputs)\n",
+    "        return outputs\n",
+    "        \n",
+    "        # text = self.text_modality.phi2_tokenizer.batch_decode(outputs)[0]\n",
+    "        # print(text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 16,
+   "id": "472a00cb-bae9-4c09-a0ef-bc57881b5e2c",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
+      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n"
+     ]
+    },
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "2236e6b1e26d444fa3d48181ba1a6cf9",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    }
+   ],
+   "source": [
+    "multi_modal_phi = MultiModalPhi2()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 17,
+   "id": "c350f2d3-0929-4c46-b63d-ff92dea437f3",
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "CausalLMOutputWithPast(loss={'logits': tensor([[[ 6.9531,  9.9375,  7.0234,  ...,  2.0020,  2.0020,  2.0000],\n",
+       "         [ 8.9062, 12.1172,  7.5977,  ..., -1.2012, -1.2012, -1.2012],\n",
+       "         [ 7.0273,  5.3477,  3.6328,  ..., -4.2070, -4.2070, -4.2070],\n",
+       "         ...,\n",
+       "         [ 7.0234,  7.4414,  9.1016,  ...,  1.0117,  1.0127,  1.0117],\n",
+       "         [ 9.4531, 10.0391,  9.7578,  ...,  0.0776,  0.0775,  0.0764],\n",
+       "         [ 8.0703,  6.6445,  5.5156,  ..., -1.9268, -1.9268, -1.9277]]],\n",
+       "       grad_fn=<ToCopyBackward0>)}, logits=tensor([[[ 6.9531,  9.9375,  7.0234,  ...,  2.0020,  2.0020,  2.0000],\n",
+       "         [ 8.9062, 12.1172,  7.5977,  ..., -1.2012, -1.2012, -1.2012],\n",
+       "         [ 7.0273,  5.3477,  3.6328,  ..., -4.2070, -4.2070, -4.2070],\n",
+       "         ...,\n",
+       "         [ 7.0234,  7.4414,  9.1016,  ...,  1.0117,  1.0127,  1.0117],\n",
+       "         [ 9.4531, 10.0391,  9.7578,  ...,  0.0776,  0.0775,  0.0764],\n",
+       "         [ 8.0703,  6.6445,  5.5156,  ..., -1.9268, -1.9268, -1.9277]]],\n",
+       "       grad_fn=<ToCopyBackward0>), past_key_values=None, hidden_states=None, attentions=None)"
+      ]
+     },
+     "execution_count": 17,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "audio = sample\n",
+    "text = \"explain about the audio\"\n",
+    "multi_modal_phi.generate(audio, text)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "46aa9c66-a5bb-4760-8895-92673f49345f",
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3 (ipykernel)",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.12"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

README.md CHANGED Viewed

@@ -1,13 +1,44 @@
 ---
 title: MultiModal Phi2
-emoji: 😻
-colorFrom: gray
-colorTo: purple
 sdk: gradio
-sdk_version: 4.16.0
 app_file: app.py
 pinned: false
 license: mit
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
 title: MultiModal Phi2
+emoji: 🚀
+colorFrom: blue
+colorTo: red
 sdk: gradio
+sdk_version: 3.35.2
 app_file: app.py
 pinned: false
 license: mit
 ---
+## Phi2 : Multimodal Finetuning
+### Details
+1. LLM Backbone: Phi2
+2. Vision Tower: clip-vit-large-patch14-336
+3. Audio Model: Whisper
+4. Pretraining Dataset: LAION-CC-SBU dataset with BLIP captions(200k samples)
+5. Finetuning Dataset: Instruct 150k dataset based on COCO
+### Design
+![image](https://github.com/RaviNaik/ERA-CAPSTONE/assets/23289802/56df24cd-2681-4e17-ab64-9652f609b15f)
+### Pretraining
+#### Training Loss Curve
+![image](https://github.com/RaviNaik/ERA-CAPSTONE/assets/23289802/b6c37a95-0a56-4b52-8719-3ff56dc1b703)
+#### Learing Rate
+![image](https://github.com/RaviNaik/ERA-CAPSTONE/assets/23289802/44d9a11b-b28d-47e1-ba1d-d6dc22ebe748)
+#### Training Logs
+![image](https://github.com/RaviNaik/ERA-CAPSTONE/assets/23289802/76543d98-d9fe-4c1a-ac47-3d06e48053ad)
+### Finetuning
+#### Training Loss Curve
+![image](https://github.com/RaviNaik/ERA-CAPSTONE/assets/23289802/45ef40bd-fae5-4cfe-a522-c0eed2833230)
+#### Learing Rate
+![image](https://github.com/RaviNaik/ERA-CAPSTONE/assets/23289802/df60ee62-a537-4e36-a7f7-f7111e101162)
+#### Training Logs
+![image](https://github.com/RaviNaik/ERA-CAPSTONE/assets/23289802/2747acce-bc99-4c37-a05a-d5e81cb9aa9d)
+### Results
+![image](https://github.com/RaviNaik/ERA-CAPSTONE/assets/23289802/f12a9f04-df32-413e-b957-774c30381b2b)

app.py ADDED Viewed

	@@ -0,0 +1,128 @@

+import gradio as gr
+from PIL import Image
+from inference.main import MultiModalPhi2
+messages = []
+multimodal_phi2 = MultiModalPhi2(
+    modelname_or_path="RaviNaik/Llava-Phi2",
+    temperature=0.2,
+    max_new_tokens=1024,
+    device="cpu",
+)
+def add_content(chatbot, text, image, audio_upload, audio_mic) -> gr.Chatbot:
+    textflag, imageflag, audioflag = False, False, False
+    if text not in ["", None]:
+        chatbot.append((text, None))
+        textflag = True
+    if image is not None:
+        chatbot.append(((image,), None))
+        imageflag = True
+    if audio_mic is not None:
+        chatbot.append(((audio_mic,), None))
+        audioflag = True
+    else:
+        if audio_upload is not None:
+            chatbot.append(((audio_upload,), None))
+            audioflag = True
+    if not any([textflag, imageflag, audioflag]):
+        # Raise an error if neither text nor file is provided
+        raise gr.Error("Enter a valid text, image or audio")
+    return chatbot
+def clear_data():
+    return {prompt: None, image: None, audio_upload: None, audio_mic: None, chatbot: []}
+def run(history, text, image, audio_upload, audio_mic):
+    if text in [None, ""]:
+        text = None
+    if audio_upload is not None:
+        audio = audio_upload
+    elif audio_mic is not None:
+        audio = audio_mic
+    else:
+        audio = None
+    print("text", text)
+    print("image", image)
+    print("audio", audio)
+    if image is not None:
+        image = Image.open(image)
+    outputs = multimodal_phi2(text, audio, image)
+    # outputs = ""
+    history.append((None, outputs.title()))
+    return history, None, None, None, None
+with gr.Blocks() as demo:
+    gr.Markdown("## MulitModal Phi2 Model Pretraining and Finetuning from Scratch")
+    gr.Markdown(
+        """This is a multimodal implementation of [Phi2](https://huggingface.co/microsoft/phi-2) model.
+        Please find the source code and training details [here](https://github.com/RaviNaik/ERA-CAPSTONE/MultiModalPhi2).
+        ### Details:
+        1. LLM Backbone: [Phi2](https://huggingface.co/microsoft/phi-2)
+        2. Vision Tower: [clip-vit-large-patch14-336](https://huggingface.co/openai/clip-vit-large-patch14-336)
+        3. Audio Model: [Whisper Tiny](https://huggingface.co/openai/whisper-tiny)
+        4. Pretraining Dataset: [LAION-CC-SBU dataset with BLIP captions(200k samples)](https://huggingface.co/datasets/liuhaotian/LLaVA-Pretrain)
+        5. Finetuning Dataset: [Instruct 150k dataset based on COCO](https://huggingface.co/datasets/liuhaotian/LLaVA-Instruct-150K)
+        6. Finetuned Model: [RaviNaik/Llava-Phi2](https://huggingface.co/RaviNaik/Llava-Phi2)
+        """
+    )
+    with gr.Row():
+        with gr.Column(scale=4):
+            # Creating a column with a scale of 6
+            with gr.Box():
+                with gr.Row():
+                    # Adding a Textbox with a placeholder "write prompt"
+                    prompt = gr.Textbox(
+                        placeholder="Enter Prompt", lines=2, label="Query", value=None
+                    )
+                # Creating a column with a scale of 2
+                with gr.Row():
+                    # Adding image
+                    image = gr.Image(type="filepath", value=None)
+                # Creating a column with a scale of 2
+                with gr.Row():
+                    # Add audio
+                    audio_upload = gr.Audio(source="upload", type="filepath")
+                    audio_mic = gr.Audio(
+                        source="microphone", type="filepath", format="mp3"
+                    )
+        with gr.Column(scale=8):
+            with gr.Box():
+                with gr.Row():
+                    chatbot = gr.Chatbot(
+                        avatar_images=("🧑", "🤖"),
+                        height=550,
+                    )
+                with gr.Row():
+                    # Adding a Button
+                    submit = gr.Button()
+                    clear = gr.Button(value="Clear")
+    submit.click(
+        add_content,
+        inputs=[chatbot, prompt, image, audio_upload, audio_mic],
+        outputs=[chatbot],
+    ).success(
+        run,
+        inputs=[chatbot, prompt, image, audio_upload, audio_mic],
+        outputs=[chatbot, prompt, image, audio_upload, audio_mic],
+    )
+    clear.click(
+        clear_data,
+        outputs=[prompt, image, audio_upload, audio_mic, chatbot],
+    )
+demo.launch()

requirements.txt ADDED Viewed

	@@ -0,0 +1,21 @@

+einops==0.6.1
+einops-exts==0.0.4
+timm==0.6.13
+gradio==3.35.2
+gradio_client==0.2.9
+markdown2[all]
+numpy
+requests
+tokenizers==0.15.0
+torch==2.0.1
+shortuuid
+httpx==0.24.0
+deepspeed==0.9.5
+peft==0.4.0
+transformers==4.36.2
+accelerate==0.21.0
+bitsandbytes==0.41.0
+scikit-learn==1.2.2
+sentencepiece==0.1.99
+librosa
+soundfile