File size: 3,829 Bytes

{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "application/vnd.jupyter.widget-view+json": {
       "model_id": "50ddeec2e802423ebf210b0264d2f222",
       "version_major": 2,
       "version_minor": 0
      },
      "text/plain": [
       "Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]"
      ]
     },
     "metadata": {},
     "output_type": "display_data"
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/opt/anaconda3/lib/python3.12/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
      "  warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "'NoneType' object has no attribute 'cadam32bit_grad_fp32'\n"
     ]
    }
   ],
   "source": [
    "\n",
    "from transformers import AutoProcessor, Idefics3ForConditionalGeneration, image_utils\n",
    "import torch\n",
    "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
    "model_id=\"eltorio/IDEFICS3_ROCO\"\n",
    "# model = AutoModelForImageTextToText.from_pretrained(model_id).to(device)\n",
    "base_model_path=\"HuggingFaceM4/Idefics3-8B-Llama3\" #or change to local path\n",
    "processor = AutoProcessor.from_pretrained(base_model_path)\n",
    "model = Idefics3ForConditionalGeneration.from_pretrained(\n",
    "        base_model_path, torch_dtype=torch.bfloat16\n",
    "    ).to(device)\n",
    "\n",
    "model.load_adapter(model_id)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "from transformers import image_utils\n",
    "image = image_utils.load_image('https://github.com/sctg-development/ROCOv2-radiology/blob/main/source_dataset/test/ROCOv2_2023_test_000005.jpg?raw=true')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "messages = [\n",
    "    {\n",
    "        \"role\": \"user\",\n",
    "        \"content\": [\n",
    "            {\"type\": \"image\"},\n",
    "            {\"type\": \"text\", \"text\": \"What do we see in this image?\"},\n",
    "        ]\n",
    "    },\n",
    "]\n",
    "prompt = processor.apply_chat_template(messages, add_generation_prompt=True)\n",
    "inputs = processor(text=prompt, images=[image], return_tensors=\"pt\")\n",
    "inputs = {k: v.to(device) for k, v in inputs.items()}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['User:<image>What do we see in this image?\\nAssistant:  Buccal and palatal cortical bone thickness measurements.\\n']\n"
     ]
    }
   ],
   "source": [
    "# Generate\n",
    "generated_ids = model.generate(**inputs, max_new_tokens=500)\n",
    "generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)\n",
    "\n",
    "print(generated_texts)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "base",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}