eltorio
/

IDEFICS3_ROCO

Image-Text-to-Text

PEFT

Safetensors

English

Model card Files Files and versions Community

æLtorio commited on Nov 13, 2024

Commit

0bd0594

unverified ·

1 Parent(s): 178fec8

add sample

Browse files

Files changed (1) hide show

ROCO-idefics3-test.ipynb +118 -0

ROCO-idefics3-test.ipynb ADDED Viewed

	@@ -0,0 +1,118 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "application/vnd.jupyter.widget-view+json": {
+       "model_id": "50ddeec2e802423ebf210b0264d2f222",
+       "version_major": 2,
+       "version_minor": 0
+      },
+      "text/plain": [
+       "Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]"
+      ]
+     },
+     "metadata": {},
+     "output_type": "display_data"
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/opt/anaconda3/lib/python3.12/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
+      "  warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "'NoneType' object has no attribute 'cadam32bit_grad_fp32'\n"
+     ]
+    }
+   ],
+   "source": [
+    "from transformers import AutoProcessor, Idefics3ForConditionalGeneration, image_utils\n",
+    "import torch\n",
+    "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
+    "model_id=\"eltorio/IDEFICS3_ROCO\"\n",
+    "# model = AutoModelForImageTextToText.from_pretrained(model_id).to(device)\n",
+    "base_model_path=\"HuggingFaceM4/Idefics3-8B-Llama3\" #or change to local path\n",
+    "processor = AutoProcessor.from_pretrained(base_model_path)\n",
+    "model = Idefics3ForConditionalGeneration.from_pretrained(\n",
+    "        base_model_path, torch_dtype=torch.bfloat16\n",
+    "    ).to(device)\n",
+    "\n",
+    "model.load_adapter(model_id)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from transformers import image_utils\n",
+    "image = image_utils.load_image('https://github.com/sctg-development/ROCOv2-radiology/blob/main/source_dataset/test/ROCOv2_2023_test_000005.jpg?raw=true')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 3,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "messages = [\n",
+    "    {\n",
+    "        \"role\": \"user\",\n",
+    "        \"content\": [\n",
+    "            {\"type\": \"image\"},\n",
+    "            {\"type\": \"text\", \"text\": \"What do we see in this image?\"},\n",
+    "        ]\n",
+    "    },\n",
+    "]\n",
+    "prompt = processor.apply_chat_template(messages, add_generation_prompt=True)\n",
+    "inputs = processor(text=prompt, images=[image], return_tensors=\"pt\")\n",
+    "inputs = {k: v.to(device) for k, v in inputs.items()}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Generate\n",
+    "generated_ids = model.generate(**inputs, max_new_tokens=500)\n",
+    "generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)\n",
+    "\n",
+    "print(generated_texts)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.12.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}