Image-Text-to-Text
PEFT
Safetensors
English
æLtorio commited on
Commit
0bd0594
1 Parent(s): 178fec8

add sample

Browse files
Files changed (1) hide show
  1. ROCO-idefics3-test.ipynb +118 -0
ROCO-idefics3-test.ipynb ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "metadata": {},
7
+ "outputs": [
8
+ {
9
+ "data": {
10
+ "application/vnd.jupyter.widget-view+json": {
11
+ "model_id": "50ddeec2e802423ebf210b0264d2f222",
12
+ "version_major": 2,
13
+ "version_minor": 0
14
+ },
15
+ "text/plain": [
16
+ "Loading checkpoint shards: 0%| | 0/4 [00:00<?, ?it/s]"
17
+ ]
18
+ },
19
+ "metadata": {},
20
+ "output_type": "display_data"
21
+ },
22
+ {
23
+ "name": "stderr",
24
+ "output_type": "stream",
25
+ "text": [
26
+ "/opt/anaconda3/lib/python3.12/site-packages/bitsandbytes/cextension.py:34: UserWarning: The installed version of bitsandbytes was compiled without GPU support. 8-bit optimizers, 8-bit multiplication, and GPU quantization are unavailable.\n",
27
+ " warn(\"The installed version of bitsandbytes was compiled without GPU support. \"\n"
28
+ ]
29
+ },
30
+ {
31
+ "name": "stdout",
32
+ "output_type": "stream",
33
+ "text": [
34
+ "'NoneType' object has no attribute 'cadam32bit_grad_fp32'\n"
35
+ ]
36
+ }
37
+ ],
38
+ "source": [
39
+ "from transformers import AutoProcessor, Idefics3ForConditionalGeneration, image_utils\n",
40
+ "import torch\n",
41
+ "device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')\n",
42
+ "model_id=\"eltorio/IDEFICS3_ROCO\"\n",
43
+ "# model = AutoModelForImageTextToText.from_pretrained(model_id).to(device)\n",
44
+ "base_model_path=\"HuggingFaceM4/Idefics3-8B-Llama3\" #or change to local path\n",
45
+ "processor = AutoProcessor.from_pretrained(base_model_path)\n",
46
+ "model = Idefics3ForConditionalGeneration.from_pretrained(\n",
47
+ " base_model_path, torch_dtype=torch.bfloat16\n",
48
+ " ).to(device)\n",
49
+ "\n",
50
+ "model.load_adapter(model_id)\n"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": 2,
56
+ "metadata": {},
57
+ "outputs": [],
58
+ "source": [
59
+ "from transformers import image_utils\n",
60
+ "image = image_utils.load_image('https://github.com/sctg-development/ROCOv2-radiology/blob/main/source_dataset/test/ROCOv2_2023_test_000005.jpg?raw=true')"
61
+ ]
62
+ },
63
+ {
64
+ "cell_type": "code",
65
+ "execution_count": 3,
66
+ "metadata": {},
67
+ "outputs": [],
68
+ "source": [
69
+ "messages = [\n",
70
+ " {\n",
71
+ " \"role\": \"user\",\n",
72
+ " \"content\": [\n",
73
+ " {\"type\": \"image\"},\n",
74
+ " {\"type\": \"text\", \"text\": \"What do we see in this image?\"},\n",
75
+ " ]\n",
76
+ " },\n",
77
+ "]\n",
78
+ "prompt = processor.apply_chat_template(messages, add_generation_prompt=True)\n",
79
+ "inputs = processor(text=prompt, images=[image], return_tensors=\"pt\")\n",
80
+ "inputs = {k: v.to(device) for k, v in inputs.items()}"
81
+ ]
82
+ },
83
+ {
84
+ "cell_type": "code",
85
+ "execution_count": null,
86
+ "metadata": {},
87
+ "outputs": [],
88
+ "source": [
89
+ "# Generate\n",
90
+ "generated_ids = model.generate(**inputs, max_new_tokens=500)\n",
91
+ "generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)\n",
92
+ "\n",
93
+ "print(generated_texts)"
94
+ ]
95
+ }
96
+ ],
97
+ "metadata": {
98
+ "kernelspec": {
99
+ "display_name": "base",
100
+ "language": "python",
101
+ "name": "python3"
102
+ },
103
+ "language_info": {
104
+ "codemirror_mode": {
105
+ "name": "ipython",
106
+ "version": 3
107
+ },
108
+ "file_extension": ".py",
109
+ "mimetype": "text/x-python",
110
+ "name": "python",
111
+ "nbconvert_exporter": "python",
112
+ "pygments_lexer": "ipython3",
113
+ "version": "3.12.7"
114
+ }
115
+ },
116
+ "nbformat": 4,
117
+ "nbformat_minor": 2
118
+ }