Leyo commited on
Commit
34d389e
1 Parent(s): f9f9d0b
Files changed (1) hide show
  1. app_dialogue.py +64 -125
app_dialogue.py CHANGED
@@ -2,7 +2,11 @@ import os
2
  import subprocess
3
 
4
  # Install flash attention
5
- subprocess.run('pip install flash-attn --no-build-isolation', env={'FLASH_ATTENTION_SKIP_CUDA_BUILD': "TRUE"}, shell=True)
 
 
 
 
6
 
7
 
8
  import copy
@@ -28,127 +32,33 @@ from transformers import Idefics2ForConditionalGeneration
28
 
29
  DEVICE = torch.device("cuda")
30
  MODELS = {
31
- # "idefics2-8b (sft)": Idefics2ForConditionalGeneration.from_pretrained(
32
- # "HuggingFaceM4/idefics2-8b",
33
- # torch_dtype=torch.bfloat16,
34
- # _attn_implementation="flash_attention_2",
35
- # trust_remote_code=True,
36
- # token=os.environ["HF_AUTH_TOKEN"],
37
- # ).to(DEVICE),
38
- "idefics2-8b-chatty (chat-600)": Idefics2ForConditionalGeneration.from_pretrained(
39
  "HuggingFaceM4/idefics2-8b-chatty",
40
  torch_dtype=torch.bfloat16,
41
  _attn_implementation="flash_attention_2",
42
  trust_remote_code=True,
43
  token=os.environ["HF_AUTH_TOKEN"],
44
- revision="bb460e58294bcb02430df9fd126b3c522f867d83"
45
  ).to(DEVICE),
46
- # "idefics2-8b-chatty (chat-50)": Idefics2ForConditionalGeneration.from_pretrained(
47
- # "HuggingFaceM4/idefics2-8b-chatty",
48
- # torch_dtype=torch.bfloat16,
49
- # _attn_implementation="flash_attention_2",
50
- # trust_remote_code=True,
51
- # token=os.environ["HF_AUTH_TOKEN"],
52
- # revision="1d57ffb705199370f7875667cc8f54abd09b2004"
53
- # ).to(DEVICE),
54
  }
55
  PROCESSOR = AutoProcessor.from_pretrained(
56
  "HuggingFaceM4/idefics2-8b",
57
  token=os.environ["HF_AUTH_TOKEN"],
58
  )
59
 
60
- # SYSTEM_PROMPT = [ # Deactivating the system propmpt for now, but if I were to reactivate it, I would need to a/ transform turns into dict for applying the chat template, b/ manually overwrite the `default_template` to add the first line (that is not part of any turns), in particular for handling the bos_token.
61
- # # """The following is a conversation between a highly knowledgeable and intelligent visual AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant has the ability to perceive images and reason about the content of visual inputs. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.
62
-
63
- # The conversation begins:""",
64
- # """\nUser:""",
65
- # "https://huggingface.co/spaces/HuggingFaceM4/idefics_playground/resolve/main/example_images/kittens-cats-pet-cute-preview.jpg?download=true",
66
- # "Describe this image.<end_of_utterance>",
67
- # """\nAssistant: Five kittens are standing together in the center of the photograph. From the left to right, there is one orange kitten, two kittens white and black stripes, and two brown kittens with an orange head. They are in bright green grass and it looks like they are moving forward.<end_of_utterance>""",
68
- # "\nUser:How about this image?",
69
- # "https://huggingface.co/spaces/HuggingFaceM4/idefics_playground/resolve/main/example_images/puppy.jpg?download=true",
70
- # "Can you describe it too?<end_of_utterance>",
71
- # """\nAssistant: A dog is lying on the floor, looking at the camera. It is looking directly at you. The dog has a white body and brown patches on its face and ears. Its eyes are dark. Its nose is black, and it has long, floppy ears, white paws, long fur, big eyes.<end_of_utterance>""",
72
- # "\nUser: What kind of breed is it?<end_of_utterance>",
73
- # """\nAssistant: It looks like a Cavalier King Charles Spaniel. These breeds are known for their gentle, affectionate nature. Their long ears and big, brown eyes give them a sweet, expressive face. Cavaliers are small dogs weighing between 13 and 20 pounds. They have a silky, medium-length coat that comes in various colors, including black and tan, tricolor, red and white, or black.<end_of_utterance>""",
74
- # "\nUser: What can you tell me about this breed of dogs?<end_of_utterance>",
75
- # """\nAssistant: One specific characteristic of the Cavalier King Charles Spaniel is its friendly and affectionate nature. This breed is known for being extremely sociable and forming strong bonds with their owners. They are often described as "velcro dogs" because they love to be close to their human companions, whether it's sitting on their laps, cuddling on the couch, or simply following them around the house.<end_of_utterance>""",
76
- # "\nUser: ghjkhjabnufs<end_of_utterance>",
77
- # """\nAssistant: That doesn’t seem to be a word. Could you ask me another way?<end_of_utterance>""",
78
- # "\nUser: Do you like Cavalier King Charles Spaniel?<end_of_utterance>",
79
- # """\nAssistant: I do not have personal opinions as I’m just a computer program. However, cavaliers are known for being excellent family pets due to their gentle and patient demeanor, making them great with children and other pets. Their affectionate nature also makes them ideal therapy dogs, as they can provide comfort and emotional support to those in need.<end_of_utterance>""",
80
- # "\nUser: How many dogs do you see in this image?",
81
- # "https://huggingface.co/spaces/HuggingFaceM4/idefics_playground/resolve/main/example_images/tennis_tsonga.jpg?download=true",
82
- # "<end_of_utterance>",
83
- # """\nAssistant: There is no dogs in this image. The picture shows a tennis player jumping to volley the ball.<end_of_utterance>""",
84
- # ]
85
-
86
  SYSTEM_PROMPT = [
87
  {
88
  "role": "system",
89
  "content": [
90
- {"type": "text", "text":
91
- "The following is a conversation between a highly knowledgeable and intelligent visual AI assistant, called Assistant, and a human user, called User. In the following interactions, \
 
92
  User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant has the ability to perceive images and reason about the \
93
- content of visual inputs. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts."
94
  },
95
  ],
96
  }
97
- # {
98
- # "role": "user",
99
- # "content": [
100
- # {"type": "image", "image": "https://huggingface.co/spaces/HuggingFaceM4/idefics_playground/resolve/main/example_images/kittens-cats-pet-cute-preview.jpg?download=true"},
101
- # {"type": "text", "text": "Describe this image."},
102
- # ],
103
- # },
104
- # {
105
- # "role": "assistant",
106
- # "content": [
107
- # {"type": "text", "text": "Five kittens are standing together in the center of the photograph. From the left to right, there is one orange kitten, two kittens white and black stripes, and two brown kittens with an orange head. They are in bright green grass and it looks like they are moving forward."},
108
- # ],
109
- # },
110
- # {
111
- # "role": "user",
112
- # "content": [
113
- # {"type": "text", "text": "How about this image?"},
114
- # {"type": "image", "image": "https://huggingface.co/spaces/HuggingFaceM4/idefics_playground/resolve/main/example_images/puppy.jpg?download=true"},
115
- # {"type": "text", "text": "Can you describe it too?"},
116
- # ],
117
- # },
118
- # {
119
- # "role": "assistant",
120
- # "content": [
121
- # {"type": "text", "text": "A dog is lying on the floor, looking at the camera. It is looking directly at you. The dog has a white body and brown patches on its face and ears. Its eyes are dark. Its nose is black, and it has long, floppy ears, white paws, long fur, big eyes."},
122
- # ],
123
- # },
124
- # {
125
- # "role": "user",
126
- # "content": [
127
- # {"type": "text", "text": "What can you tell me about this breed of dogs?"},
128
- # ],
129
- # },
130
- # {
131
- # "role": "assistant",
132
- # "content": [
133
- # {"type": "text", "text": "One specific characteristic of the Cavalier King Charles Spaniel is its friendly and affectionate nature. This breed is known for being extremely sociable and forming strong bonds with their owners. They are often described as \"velcro dogs\" because they love to be close to their human companions, whether it's sitting on their laps, cuddling on the couch, or simply following them around the house."},
134
- # ],
135
- # },
136
- # {
137
- # "role": "user",
138
- # "content": [
139
- # {"type": "text", "text": "How many dogs do you see in the following image?"},
140
- # {"type": "image", "image": "https://huggingface.co/spaces/HuggingFaceM4/idefics_playground/resolve/main/example_images/tennis_tsonga.jpg?download=true"},
141
- # ],
142
- # },
143
- # {
144
- # "role": "assistant",
145
- # "content": [
146
- # {"type": "text", "text": "There are no dogs in this image. The picture shows a tennis player in the midst of a powerful swing."},
147
- # ],
148
- # },
149
  ]
150
 
151
-
152
  API_TOKEN = os.getenv("HF_AUTH_TOKEN")
153
  # IDEFICS_LOGO = "https://huggingface.co/spaces/HuggingFaceM4/idefics_playground/resolve/main/IDEFICS_logo.png"
154
  BOT_AVATAR = "IDEFICS_logo.png"
@@ -158,6 +68,7 @@ BOT_AVATAR = "IDEFICS_logo.png"
158
  def turn_is_pure_media(turn):
159
  return turn[1] is None
160
 
 
161
  def load_image_from_url(url):
162
  with urllib.request.urlopen(url) as response:
163
  image_data = response.read()
@@ -165,6 +76,7 @@ def load_image_from_url(url):
165
  image = Image.open(image_stream)
166
  return image
167
 
 
168
  def format_user_prompt_with_im_history_and_system_conditioning(
169
  user_prompt, chat_history
170
  ) -> List[Dict[str, Union[List, str]]]:
@@ -182,7 +94,9 @@ def format_user_prompt_with_im_history_and_system_conditioning(
182
 
183
  # Format history
184
  for turn in chat_history:
185
- if not resulting_messages or (resulting_messages and resulting_messages[-1]["role"] != "user"):
 
 
186
  resulting_messages.append(
187
  {
188
  "role": "user",
@@ -202,9 +116,7 @@ def format_user_prompt_with_im_history_and_system_conditioning(
202
  resulting_messages.append(
203
  {
204
  "role": "assistant",
205
- "content": [
206
- {"type": "text", "text": user_utterance.strip()}
207
- ]
208
  }
209
  )
210
 
@@ -213,9 +125,7 @@ def format_user_prompt_with_im_history_and_system_conditioning(
213
  resulting_messages.append(
214
  {
215
  "role": "user",
216
- "content": [
217
- {"type": "text", "text": user_prompt['text']}
218
- ],
219
  }
220
  )
221
  else:
@@ -223,12 +133,11 @@ def format_user_prompt_with_im_history_and_system_conditioning(
223
  resulting_messages.append(
224
  {
225
  "role": "user",
226
- "content": [{"type": "image"}] * len(user_prompt['files']) + [
227
- {"type": "text", "text": user_prompt['text']}
228
- ]
229
  }
230
  )
231
- resulting_images.extend([Image.open(im['path']) for im in user_prompt['files']])
232
 
233
  return resulting_messages, resulting_images
234
 
@@ -266,7 +175,7 @@ def model_inference(
266
  streamer = TextIteratorStreamer(
267
  PROCESSOR.tokenizer,
268
  skip_prompt=True,
269
- timeout=5.,
270
  )
271
 
272
  # Common parameters to all decoding strategies
@@ -289,13 +198,20 @@ def model_inference(
289
  generation_args["top_p"] = top_p
290
 
291
  # Creating model inputs
292
- resulting_text, resulting_images = format_user_prompt_with_im_history_and_system_conditioning(
 
 
 
293
  user_prompt=user_prompt,
294
  chat_history=chat_history,
295
  )
296
 
297
  prompt = PROCESSOR.apply_chat_template(resulting_text, add_generation_prompt=True)
298
- inputs = PROCESSOR(text=prompt, images=resulting_images if resulting_images else None, return_tensors="pt")
 
 
 
 
299
  inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
300
  generation_args.update(inputs)
301
 
@@ -323,6 +239,7 @@ def model_inference(
323
  print("Success - generated the following text:", acc_text)
324
  print("-----")
325
 
 
326
  # Hyper-parameters for generation
327
  max_new_tokens = gr.Slider(
328
  minimum=8,
@@ -382,19 +299,22 @@ problematic_callback = gr.CSVLogger()
382
 
383
 
384
  # Using Flagging for saving dope and problematic examples
385
- # Dope examples flagging
386
-
387
 
388
- # gr.Markdown("""## How to use?
389
 
390
- # There are two ways to provide image inputs:
391
- # - Using the image box on the left panel
392
- # - Using the inline syntax: `text<fake_token_around_image><image:URL_IMAGE><fake_token_around_image>text`
393
 
394
- # The second syntax allows inputting an arbitrary number of images.""")
395
 
396
 
397
- with gr.Blocks(fill_height=True, css=""".gradio-container .avatar-container {height: 40px width: 40px !important;}""") as demo:
 
 
 
398
  # model selector should be set to `visbile=False` ultimately
399
  with gr.Row(elem_id="model_selector_row"):
400
  model_selector = gr.Dropdown(
@@ -410,7 +330,13 @@ with gr.Blocks(fill_height=True, css=""".gradio-container .avatar-container {hei
410
  decoding_strategy.change(
411
  fn=lambda selection: gr.Slider(
412
  visible=(
413
- selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
 
 
 
 
 
 
414
  )
415
  ),
416
  inputs=decoding_strategy,
@@ -419,7 +345,13 @@ with gr.Blocks(fill_height=True, css=""".gradio-container .avatar-container {hei
419
  decoding_strategy.change(
420
  fn=lambda selection: gr.Slider(
421
  visible=(
422
- selection in ["contrastive_sampling", "beam_sampling", "Top P Sampling", "sampling_top_k"]
 
 
 
 
 
 
423
  )
424
  ),
425
  inputs=decoding_strategy,
@@ -437,7 +369,14 @@ with gr.Blocks(fill_height=True, css=""".gradio-container .avatar-container {hei
437
  # examples=[{"text": "hello"}, {"text": "hola"}, {"text": "merhaba"}],
438
  title="Idefics2 Playground",
439
  multimodal=True,
440
- additional_inputs=[model_selector, decoding_strategy, temperature, max_new_tokens, repetition_penalty, top_p],
 
 
 
 
 
 
 
441
  )
442
  with gr.Group():
443
  with gr.Row():
 
2
  import subprocess
3
 
4
  # Install flash attention
5
+ subprocess.run(
6
+ "pip install flash-attn --no-build-isolation",
7
+ env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
8
+ shell=True,
9
+ )
10
 
11
 
12
  import copy
 
32
 
33
  DEVICE = torch.device("cuda")
34
  MODELS = {
35
+ "idefics2-8b-chatty": Idefics2ForConditionalGeneration.from_pretrained(
 
 
 
 
 
 
 
36
  "HuggingFaceM4/idefics2-8b-chatty",
37
  torch_dtype=torch.bfloat16,
38
  _attn_implementation="flash_attention_2",
39
  trust_remote_code=True,
40
  token=os.environ["HF_AUTH_TOKEN"],
 
41
  ).to(DEVICE),
 
 
 
 
 
 
 
 
42
  }
43
  PROCESSOR = AutoProcessor.from_pretrained(
44
  "HuggingFaceM4/idefics2-8b",
45
  token=os.environ["HF_AUTH_TOKEN"],
46
  )
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  SYSTEM_PROMPT = [
49
  {
50
  "role": "system",
51
  "content": [
52
+ {
53
+ "type": "text",
54
+ "text": "The following is a conversation between a highly knowledgeable and intelligent visual AI assistant, called Assistant, and a human user, called User. In the following interactions, \
55
  User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant has the ability to perceive images and reason about the \
56
+ content of visual inputs. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.",
57
  },
58
  ],
59
  }
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
60
  ]
61
 
 
62
  API_TOKEN = os.getenv("HF_AUTH_TOKEN")
63
  # IDEFICS_LOGO = "https://huggingface.co/spaces/HuggingFaceM4/idefics_playground/resolve/main/IDEFICS_logo.png"
64
  BOT_AVATAR = "IDEFICS_logo.png"
 
68
  def turn_is_pure_media(turn):
69
  return turn[1] is None
70
 
71
+
72
  def load_image_from_url(url):
73
  with urllib.request.urlopen(url) as response:
74
  image_data = response.read()
 
76
  image = Image.open(image_stream)
77
  return image
78
 
79
+
80
  def format_user_prompt_with_im_history_and_system_conditioning(
81
  user_prompt, chat_history
82
  ) -> List[Dict[str, Union[List, str]]]:
 
94
 
95
  # Format history
96
  for turn in chat_history:
97
+ if not resulting_messages or (
98
+ resulting_messages and resulting_messages[-1]["role"] != "user"
99
+ ):
100
  resulting_messages.append(
101
  {
102
  "role": "user",
 
116
  resulting_messages.append(
117
  {
118
  "role": "assistant",
119
+ "content": [{"type": "text", "text": user_utterance.strip()}],
 
 
120
  }
121
  )
122
 
 
125
  resulting_messages.append(
126
  {
127
  "role": "user",
128
+ "content": [{"type": "text", "text": user_prompt["text"]}],
 
 
129
  }
130
  )
131
  else:
 
133
  resulting_messages.append(
134
  {
135
  "role": "user",
136
+ "content": [{"type": "image"}] * len(user_prompt["files"])
137
+ + [{"type": "text", "text": user_prompt["text"]}],
 
138
  }
139
  )
140
+ resulting_images.extend([Image.open(im["path"]) for im in user_prompt["files"]])
141
 
142
  return resulting_messages, resulting_images
143
 
 
175
  streamer = TextIteratorStreamer(
176
  PROCESSOR.tokenizer,
177
  skip_prompt=True,
178
+ timeout=5.0,
179
  )
180
 
181
  # Common parameters to all decoding strategies
 
198
  generation_args["top_p"] = top_p
199
 
200
  # Creating model inputs
201
+ (
202
+ resulting_text,
203
+ resulting_images,
204
+ ) = format_user_prompt_with_im_history_and_system_conditioning(
205
  user_prompt=user_prompt,
206
  chat_history=chat_history,
207
  )
208
 
209
  prompt = PROCESSOR.apply_chat_template(resulting_text, add_generation_prompt=True)
210
+ inputs = PROCESSOR(
211
+ text=prompt,
212
+ images=resulting_images if resulting_images else None,
213
+ return_tensors="pt",
214
+ )
215
  inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
216
  generation_args.update(inputs)
217
 
 
239
  print("Success - generated the following text:", acc_text)
240
  print("-----")
241
 
242
+
243
  # Hyper-parameters for generation
244
  max_new_tokens = gr.Slider(
245
  minimum=8,
 
299
 
300
 
301
  # Using Flagging for saving dope and problematic examples
302
+ # Dope examples flagging
303
+
304
 
305
+ # gr.Markdown("""## How to use?
306
 
307
+ # There are two ways to provide image inputs:
308
+ # - Using the image box on the left panel
309
+ # - Using the inline syntax: `text<fake_token_around_image><image:URL_IMAGE><fake_token_around_image>text`
310
 
311
+ # The second syntax allows inputting an arbitrary number of images.""")
312
 
313
 
314
+ with gr.Blocks(
315
+ fill_height=True,
316
+ css=""".gradio-container .avatar-container {height: 40px width: 40px !important;}""",
317
+ ) as demo:
318
  # model selector should be set to `visbile=False` ultimately
319
  with gr.Row(elem_id="model_selector_row"):
320
  model_selector = gr.Dropdown(
 
330
  decoding_strategy.change(
331
  fn=lambda selection: gr.Slider(
332
  visible=(
333
+ selection
334
+ in [
335
+ "contrastive_sampling",
336
+ "beam_sampling",
337
+ "Top P Sampling",
338
+ "sampling_top_k",
339
+ ]
340
  )
341
  ),
342
  inputs=decoding_strategy,
 
345
  decoding_strategy.change(
346
  fn=lambda selection: gr.Slider(
347
  visible=(
348
+ selection
349
+ in [
350
+ "contrastive_sampling",
351
+ "beam_sampling",
352
+ "Top P Sampling",
353
+ "sampling_top_k",
354
+ ]
355
  )
356
  ),
357
  inputs=decoding_strategy,
 
369
  # examples=[{"text": "hello"}, {"text": "hola"}, {"text": "merhaba"}],
370
  title="Idefics2 Playground",
371
  multimodal=True,
372
+ additional_inputs=[
373
+ model_selector,
374
+ decoding_strategy,
375
+ temperature,
376
+ max_new_tokens,
377
+ repetition_penalty,
378
+ top_p,
379
+ ],
380
  )
381
  with gr.Group():
382
  with gr.Row():