VictorSanh commited on
Commit
4273b28
1 Parent(s): fee6472

update with final transformers api

Browse files
Files changed (1) hide show
  1. app_dialogue.py +27 -19
app_dialogue.py CHANGED
@@ -41,11 +41,9 @@ MODELS = {
41
 
42
  }
43
  PROCESSOR = AutoProcessor.from_pretrained(
44
- "HuggingFaceM4/idefics2-tfrm-compatible",
45
  token=os.environ["HF_AUTH_TOKEN"],
46
  )
47
- BAD_WORDS_IDS = PROCESSOR.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
48
- EOS_WORDS_IDS = PROCESSOR.tokenizer("<end_of_utterance>", add_special_tokens=False).input_ids + [PROCESSOR.tokenizer.eos_token_id]
49
 
50
  SYSTEM_PROMPT = [ # Deactivating the system propmpt for now, but if I were to reactivate it, I would need to a/ transform turns into dict for applying the chat template, b/ manually overwrite the `default_template` to add the first line (that is not part of any turns), in particular for handling the bos_token.
51
  # """The following is a conversation between a highly knowledgeable and intelligent visual AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant has the ability to perceive images and reason about the content of visual inputs. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.
@@ -90,12 +88,13 @@ def format_user_prompt_with_im_history_and_system_conditioning(
90
  Produces the resulting list that needs to go inside the processor.
91
  It handles the potential image(s), the history and the system conditionning.
92
  """
93
- resulting_list = copy.deepcopy(SYSTEM_PROMPT)
 
94
 
95
  # Format history
96
  for turn in chat_history:
97
- if not resulting_list or (resulting_list and resulting_list[-1]["role"] != "user"):
98
- resulting_list.append(
99
  {
100
  "role": "user",
101
  "content": [],
@@ -104,35 +103,45 @@ def format_user_prompt_with_im_history_and_system_conditioning(
104
 
105
  if turn_is_pure_media(turn):
106
  media = turn[0][0]
107
- resulting_list[-1]["content"].append(Image.open(media))
 
108
  else:
109
  user_utterance, assistant_utterance = turn
110
- resulting_list[-1]["content"].append(user_utterance.strip())
111
- resulting_list.append(
 
 
112
  {
113
  "role": "assistant",
114
- "content": [assistant_utterance]
 
 
115
  }
116
  )
117
 
118
  # Format current input
119
  if not user_prompt["files"]:
120
- resulting_list.append(
121
  {
122
  "role": "user",
123
- "content": [user_prompt['text']],
 
 
124
  }
125
  )
126
  else:
127
  # Choosing to put the image first (i.e. before the text), but this is an arbiratrary choice.
128
- resulting_list.append(
129
  {
130
  "role": "user",
131
- "content": [Image.open(im['path']) for im in user_prompt['files']] + [user_prompt['text']],
 
 
132
  }
133
  )
 
134
 
135
- return resulting_list
136
 
137
 
138
  def extract_images_from_msg_list(msg_list):
@@ -176,8 +185,6 @@ def model_inference(
176
  generation_args = {
177
  "max_new_tokens": max_new_tokens,
178
  "repetition_penalty": repetition_penalty,
179
- "bad_words_ids": BAD_WORDS_IDS,
180
- "eos_token_id": EOS_WORDS_IDS,
181
  "streamer": streamer,
182
  }
183
 
@@ -193,11 +200,12 @@ def model_inference(
193
  generation_args["top_p"] = top_p
194
 
195
  # Creating model inputs
196
- formated_prompt_list = format_user_prompt_with_im_history_and_system_conditioning(
197
  user_prompt=user_prompt,
198
  chat_history=chat_history,
199
  )
200
- inputs = PROCESSOR.apply_chat_template(formated_prompt_list, add_generation_prompt=True, return_tensors="pt")
 
201
  inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
202
  generation_args.update(inputs)
203
 
 
41
 
42
  }
43
  PROCESSOR = AutoProcessor.from_pretrained(
44
+ "HuggingFaceM4/idefics2-8b",
45
  token=os.environ["HF_AUTH_TOKEN"],
46
  )
 
 
47
 
48
  SYSTEM_PROMPT = [ # Deactivating the system propmpt for now, but if I were to reactivate it, I would need to a/ transform turns into dict for applying the chat template, b/ manually overwrite the `default_template` to add the first line (that is not part of any turns), in particular for handling the bos_token.
49
  # """The following is a conversation between a highly knowledgeable and intelligent visual AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant has the ability to perceive images and reason about the content of visual inputs. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.
 
88
  Produces the resulting list that needs to go inside the processor.
89
  It handles the potential image(s), the history and the system conditionning.
90
  """
91
+ resulting_messages = copy.deepcopy(SYSTEM_PROMPT)
92
+ resulting_images = []
93
 
94
  # Format history
95
  for turn in chat_history:
96
+ if not resulting_messages or (resulting_messages and resulting_messages[-1]["role"] != "user"):
97
+ resulting_messages.append(
98
  {
99
  "role": "user",
100
  "content": [],
 
103
 
104
  if turn_is_pure_media(turn):
105
  media = turn[0][0]
106
+ resulting_messages[-1]["content"].append({"type": "image"})
107
+ resulting_images.append(Image.open(media))
108
  else:
109
  user_utterance, assistant_utterance = turn
110
+ resulting_messages[-1]["content"].append(
111
+ {"type": "text", "text": user_utterance.strip()}
112
+ )
113
+ resulting_messages.append(
114
  {
115
  "role": "assistant",
116
+ "content": [
117
+ {"type": "text", "text": user_utterance.strip()}
118
+ ]
119
  }
120
  )
121
 
122
  # Format current input
123
  if not user_prompt["files"]:
124
+ resulting_messages.append(
125
  {
126
  "role": "user",
127
+ "content": [
128
+ {"type": "text", "text": user_prompt['text']}
129
+ ],
130
  }
131
  )
132
  else:
133
  # Choosing to put the image first (i.e. before the text), but this is an arbiratrary choice.
134
+ resulting_messages.append(
135
  {
136
  "role": "user",
137
+ "content": [{"type": "image"}] * len(user_prompt['files']) + [
138
+ {"type": "text", "text": user_prompt['text']}
139
+ ]
140
  }
141
  )
142
+ resulting_images.extend([Image.open(im['path']) for im in user_prompt['files']])
143
 
144
+ return resulting_messages, resulting_images
145
 
146
 
147
  def extract_images_from_msg_list(msg_list):
 
185
  generation_args = {
186
  "max_new_tokens": max_new_tokens,
187
  "repetition_penalty": repetition_penalty,
 
 
188
  "streamer": streamer,
189
  }
190
 
 
200
  generation_args["top_p"] = top_p
201
 
202
  # Creating model inputs
203
+ resulting_text, resulting_images = format_user_prompt_with_im_history_and_system_conditioning(
204
  user_prompt=user_prompt,
205
  chat_history=chat_history,
206
  )
207
+ prompt = PROCESSOR.apply_chat_template(resulting_text, add_generation_prompt=True)
208
+ inputs = PROCESSOR(text=prompt, images=resulting_images if resulting_images else None, return_tensors="pt")
209
  inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
210
  generation_args.update(inputs)
211