Spaces:
Runtime error
Runtime error
VictorSanh
commited on
Commit
•
4273b28
1
Parent(s):
fee6472
update with final transformers api
Browse files- app_dialogue.py +27 -19
app_dialogue.py
CHANGED
@@ -41,11 +41,9 @@ MODELS = {
|
|
41 |
|
42 |
}
|
43 |
PROCESSOR = AutoProcessor.from_pretrained(
|
44 |
-
"HuggingFaceM4/idefics2-
|
45 |
token=os.environ["HF_AUTH_TOKEN"],
|
46 |
)
|
47 |
-
BAD_WORDS_IDS = PROCESSOR.tokenizer(["<image>", "<fake_token_around_image>"], add_special_tokens=False).input_ids
|
48 |
-
EOS_WORDS_IDS = PROCESSOR.tokenizer("<end_of_utterance>", add_special_tokens=False).input_ids + [PROCESSOR.tokenizer.eos_token_id]
|
49 |
|
50 |
SYSTEM_PROMPT = [ # Deactivating the system propmpt for now, but if I were to reactivate it, I would need to a/ transform turns into dict for applying the chat template, b/ manually overwrite the `default_template` to add the first line (that is not part of any turns), in particular for handling the bos_token.
|
51 |
# """The following is a conversation between a highly knowledgeable and intelligent visual AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant has the ability to perceive images and reason about the content of visual inputs. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.
|
@@ -90,12 +88,13 @@ def format_user_prompt_with_im_history_and_system_conditioning(
|
|
90 |
Produces the resulting list that needs to go inside the processor.
|
91 |
It handles the potential image(s), the history and the system conditionning.
|
92 |
"""
|
93 |
-
|
|
|
94 |
|
95 |
# Format history
|
96 |
for turn in chat_history:
|
97 |
-
if not
|
98 |
-
|
99 |
{
|
100 |
"role": "user",
|
101 |
"content": [],
|
@@ -104,35 +103,45 @@ def format_user_prompt_with_im_history_and_system_conditioning(
|
|
104 |
|
105 |
if turn_is_pure_media(turn):
|
106 |
media = turn[0][0]
|
107 |
-
|
|
|
108 |
else:
|
109 |
user_utterance, assistant_utterance = turn
|
110 |
-
|
111 |
-
|
|
|
|
|
112 |
{
|
113 |
"role": "assistant",
|
114 |
-
"content": [
|
|
|
|
|
115 |
}
|
116 |
)
|
117 |
|
118 |
# Format current input
|
119 |
if not user_prompt["files"]:
|
120 |
-
|
121 |
{
|
122 |
"role": "user",
|
123 |
-
"content": [
|
|
|
|
|
124 |
}
|
125 |
)
|
126 |
else:
|
127 |
# Choosing to put the image first (i.e. before the text), but this is an arbiratrary choice.
|
128 |
-
|
129 |
{
|
130 |
"role": "user",
|
131 |
-
"content": [
|
|
|
|
|
132 |
}
|
133 |
)
|
|
|
134 |
|
135 |
-
return
|
136 |
|
137 |
|
138 |
def extract_images_from_msg_list(msg_list):
|
@@ -176,8 +185,6 @@ def model_inference(
|
|
176 |
generation_args = {
|
177 |
"max_new_tokens": max_new_tokens,
|
178 |
"repetition_penalty": repetition_penalty,
|
179 |
-
"bad_words_ids": BAD_WORDS_IDS,
|
180 |
-
"eos_token_id": EOS_WORDS_IDS,
|
181 |
"streamer": streamer,
|
182 |
}
|
183 |
|
@@ -193,11 +200,12 @@ def model_inference(
|
|
193 |
generation_args["top_p"] = top_p
|
194 |
|
195 |
# Creating model inputs
|
196 |
-
|
197 |
user_prompt=user_prompt,
|
198 |
chat_history=chat_history,
|
199 |
)
|
200 |
-
|
|
|
201 |
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
|
202 |
generation_args.update(inputs)
|
203 |
|
|
|
41 |
|
42 |
}
|
43 |
PROCESSOR = AutoProcessor.from_pretrained(
|
44 |
+
"HuggingFaceM4/idefics2-8b",
|
45 |
token=os.environ["HF_AUTH_TOKEN"],
|
46 |
)
|
|
|
|
|
47 |
|
48 |
SYSTEM_PROMPT = [ # Deactivating the system propmpt for now, but if I were to reactivate it, I would need to a/ transform turns into dict for applying the chat template, b/ manually overwrite the `default_template` to add the first line (that is not part of any turns), in particular for handling the bos_token.
|
49 |
# """The following is a conversation between a highly knowledgeable and intelligent visual AI assistant, called Assistant, and a human user, called User. In the following interactions, User and Assistant will converse in natural language, and Assistant will do its best to answer User’s questions. Assistant has the ability to perceive images and reason about the content of visual inputs. Assistant was built to be respectful, polite and inclusive. It knows a lot, and always tells the truth. When prompted with an image, it does not make up facts.
|
|
|
88 |
Produces the resulting list that needs to go inside the processor.
|
89 |
It handles the potential image(s), the history and the system conditionning.
|
90 |
"""
|
91 |
+
resulting_messages = copy.deepcopy(SYSTEM_PROMPT)
|
92 |
+
resulting_images = []
|
93 |
|
94 |
# Format history
|
95 |
for turn in chat_history:
|
96 |
+
if not resulting_messages or (resulting_messages and resulting_messages[-1]["role"] != "user"):
|
97 |
+
resulting_messages.append(
|
98 |
{
|
99 |
"role": "user",
|
100 |
"content": [],
|
|
|
103 |
|
104 |
if turn_is_pure_media(turn):
|
105 |
media = turn[0][0]
|
106 |
+
resulting_messages[-1]["content"].append({"type": "image"})
|
107 |
+
resulting_images.append(Image.open(media))
|
108 |
else:
|
109 |
user_utterance, assistant_utterance = turn
|
110 |
+
resulting_messages[-1]["content"].append(
|
111 |
+
{"type": "text", "text": user_utterance.strip()}
|
112 |
+
)
|
113 |
+
resulting_messages.append(
|
114 |
{
|
115 |
"role": "assistant",
|
116 |
+
"content": [
|
117 |
+
{"type": "text", "text": user_utterance.strip()}
|
118 |
+
]
|
119 |
}
|
120 |
)
|
121 |
|
122 |
# Format current input
|
123 |
if not user_prompt["files"]:
|
124 |
+
resulting_messages.append(
|
125 |
{
|
126 |
"role": "user",
|
127 |
+
"content": [
|
128 |
+
{"type": "text", "text": user_prompt['text']}
|
129 |
+
],
|
130 |
}
|
131 |
)
|
132 |
else:
|
133 |
# Choosing to put the image first (i.e. before the text), but this is an arbiratrary choice.
|
134 |
+
resulting_messages.append(
|
135 |
{
|
136 |
"role": "user",
|
137 |
+
"content": [{"type": "image"}] * len(user_prompt['files']) + [
|
138 |
+
{"type": "text", "text": user_prompt['text']}
|
139 |
+
]
|
140 |
}
|
141 |
)
|
142 |
+
resulting_images.extend([Image.open(im['path']) for im in user_prompt['files']])
|
143 |
|
144 |
+
return resulting_messages, resulting_images
|
145 |
|
146 |
|
147 |
def extract_images_from_msg_list(msg_list):
|
|
|
185 |
generation_args = {
|
186 |
"max_new_tokens": max_new_tokens,
|
187 |
"repetition_penalty": repetition_penalty,
|
|
|
|
|
188 |
"streamer": streamer,
|
189 |
}
|
190 |
|
|
|
200 |
generation_args["top_p"] = top_p
|
201 |
|
202 |
# Creating model inputs
|
203 |
+
resulting_text, resulting_images = format_user_prompt_with_im_history_and_system_conditioning(
|
204 |
user_prompt=user_prompt,
|
205 |
chat_history=chat_history,
|
206 |
)
|
207 |
+
prompt = PROCESSOR.apply_chat_template(resulting_text, add_generation_prompt=True)
|
208 |
+
inputs = PROCESSOR(text=prompt, images=resulting_images if resulting_images else None, return_tensors="pt")
|
209 |
inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
|
210 |
generation_args.update(inputs)
|
211 |
|