ValueError: Image features and image tokens do not match: tokens: 1, features 576
#2
by
DsnTgr
- opened
It happens error when running the demo:
ValueError Traceback (most recent call last)
Cell In[1], line 29
24 prompt = "A little girl is riding a bicycle at high speed. Focused, detailed, realistic."
25 image = load_image(
26 "./a girl.png"
27 )
---> 29 output = pipe(image=image, prompt=prompt).frames[0]
30 export_to_video(output, "HunyuanVideo-I2V-diffusers.mp4", fps=15)
File ~/miniconda3/envs/ai-train/lib/python3.10/site-packages/torch/utils/_contextlib.py:116, in context_decorator.<locals>.decorate_context(*args, **kwargs)
113 @functools.wraps(func)
114 def decorate_context(*args, **kwargs):
115 with ctx_factory():
--> 116 return func(*args, **kwargs)
File ~/miniconda3/envs/ai-train/lib/python3.10/site-packages/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py:751, in HunyuanVideoImageToVideoPipeline.__call__(self, image, prompt, prompt_2, negative_prompt, negative_prompt_2, height, width, num_frames, num_inference_steps, sigmas, true_cfg_scale, guidance_scale, num_videos_per_prompt, generator, latents, prompt_embeds, pooled_prompt_embeds, prompt_attention_mask, negative_prompt_embeds, negative_pooled_prompt_embeds, negative_prompt_attention_mask, output_type, return_dict, attention_kwargs, callback_on_step_end, callback_on_step_end_tensor_inputs, prompt_template, max_sequence_length)
749 # 4. Encode input prompt
750 transformer_dtype = self.transformer.dtype
--> 751 prompt_embeds, pooled_prompt_embeds, prompt_attention_mask = self.encode_prompt(
752 image=image,
753 prompt=prompt,
754 prompt_2=prompt_2,
755 prompt_template=prompt_template,
756 num_videos_per_prompt=num_videos_per_prompt,
757 prompt_embeds=prompt_embeds,
758 pooled_prompt_embeds=pooled_prompt_embeds,
759 prompt_attention_mask=prompt_attention_mask,
760 device=device,
761 max_sequence_length=max_sequence_length,
762 )
763 prompt_embeds = prompt_embeds.to(transformer_dtype)
764 prompt_attention_mask = prompt_attention_mask.to(transformer_dtype)
File ~/miniconda3/envs/ai-train/lib/python3.10/site-packages/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py:404, in HunyuanVideoImageToVideoPipeline.encode_prompt(self, image, prompt, prompt_2, prompt_template, num_videos_per_prompt, prompt_embeds, pooled_prompt_embeds, prompt_attention_mask, device, dtype, max_sequence_length)
389 def encode_prompt(
390 self,
391 image: torch.Tensor,
(...)
401 max_sequence_length: int = 256,
402 ):
403 if prompt_embeds is None:
--> 404 prompt_embeds, prompt_attention_mask = self._get_llama_prompt_embeds(
405 image,
406 prompt,
407 prompt_template,
408 num_videos_per_prompt,
409 device=device,
410 dtype=dtype,
411 max_sequence_length=max_sequence_length,
412 )
414 if pooled_prompt_embeds is None:
415 if prompt_2 is None:
File ~/miniconda3/envs/ai-train/lib/python3.10/site-packages/diffusers/pipelines/hunyuan_video/pipeline_hunyuan_video_image2video.py:277, in HunyuanVideoImageToVideoPipeline._get_llama_prompt_embeds(self, image, prompt, prompt_template, num_videos_per_prompt, device, dtype, max_sequence_length, num_hidden_layers_to_skip, image_embed_interleave)
273 prompt_attention_mask = text_inputs.attention_mask.to(device=device)
275 image_embeds = self.image_processor(image, return_tensors="pt").pixel_values.to(device)
--> 277 prompt_embeds = self.text_encoder(
278 input_ids=text_input_ids,
279 attention_mask=prompt_attention_mask,
280 pixel_values=image_embeds,
281 output_hidden_states=True,
282 ).hidden_states[-(num_hidden_layers_to_skip + 1)]
283 prompt_embeds = prompt_embeds.to(dtype=dtype)
285 image_emb_len = prompt_template.get("image_emb_len", 576)
File ~/miniconda3/envs/ai-train/lib/python3.10/site-packages/torch/nn/modules/module.py:1739, in Module._wrapped_call_impl(self, *args, **kwargs)
1737 return self._compiled_call_impl(*args, **kwargs) # type: ignore[misc]
1738 else:
-> 1739 return self._call_impl(*args, **kwargs)
File ~/miniconda3/envs/ai-train/lib/python3.10/site-packages/torch/nn/modules/module.py:1750, in Module._call_impl(self, *args, **kwargs)
1745 # If we don't have any hooks, we want to skip the rest of the logic in
1746 # this function, and just call forward.
1747 if not (self._backward_hooks or self._backward_pre_hooks or self._forward_hooks or self._forward_pre_hooks
1748 or _global_backward_pre_hooks or _global_backward_hooks
1749 or _global_forward_hooks or _global_forward_pre_hooks):
-> 1750 return forward_call(*args, **kwargs)
1752 result = None
1753 called_always_called_hooks = set()
File ~/miniconda3/envs/ai-train/lib/python3.10/site-packages/accelerate/hooks.py:170, in add_hook_to_module.<locals>.new_forward(module, *args, **kwargs)
168 output = module._old_forward(*args, **kwargs)
169 else:
--> 170 output = module._old_forward(*args, **kwargs)
171 return module._hf_hook.post_forward(module, output)
File ~/miniconda3/envs/ai-train/lib/python3.10/site-packages/transformers/utils/deprecation.py:172, in deprecate_kwarg.<locals>.wrapper.<locals>.wrapped_func(*args, **kwargs)
168 elif minimum_action in (Action.NOTIFY, Action.NOTIFY_ALWAYS) and not is_torchdynamo_compiling():
169 # DeprecationWarning is ignored by default, so we use FutureWarning instead
170 warnings.warn(message, FutureWarning, stacklevel=2)
--> 172 return func(*args, **kwargs)
File ~/miniconda3/envs/ai-train/lib/python3.10/site-packages/transformers/models/llava/modeling_llava.py:426, in LlavaForConditionalGeneration.forward(self, input_ids, pixel_values, attention_mask, position_ids, past_key_values, inputs_embeds, vision_feature_layer, vision_feature_select_strategy, labels, use_cache, output_attentions, output_hidden_states, return_dict, cache_position, logits_to_keep, image_sizes, **lm_kwargs)
424 n_image_tokens = (input_ids == self.config.image_token_index).sum()
425 n_image_features = image_features.shape[0] * image_features.shape[1]
--> 426 raise ValueError(
427 f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
428 )
429 image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
430 inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
ValueError: Image features and image tokens do not match: tokens: 1, features 576
Env:
Name: diffusers
Version: 0.33.0.dev0
Name: transformers
Version: 4.50.0.dev0
This was the environment the model was tested with: https://github.com/huggingface/diffusers/pull/10983#issuecomment-2706597418
For other versions, we'll have to introduce version guards and try to fix